Source code for sparknlp.annotator.embeddings.word_embeddings

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for WordEmbeddings."""


from sparknlp.common import *


[docs]class WordEmbeddings(AnnotatorApproach, HasEmbeddingsProperties, HasStorage): """Word Embeddings lookup annotator that maps tokens to vectors. For instantiated/pretrained models, see :class:`.WordEmbeddingsModel`. A custom token lookup dictionary for embeddings can be set with :meth:`.setStoragePath`. Each line of the provided file needs to have a token, followed by their vector representation, delimited by a spaces:: ... are 0.39658191506190343 0.630968081620067 0.5393722253731201 0.8428180123359783 were 0.7535235923631415 0.9699218875629833 0.10397182122983872 0.11833962569383116 stress 0.0492683418305907 0.9415954572751959 0.47624463167525755 0.16790967216778263 induced 0.1535748762292387 0.33498936903209897 0.9235178224122094 0.1158772920395934 ... If a token is not found in the dictionary, then the result will be a zero vector of the same dimension. Statistics about the rate of converted tokens, can be retrieved with :meth:`WordEmbeddingsModel.withCoverageColumn() <sparknlp.annotator.WordEmbeddingsModel.withCoverageColumn>` and :meth:`WordEmbeddingsModel.overallCoverage() <sparknlp.annotator.WordEmbeddingsModel.overallCoverage>`. For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/scala/training/NerDL/win/customNerDlPipeline/CustomForNerDLPipeline.java>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS`` ====================== ====================== Parameters ---------- writeBufferSize Buffer size limit before dumping to disk storage while writing, by default 10000 readCacheSize Cache size for items retrieved from storage. Increase for performance but higher memory consumption Examples -------- In this example, the file ``random_embeddings_dim4.txt`` has the form of the content above. >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> embeddings = WordEmbeddings() \\ ... .setStoragePath("src/test/resources/random_embeddings_dim4.txt", ReadAs.TEXT) \\ ... .setStorageRef("glove_4d") \\ ... .setDimension(4) \\ ... .setInputCols(["document", "token"]) \\ ... .setOutputCol("embeddings") >>> embeddingsFinisher = EmbeddingsFinisher() \\ ... .setInputCols(["embeddings"]) \\ ... .setOutputCols("finished_embeddings") \\ ... .setOutputAsVector(True) \\ ... .setCleanAnnotations(False) >>> pipeline = Pipeline() \\ ... .setStages([ ... documentAssembler, ... tokenizer, ... embeddings, ... embeddingsFinisher ... ]) >>> data = spark.createDataFrame([["The patient was diagnosed with diabetes."]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.selectExpr("explode(finished_embeddings) as result").show(truncate=False) +----------------------------------------------------------------------------------+ |result | +----------------------------------------------------------------------------------+ |[0.9439099431037903,0.4707513153553009,0.806300163269043,0.16176554560661316] | |[0.7966810464859009,0.5551124811172485,0.8861005902290344,0.28284206986427307] | |[0.025029370561242104,0.35177749395370483,0.052506182342767715,0.1887107789516449]| |[0.08617766946554184,0.8399239182472229,0.5395117998123169,0.7864698767662048] | |[0.6599600911140442,0.16109347343444824,0.6041093468666077,0.8913561105728149] | |[0.5955275893211365,0.01899011991918087,0.4397728443145752,0.8911281824111938] | |[0.9840458631515503,0.7599489092826843,0.9417727589607239,0.8624503016471863] | +----------------------------------------------------------------------------------+ See Also -------- SentenceEmbeddings : to combine embeddings into a sentence-level representation """ name = "WordEmbeddings" inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS writeBufferSize = Param(Params._dummy(), "writeBufferSize", "buffer size limit before dumping to disk storage while writing", typeConverter=TypeConverters.toInt) readCacheSize = Param(Params._dummy(), "readCacheSize", "cache size for items retrieved from storage. Increase for performance but higher memory consumption", typeConverter=TypeConverters.toInt)
[docs] def setWriteBufferSize(self, v): """Sets buffer size limit before dumping to disk storage while writing, by default 10000. Parameters ---------- v : int Buffer size limit """ return self._set(writeBufferSize=v)
[docs] def setReadCacheSize(self, v): """Sets cache size for items retrieved from storage. Increase for performance but higher memory consumption. Parameters ---------- v : int Cache size for items retrieved from storage """ return self._set(readCacheSize=v)
@keyword_only def __init__(self): super(WordEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddings") self._setDefault( caseSensitive=False, writeBufferSize=10000, storageRef=self.uid ) def _create_model(self, java_model): return WordEmbeddingsModel(java_model=java_model)
[docs]class WordEmbeddingsModel(AnnotatorModel, HasEmbeddingsProperties, HasStorageModel): """Word Embeddings lookup annotator that maps tokens to vectors This is the instantiated model of :class:`.WordEmbeddings`. Pretrained models can be loaded with :meth:`.pretrained` of the companion object: >>> embeddings = WordEmbeddingsModel.pretrained() \\ ... .setInputCols(["document", "token"]) \\ ... .setOutputCol("embeddings") The default model is ``"glove_100d"``, if no name is provided. For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Embeddings>`__. For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_offline.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS`` ====================== ====================== Parameters ---------- dimension Number of embedding dimensions readCacheSize Cache size for items retrieved from storage. Increase for performance but higher memory consumption Notes ----- There are also two convenient functions to retrieve the embeddings coverage with respect to the transformed dataset: - :meth:`.withCoverageColumn`: Adds a custom column with word coverage stats for the embedded field. This creates a new column with statistics for each row. - :meth:`.overallCoverage`: Calculates overall word coverage for the whole data in the embedded field. This returns a single coverage object considering all rows in the field. Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> embeddings = WordEmbeddingsModel.pretrained() \\ ... .setInputCols(["document", "token"]) \\ ... .setOutputCol("embeddings") >>> embeddingsFinisher = EmbeddingsFinisher() \\ ... .setInputCols(["embeddings"]) \\ ... .setOutputCols("finished_embeddings") \\ ... .setOutputAsVector(True) \\ ... .setCleanAnnotations(False) >>> pipeline = Pipeline() \\ ... .setStages([ ... documentAssembler, ... tokenizer, ... embeddings, ... embeddingsFinisher ... ]) >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80) +--------------------------------------------------------------------------------+ | result| +--------------------------------------------------------------------------------+ |[-0.570580005645752,0.44183000922203064,0.7010200023651123,-0.417129993438720...| |[-0.542639970779419,0.4147599935531616,1.0321999788284302,-0.4024400115013122...| |[-0.2708599865436554,0.04400600120425224,-0.020260000601410866,-0.17395000159...| |[0.6191999912261963,0.14650000631809235,-0.08592499792575836,-0.2629800140857...| |[-0.3397899866104126,0.20940999686717987,0.46347999572753906,-0.6479200124740...| +--------------------------------------------------------------------------------+ See Also -------- SentenceEmbeddings : to combine embeddings into a sentence-level representation """ name = "WordEmbeddingsModel" databases = ['EMBEDDINGS'] inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS readCacheSize = Param(Params._dummy(), "readCacheSize", "cache size for items retrieved from storage. Increase for performance but higher memory consumption", typeConverter=TypeConverters.toInt)
[docs] def setReadCacheSize(self, v): """Sets cache size for items retrieved from storage. Increase for performance but higher memory consumption. Parameters ---------- v : int Cache size for items retrieved from storage """ return self._set(readCacheSize=v)
@keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel", java_model=None): super(WordEmbeddingsModel, self).__init__( classname=classname, java_model=java_model ) @staticmethod
[docs] def overallCoverage(dataset, embeddings_col): """Calculates overall word coverage for the whole data in the embedded field. This returns a single coverage object considering all rows in the field. Parameters ---------- dataset : :class:`pyspark.sql.DataFrame` The dataset with embeddings column embeddings_col : str Name of the embeddings column Returns ------- :class:`.CoverageResult` CoverateResult object with extracted information Examples -------- >>> wordsOverallCoverage = WordEmbeddingsModel.overallCoverage( ... resultDF,"embeddings" ... ).percentage 1.0 """ from sparknlp.internal import _EmbeddingsOverallCoverage from sparknlp.common import CoverageResult return CoverageResult(_EmbeddingsOverallCoverage(dataset, embeddings_col).apply())
@staticmethod
[docs] def withCoverageColumn(dataset, embeddings_col, output_col='coverage'): """Adds a custom column with word coverage stats for the embedded field. This creates a new column with statistics for each row. Parameters ---------- dataset : :class:`pyspark.sql.DataFrame` The dataset with embeddings column embeddings_col : str Name of the embeddings column output_col : str, optional Name for the resulting column, by default 'coverage' Returns ------- :class:`pyspark.sql.DataFrame` Dataframe with calculated coverage Examples -------- >>> wordsCoverage = WordEmbeddingsModel.withCoverageColumn(resultDF, "embeddings", "cov_embeddings") >>> wordsCoverage.select("text","cov_embeddings").show(truncate=False) +-------------------+--------------+ |text |cov_embeddings| +-------------------+--------------+ |This is a sentence.|[5, 5, 1.0] | +-------------------+--------------+ """ from sparknlp.internal import _EmbeddingsCoverageColumn from pyspark.sql import DataFrame return DataFrame(_EmbeddingsCoverageColumn(dataset, embeddings_col, output_col).apply(), dataset.sql_ctx)
@staticmethod
[docs] def pretrained(name="glove_100d", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional Name of the pretrained model, by default "glove_100d" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise. Returns ------- WordEmbeddingsModel The restored model """ from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(WordEmbeddingsModel, name, lang, remote_loc)
@staticmethod
[docs] def loadStorage(path, spark, storage_ref): """Loads the model from storage. Parameters ---------- path : str Path to the model spark : :class:`pyspark.sql.SparkSession` The current SparkSession storage_ref : str Identifiers for the model parameters """ HasStorageModel.loadStorages(path, spark, storage_ref, WordEmbeddingsModel.databases)