Source code for sparknlp.annotator.embeddings.doc2vec

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for Doc2Vec."""

from sparknlp.common import *


[docs]class Doc2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperties): """Trains a Word2Vec model that creates vector representations of words in a text corpus. The algorithm first constructs a vocabulary from the corpus and then learns vector representation of words in the vocabulary. The vector representation can be used as features in natural language processing and machine learning algorithms. We use Word2Vec implemented in Spark ML. It uses skip-gram model in our implementation and a hierarchical softmax method to train the model. The variable names in the implementation match the original C implementation. For instantiated/pretrained models, see :class:`.Doc2VecModel`. For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__. ====================== ======================= Input Annotation types Output Annotation type ====================== ======================= ``TOKEN`` ``SENTENCE_EMBEDDINGS`` ====================== ======================= Parameters ---------- vectorSize The dimension of codes after transforming from words (> 0), by default 100 windowSize The window size (context words from [-window, window]) (> 0), by default 5 numPartitions Number of partitions for sentences of words (> 0), by default 1 minCount The minimum number of times a token must appear to be included in the word2vec model's vocabulary (>= 0), by default 1 maxSentenceLength The window size (Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks up to the size (> 0), by default 1000 stepSize Step size (learning rate) to be used for each iteration of optimization (> 0), by default 0.025 maxIter Maximum number of iterations (>= 0), by default 1 seed Random seed, by default 44 References ---------- For the original C implementation, see https://code.google.com/p/word2vec/ For the research paper, see `Efficient Estimation of Word Representations in Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed Representations of Words and Phrases and their Compositionality <https://arxiv.org/pdf/1310.4546v1.pdf>`__. Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> embeddings = Doc2VecApproach() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("embeddings") >>> pipeline = Pipeline() \\ ... .setStages([ ... documentAssembler, ... tokenizer, ... embeddings ... ]) >>> path = "sherlockholmes.txt" >>> dataset = spark.read.text(path).toDF("text") >>> pipelineModel = pipeline.fit(dataset) """ inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS vectorSize = Param(Params._dummy(), "vectorSize", "the dimension of codes after transforming from words (> 0)", typeConverter=TypeConverters.toInt) windowSize = Param(Params._dummy(), "windowSize", "the window size (context words from [-window, window]) (> 0)", typeConverter=TypeConverters.toInt) numPartitions = Param(Params._dummy(), "numPartitions", "number of partitions for sentences of words (> 0)", typeConverter=TypeConverters.toInt) minCount = Param(Params._dummy(), "minCount", "the minimum number of times a token must " + "appear to be included in the word2vec model's vocabulary (>= 0)", typeConverter=TypeConverters.toInt) maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "the window size (Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will " + "be divided into chunks up to the size (> 0)", typeConverter=TypeConverters.toInt) stepSize = Param(Params._dummy(), "stepSize", "Step size (learning rate) to be used for each iteration of optimization (> 0)", typeConverter=TypeConverters.toFloat) maxIter = Param(Params._dummy(), "maxIter", "maximum number of iterations (>= 0)", typeConverter=TypeConverters.toInt) seed = Param(Params._dummy(), "seed", "Random seed", typeConverter=TypeConverters.toInt)
[docs] def setVectorSize(self, vectorSize): """ Sets vector size (default: 100). """ return self._set(vectorSize=vectorSize)
[docs] def setWindowSize(self, windowSize): """ Sets window size (default: 5). """ return self._set(windowSize=windowSize)
[docs] def setStepSize(self, stepSize): """ Sets initial learning rate (default: 0.025). """ return self._set(stepSize=stepSize)
[docs] def setNumPartitions(self, numPartitions): """ Sets number of partitions (default: 1). Use a small number for accuracy. """ return self._set(numPartitions=numPartitions)
[docs] def setMaxIter(self, numIterations): """ Sets number of iterations (default: 1), which should be smaller than or equal to number of partitions. """ return self._set(maxIter=numIterations)
[docs] def setSeed(self, seed): """ Sets random seed. """ return self._set(seed=seed)
[docs] def setMinCount(self, minCount): """ Sets minCount, the minimum number of times a token must appear to be included in the word2vec model's vocabulary (default: 5). """ return self._set(minCount=minCount)
[docs] def setMaxSentenceLength(self, maxSentenceLength): """ Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will be divided into chunks up to the size (> 0) """ return self._set(maxSentenceLength=maxSentenceLength)
@keyword_only def __init__(self): super(Doc2VecApproach, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.Doc2VecApproach") self._setDefault( vectorSize=100, windowSize=5, numPartitions=1, minCount=1, maxSentenceLength=1000, stepSize=0.025, maxIter=1, seed=44 ) def _create_model(self, java_model): return Doc2VecModel(java_model=java_model)
[docs]class Doc2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties): """Word2Vec model that creates vector representations of words in a text corpus. The algorithm first constructs a vocabulary from the corpus and then learns vector representation of words in the vocabulary. The vector representation can be used as features in natural language processing and machine learning algorithms. We use Word2Vec implemented in Spark ML. It uses skip-gram model in our implementation and a hierarchical softmax method to train the model. The variable names in the implementation match the original C implementation. This is the instantiated model of the :class:`.Doc2VecApproach`. For training your own model, please see the documentation of that class. Pretrained models can be loaded with :meth:`.pretrained` of the companion object: >>> embeddings = Doc2VecModel.pretrained() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("embeddings") The default model is `"doc2vec_gigaword_300"`, if no name is provided. ====================== ======================= Input Annotation types Output Annotation type ====================== ======================= ``TOKEN`` ``SENTENCE_EMBEDDINGS`` ====================== ======================= Parameters ---------- vectorSize The dimension of codes after transforming from words (> 0) , by default 100 References ---------- For the original C implementation, see https://code.google.com/p/word2vec/ For the research paper, see `Efficient Estimation of Word Representations in Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed Representations of Words and Phrases and their Compositionality <https://arxiv.org/pdf/1310.4546v1.pdf>`__. Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> embeddings = Doc2VecModel.pretrained() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("embeddings") >>> embeddingsFinisher = EmbeddingsFinisher() \\ ... .setInputCols(["embeddings"]) \\ ... .setOutputCols("finished_embeddings") \\ ... .setOutputAsVector(True) >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer, ... embeddings, ... embeddingsFinisher ... ]) >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.selectExpr("explode(finished_embeddings) as result").show(1, 80) +--------------------------------------------------------------------------------+ | result| +--------------------------------------------------------------------------------+ |[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...| +--------------------------------------------------------------------------------+ """ name = "Doc2VecModel" inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS vectorSize = Param(Params._dummy(), "vectorSize", "the dimension of codes after transforming from words (> 0)", typeConverter=TypeConverters.toInt)
[docs] def setVectorSize(self, vectorSize): """ Sets vector size (default: 100). """ return self._set(vectorSize=vectorSize)
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.Doc2VecModel", java_model=None): super(Doc2VecModel, self).__init__( classname=classname, java_model=java_model ) self._setDefault( vectorSize=100 ) @staticmethod
[docs] def pretrained(name="doc2vec_gigaword_300", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional Name of the pretrained model, by default "doc2vec_wiki" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise. Returns ------- Doc2VecModel The restored model """ from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(Doc2VecModel, name, lang, remote_loc)
[docs] def getVectors(self): """ Returns the vector representation of the words as a dataframe with two fields, word and vector. """ return self._call_java("getVectors")