Source code for sparknlp.annotator.embeddings.doc2vec

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for Doc2Vec."""

from sparknlp.common import *


[docs]class Doc2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperties):
    """Trains a Word2Vec model that creates vector representations of words in a
    text corpus.

    The algorithm first constructs a vocabulary from the corpus and then learns
    vector representation of words in the vocabulary. The vector representation
    can be used as features in natural language processing and machine learning
    algorithms.

    We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
    implementation and a hierarchical softmax method to train the model. The
    variable names in the implementation match the original C implementation.

    For instantiated/pretrained models, see :class:`.Doc2VecModel`.

    For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.

    ====================== =======================
    Input Annotation types Output Annotation type
    ====================== =======================
    ``TOKEN``              ``SENTENCE_EMBEDDINGS``
    ====================== =======================

    Parameters
    ----------
    vectorSize
        The dimension of codes after transforming from words (> 0), by default
        100
    windowSize
        The window size (context words from [-window, window]) (> 0), by default
        5
    numPartitions
        Number of partitions for sentences of words (> 0), by default 1
    minCount
        The minimum number of times a token must appear to be included in the
        word2vec model's vocabulary (>= 0), by default 1
    maxSentenceLength
        The window size (Maximum length (in words) of each sentence in the input
        data. Any sentence longer than this threshold will be divided into
        chunks up to the size (> 0), by default 1000
    stepSize
        Step size (learning rate) to be used for each iteration of optimization
        (> 0), by default 0.025
    maxIter
        Maximum number of iterations (>= 0), by default 1
    seed
        Random seed, by default 44


    References
    ----------
    For the original C implementation, see https://code.google.com/p/word2vec/

    For the research paper, see `Efficient Estimation of Word Representations in
    Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
    Representations of Words and Phrases and their Compositionality
    <https://arxiv.org/pdf/1310.4546v1.pdf>`__.

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> embeddings = Doc2VecApproach() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("embeddings")
    >>> pipeline = Pipeline() \\
    ...     .setStages([
    ...       documentAssembler,
    ...       tokenizer,
    ...       embeddings
    ...     ])
    >>> path = "sherlockholmes.txt"
    >>> dataset = spark.read.text(path).toDF("text")
    >>> pipelineModel = pipeline.fit(dataset)
    """
[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN]

[docs]    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS

[docs]    vectorSize = Param(Params._dummy(),
                       "vectorSize",
                       "the dimension of codes after transforming from words (> 0)",
                       typeConverter=TypeConverters.toInt)

[docs]    windowSize = Param(Params._dummy(),
                       "windowSize",
                       "the window size (context words from [-window, window]) (> 0)",
                       typeConverter=TypeConverters.toInt)

[docs]    numPartitions = Param(Params._dummy(),
                          "numPartitions",
                          "number of partitions for sentences of words (> 0)",
                          typeConverter=TypeConverters.toInt)

[docs]    minCount = Param(Params._dummy(),
                     "minCount",
                     "the minimum number of times a token must " +
                     "appear to be included in the word2vec model's vocabulary (>= 0)",
                     typeConverter=TypeConverters.toInt)

[docs]    maxSentenceLength = Param(Params._dummy(),
                              "maxSentenceLength",
                              "the window size (Maximum length (in words) of each sentence in the input data. Any sentence longer than this threshold will " +
                              "be divided into chunks up to the size (> 0)",
                              typeConverter=TypeConverters.toInt)

[docs]    stepSize = Param(Params._dummy(),
                     "stepSize",
                     "Step size (learning rate) to be used for each iteration of optimization (> 0)",
                     typeConverter=TypeConverters.toFloat)

[docs]    maxIter = Param(Params._dummy(),
                    "maxIter",
                    "maximum number of iterations (>= 0)",
                    typeConverter=TypeConverters.toInt)

[docs]    seed = Param(Params._dummy(),
                 "seed",
                 "Random seed",
                 typeConverter=TypeConverters.toInt)

[docs]    def setVectorSize(self, vectorSize):
        """
        Sets vector size (default: 100).
        """
        return self._set(vectorSize=vectorSize)

[docs]    def setWindowSize(self, windowSize):
        """
        Sets window size (default: 5).
        """
        return self._set(windowSize=windowSize)

[docs]    def setStepSize(self, stepSize):
        """
        Sets initial learning rate (default: 0.025).
        """
        return self._set(stepSize=stepSize)

[docs]    def setNumPartitions(self, numPartitions):
        """
        Sets number of partitions (default: 1). Use a small number for
        accuracy.
        """
        return self._set(numPartitions=numPartitions)

[docs]    def setMaxIter(self, numIterations):
        """
        Sets number of iterations (default: 1), which should be smaller
        than or equal to number of partitions.
        """
        return self._set(maxIter=numIterations)

[docs]    def setSeed(self, seed):
        """
        Sets random seed.
        """
        return self._set(seed=seed)

[docs]    def setMinCount(self, minCount):
        """
        Sets minCount, the minimum number of times a token must appear
        to be included in the word2vec model's vocabulary (default: 5).
        """
        return self._set(minCount=minCount)

[docs]    def setMaxSentenceLength(self, maxSentenceLength):
        """
        Maximum length (in words) of each sentence in the input data.
        Any sentence longer than this threshold will be divided into
        chunks up to the size (> 0)
        """
        return self._set(maxSentenceLength=maxSentenceLength)

    @keyword_only
    def __init__(self):
        super(Doc2VecApproach, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.Doc2VecApproach")
        self._setDefault(
            vectorSize=100,
            windowSize=5,
            numPartitions=1,
            minCount=1,
            maxSentenceLength=1000,
            stepSize=0.025,
            maxIter=1,
            seed=44
        )

    def _create_model(self, java_model):
        return Doc2VecModel(java_model=java_model)


[docs]class Doc2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties):
    """Word2Vec model that creates vector representations of words in a text
    corpus.

    The algorithm first constructs a vocabulary from the corpus and then learns
    vector representation of words in the vocabulary. The vector representation
    can be used as features in natural language processing and machine learning
    algorithms.

    We use Word2Vec implemented in Spark ML. It uses skip-gram model in our
    implementation and a hierarchical softmax method to train the model. The
    variable names in the implementation match the original C implementation.

    This is the instantiated model of the :class:`.Doc2VecApproach`. For
    training your own model, please see the documentation of that class.

    Pretrained models can be loaded with :meth:`.pretrained` of the companion
    object:

    >>> embeddings = Doc2VecModel.pretrained() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("embeddings")

    The default model is `"doc2vec_gigaword_300"`, if no name is provided.

    ====================== =======================
    Input Annotation types Output Annotation type
    ====================== =======================
    ``TOKEN``              ``SENTENCE_EMBEDDINGS``
    ====================== =======================

    Parameters
    ----------
    vectorSize
        The dimension of codes after transforming from words (> 0) , by default
        100

    References
    ----------
    For the original C implementation, see https://code.google.com/p/word2vec/

    For the research paper, see `Efficient Estimation of Word Representations in
    Vector Space <https://arxiv.org/abs/1301.3781>`__ and `Distributed
    Representations of Words and Phrases and their Compositionality
    <https://arxiv.org/pdf/1310.4546v1.pdf>`__.

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> embeddings = Doc2VecModel.pretrained() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("embeddings")
    >>> embeddingsFinisher = EmbeddingsFinisher() \\
    ...     .setInputCols(["embeddings"]) \\
    ...     .setOutputCols("finished_embeddings") \\
    ...     .setOutputAsVector(True)
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     tokenizer,
    ...     embeddings,
    ...     embeddingsFinisher
    ... ])
    >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
    +--------------------------------------------------------------------------------+
    |                                                                          result|
    +--------------------------------------------------------------------------------+
    |[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...|
    +--------------------------------------------------------------------------------+
    """
[docs]    name = "Doc2VecModel"

[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN]

[docs]    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS

[docs]    vectorSize = Param(Params._dummy(),
                       "vectorSize",
                       "the dimension of codes after transforming from words (> 0)",
                       typeConverter=TypeConverters.toInt)

[docs]    def setVectorSize(self, vectorSize):
        """
        Sets vector size (default: 100).
        """
        return self._set(vectorSize=vectorSize)

    def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.Doc2VecModel", java_model=None):
        super(Doc2VecModel, self).__init__(
            classname=classname,
            java_model=java_model
        )
        self._setDefault(
            vectorSize=100
        )

    @staticmethod
[docs]    def pretrained(name="doc2vec_gigaword_300", lang="en", remote_loc=None):
        """Downloads and loads a pretrained model.

        Parameters
        ----------
        name : str, optional
            Name of the pretrained model, by default "doc2vec_wiki"
        lang : str, optional
            Language of the pretrained model, by default "en"
        remote_loc : str, optional
            Optional remote address of the resource, by default None. Will use
            Spark NLPs repositories otherwise.

        Returns
        -------
        Doc2VecModel
            The restored model
        """
        from sparknlp.pretrained import ResourceDownloader
        return ResourceDownloader.downloadModel(Doc2VecModel, name, lang, remote_loc)

[docs]    def getVectors(self):
        """
        Returns the vector representation of the words as a dataframe
        with two fields, word and vector.
        """
        return self._call_java("getVectors")