Source code for sparknlp.annotator.embeddings.sentence_embeddings

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for SentenceEmbeddings."""

from sparknlp.common import *


[docs]class SentenceEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasStorageRef):
    """Converts the results from WordEmbeddings, BertEmbeddings, or other word
    embeddings into sentence or document embeddings by either summing up or
    averaging all the word embeddings in a sentence or a document (depending on
    the inputCols).

    This can be configured with :meth:`.setPoolingStrategy`, which either be
    ``"AVERAGE"`` or ``"SUM"``.

    For more extended examples see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb>`__..

    ============================= =======================
    Input Annotation types        Output Annotation type
    ============================= =======================
    ``DOCUMENT, WORD_EMBEDDINGS`` ``SENTENCE_EMBEDDINGS``
    ============================= =======================

    Parameters
    ----------
    dimension
        Number of embedding dimensions
    poolingStrategy
        Choose how you would like to aggregate Word Embeddings to Sentence
        Embeddings: AVERAGE or SUM, by default AVERAGE

    Notes
    -----
    If you choose document as your input for Tokenizer,
    WordEmbeddings/BertEmbeddings, and SentenceEmbeddings then it averages/sums
    all the embeddings into one array of embeddings. However, if you choose
    sentences as inputCols then for each sentence SentenceEmbeddings generates
    one array of embeddings.

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> embeddings = WordEmbeddingsModel.pretrained() \\
    ...     .setInputCols(["document", "token"]) \\
    ...     .setOutputCol("embeddings")
    >>> embeddingsSentence = SentenceEmbeddings() \\
    ...     .setInputCols(["document", "embeddings"]) \\
    ...     .setOutputCol("sentence_embeddings") \\
    ...     .setPoolingStrategy("AVERAGE")
    >>> embeddingsFinisher = EmbeddingsFinisher() \\
    ...     .setInputCols(["sentence_embeddings"]) \\
    ...     .setOutputCols("finished_embeddings") \\
    ...     .setOutputAsVector(True) \\
    ...     .setCleanAnnotations(False)
    >>> pipeline = Pipeline() \\
    ...     .setStages([
    ...       documentAssembler,
    ...       tokenizer,
    ...       embeddings,
    ...       embeddingsSentence,
    ...       embeddingsFinisher
    ...     ])
    >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
    +--------------------------------------------------------------------------------+
    |                                                                          result|
    +--------------------------------------------------------------------------------+
    |[-0.22093398869037628,0.25130119919776917,0.41810303926467896,-0.380883991718...|
    +--------------------------------------------------------------------------------+
    """

[docs]    name = "SentenceEmbeddings"

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.WORD_EMBEDDINGS]

[docs]    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS

    @keyword_only
    def __init__(self):
        super(SentenceEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings")
        self._setDefault(
            poolingStrategy="AVERAGE"
        )

[docs]    poolingStrategy = Param(Params._dummy(),
                            "poolingStrategy",
                            "Choose how you would like to aggregate Word Embeddings to Sentence Embeddings: AVERAGE or SUM",
                            typeConverter=TypeConverters.toString)

[docs]    def setPoolingStrategy(self, strategy):
        """Sets how to aggregate the word Embeddings to sentence embeddings, by
        default AVERAGE.

        Can either be AVERAGE or SUM.

        Parameters
        ----------
        strategy : str
            Pooling Strategy, either be AVERAGE or SUM

        Returns
        -------
        [type]
            [description]
        """
        if strategy == "AVERAGE":
            return self._set(poolingStrategy=strategy)
        elif strategy == "SUM":
            return self._set(poolingStrategy=strategy)
        else:
            return self._set(poolingStrategy="AVERAGE")