Source code for sparknlp.base.embeddings_finisher

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the EmbeddingsFinisher."""

from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param
from sparknlp.internal import AnnotatorTransformer


[docs]class EmbeddingsFinisher(AnnotatorTransformer):
    """Extracts embeddings from Annotations into a more easily usable form.

    This is useful for example:

    - WordEmbeddings,
    - Transformer based embeddings such as BertEmbeddings,
    - SentenceEmbeddings and
    - ChunkEmbeddings, etc.

    By using ``EmbeddingsFinisher`` you can easily transform your embeddings
    into array of floats or vectors which are compatible with Spark ML functions
    such as LDA, K-mean, Random Forest classifier or any other functions that
    require a ``featureCol``.

    For more extended examples see the
    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb
>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``EMBEDDINGS``         ``NONE``
    ====================== ======================

    Parameters
    ----------
    inputCols
        Names of input annotation columns containing embeddings
    outputCols
        Names of finished output columns
    cleanAnnotations
        Whether to remove all the existing annotation columns, by default False
    outputAsVector
        Whether to output the embeddings as Vectors instead of arrays,
        by default False

    Examples
    --------
    First extract embeddings.

    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...    .setInputCol("text") \\
    ...    .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...    .setInputCols("document") \\
    ...    .setOutputCol("token")
    >>> normalizer = Normalizer() \\
    ...    .setInputCols("token") \\
    ...    .setOutputCol("normalized")
    >>> stopwordsCleaner = StopWordsCleaner() \\
    ...    .setInputCols("normalized") \\
    ...    .setOutputCol("cleanTokens") \\
    ...    .setCaseSensitive(False)
    >>> gloveEmbeddings = WordEmbeddingsModel.pretrained() \\
    ...    .setInputCols("document", "cleanTokens") \\
    ...    .setOutputCol("embeddings") \\
    ...    .setCaseSensitive(False)
    >>> embeddingsFinisher = EmbeddingsFinisher() \\
    ...    .setInputCols("embeddings") \\
    ...    .setOutputCols("finished_sentence_embeddings") \\
    ...    .setOutputAsVector(True) \\
    ...    .setCleanAnnotations(False)
    >>> data = spark.createDataFrame([["Spark NLP is an open-source text processing library."]]) \\
    ...    .toDF("text")
    >>> pipeline = Pipeline().setStages([
    ...    documentAssembler,
    ...    tokenizer,
    ...    normalizer,
    ...    stopwordsCleaner,
    ...    gloveEmbeddings,
    ...    embeddingsFinisher
    ... ]).fit(data)
    >>> result = pipeline.transform(data)

    Show results.

    >>> resultWithSize = result.selectExpr("explode(finished_sentence_embeddings) as embeddings")
    >>> resultWithSize.show(5, 80)
    +--------------------------------------------------------------------------------+
    |                                                                      embeddings|
    +--------------------------------------------------------------------------------+
    |[0.1619900017976761,0.045552998781204224,-0.03229299932718277,-0.685609996318...|
    |[-0.42416998744010925,1.1378999948501587,-0.5717899799346924,-0.5078899860382...|
    |[0.08621499687433243,-0.15772999823093414,-0.06067200005054474,0.395359992980...|
    |[-0.4970499873161316,0.7164199948310852,0.40119001269340515,-0.05761000141501...|
    |[-0.08170200139284134,0.7159299850463867,-0.20677000284194946,0.0295659992843...|
    +--------------------------------------------------------------------------------+

    See Also
    --------
    EmbeddingsFinisher : for finishing embeddings
    """

[docs]    inputCols = Param(Params._dummy(), "inputCols", "name of input annotation cols containing embeddings", typeConverter=TypeConverters.toListString)
[docs]    outputCols = Param(Params._dummy(), "outputCols", "output EmbeddingsFinisher ouput cols", typeConverter=TypeConverters.toListString)
[docs]    cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove all the existing annotation columns", typeConverter=TypeConverters.toBoolean)
[docs]    outputAsVector = Param(Params._dummy(), "outputAsVector", "if enabled it will output the embeddings as Vectors instead of arrays", typeConverter=TypeConverters.toBoolean)

[docs]    name = "EmbeddingsFinisher"

    @keyword_only
    def __init__(self):
        super(EmbeddingsFinisher, self).__init__(classname="com.johnsnowlabs.nlp.EmbeddingsFinisher")
        self._setDefault(
            cleanAnnotations=False,
            outputAsVector=False,
            outputCols=[]
        )

    @keyword_only
[docs]    def setParams(self):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

[docs]    def setInputCols(self, *value):
        """Sets name of input annotation columns containing embeddings.

        Parameters
        ----------
        *value : str
            Input columns for the annotator
        """

        if len(value) == 1 and type(value[0]) == list:
            return self._set(inputCols=value[0])
        else:
            return self._set(inputCols=list(value))

[docs]    def setOutputCols(self, *value):
        """Sets names of finished output columns.

        Parameters
        ----------
        *value : List[str]
            Input columns for the annotator
        """

        if len(value) == 1 and type(value[0]) == list:
            return self._set(outputCols=value[0])
        else:
            return self._set(outputCols=list(value))

[docs]    def setCleanAnnotations(self, value):
        """Sets whether to remove all the existing annotation columns, by default
        False.

        Parameters
        ----------
        value : bool
            Whether to remove all the existing annotation columns
        """

        return self._set(cleanAnnotations=value)

[docs]    def setOutputAsVector(self, value):
        """Sets whether to output the embeddings as Vectors instead of arrays,
        by default False.

        Parameters
        ----------
        value : bool
            Whether to output the embeddings as Vectors instead of arrays
        """

        return self._set(outputAsVector=value)

[docs]    def getInputCols(self):
        """Gets input columns name of annotations."""
        return self.getOrDefault(self.inputCols)

[docs]    def getOutputCols(self):
        """Gets output columns name of annotations."""
        if len(self.getOrDefault(self.outputCols)) == 0:
            return ["finished_" + input_col for input_col in self.getInputCols()]
        else:
            return self.getOrDefault(self.outputCols)