Source code for sparknlp.annotator.embeddings.chunk_embeddings

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for ChunkEmbeddings"""

from sparknlp.common import *


[docs]class ChunkEmbeddings(AnnotatorModel):
    """This annotator utilizes WordEmbeddings, BertEmbeddings etc. to generate
    chunk embeddings from either Chunker, NGramGenerator, or NerConverter
    outputs.

    For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/embeddings/ChunkEmbeddings.ipynb>`__.

    ========================== ======================
    Input Annotation types     Output Annotation type
    ========================== ======================
    ``CHUNK, WORD_EMBEDDINGS`` ``WORD_EMBEDDINGS``
    ========================== ======================

    Parameters
    ----------
    poolingStrategy
        Choose how you would like to aggregate Word Embeddings to Chunk
        Embeddings, by default AVERAGE.
        Possible Values: ``AVERAGE, SUM``
    skipOOV
        Whether to discard default vectors for OOV words from the
        aggregation/pooling.

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline

    Extract the Embeddings from the NGrams

    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> sentence = SentenceDetector() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("sentence")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["sentence"]) \\
    ...     .setOutputCol("token")
    >>> nGrams = NGramGenerator() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("chunk") \\
    ...     .setN(2)
    >>> embeddings = WordEmbeddingsModel.pretrained() \\
    ...     .setInputCols(["sentence", "token"]) \\
    ...     .setOutputCol("embeddings") \\
    ...     .setCaseSensitive(False)

    Convert the NGram chunks into Word Embeddings

    >>> chunkEmbeddings = ChunkEmbeddings() \\
    ...     .setInputCols(["chunk", "embeddings"]) \\
    ...     .setOutputCol("chunk_embeddings") \\
    ...     .setPoolingStrategy("AVERAGE")
    >>> pipeline = Pipeline() \\
    ...     .setStages([
    ...       documentAssembler,
    ...       sentence,
    ...       tokenizer,
    ...       nGrams,
    ...       embeddings,
    ...       chunkEmbeddings
    ...     ])
    >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("explode(chunk_embeddings) as result") \\
    ...     .select("result.annotatorType", "result.result", "result.embeddings") \\
    ...     .show(5, 80)
    +---------------+----------+--------------------------------------------------------------------------------+
    |  annotatorType|    result|                                                                      embeddings|
    +---------------+----------+--------------------------------------------------------------------------------+
    |word_embeddings|   This is|[-0.55661, 0.42829502, 0.86661, -0.409785, 0.06316501, 0.120775, -0.0732005, ...|
    |word_embeddings|      is a|[-0.40674996, 0.22938299, 0.50597, -0.288195, 0.555655, 0.465145, 0.140118, 0...|
    |word_embeddings|a sentence|[0.17417, 0.095253006, -0.0530925, -0.218465, 0.714395, 0.79860497, 0.0129999...|
    |word_embeddings|sentence .|[0.139705, 0.177955, 0.1887775, -0.45545, 0.20030999, 0.461557, -0.07891501, ...|
    +---------------+----------+--------------------------------------------------------------------------------+
    """

[docs]    name = "ChunkEmbeddings"

[docs]    inputAnnotatorTypes = [AnnotatorType.CHUNK, AnnotatorType.WORD_EMBEDDINGS]

[docs]    outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS

    @keyword_only
    def __init__(self):
        super(ChunkEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.ChunkEmbeddings")
        self._setDefault(
            poolingStrategy="AVERAGE"
        )

[docs]    poolingStrategy = Param(Params._dummy(),
                            "poolingStrategy",
                            "Choose how you would like to aggregate Word Embeddings to Chunk Embeddings:" +
                            "AVERAGE or SUM",
                            typeConverter=TypeConverters.toString)
[docs]    skipOOV = Param(Params._dummy(), "skipOOV",
                    "Whether to discard default vectors for OOV words from the aggregation / pooling ",
                    typeConverter=TypeConverters.toBoolean)

[docs]    def setPoolingStrategy(self, strategy):
        """Sets how to aggregate Word Embeddings to Chunk Embeddings, by default
        AVERAGE.

        Possible Values: ``AVERAGE, SUM``

        Parameters
        ----------
        strategy : str
            Aggregation Strategy
        """
        if strategy == "AVERAGE":
            return self._set(poolingStrategy=strategy)
        elif strategy == "SUM":
            return self._set(poolingStrategy=strategy)
        else:
            return self._set(poolingStrategy="AVERAGE")

[docs]    def setSkipOOV(self, value):
        """Sets whether to discard default vectors for OOV words from the
        aggregation/pooling.

        Parameters
        ----------
        value : bool
            whether to discard default vectors for OOV words from the
            aggregation/pooling.
        """
        return self._set(skipOOV=value)