Source code for sparknlp.annotator.embeddings.elmo_embeddings

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for ElmoEmbeddings."""

from sparknlp.common import *


[docs]class ElmoEmbeddings(AnnotatorModel,
                     HasEmbeddingsProperties,
                     HasCaseSensitiveProperties,
                     HasStorageRef,
                     HasEngine):
    """Word embeddings from ELMo (Embeddings from Language Models), a language
    model trained on the 1 Billion Word Benchmark.

    Note that this is a very computationally expensive module compared to word
    embedding modules that only perform embedding lookups. The use of an
    accelerator is recommended.

    Pretrained models can be loaded with :meth:`.pretrained` of the companion
    object:

    >>> embeddings = ElmoEmbeddings.pretrained() \\
    ...     .setInputCols(["sentence", "token"]) \\
    ...     .setOutputCol("elmo_embeddings")


    The default model is ``"elmo"``, if no name is provided.

    For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.

    The pooling layer can be set with :meth:`.setPoolingLayer` to the following
    values:

    - ``"word_emb"``: the character-based word representations with shape
      ``[batch_size, max_length, 512]``.
    - ``"lstm_outputs1"``: the first LSTM hidden state with shape
      ``[batch_size, max_length, 1024]``.
    - ``"lstm_outputs2"``: the second LSTM hidden state with shape
      ``[batch_size, max_length, 1024]``.
    - ``"elmo"``: the weighted sum of the 3 layers, where the weights are
      trainable. This tensor has shape ``[batch_size, max_length, 1024]``.

    For extended examples of usage, see the
    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_elmo.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT, TOKEN``    ``WORD_EMBEDDINGS``
    ====================== ======================

    Parameters
    ----------
    batchSize
        Batch size. Large values allows faster processing but requires more
        memory, by default 32
    dimension
        Number of embedding dimensions
    caseSensitive
        Whether to ignore case in tokens for embeddings matching
    configProtoBytes
        ConfigProto from tensorflow, serialized into byte array.
    poolingLayer
        Set ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or
        elmo, by default word_emb

    References
    ----------
    https://tfhub.dev/google/elmo/3

    `Deep contextualized word representations <https://arxiv.org/abs/1802.05365>`__

    **Paper abstract:**

    *We introduce a new type of deep contextualized word representation that
    models both (1) complex characteristics of word use (e.g., syntax and
    semantics), and (2) how these uses vary across linguistic contexts (i.e.,
    to model polysemy). Our word vectors are learned functions of the internal
    states of a deep bidirectional language model (biLM), which is pre-trained
    on a large text corpus. We show that these representations can be easily
    added to existing models and significantly improve the state of the art
    across six challenging NLP problems, including question answering, textual
    entailment and sentiment analysis. We also present an analysis showing that
    exposing the deep internals of the pre-trained network is crucial, allowing
    downstream models to mix different types of semi-supervision signals.*

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> embeddings = ElmoEmbeddings.pretrained() \\
    ...     .setPoolingLayer("word_emb") \\
    ...     .setInputCols(["token", "document"]) \\
    ...     .setOutputCol("embeddings")
    >>> embeddingsFinisher = EmbeddingsFinisher() \\
    ...     .setInputCols(["embeddings"]) \\
    ...     .setOutputCols("finished_embeddings") \\
    ...     .setOutputAsVector(True) \\
    ...     .setCleanAnnotations(False)
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     tokenizer,
    ...     embeddings,
    ...     embeddingsFinisher
    ... ])
    >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
    +--------------------------------------------------------------------------------+
    |                                                                          result|
    +--------------------------------------------------------------------------------+
    |[6.662458181381226E-4,-0.2541114091873169,-0.6275503039360046,0.5787073969841...|
    |[0.19154725968837738,0.22998669743537903,-0.2894386649131775,0.21524395048618...|
    |[0.10400570929050446,0.12288510054349899,-0.07056470215320587,-0.246389418840...|
    |[0.49932169914245605,-0.12706467509269714,0.30969417095184326,0.2643227577209...|
    |[-0.8871506452560425,-0.20039963722229004,-1.0601330995559692,0.0348707810044...|
    +--------------------------------------------------------------------------------+
    """

[docs]    name = "ElmoEmbeddings"

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]

[docs]    outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS

[docs]    batchSize = Param(Params._dummy(),
                      "batchSize",
                      "Batch size. Large values allows faster processing but requires more memory.",
                      typeConverter=TypeConverters.toInt)

[docs]    configProtoBytes = Param(Params._dummy(),
                             "configProtoBytes",
                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
                             TypeConverters.toListInt)

[docs]    poolingLayer = Param(Params._dummy(),
                         "poolingLayer", "Set ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or elmo",
                         typeConverter=TypeConverters.toString)

[docs]    def setConfigProtoBytes(self, b):
        """Sets configProto from tensorflow, serialized into byte array.

        Parameters
        ----------
        b : List[int]
            ConfigProto from tensorflow, serialized into byte array
        """
        return self._set(configProtoBytes=b)

[docs]    def setBatchSize(self, value):
        """Sets batch size, by default 32.

        Parameters
        ----------
        value : int
            Batch size
        """
        return self._set(batchSize=value)

[docs]    def setPoolingLayer(self, layer):
        """Sets ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or
        elmo, by default word_emb

        Parameters
        ----------
        layer : str
            ELMO pooling layer
        """
        if layer == "word_emb":
            return self._set(poolingLayer=layer)
        elif layer == "lstm_outputs1":
            return self._set(poolingLayer=layer)
        elif layer == "lstm_outputs2":
            return self._set(poolingLayer=layer)
        elif layer == "elmo":
            return self._set(poolingLayer=layer)
        else:
            return self._set(poolingLayer="word_emb")

    @keyword_only
    def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings", java_model=None):
        super(ElmoEmbeddings, self).__init__(
            classname=classname,
            java_model=java_model
        )
        self._setDefault(
            batchSize=32,
            poolingLayer="word_emb"
        )

    @staticmethod
[docs]    def loadSavedModel(folder, spark_session):
        """Loads a locally saved model.

        Parameters
        ----------
        folder : str
            Folder of the saved model
        spark_session : pyspark.sql.SparkSession
            The current SparkSession

        Returns
        -------
        ElmoEmbeddings
            The restored model
        """
        from sparknlp.internal import _ElmoLoader
        jModel = _ElmoLoader(folder, spark_session._jsparkSession)._java_obj
        return ElmoEmbeddings(java_model=jModel)

    @staticmethod
[docs]    def pretrained(name="elmo", lang="en", remote_loc=None):
        """Downloads and loads a pretrained model.

        Parameters
        ----------
        name : str, optional
            Name of the pretrained model, by default "elmo"
        lang : str, optional
            Language of the pretrained model, by default "en"
        remote_loc : str, optional
            Optional remote address of the resource, by default None. Will use
            Spark NLPs repositories otherwise.

        Returns
        -------
        ElmoEmbeddings
            The restored model
        """
        from sparknlp.pretrained import ResourceDownloader
        return ResourceDownloader.downloadModel(ElmoEmbeddings, name, lang, remote_loc)