# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for ElmoEmbeddings."""
from sparknlp.common import *
[docs]class ElmoEmbeddings(AnnotatorModel,
HasEmbeddingsProperties,
HasCaseSensitiveProperties,
HasStorageRef,
HasEngine):
"""Word embeddings from ELMo (Embeddings from Language Models), a language
model trained on the 1 Billion Word Benchmark.
Note that this is a very computationally expensive module compared to word
embedding modules that only perform embedding lookups. The use of an
accelerator is recommended.
Pretrained models can be loaded with :meth:`.pretrained` of the companion
object:
>>> embeddings = ElmoEmbeddings.pretrained() \\
... .setInputCols(["sentence", "token"]) \\
... .setOutputCol("elmo_embeddings")
The default model is ``"elmo"``, if no name is provided.
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Embeddings>`__.
The pooling layer can be set with :meth:`.setPoolingLayer` to the following
values:
- ``"word_emb"``: the character-based word representations with shape
``[batch_size, max_length, 512]``.
- ``"lstm_outputs1"``: the first LSTM hidden state with shape
``[batch_size, max_length, 1024]``.
- ``"lstm_outputs2"``: the second LSTM hidden state with shape
``[batch_size, max_length, 1024]``.
- ``"elmo"``: the weighted sum of the 3 layers, where the weights are
trainable. This tensor has shape ``[batch_size, max_length, 1024]``.
For extended examples of usage, see the
`Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/dl-ner/ner_elmo.ipynb>`__.
====================== ======================
Input Annotation types Output Annotation type
====================== ======================
``DOCUMENT, TOKEN`` ``WORD_EMBEDDINGS``
====================== ======================
Parameters
----------
batchSize
Batch size. Large values allows faster processing but requires more
memory, by default 32
dimension
Number of embedding dimensions
caseSensitive
Whether to ignore case in tokens for embeddings matching
configProtoBytes
ConfigProto from tensorflow, serialized into byte array.
poolingLayer
Set ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or
elmo, by default word_emb
References
----------
https://tfhub.dev/google/elmo/3
`Deep contextualized word representations <https://arxiv.org/abs/1802.05365>`__
**Paper abstract:**
*We introduce a new type of deep contextualized word representation that
models both (1) complex characteristics of word use (e.g., syntax and
semantics), and (2) how these uses vary across linguistic contexts (i.e.,
to model polysemy). Our word vectors are learned functions of the internal
states of a deep bidirectional language model (biLM), which is pre-trained
on a large text corpus. We show that these representations can be easily
added to existing models and significantly improve the state of the art
across six challenging NLP problems, including question answering, textual
entailment and sentiment analysis. We also present an analysis showing that
exposing the deep internals of the pre-trained network is crucial, allowing
downstream models to mix different types of semi-supervision signals.*
Examples
--------
>>> import sparknlp
>>> from sparknlp.base import *
>>> from sparknlp.annotator import *
>>> from pyspark.ml import Pipeline
>>> documentAssembler = DocumentAssembler() \\
... .setInputCol("text") \\
... .setOutputCol("document")
>>> tokenizer = Tokenizer() \\
... .setInputCols(["document"]) \\
... .setOutputCol("token")
>>> embeddings = ElmoEmbeddings.pretrained() \\
... .setPoolingLayer("word_emb") \\
... .setInputCols(["token", "document"]) \\
... .setOutputCol("embeddings")
>>> embeddingsFinisher = EmbeddingsFinisher() \\
... .setInputCols(["embeddings"]) \\
... .setOutputCols("finished_embeddings") \\
... .setOutputAsVector(True) \\
... .setCleanAnnotations(False)
>>> pipeline = Pipeline().setStages([
... documentAssembler,
... tokenizer,
... embeddings,
... embeddingsFinisher
... ])
>>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
>>> result = pipeline.fit(data).transform(data)
>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
+--------------------------------------------------------------------------------+
| result|
+--------------------------------------------------------------------------------+
|[6.662458181381226E-4,-0.2541114091873169,-0.6275503039360046,0.5787073969841...|
|[0.19154725968837738,0.22998669743537903,-0.2894386649131775,0.21524395048618...|
|[0.10400570929050446,0.12288510054349899,-0.07056470215320587,-0.246389418840...|
|[0.49932169914245605,-0.12706467509269714,0.30969417095184326,0.2643227577209...|
|[-0.8871506452560425,-0.20039963722229004,-1.0601330995559692,0.0348707810044...|
+--------------------------------------------------------------------------------+
"""
name = "ElmoEmbeddings"
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS
batchSize = Param(Params._dummy(),
"batchSize",
"Batch size. Large values allows faster processing but requires more memory.",
typeConverter=TypeConverters.toInt)
configProtoBytes = Param(Params._dummy(),
"configProtoBytes",
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
TypeConverters.toListInt)
poolingLayer = Param(Params._dummy(),
"poolingLayer", "Set ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or elmo",
typeConverter=TypeConverters.toString)
[docs] def setConfigProtoBytes(self, b):
"""Sets configProto from tensorflow, serialized into byte array.
Parameters
----------
b : List[int]
ConfigProto from tensorflow, serialized into byte array
"""
return self._set(configProtoBytes=b)
[docs] def setBatchSize(self, value):
"""Sets batch size, by default 32.
Parameters
----------
value : int
Batch size
"""
return self._set(batchSize=value)
[docs] def setPoolingLayer(self, layer):
"""Sets ELMO pooling layer to: word_emb, lstm_outputs1, lstm_outputs2, or
elmo, by default word_emb
Parameters
----------
layer : str
ELMO pooling layer
"""
if layer == "word_emb":
return self._set(poolingLayer=layer)
elif layer == "lstm_outputs1":
return self._set(poolingLayer=layer)
elif layer == "lstm_outputs2":
return self._set(poolingLayer=layer)
elif layer == "elmo":
return self._set(poolingLayer=layer)
else:
return self._set(poolingLayer="word_emb")
@keyword_only
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings", java_model=None):
super(ElmoEmbeddings, self).__init__(
classname=classname,
java_model=java_model
)
self._setDefault(
batchSize=32,
poolingLayer="word_emb"
)
@staticmethod
[docs] def loadSavedModel(folder, spark_session):
"""Loads a locally saved model.
Parameters
----------
folder : str
Folder of the saved model
spark_session : pyspark.sql.SparkSession
The current SparkSession
Returns
-------
ElmoEmbeddings
The restored model
"""
from sparknlp.internal import _ElmoLoader
jModel = _ElmoLoader(folder, spark_session._jsparkSession)._java_obj
return ElmoEmbeddings(java_model=jModel)
@staticmethod
[docs] def pretrained(name="elmo", lang="en", remote_loc=None):
"""Downloads and loads a pretrained model.
Parameters
----------
name : str, optional
Name of the pretrained model, by default "elmo"
lang : str, optional
Language of the pretrained model, by default "en"
remote_loc : str, optional
Optional remote address of the resource, by default None. Will use
Spark NLPs repositories otherwise.
Returns
-------
ElmoEmbeddings
The restored model
"""
from sparknlp.pretrained import ResourceDownloader
return ResourceDownloader.downloadModel(ElmoEmbeddings, name, lang, remote_loc)