Source code for sparknlp.annotator.embeddings.sentence_embeddings
# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for SentenceEmbeddings."""
from sparknlp.common import *
[docs]class SentenceEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasStorageRef):
"""Converts the results from WordEmbeddings, BertEmbeddings, or other word
embeddings into sentence or document embeddings by either summing up or
averaging all the word embeddings in a sentence or a document (depending on
the inputCols).
This can be configured with :meth:`.setPoolingStrategy`, which either be
``"AVERAGE"`` or ``"SUM"``.
For more extended examples see the `Examples
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/text-similarity/Spark_NLP_Spark_ML_Text_Similarity.ipynb>`__..
============================= =======================
Input Annotation types Output Annotation type
============================= =======================
``DOCUMENT, WORD_EMBEDDINGS`` ``SENTENCE_EMBEDDINGS``
============================= =======================
Parameters
----------
dimension
Number of embedding dimensions
poolingStrategy
Choose how you would like to aggregate Word Embeddings to Sentence
Embeddings: AVERAGE or SUM, by default AVERAGE
Notes
-----
If you choose document as your input for Tokenizer,
WordEmbeddings/BertEmbeddings, and SentenceEmbeddings then it averages/sums
all the embeddings into one array of embeddings. However, if you choose
sentences as inputCols then for each sentence SentenceEmbeddings generates
one array of embeddings.
Examples
--------
>>> import sparknlp
>>> from sparknlp.base import *
>>> from sparknlp.annotator import *
>>> from pyspark.ml import Pipeline
>>> documentAssembler = DocumentAssembler() \\
... .setInputCol("text") \\
... .setOutputCol("document")
>>> tokenizer = Tokenizer() \\
... .setInputCols(["document"]) \\
... .setOutputCol("token")
>>> embeddings = WordEmbeddingsModel.pretrained() \\
... .setInputCols(["document", "token"]) \\
... .setOutputCol("embeddings")
>>> embeddingsSentence = SentenceEmbeddings() \\
... .setInputCols(["document", "embeddings"]) \\
... .setOutputCol("sentence_embeddings") \\
... .setPoolingStrategy("AVERAGE")
>>> embeddingsFinisher = EmbeddingsFinisher() \\
... .setInputCols(["sentence_embeddings"]) \\
... .setOutputCols("finished_embeddings") \\
... .setOutputAsVector(True) \\
... .setCleanAnnotations(False)
>>> pipeline = Pipeline() \\
... .setStages([
... documentAssembler,
... tokenizer,
... embeddings,
... embeddingsSentence,
... embeddingsFinisher
... ])
>>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text")
>>> result = pipeline.fit(data).transform(data)
>>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
+--------------------------------------------------------------------------------+
| result|
+--------------------------------------------------------------------------------+
|[-0.22093398869037628,0.25130119919776917,0.41810303926467896,-0.380883991718...|
+--------------------------------------------------------------------------------+
"""
name = "SentenceEmbeddings"
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.WORD_EMBEDDINGS]
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
@keyword_only
def __init__(self):
super(SentenceEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings")
self._setDefault(
poolingStrategy="AVERAGE"
)
poolingStrategy = Param(Params._dummy(),
"poolingStrategy",
"Choose how you would like to aggregate Word Embeddings to Sentence Embeddings: AVERAGE or SUM",
typeConverter=TypeConverters.toString)
[docs] def setPoolingStrategy(self, strategy):
"""Sets how to aggregate the word Embeddings to sentence embeddings, by
default AVERAGE.
Can either be AVERAGE or SUM.
Parameters
----------
strategy : str
Pooling Strategy, either be AVERAGE or SUM
Returns
-------
[type]
[description]
"""
if strategy == "AVERAGE":
return self._set(poolingStrategy=strategy)
elif strategy == "SUM":
return self._set(poolingStrategy=strategy)
else:
return self._set(poolingStrategy="AVERAGE")