Source code for sparknlp.annotator.embeddings.chunk_embeddings

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for ChunkEmbeddings"""

from sparknlp.common import *


[docs]class ChunkEmbeddings(AnnotatorModel): """This annotator utilizes WordEmbeddings, BertEmbeddings etc. to generate chunk embeddings from either Chunker, NGramGenerator, or NerConverter outputs. For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/embeddings/ChunkEmbeddings.ipynb>`__. ========================== ====================== Input Annotation types Output Annotation type ========================== ====================== ``CHUNK, WORD_EMBEDDINGS`` ``WORD_EMBEDDINGS`` ========================== ====================== Parameters ---------- poolingStrategy Choose how you would like to aggregate Word Embeddings to Chunk Embeddings, by default AVERAGE. Possible Values: ``AVERAGE, SUM`` skipOOV Whether to discard default vectors for OOV words from the aggregation/pooling. Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline Extract the Embeddings from the NGrams >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> sentence = SentenceDetector() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("sentence") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["sentence"]) \\ ... .setOutputCol("token") >>> nGrams = NGramGenerator() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("chunk") \\ ... .setN(2) >>> embeddings = WordEmbeddingsModel.pretrained() \\ ... .setInputCols(["sentence", "token"]) \\ ... .setOutputCol("embeddings") \\ ... .setCaseSensitive(False) Convert the NGram chunks into Word Embeddings >>> chunkEmbeddings = ChunkEmbeddings() \\ ... .setInputCols(["chunk", "embeddings"]) \\ ... .setOutputCol("chunk_embeddings") \\ ... .setPoolingStrategy("AVERAGE") >>> pipeline = Pipeline() \\ ... .setStages([ ... documentAssembler, ... sentence, ... tokenizer, ... nGrams, ... embeddings, ... chunkEmbeddings ... ]) >>> data = spark.createDataFrame([["This is a sentence."]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.selectExpr("explode(chunk_embeddings) as result") \\ ... .select("result.annotatorType", "result.result", "result.embeddings") \\ ... .show(5, 80) +---------------+----------+--------------------------------------------------------------------------------+ | annotatorType| result| embeddings| +---------------+----------+--------------------------------------------------------------------------------+ |word_embeddings| This is|[-0.55661, 0.42829502, 0.86661, -0.409785, 0.06316501, 0.120775, -0.0732005, ...| |word_embeddings| is a|[-0.40674996, 0.22938299, 0.50597, -0.288195, 0.555655, 0.465145, 0.140118, 0...| |word_embeddings|a sentence|[0.17417, 0.095253006, -0.0530925, -0.218465, 0.714395, 0.79860497, 0.0129999...| |word_embeddings|sentence .|[0.139705, 0.177955, 0.1887775, -0.45545, 0.20030999, 0.461557, -0.07891501, ...| +---------------+----------+--------------------------------------------------------------------------------+ """ name = "ChunkEmbeddings" inputAnnotatorTypes = [AnnotatorType.CHUNK, AnnotatorType.WORD_EMBEDDINGS] outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS @keyword_only def __init__(self): super(ChunkEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.ChunkEmbeddings") self._setDefault( poolingStrategy="AVERAGE" ) poolingStrategy = Param(Params._dummy(), "poolingStrategy", "Choose how you would like to aggregate Word Embeddings to Chunk Embeddings:" + "AVERAGE or SUM", typeConverter=TypeConverters.toString) skipOOV = Param(Params._dummy(), "skipOOV", "Whether to discard default vectors for OOV words from the aggregation / pooling ", typeConverter=TypeConverters.toBoolean)
[docs] def setPoolingStrategy(self, strategy): """Sets how to aggregate Word Embeddings to Chunk Embeddings, by default AVERAGE. Possible Values: ``AVERAGE, SUM`` Parameters ---------- strategy : str Aggregation Strategy """ if strategy == "AVERAGE": return self._set(poolingStrategy=strategy) elif strategy == "SUM": return self._set(poolingStrategy=strategy) else: return self._set(poolingStrategy="AVERAGE")
[docs] def setSkipOOV(self, value): """Sets whether to discard default vectors for OOV words from the aggregation/pooling. Parameters ---------- value : bool whether to discard default vectors for OOV words from the aggregation/pooling. """ return self._set(skipOOV=value)