Source code for sparknlp.annotator.n_gram_generator

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the NGramGenerator."""
from sparknlp.common import *


[docs]class NGramGenerator(AnnotatorModel): """A feature transformer that converts the input array of strings (annotatorType ``TOKEN``) into an array of n-grams (annotatorType ``CHUNK``). Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned. For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/NgramGenerator.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN`` ``CHUNK`` ====================== ====================== Parameters ---------- n Number elements per n-gram (>=1), by default 2 enableCumulative Whether to calculate just the actual n-grams, by default False delimiter Character to use to join the tokens, by default " " Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> sentence = SentenceDetector() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("sentence") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["sentence"]) \\ ... .setOutputCol("token") >>> nGrams = NGramGenerator() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("ngrams") \\ ... .setN(2) >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... sentence, ... tokenizer, ... nGrams ... ]) >>> data = spark.createDataFrame([["This is my sentence."]]).toDF("text") >>> results = pipeline.fit(data).transform(data) >>> results.selectExpr("explode(ngrams) as result").show(truncate=False) +------------------------------------------------------------+ |result | +------------------------------------------------------------+ |[chunk, 0, 6, This is, [sentence -> 0, chunk -> 0], []] | |[chunk, 5, 9, is my, [sentence -> 0, chunk -> 1], []] | |[chunk, 8, 18, my sentence, [sentence -> 0, chunk -> 2], []]| |[chunk, 11, 19, sentence ., [sentence -> 0, chunk -> 3], []]| +------------------------------------------------------------+ """ name = "NGramGenerator" inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.CHUNK @keyword_only def __init__(self): super(NGramGenerator, self).__init__(classname="com.johnsnowlabs.nlp.annotators.NGramGenerator") self._setDefault( n=2, enableCumulative=False ) n = Param(Params._dummy(), "n", "number elements per n-gram (>=1)", typeConverter=TypeConverters.toInt) enableCumulative = Param(Params._dummy(), "enableCumulative", "whether to calculate just the actual n-grams " + "or all n-grams from 1 through n", typeConverter=TypeConverters.toBoolean) delimiter = Param(Params._dummy(), "delimiter", "String to use to join the tokens ", typeConverter=TypeConverters.toString)
[docs] def setN(self, value): """Sets number elements per n-gram (>=1), by default 2. Parameters ---------- value : int Number elements per n-gram (>=1), by default 2 """ return self._set(n=value)
[docs] def setEnableCumulative(self, value): """Sets whether to calculate just the actual n-grams, by default False. Parameters ---------- value : bool Whether to calculate just the actual n-grams """ return self._set(enableCumulative=value)
[docs] def setDelimiter(self, value): """Sets character to use to join the tokens Parameters ---------- value : str character to use to join the tokens Raises ------ Exception Delimiter should have length == 1 """ if len(value) > 1: raise Exception("Delimiter should have length == 1") return self._set(delimiter=value)