Source code for sparknlp.base.token_assembler
# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for the TokenAssembler."""
from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param
from sparknlp.common.annotator_type import AnnotatorType
from sparknlp.internal import AnnotatorTransformer
from sparknlp.common import AnnotatorProperties
[docs]class TokenAssembler(AnnotatorTransformer, AnnotatorProperties):
"""This transformer reconstructs a ``DOCUMENT`` type annotation from tokens,
usually after these have been normalized, lemmatized, normalized, spell
checked, etc, in order to use this document annotation in further
annotators. Requires ``DOCUMENT`` and ``TOKEN`` type annotations as input.
For more extended examples on document pre-processing see the
`Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/token-assembler/Assembling_Tokens_to_Documents.ipynb>`__.
====================== ======================
Input Annotation types Output Annotation type
====================== ======================
``DOCUMENT, TOKEN`` ``DOCUMENT``
====================== ======================
Parameters
----------
preservePosition
Whether to preserve the actual position of the tokens or reduce them to
one space
Examples
--------
>>> import sparknlp
>>> from sparknlp.base import *
>>> from sparknlp.annotator import *
>>> from pyspark.ml import Pipeline
First, the text is tokenized and cleaned
>>> documentAssembler = DocumentAssembler() \\
... .setInputCol("text") \\
... .setOutputCol("document")
>>> sentenceDetector = SentenceDetector() \\
... .setInputCols(["document"]) \\
... .setOutputCol("sentences")
>>> tokenizer = Tokenizer() \\
... .setInputCols(["sentences"]) \\
... .setOutputCol("token")
>>> normalizer = Normalizer() \\
... .setInputCols(["token"]) \\
... .setOutputCol("normalized") \\
... .setLowercase(False)
>>> stopwordsCleaner = StopWordsCleaner() \\
... .setInputCols(["normalized"]) \\
... .setOutputCol("cleanTokens") \\
... .setCaseSensitive(False)
Then the TokenAssembler turns the cleaned tokens into a ``DOCUMENT`` type
structure.
>>> tokenAssembler = TokenAssembler() \\
... .setInputCols(["sentences", "cleanTokens"]) \\
... .setOutputCol("cleanText")
>>> data = spark.createDataFrame([["Spark NLP is an open-source text processing library for advanced natural language processing."]]) \\
... .toDF("text")
>>> pipeline = Pipeline().setStages([
... documentAssembler,
... sentenceDetector,
... tokenizer,
... normalizer,
... stopwordsCleaner,
... tokenAssembler
... ]).fit(data)
>>> result = pipeline.transform(data)
>>> result.select("cleanText").show(truncate=False)
+---------------------------------------------------------------------------------------------------------------------------+
|cleanText |
+---------------------------------------------------------------------------------------------------------------------------+
|[[document, 0, 80, Spark NLP opensource text processing library advanced natural language processing, [sentence -> 0], []]]|
+---------------------------------------------------------------------------------------------------------------------------+
"""
name = "TokenAssembler"
inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
outputAnnotatorType = AnnotatorType.DOCUMENT
preservePosition = Param(Params._dummy(), "preservePosition", "whether to preserve the actual position of the tokens or reduce them to one space", typeConverter=TypeConverters.toBoolean)
@keyword_only
def __init__(self):
super(TokenAssembler, self).__init__(classname="com.johnsnowlabs.nlp.TokenAssembler")
@keyword_only
def setParams(self):
kwargs = self._input_kwargs
return self._set(**kwargs)
[docs] def setPreservePosition(self, value):
"""Sets whether to preserve the actual position of the tokens or reduce
them to one space.
Parameters
----------
value : str
Name of the Id Column
"""
return self._set(preservePosition=value)