Source code for sparknlp.base.multi_document_assembler

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param

from sparknlp.common import AnnotatorType
from sparknlp.internal import AnnotatorTransformer


[docs]class MultiDocumentAssembler(AnnotatorTransformer): """Prepares data into a format that is processable by Spark NLP. This is the entry point for every Spark NLP pipeline. The `MultiDocumentAssembler` can read either a ``String`` column or an ``Array[String]``. Additionally, :meth:`.setCleanupMode` can be used to pre-process the text (Default: ``disabled``). For possible options please refer the parameters section. For more extended examples on document pre-processing see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``NONE`` ``DOCUMENT`` ====================== ====================== Parameters ---------- inputCols: str or List[str] Input column name. outputCols: str or List[str] Output column name. idCol: str Name of String type column for row id. metadataCol: str Name of Map type column with metadata information cleanupMode: str How to cleanup the document , by default disabled. Possible values: ``disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full`` Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from pyspark.ml import Pipeline >>> data = spark.createDataFrame([["Spark NLP is an open-source text processing library."], ["Spark NLP is a state-of-the-art Natural Language Processing library built on top of Apache Spark"]]).toDF("text", "text2") >>> documentAssembler = MultiDocumentAssembler().setInputCols(["text", "text2"]).setOutputCols(["document1", "document2"]) >>> result = documentAssembler.transform(data) >>> result.select("document1").show(truncate=False) +----------------------------------------------------------------------------------------------+ |document1 | +----------------------------------------------------------------------------------------------+ |[[document, 0, 51, Spark NLP is an open-source text processing library., [sentence -> 0], []]]| +----------------------------------------------------------------------------------------------+ >>> result.select("document1").printSchema() root |-- document: array (nullable = True) | |-- element: struct (containsNull = True) | | |-- annotatorType: string (nullable = True) | | |-- begin: integer (nullable = False) | | |-- end: integer (nullable = False) | | |-- result: string (nullable = True) | | |-- metadata: map (nullable = True) | | | |-- key: string | | | |-- value: string (valueContainsNull = True) | | |-- embeddings: array (nullable = True) | | | |-- element: float (containsNull = False) """ outputAnnotatorType = AnnotatorType.DOCUMENT inputCols = Param(Params._dummy(), "inputCols", "input annotations", typeConverter=TypeConverters.toListString) outputCols = Param(Params._dummy(), "outputCols", "output finished annotation cols", typeConverter=TypeConverters.toListString) idCol = Param(Params._dummy(), "idCol", "column for setting an id to such string in row", typeConverter=TypeConverters.toString) metadataCol = Param(Params._dummy(), "metadataCol", "String to String map column to use as metadata", typeConverter=TypeConverters.toString) cleanupMode = Param(Params._dummy(), "cleanupMode", "possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full", typeConverter=TypeConverters.toString) name = 'MultiDocumentAssembler' @keyword_only def __init__(self): super(MultiDocumentAssembler, self).__init__(classname="com.johnsnowlabs.nlp.MultiDocumentAssembler") self._setDefault(cleanupMode='disabled') @keyword_only def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs)
[docs] def setInputCols(self, *value): """Sets column names of input annotations. Parameters ---------- *value : List[str] Input columns for the annotator """ if len(value) == 1 and type(value[0]) == list: return self._set(inputCols=value[0]) else: return self._set(inputCols=list(value))
[docs] def setOutputCols(self, *value): """Sets column names of output annotations. Parameters ---------- *value : List[str] List of output columns """ if len(value) == 1 and type(value[0]) == list: return self._set(outputCols=value[0]) else: return self._set(outputCols=list(value))
[docs] def setIdCol(self, value): """Sets name of string type column for row id. Parameters ---------- value : str Name of the Id Column """ return self._set(idCol=value)
[docs] def setMetadataCol(self, value): """Sets name for Map type column with metadata information. Parameters ---------- value : str Name of the metadata column """ return self._set(metadataCol=value)
[docs] def setCleanupMode(self, value): """Sets how to cleanup the document, by default disabled. Possible values: ``disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full`` Parameters ---------- value : str Cleanup mode """ if value.strip().lower() not in ['disabled', 'inplace', 'inplace_full', 'shrink', 'shrink_full', 'each', 'each_full', 'delete_full']: raise Exception("Cleanup mode possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full") return self._set(cleanupMode=value)
[docs] def getOutputCols(self): """Gets output columns name of annotations.""" return self.getOrDefault(self.outputCols)