Source code for sparknlp.base.document_assembler

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the DocumentAssembler."""

from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param

from sparknlp.common import AnnotatorType
from sparknlp.internal import AnnotatorTransformer


[docs]class DocumentAssembler(AnnotatorTransformer):
    """Prepares data into a format that is processable by Spark NLP.

    This is the entry point for every Spark NLP pipeline. The
    `DocumentAssembler` reads ``String`` columns. Additionally,
    :meth:`.setCleanupMode` can be used to pre-process the
    text (Default: ``disabled``). For possible options please refer the
    parameters section.

    For more extended examples on document pre-processing see the
    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Documents_With_DocumentAssembler.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``NONE``               ``DOCUMENT``
    ====================== ======================

    Parameters
    ----------
    inputCol
        Input column name
    outputCol
        Output column name
    idCol
        Name of String type column for row id.
    metadataCol
        Name of Map type column with metadata information
    cleanupMode
        How to cleanup the document , by default disabled.
        Possible values: ``disabled, inplace, inplace_full, shrink, shrink_full,
        each, each_full, delete_full``

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from pyspark.ml import Pipeline
    >>> data = spark.createDataFrame([["Spark NLP is an open-source text processing library."]]).toDF("text")
    >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
    >>> result = documentAssembler.transform(data)
    >>> result.select("document").show(truncate=False)
    +----------------------------------------------------------------------------------------------+
    |document                                                                                      |
    +----------------------------------------------------------------------------------------------+
    |[[document, 0, 51, Spark NLP is an open-source text processing library., [sentence -> 0], []]]|
    +----------------------------------------------------------------------------------------------+
    >>> result.select("document").printSchema()
    root
    |-- document: array (nullable = True)
    |    |-- element: struct (containsNull = True)
    |    |    |-- annotatorType: string (nullable = True)
    |    |    |-- begin: integer (nullable = False)
    |    |    |-- end: integer (nullable = False)
    |    |    |-- result: string (nullable = True)
    |    |    |-- metadata: map (nullable = True)
    |    |    |    |-- key: string
    |    |    |    |-- value: string (valueContainsNull = True)
    |    |    |-- embeddings: array (nullable = True)
    |    |    |    |-- element: float (containsNull = False)
    """

[docs]    outputAnnotatorType = AnnotatorType.DOCUMENT

[docs]    inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
[docs]    outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
[docs]    idCol = Param(Params._dummy(), "idCol", "column for setting an id to such string in row", typeConverter=TypeConverters.toString)
[docs]    metadataCol = Param(Params._dummy(), "metadataCol", "String to String map column to use as metadata", typeConverter=TypeConverters.toString)
[docs]    cleanupMode = Param(Params._dummy(), "cleanupMode", "possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full", typeConverter=TypeConverters.toString)
[docs]    name = 'DocumentAssembler'

    @keyword_only
    def __init__(self):
        super(DocumentAssembler, self).__init__(classname="com.johnsnowlabs.nlp.DocumentAssembler")
        self._setDefault(outputCol="document", cleanupMode='disabled')

    @keyword_only
[docs]    def setParams(self):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

[docs]    def setInputCol(self, value):
        """Sets input column name.

        Parameters
        ----------
        value : str
            Name of the input column
        """
        return self._set(inputCol=value)

[docs]    def setOutputCol(self, value):
        """Sets output column name.

        Parameters
        ----------
        value : str
            Name of the Output Column
        """
        return self._set(outputCol=value)

[docs]    def setIdCol(self, value):
        """Sets name of string type column for row id.

        Parameters
        ----------
        value : str
            Name of the Id Column
        """
        return self._set(idCol=value)

[docs]    def setMetadataCol(self, value):
        """Sets name for Map type column with metadata information.

        Parameters
        ----------
        value : str
            Name of the metadata column
        """
        return self._set(metadataCol=value)

[docs]    def setCleanupMode(self, value):
        """Sets how to cleanup the document, by default disabled.
        Possible values: ``disabled, inplace, inplace_full, shrink, shrink_full,
        each, each_full, delete_full``

        Parameters
        ----------
        value : str
            Cleanup mode
        """
        if value.strip().lower() not in ['disabled', 'inplace', 'inplace_full', 'shrink', 'shrink_full', 'each', 'each_full', 'delete_full']:
            raise Exception("Cleanup mode possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full")
        return self._set(cleanupMode=value)

[docs]    def getOutputCol(self):
        """Gets output column name of annotations."""
        return self.getOrDefault(self.outputCol)

    # def getInputCol(self):
    #     """Gets current column names of input annotations."""
    #     return self.getOrDefault(self.inputCol)