Source code for sparknlp.base.finisher

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the Finisher."""

from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param
from sparknlp.internal import AnnotatorTransformer


[docs]class Finisher(AnnotatorTransformer):
    """Converts annotation results into a format that easier to use.

    It is useful to extract the results from Spark NLP Pipelines. The Finisher
    outputs annotation(s) values into ``String``.

    For more extended examples on document pre-processing see the
    `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/model-downloader/Create%20custom%20pipeline%20-%20NerDL.ipynb
>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``ANY``                ``NONE``
    ====================== ======================

    Parameters
    ----------
    inputCols
        Input annotations
    outputCols
        Output finished annotation cols
    valueSplitSymbol
        Character separating values, by default #
    annotationSplitSymbol
        Character separating annotations, by default @
    cleanAnnotations
        Whether to remove annotation columns, by default True
    includeMetadata
        Whether to include annotation metadata, by default False
    outputAsArray
        Finisher generates an Array with the results instead of string, by
        default True
    parseEmbeddingsVectors
        Whether to include embeddings vectors in the process, by default False

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from sparknlp.pretrained import PretrainedPipeline
    >>> data = spark.createDataFrame([[1, "New York and New Jersey aren't that far apart actually."]]).toDF("id", "text")

    Define pretrained pipeline that extracts Named Entities amongst other things
    and apply the `Finisher` on it.

    >>> pipeline = PretrainedPipeline("explain_document_dl")
    >>> finisher = Finisher().setInputCols("entities").setOutputCols("output")
    >>> explainResult = pipeline.transform(data)

    Show results.

    >>> explainResult.selectExpr("explode(entities)").show(truncate=False)
    +------------------------------------------------------------------------------------------------------------------------------------------------------+
    |entities                                                                                                                                              |
    +------------------------------------------------------------------------------------------------------------------------------------------------------+
    |[[chunk, 0, 7, New York, [entity -> LOC, sentence -> 0, chunk -> 0], []], [chunk, 13, 22, New Jersey, [entity -> LOC, sentence -> 0, chunk -> 1], []]]|
    +------------------------------------------------------------------------------------------------------------------------------------------------------+
    >>> result = finisher.transform(explainResult)
    >>> result.select("output").show(truncate=False)
    +----------------------+
    |output                |
    +----------------------+
    |[New York, New Jersey]|
    +----------------------+

    See Also
    --------
    Finisher : for finishing Strings
    """

[docs]    inputCols = Param(Params._dummy(), "inputCols", "input annotations", typeConverter=TypeConverters.toListString)
[docs]    outputCols = Param(Params._dummy(), "outputCols", "output finished annotation cols", typeConverter=TypeConverters.toListString)
[docs]    valueSplitSymbol = Param(Params._dummy(), "valueSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
[docs]    annotationSplitSymbol = Param(Params._dummy(), "annotationSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
[docs]    cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove annotation columns", typeConverter=TypeConverters.toBoolean)
[docs]    includeMetadata = Param(Params._dummy(), "includeMetadata", "annotation metadata format", typeConverter=TypeConverters.toBoolean)
[docs]    outputAsArray = Param(Params._dummy(), "outputAsArray", "finisher generates an Array with the results instead of string", typeConverter=TypeConverters.toBoolean)
[docs]    parseEmbeddingsVectors = Param(Params._dummy(), "parseEmbeddingsVectors", "whether to include embeddings vectors in the process", typeConverter=TypeConverters.toBoolean)
[docs]    name = "Finisher"

    @keyword_only
    def __init__(self):
        super(Finisher, self).__init__(classname="com.johnsnowlabs.nlp.Finisher")
        self._setDefault(
            cleanAnnotations=True,
            includeMetadata=False,
            outputAsArray=True,
            parseEmbeddingsVectors=False,
            valueSplitSymbol="#",
            annotationSplitSymbol="@",
            outputCols=[]
        )

    @keyword_only
[docs]    def setParams(self):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

[docs]    def setInputCols(self, *value):
        """Sets column names of input annotations.

        Parameters
        ----------
        *value : List[str]
            Input columns for the annotator
        """
        if len(value) == 1 and type(value[0]) == list:
            return self._set(inputCols=value[0])
        else:
            return self._set(inputCols=list(value))

[docs]    def setOutputCols(self, *value):
        """Sets column names of finished output annotations.

        Parameters
        ----------
        *value : List[str]
            List of output columns
        """
        if len(value) == 1 and type(value[0]) == list:
            return self._set(outputCols=value[0])
        else:
            return self._set(outputCols=list(value))

[docs]    def setValueSplitSymbol(self, value):
        """Sets character separating values, by default #.

        Parameters
        ----------
        value : str
            Character to separate annotations
        """
        return self._set(valueSplitSymbol=value)

[docs]    def setAnnotationSplitSymbol(self, value):
        """Sets character separating annotations, by default @.

        Parameters
        ----------
        value : str
            ...
        """
        return self._set(annotationSplitSymbol=value)

[docs]    def setCleanAnnotations(self, value):
        """Sets whether to remove annotation columns, by default True.

        Parameters
        ----------
        value : bool
            Whether to remove annotation columns
        """
        return self._set(cleanAnnotations=value)

[docs]    def setIncludeMetadata(self, value):
        """Sets whether to include annotation metadata.

        Parameters
        ----------
        value : bool
            Whether to include annotation metadata
        """
        return self._set(includeMetadata=value)

[docs]    def setOutputAsArray(self, value):
        """Sets whether to generate an array with the results instead of a
        string.

        Parameters
        ----------
        value : bool
            Whether to generate an array with the results instead of a string
        """
        return self._set(outputAsArray=value)

[docs]    def setParseEmbeddingsVectors(self, value):
        """Sets whether to include embeddings vectors in the process.

        Parameters
        ----------
        value : bool
            Whether to include embeddings vectors in the process
        """
        return self._set(parseEmbeddingsVectors=value)

[docs]    def getInputCols(self):
        """Gets input columns name of annotations."""
        return self.getOrDefault(self.inputCols)

[docs]    def getOutputCols(self):
        """Gets output columns name of annotations."""
        if len(self.getOrDefault(self.outputCols)) == 0:
            return ["finished_" + input_col for input_col in self.getInputCols()]
        else:
            return self.getOrDefault(self.outputCols)