Source code for sparknlp.annotator.chunk2_doc

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for Chunk2Doc."""

from pyspark import keyword_only

from sparknlp.common import AnnotatorProperties
from sparknlp.common.annotator_type import AnnotatorType
from sparknlp.internal import AnnotatorTransformer


[docs]class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties):
    """Converts a ``CHUNK`` type column back into ``DOCUMENT``.
    
    Useful when trying to re-tokenize or do further analysis on a ``CHUNK`` result.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``CHUNK``              ``DOCUMENT``
    ====================== ======================

    Parameters
    ----------
    None

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.pretrained import PretrainedPipeline

    Location entities are extracted and converted back into ``DOCUMENT`` type for
    further processing.

    >>> data = spark.createDataFrame([[1, "New York and New Jersey aren't that far apart actually."]]).toDF("id", "text")

    Define pretrained pipeline that extracts Named Entities amongst other things
    and apply `Chunk2Doc` on it.

    >>> pipeline = PretrainedPipeline("explain_document_dl")
    >>> chunkToDoc = Chunk2Doc().setInputCols("entities").setOutputCol("chunkConverted")
    >>> explainResult = pipeline.transform(data)

    Show results.

    >>> result = chunkToDoc.transform(explainResult)
    >>> result.selectExpr("explode(chunkConverted)").show(truncate=False)
    +------------------------------------------------------------------------------+
    |col                                                                           |
    +------------------------------------------------------------------------------+
    |[document, 0, 7, New York, [entity -> LOC, sentence -> 0, chunk -> 0], []]    |
    |[document, 13, 22, New Jersey, [entity -> LOC, sentence -> 0, chunk -> 1], []]|
    +------------------------------------------------------------------------------+

    See Also
    --------
    Doc2Chunk : for converting `DOCUMENT` annotations to `CHUNK`
    """

[docs]    name = "Chunk2Doc"

[docs]    inputAnnotatorTypes = [AnnotatorType.CHUNK]

[docs]    outputAnnotatorType = AnnotatorType.DOCUMENT

    @keyword_only
    def __init__(self):
        super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunk2Doc")

    @keyword_only
[docs]    def setParams(self):
        kwargs = self._input_kwargs
        return self._set(**kwargs)