Source code for sparknlp.annotator.chunk2_doc

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for Chunk2Doc."""

from pyspark import keyword_only

from sparknlp.common import AnnotatorProperties
from sparknlp.common.annotator_type import AnnotatorType
from sparknlp.internal import AnnotatorTransformer


[docs]class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties): """Converts a ``CHUNK`` type column back into ``DOCUMENT``. Useful when trying to re-tokenize or do further analysis on a ``CHUNK`` result. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``CHUNK`` ``DOCUMENT`` ====================== ====================== Parameters ---------- None Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.pretrained import PretrainedPipeline Location entities are extracted and converted back into ``DOCUMENT`` type for further processing. >>> data = spark.createDataFrame([[1, "New York and New Jersey aren't that far apart actually."]]).toDF("id", "text") Define pretrained pipeline that extracts Named Entities amongst other things and apply `Chunk2Doc` on it. >>> pipeline = PretrainedPipeline("explain_document_dl") >>> chunkToDoc = Chunk2Doc().setInputCols("entities").setOutputCol("chunkConverted") >>> explainResult = pipeline.transform(data) Show results. >>> result = chunkToDoc.transform(explainResult) >>> result.selectExpr("explode(chunkConverted)").show(truncate=False) +------------------------------------------------------------------------------+ |col | +------------------------------------------------------------------------------+ |[document, 0, 7, New York, [entity -> LOC, sentence -> 0, chunk -> 0], []] | |[document, 13, 22, New Jersey, [entity -> LOC, sentence -> 0, chunk -> 1], []]| +------------------------------------------------------------------------------+ See Also -------- Doc2Chunk : for converting `DOCUMENT` annotations to `CHUNK` """ name = "Chunk2Doc" inputAnnotatorTypes = [AnnotatorType.CHUNK] outputAnnotatorType = AnnotatorType.DOCUMENT @keyword_only def __init__(self): super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunk2Doc") @keyword_only def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs)