Source code for sparknlp.base.doc2_chunk

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for Doc2Chunk."""

from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param

from sparknlp.internal import AnnotatorTransformer

from sparknlp.common import AnnotatorProperties, AnnotatorType


[docs]class Doc2Chunk(AnnotatorTransformer, AnnotatorProperties): """Converts ``DOCUMENT`` type annotations into ``CHUNK`` type with the contents of a ``chunkCol``. Chunk text must be contained within input ``DOCUMENT``. May be either ``StringType`` or ``ArrayType[StringType]`` (using setIsArray). Useful for annotators that require a CHUNK type input. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT`` ``CHUNK`` ====================== ====================== Parameters ---------- chunkCol Column that contains the string. Must be part of DOCUMENT startCol Column that has a reference of where the chunk begins startColByTokenIndex Whether start column is prepended by whitespace tokens isArray Whether the chunkCol is an array of strings, by default False failOnMissing Whether to fail the job if a chunk is not found within document. Return empty otherwise lowerCase Whether to lower case for matching case Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.common import * >>> from sparknlp.annotator import * >>> from sparknlp.training import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document") >>> chunkAssembler = Doc2Chunk() \\ ... .setInputCols("document") \\ ... .setChunkCol("target") \\ ... .setOutputCol("chunk") \\ ... .setIsArray(True) >>> data = spark.createDataFrame([[ ... "Spark NLP is an open-source text processing library for advanced natural language processing.", ... ["Spark NLP", "text processing library", "natural language processing"] ... ]]).toDF("text", "target") >>> pipeline = Pipeline().setStages([documentAssembler, chunkAssembler]).fit(data) >>> result = pipeline.transform(data) >>> result.selectExpr("chunk.result", "chunk.annotatorType").show(truncate=False) +-----------------------------------------------------------------+---------------------+ |result |annotatorType | +-----------------------------------------------------------------+---------------------+ |[Spark NLP, text processing library, natural language processing]|[chunk, chunk, chunk]| +-----------------------------------------------------------------+---------------------+ See Also -------- Chunk2Doc : for converting `CHUNK` annotations to `DOCUMENT` """ inputAnnotatorTypes = [AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.CHUNK chunkCol = Param(Params._dummy(), "chunkCol", "column that contains string. Must be part of DOCUMENT", typeConverter=TypeConverters.toString) startCol = Param(Params._dummy(), "startCol", "column that has a reference of where chunk begins", typeConverter=TypeConverters.toString) startColByTokenIndex = Param(Params._dummy(), "startColByTokenIndex", "whether start col is by whitespace tokens", typeConverter=TypeConverters.toBoolean) isArray = Param(Params._dummy(), "isArray", "whether the chunkCol is an array of strings", typeConverter=TypeConverters.toBoolean) failOnMissing = Param(Params._dummy(), "failOnMissing", "whether to fail the job if a chunk is not found within document. return empty otherwise", typeConverter=TypeConverters.toBoolean) lowerCase = Param(Params._dummy(), "lowerCase", "whether to lower case for matching case", typeConverter=TypeConverters.toBoolean) name = "Doc2Chunk" @keyword_only def __init__(self): super(Doc2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.Doc2Chunk") self._setDefault( isArray=False ) @keyword_only def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs)
[docs] def setChunkCol(self, value): """Sets column that contains the string. Must be part of DOCUMENT. Parameters ---------- value : str Name of the Chunk Column """ return self._set(chunkCol=value)
[docs] def setIsArray(self, value): """Sets whether the chunkCol is an array of strings. Parameters ---------- value : bool Whether the chunkCol is an array of strings """ return self._set(isArray=value)
[docs] def setStartCol(self, value): """Sets column that has a reference of where chunk begins. Parameters ---------- value : str Name of the reference column """ return self._set(startCol=value)
[docs] def setStartColByTokenIndex(self, value): """Sets whether start column is prepended by whitespace tokens. Parameters ---------- value : bool whether start column is prepended by whitespace tokens """ return self._set(startColByTokenIndex=value)
[docs] def setFailOnMissing(self, value): """Sets whether to fail the job if a chunk is not found within document. Return empty otherwise. Parameters ---------- value : bool Whether to fail job on missing chunks """ return self._set(failOnMissing=value)
[docs] def setLowerCase(self, value): """Sets whether to lower case for matching case. Parameters ---------- value : bool Name of the Id Column """ return self._set(lowerCase=value)