Source code for sparknlp.annotator.chunker

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the Chunker."""
from sparknlp.common import *


[docs]class Chunker(AnnotatorModel): """This annotator matches a pattern of part-of-speech tags in order to return meaningful phrases from document. Extracted part-of-speech tags are mapped onto the sentence, which can then be parsed by regular expressions. The part-of-speech tags are wrapped by angle brackets ``<>`` to be easily distinguishable in the text itself. This example sentence will result in the form: .. code-block:: none "Peter Pipers employees are picking pecks of pickled peppers." "<NNP><NNP><NNS><VBP><VBG><NNS><IN><JJ><NNS><.>" To then extract these tags, ``regexParsers`` need to be set with e.g.: >>> chunker = Chunker() \\ ... .setInputCols(["sentence", "pos"]) \\ ... .setOutputCol("chunk") \\ ... .setRegexParsers(["<NNP>+", "<NNS>+"]) When defining the regular expressions, tags enclosed in angle brackets are treated as groups, so here specifically ``"<NNP>+"`` means 1 or more nouns in succession. For more extended examples see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/chunking/Chunk_Extraction_with_Chunker.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT, POS`` ``CHUNK`` ====================== ====================== Parameters ---------- regexParsers An array of grammar based chunk parsers Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> sentence = SentenceDetector() \\ ... .setInputCols("document") \\ ... .setOutputCol("sentence") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["sentence"]) \\ ... .setOutputCol("token") >>> POSTag = PerceptronModel.pretrained() \\ ... .setInputCols("document", "token") \\ ... .setOutputCol("pos") >>> chunker = Chunker() \\ ... .setInputCols("sentence", "pos") \\ ... .setOutputCol("chunk") \\ ... .setRegexParsers(["<NNP>+", "<NNS>+"]) >>> pipeline = Pipeline() \\ ... .setStages([ ... documentAssembler, ... sentence, ... tokenizer, ... POSTag, ... chunker ... ]) >>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.selectExpr("explode(chunk) as result").show(truncate=False) +-------------------------------------------------------------+ |result | +-------------------------------------------------------------+ |[chunk, 0, 11, Peter Pipers, [sentence -> 0, chunk -> 0], []]| |[chunk, 13, 21, employees, [sentence -> 0, chunk -> 1], []] | |[chunk, 35, 39, pecks, [sentence -> 0, chunk -> 2], []] | |[chunk, 52, 58, peppers, [sentence -> 0, chunk -> 3], []] | +-------------------------------------------------------------+ See Also -------- PerceptronModel : for Part-Of-Speech tagging """ inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS] outputAnnotatorType = AnnotatorType.CHUNK regexParsers = Param(Params._dummy(), "regexParsers", "an array of grammar based chunk parsers", typeConverter=TypeConverters.toListString) name = "Chunker" @keyword_only def __init__(self): super(Chunker, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Chunker")
[docs] def setRegexParsers(self, value): """Sets an array of grammar based chunk parsers. POS classes should be enclosed in angle brackets, then treated as groups. Parameters ---------- value : List[str] Array of grammar based chunk parsers Examples -------- >>> chunker = Chunker() \\ ... .setInputCols("sentence", "pos") \\ ... .setOutputCol("chunk") \\ ... .setRegexParsers(["<NNP>+", "<NNS>+"]) """ return self._set(regexParsers=value)