Source code for sparknlp.annotator.token.chunk_tokenizer
# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for the ChunkTokenizer."""
from sparknlp.common import *
from sparknlp.annotator.token.tokenizer import Tokenizer, TokenizerModel
[docs]class ChunkTokenizer(Tokenizer):
"""Tokenizes and flattens extracted NER chunks.
The ChunkTokenizer will split the extracted NER ``CHUNK`` type Annotations
and will create ``TOKEN`` type Annotations.
The result is then flattened, resulting in a single array.
====================== ======================
Input Annotation types Output Annotation type
====================== ======================
``CHUNK`` ``TOKEN``
====================== ======================
Parameters
----------
None
Examples
--------
>>> import sparknlp
>>> from sparknlp.base import *
>>> from sparknlp.annotator import *
>>> sparknlp.common import *
>>> from pyspark.ml import Pipeline
>>> documentAssembler = DocumentAssembler() \\
... .setInputCol("text") \\
... .setOutputCol("document")
>>> sentenceDetector = SentenceDetector() \\
... .setInputCols(["document"]) \\
... .setOutputCol("sentence")
>>> tokenizer = Tokenizer() \\
... .setInputCols(["sentence"]) \\
... .setOutputCol("token")
>>> entityExtractor = TextMatcher() \\
... .setInputCols(["sentence", "token"]) \\
... .setEntities("src/test/resources/entity-extractor/test-chunks.txt", ReadAs.TEXT) \\
... .setOutputCol("entity")
>>> chunkTokenizer = ChunkTokenizer() \\
... .setInputCols(["entity"]) \\
... .setOutputCol("chunk_token")
>>> pipeline = Pipeline().setStages([
... documentAssembler,
... sentenceDetector,
... tokenizer,
... entityExtractor,
... chunkTokenizer
... ])
>>> data = spark.createDataFrame([
... ["Hello world, my name is Michael, I am an artist and I work at Benezar"],
... ["Robert, an engineer from Farendell, graduated last year. The other one, Lucas, graduated last week."]
>>> ]).toDF("text")
>>> result = pipeline.fit(data).transform(data)
>>> result.selectExpr("entity.result as entity" , "chunk_token.result as chunk_token").show(truncate=False)
+-----------------------------------------------+---------------------------------------------------+
|entity |chunk_token |
+-----------------------------------------------+---------------------------------------------------+
|[world, Michael, work at Benezar] |[world, Michael, work, at, Benezar] |
|[engineer from Farendell, last year, last week]|[engineer, from, Farendell, last, year, last, week]|
+-----------------------------------------------+---------------------------------------------------+
"""
name = 'ChunkTokenizer'
inputAnnotatorTypes = [AnnotatorType.CHUNK]
@keyword_only
def __init__(self):
super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ChunkTokenizer")
def _create_model(self, java_model):
return ChunkTokenizerModel(java_model=java_model)
[docs]class ChunkTokenizerModel(TokenizerModel):
"""Instantiated model of the ChunkTokenizer.
This is the instantiated model of the :class:`.ChunkTokenizer`.
For training your own model, please see the documentation of that class.
====================== ======================
Input Annotation types Output Annotation type
====================== ======================
``CHUNK`` ``TOKEN``
====================== ======================
Parameters
----------
None
"""
name = 'ChunkTokenizerModel'
inputAnnotatorTypes = [AnnotatorType.CHUNK]
@keyword_only
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ChunkTokenizerModel", java_model=None):
super(TokenizerModel, self).__init__(
classname=classname,
java_model=java_model
)