Source code for sparknlp.annotator.token.recursive_tokenizer

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the RecursiveTokenizer."""

from sparknlp.common import *


[docs]class RecursiveTokenizer(AnnotatorApproach): """Tokenizes raw text recursively based on a handful of definable rules. Unlike the Tokenizer, the RecursiveTokenizer operates based on these array string parameters only: - ``prefixes``: Strings that will be split when found at the beginning of token. - ``suffixes``: Strings that will be split when found at the end of token. - ``infixes``: Strings that will be split when found at the middle of token. - ``whitelist``: Whitelist of strings not to split For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT`` ``TOKEN`` ====================== ====================== Parameters ---------- prefixes Strings to be considered independent tokens when found at the beginning of a word, by default ["'", '"', '(', '[', '\\n'] suffixes Strings to be considered independent tokens when found at the end of a word, by default ['.', ':', '%', ',', ';', '?', "'", '"', ')', ']', '\\n', '!', "'s"] infixes Strings to be considered independent tokens when found in the middle of a word, by default ['\\n', '(', ')'] whitelist Strings to be considered as single tokens , by default ["it\'s", "that\'s", "there\'s", "he\'s", "she\'s", "what\'s", "let\'s", "who\'s", "It\'s", "That\'s", "There\'s", "He\'s", "She\'s", "What\'s", "Let\'s", "Who\'s"] Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = RecursiveTokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer ... ]) >>> data = spark.createDataFrame([["One, after the Other, (and) again. PO, QAM,"]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.select("token.result").show(truncate=False) +------------------------------------------------------------------+ |result | +------------------------------------------------------------------+ |[One, ,, after, the, Other, ,, (, and, ), again, ., PO, ,, QAM, ,]| +------------------------------------------------------------------+ """ name = 'RecursiveTokenizer' inputAnnotatorTypes = [AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.TOKEN prefixes = Param(Params._dummy(), "prefixes", "strings to be considered independent tokens when found at the beginning of a word", typeConverter=TypeConverters.toListString) suffixes = Param(Params._dummy(), "suffixes", "strings to be considered independent tokens when found at the end of a word", typeConverter=TypeConverters.toListString) infixes = Param(Params._dummy(), "infixes", "strings to be considered independent tokens when found in the middle of a word", typeConverter=TypeConverters.toListString) whitelist = Param(Params._dummy(), "whitelist", "strings to be considered as single tokens", typeConverter=TypeConverters.toListString)
[docs] def setPrefixes(self, p): """Sets strings to be considered independent tokens when found at the beginning of a word, by default ["'", '"', '(', '[', '\\n']. Parameters ---------- p : List[str] Strings to be considered independent tokens when found at the beginning of a word """ return self._set(prefixes=p)
[docs] def setSuffixes(self, s): """Sets strings to be considered independent tokens when found at the end of a word, by default ['.', ':', '%', ',', ';', '?', "'", '"', ')', ']', '\\n', '!', "'s"]. Parameters ---------- s : List[str] Strings to be considered independent tokens when found at the end of a word """ return self._set(suffixes=s)
[docs] def setInfixes(self, i): """Sets strings to be considered independent tokens when found in the middle of a word, by default ['\\n', '(', ')']. Parameters ---------- i : List[str] Strings to be considered independent tokens when found in the middle of a word Returns ------- [type] [description] """ return self._set(infixes=i)
[docs] def setWhitelist(self, w): """Sets strings to be considered as single tokens, by default ["it\'s", "that\'s", "there\'s", "he\'s", "she\'s", "what\'s", "let\'s", "who\'s", "It\'s", "That\'s", "There\'s", "He\'s", "She\'s", "What\'s", "Let\'s", "Who\'s"]. Parameters ---------- w : List[str] Strings to be considered as single tokens """ return self._set(whitelist=w)
@keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer"): super(RecursiveTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer") self._setDefault( prefixes=["'", "\"", "(", "[", "\n"], infixes=["\n", "(", ")"], suffixes=[".", ":", "%", ",", ";", "?", "'", "\"", ")", "]", "\n", "!", "'s"], whitelist=["it's", "that's", "there's", "he's", "she's", "what's", "let's", "who's", \ "It's", "That's", "There's", "He's", "She's", "What's", "Let's", "Who's"] ) def _create_model(self, java_model): return RecursiveTokenizerModel(java_model=java_model)
[docs]class RecursiveTokenizerModel(AnnotatorModel): """Instantiated model of the RecursiveTokenizer. This is the instantiated model of the :class:`.RecursiveTokenizer`. For training your own model, please see the documentation of that class. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT`` ``TOKEN`` ====================== ====================== Parameters ---------- None """ name = 'RecursiveTokenizerModel' inputAnnotatorTypes = [AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.TOKEN def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizerModel", java_model=None): super(RecursiveTokenizerModel, self).__init__( classname=classname, java_model=java_model
)