Source code for sparknlp.annotator.token.recursive_tokenizer

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the RecursiveTokenizer."""

from sparknlp.common import *


[docs]class RecursiveTokenizer(AnnotatorApproach):
    """Tokenizes raw text recursively based on a handful of definable rules.

    Unlike the Tokenizer, the RecursiveTokenizer operates based on these array
    string parameters only:

    - ``prefixes``: Strings that will be split when found at the beginning of
      token.
    - ``suffixes``: Strings that will be split when found at the end of token.
    - ``infixes``: Strings that will be split when found at the middle of token.
    - ``whitelist``: Whitelist of strings not to split

    For extended examples of usage, see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/italian/Training_Context_Spell_Checker_Italian.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``TOKEN``
    ====================== ======================

    Parameters
    ----------
    prefixes
        Strings to be considered independent tokens when found at the beginning
        of a word, by default ["'", '"', '(', '[', '\\n']
    suffixes
        Strings to be considered independent tokens when found at the end of a
        word, by default ['.', ':', '%', ',', ';', '?', "'", '"', ')', ']',
        '\\n', '!', "'s"]
    infixes
        Strings to be considered independent tokens when found in the middle of
        a word, by default ['\\n', '(', ')']
    whitelist
        Strings to be considered as single tokens , by default ["it\'s",
        "that\'s", "there\'s", "he\'s", "she\'s", "what\'s", "let\'s", "who\'s",
        "It\'s", "That\'s", "There\'s", "He\'s", "She\'s", "What\'s", "Let\'s",
        "Who\'s"]

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = RecursiveTokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     tokenizer
    ... ])
    >>> data = spark.createDataFrame([["One, after the Other, (and) again. PO, QAM,"]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.select("token.result").show(truncate=False)
    +------------------------------------------------------------------+
    |result                                                            |
    +------------------------------------------------------------------+
    |[One, ,, after, the, Other, ,, (, and, ), again, ., PO, ,, QAM, ,]|
    +------------------------------------------------------------------+
    """
[docs]    name = 'RecursiveTokenizer'

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

[docs]    prefixes = Param(Params._dummy(),
                     "prefixes",
                     "strings to be considered independent tokens when found at the beginning of a word",
                     typeConverter=TypeConverters.toListString)

[docs]    suffixes = Param(Params._dummy(),
                     "suffixes",
                     "strings to be considered independent tokens when found at the end of a word",
                     typeConverter=TypeConverters.toListString)

[docs]    infixes = Param(Params._dummy(),
                    "infixes",
                    "strings to be considered independent tokens when found in the middle of a word",
                    typeConverter=TypeConverters.toListString)

[docs]    whitelist = Param(Params._dummy(),
                      "whitelist",
                      "strings to be considered as single tokens",
                      typeConverter=TypeConverters.toListString)

[docs]    def setPrefixes(self, p):
        """Sets strings to be considered independent tokens when found at the
        beginning of a word, by default ["'", '"', '(', '[', '\\n'].

        Parameters
        ----------
        p : List[str]
            Strings to be considered independent tokens when found at the
            beginning of a word
        """
        return self._set(prefixes=p)

[docs]    def setSuffixes(self, s):
        """Sets strings to be considered independent tokens when found at the
        end of a word, by default ['.', ':', '%', ',', ';', '?', "'", '"', ')',
        ']', '\\n', '!', "'s"].

        Parameters
        ----------
        s : List[str]
            Strings to be considered independent tokens when found at the end of
            a word
        """
        return self._set(suffixes=s)

[docs]    def setInfixes(self, i):
        """Sets strings to be considered independent tokens when found in the
        middle of a word, by default ['\\n', '(', ')'].

        Parameters
        ----------
        i : List[str]
            Strings to be considered independent tokens when found in the middle
            of a word

        Returns
        -------
        [type]
            [description]
        """
        return self._set(infixes=i)

[docs]    def setWhitelist(self, w):
        """Sets strings to be considered as single tokens, by default ["it\'s",
        "that\'s", "there\'s", "he\'s", "she\'s", "what\'s", "let\'s", "who\'s",
        "It\'s", "That\'s", "There\'s", "He\'s", "She\'s", "What\'s", "Let\'s",
        "Who\'s"].

        Parameters
        ----------
        w : List[str]
            Strings to be considered as single tokens
        """
        return self._set(whitelist=w)

    @keyword_only
    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer"):
        super(RecursiveTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer")
        self._setDefault(
            prefixes=["'", "\"", "(", "[", "\n"],
            infixes=["\n", "(", ")"],
            suffixes=[".", ":", "%", ",", ";", "?", "'", "\"", ")", "]", "\n", "!", "'s"],
            whitelist=["it's", "that's", "there's", "he's", "she's", "what's", "let's", "who's", \
                       "It's", "That's", "There's", "He's", "She's", "What's", "Let's", "Who's"]
        )

    def _create_model(self, java_model):
        return RecursiveTokenizerModel(java_model=java_model)


[docs]class RecursiveTokenizerModel(AnnotatorModel):
    """Instantiated model of the RecursiveTokenizer.

    This is the instantiated model of the :class:`.RecursiveTokenizer`.
    For training your own model, please see the documentation of that class.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``TOKEN``
    ====================== ======================

    Parameters
    ----------
    None
    """
[docs]    name = 'RecursiveTokenizerModel'

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizerModel", java_model=None):
        super(RecursiveTokenizerModel, self).__init__(
            classname=classname,
            java_model=java_model
        )