Source code for sparknlp.annotator.token.regex_tokenizer

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the RegexTokenizer."""


from sparknlp.common import *


[docs]class RegexTokenizer(AnnotatorModel):
    """A tokenizer that splits text by a regex pattern.

    The pattern needs to be set with :meth:`.setPattern` and this sets the
    delimiting pattern or how the tokens should be split. By default this
    pattern is ``\\s+`` which means that tokens should be split by 1 or more
    whitespace characters.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``TOKEN``
    ====================== ======================

    Parameters
    ----------
    minLength
        Set the minimum allowed length for each token, by default 1
    maxLength
        Set the maximum allowed length for each token
    toLowercase
        Indicates whether to convert all characters to lowercase before
        tokenizing, by default False
    pattern
        Regex pattern used for tokenizing, by default ``\\s+``
    positionalMask
        Using a positional mask to guarantee the incremental progression of the
        tokenization, by default False
    trimWhitespace
        Using a trimWhitespace flag to remove whitespaces from identified tokens,
        by default False
    preservePosition
        Using a preservePosition flag to preserve initial indexes before eventual whitespaces removal in tokens,
        by default True

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> regexTokenizer = RegexTokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("regexToken") \\
    ...     .setToLowercase(True) \\
    >>> pipeline = Pipeline().setStages([
    ...       documentAssembler,
    ...       regexTokenizer
    ...     ])
    >>> data = spark.createDataFrame([["This is my first sentence.\\nThis is my second."]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("regexToken.result").show(truncate=False)
    +-------------------------------------------------------+
    |result                                                 |
    +-------------------------------------------------------+
    |[this, is, my, first, sentence., this, is, my, second.]|
    +-------------------------------------------------------+
    """

[docs]    name = "RegexTokenizer"

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

    @keyword_only
    def __init__(self):
        super(RegexTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexTokenizer")
        self._setDefault(
            inputCols=["document"],
            outputCol="regexToken",
            toLowercase=False,
            minLength=1,
            pattern="\\s+",
            positionalMask=False,
            trimWhitespace=False,
            preservePosition=True
        )

[docs]    minLength = Param(Params._dummy(),
                      "minLength",
                      "Set the minimum allowed length for each token",
                      typeConverter=TypeConverters.toInt)

[docs]    maxLength = Param(Params._dummy(),
                      "maxLength",
                      "Set the maximum allowed length for each token",
                      typeConverter=TypeConverters.toInt)

[docs]    toLowercase = Param(Params._dummy(),
                        "toLowercase",
                        "Indicates whether to convert all characters to lowercase before tokenizing.",
                        typeConverter=TypeConverters.toBoolean)

[docs]    pattern = Param(Params._dummy(),
                    "pattern",
                    "regex pattern used for tokenizing. Defaults \S+",
                    typeConverter=TypeConverters.toString)

[docs]    positionalMask = Param(Params._dummy(),
                           "positionalMask",
                           "Using a positional mask to guarantee the incremental progression of the tokenization.",
                           typeConverter=TypeConverters.toBoolean)

[docs]    trimWhitespace = Param(Params._dummy(),
                           "trimWhitespace",
                           "Indicates whether to use a trimWhitespaces flag to remove whitespaces from identified tokens.",
                           typeConverter=TypeConverters.toBoolean)

[docs]    preservePosition = Param(Params._dummy(),
                             "preservePosition",
                             "Indicates whether to use a preserve initial indexes before eventual whitespaces removal in tokens.",
                             typeConverter=TypeConverters.toBoolean)

[docs]    def setMinLength(self, value):
        """Sets the minimum allowed length for each token, by default 1.

        Parameters
        ----------
        value : int
            Minimum allowed length for each token
        """
        return self._set(minLength=value)

[docs]    def setMaxLength(self, value):
        """Sets the maximum allowed length for each token.

        Parameters
        ----------
        value : int
            Maximum allowed length for each token
        """
        return self._set(maxLength=value)

[docs]    def setToLowercase(self, value):
        """Sets whether to convert all characters to lowercase before
        tokenizing, by default False.

        Parameters
        ----------
        value : bool
            Whether to convert all characters to lowercase before tokenizing
        """
        return self._set(toLowercase=value)

[docs]    def setPattern(self, value):
        """Sets the regex pattern used for tokenizing, by default ``\\s+``.

        Parameters
        ----------
        value : str
            Regex pattern used for tokenizing
        """
        return self._set(pattern=value)

[docs]    def setPositionalMask(self, value):
        """Sets whether to use a positional mask to guarantee the incremental
        progression of the tokenization, by default False.

        Parameters
        ----------
        value : bool
            Whether to use a positional mask
        """
        return self._set(positionalMask=value)

[docs]    def setTrimWhitespace(self, value):
        """Indicates whether to use a trimWhitespaces flag to remove whitespaces from identified tokens.

        Parameters
        ----------
        value : bool
            Indicates whether to use a trimWhitespaces flag, by default False.
        """
        return self._set(trimWhitespace=value)

[docs]    def setPreservePosition(self, value):
        """Indicates whether to use a preserve initial indexes before eventual whitespaces removal in tokens.

        Parameters
        ----------
        value : bool
            Indicates whether to use a preserve initial indexes, by default True.
        """
        return self._set(preservePosition=value)