Source code for sparknlp.annotator.token.tokenizer

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the Tokenizer."""


from sparknlp.common import *


[docs]class Tokenizer(AnnotatorApproach):
    """Tokenizes raw text in document type columns into TokenizedSentence .

    This class represents a non fitted tokenizer. Fitting it will cause the
    internal RuleFactory to construct the rules for tokenizing from the input
    configuration.

    Identifies tokens with tokenization open standards. A few rules will help
    customizing it if defaults do not fit user needs.

    For extended examples of usage see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``TOKEN``
    ====================== ======================

    Parameters
    ----------
    targetPattern
        Pattern to grab from text as token candidates, by default ``\\S+``
    prefixPattern
        Regex with groups and begins with ``\\A`` to match target prefix, by
        default ``\\A([^\\s\\w\\$\\.]*)``
    suffixPattern
        Regex with groups and ends with ``\\z`` to match target suffix, by
        default ``([^\\s\\w]?)([^\\s\\w]*)\\z``
    infixPatterns
        Regex patterns that match tokens within a single target. groups identify
        different sub-tokens. multiple defaults
    exceptions
        Words that won't be affected by tokenization rules
    exceptionsPath
        Path to file containing list of exceptions
    caseSensitiveExceptions
        Whether to care for case sensitiveness in exceptions, by default True
    contextChars
        Character list used to separate from token boundaries, by default ['.',
        ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"]
    splitPattern
        Pattern to separate from the inside of tokens. Takes priority over
        splitChars.
    splitChars
        Character list used to separate from the inside of tokens
    minLength
        Set the minimum allowed length for each token, by default 0
    maxLength
        Set the maximum allowed length for each token, by default 99999

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> data = spark.createDataFrame([["I'd like to say we didn't expect that. Jane's boyfriend."]]).toDF("text")
    >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
    >>> tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token").fit(data)
    >>> pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data)
    >>> result = pipeline.transform(data)
    >>> result.selectExpr("token.result").show(truncate=False)
    +-----------------------------------------------------------------------+
    |output                                                                 |
    +-----------------------------------------------------------------------+
    |[I'd, like, to, say, we, didn't, expect, that, ., Jane's, boyfriend, .]|
    +-----------------------------------------------------------------------+
    """

[docs]    name = 'Tokenizer'

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

[docs]    targetPattern = Param(Params._dummy(),
                          "targetPattern",
                          "pattern to grab from text as token candidates. Defaults \S+",
                          typeConverter=TypeConverters.toString)

[docs]    prefixPattern = Param(Params._dummy(),
                          "prefixPattern",
                          "regex with groups and begins with \A to match target prefix. Defaults to \A([^\s\w\$\.]*)",
                          typeConverter=TypeConverters.toString)

[docs]    suffixPattern = Param(Params._dummy(),
                          "suffixPattern",
                          "regex with groups and ends with \z to match target suffix. Defaults to ([^\s\w]?)([^\s\w]*)\z",
                          typeConverter=TypeConverters.toString)

[docs]    infixPatterns = Param(Params._dummy(),
                          "infixPatterns",
                          "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults",
                          typeConverter=TypeConverters.toListString)

[docs]    exceptions = Param(Params._dummy(),
                       "exceptions",
                       "Words that won't be affected by tokenization rules",
                       typeConverter=TypeConverters.toListString)

[docs]    exceptionsPath = Param(Params._dummy(),
                           "exceptionsPath",
                           "path to file containing list of exceptions",
                           typeConverter=TypeConverters.identity)

[docs]    caseSensitiveExceptions = Param(Params._dummy(),
                                    "caseSensitiveExceptions",
                                    "Whether to care for case sensitiveness in exceptions",
                                    typeConverter=TypeConverters.toBoolean)

[docs]    contextChars = Param(Params._dummy(),
                         "contextChars",
                         "character list used to separate from token boundaries",
                         typeConverter=TypeConverters.toListString)

[docs]    splitPattern = Param(Params._dummy(),
                         "splitPattern",
                         "character list used to separate from the inside of tokens",
                         typeConverter=TypeConverters.toString)

[docs]    splitChars = Param(Params._dummy(),
                       "splitChars",
                       "character list used to separate from the inside of tokens",
                       typeConverter=TypeConverters.toListString)

[docs]    minLength = Param(Params._dummy(),
                      "minLength",
                      "Set the minimum allowed length for each token",
                      typeConverter=TypeConverters.toInt)

[docs]    maxLength = Param(Params._dummy(),
                      "maxLength",
                      "Set the maximum allowed length for each token",
                      typeConverter=TypeConverters.toInt)

    @keyword_only
    def __init__(self):
        super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Tokenizer")
        self._setDefault(
            targetPattern="\\S+",
            contextChars=[".", ",", ";", ":", "!", "?", "*", "-", "(", ")", "\"", "'"],
            caseSensitiveExceptions=True,
            minLength=0,
            maxLength=99999
        )

[docs]    def getInfixPatterns(self):
        """Gets regex patterns that match tokens within a single target. Groups
        identify different sub-tokens.

        Returns
        -------
        List[str]
            The infix patterns
        """
        return self.getOrDefault("infixPatterns")

[docs]    def getSuffixPattern(self):
        """Gets regex with groups and ends with ``\\z`` to match target suffix.

        Returns
        -------
        str
            The suffix pattern
        """
        return self.getOrDefault("suffixPattern")

[docs]    def getPrefixPattern(self):
        """Gets regex with groups and begins with ``\\A`` to match target
        prefix.

        Returns
        -------
        str
            The prefix pattern
        """
        return self.getOrDefault("prefixPattern")

[docs]    def getContextChars(self):
        """Gets character list used to separate from token boundaries.

        Returns
        -------
        List[str]
            Character list used to separate from token boundaries
        """
        return self.getOrDefault("contextChars")

[docs]    def getSplitChars(self):
        """Gets character list used to separate from the inside of tokens.

        Returns
        -------
        List[str]
            Character list used to separate from the inside of tokens
        """
        return self.getOrDefault("splitChars")

[docs]    def setTargetPattern(self, value):
        """Sets pattern to grab from text as token candidates, by default
        ``\\S+``.

        Parameters
        ----------
        value : str
            Pattern to grab from text as token candidates
        """
        return self._set(targetPattern=value)

[docs]    def setPrefixPattern(self, value):
        """Sets regex with groups and begins with ``\\A`` to match target prefix, by
        default ``\\A([^\\s\\w\\$\\.]*)``.

        Parameters
        ----------
        value : str
            Regex with groups and begins with ``\\A`` to match target prefix
        """
        return self._set(prefixPattern=value)

[docs]    def setSuffixPattern(self, value):
        """Sets regex with groups and ends with ``\\z`` to match target suffix,
        by default ``([^\\s\\w]?)([^\\s\\w]*)\\z``.

        Parameters
        ----------
        value : str
            Regex with groups and ends with ``\\z`` to match target suffix
        """
        return self._set(suffixPattern=value)

[docs]    def setInfixPatterns(self, value):
        """Sets regex patterns that match tokens within a single target. Groups
        identify different sub-tokens.

        Parameters
        ----------
        value : List[str]
            Regex patterns that match tokens within a single target
        """
        return self._set(infixPatterns=value)

[docs]    def addInfixPattern(self, value):
        """Adds an additional regex pattern that match tokens within a single
        target. Groups identify different sub-tokens.

        Parameters
        ----------
        value : str
            Regex pattern that match tokens within a single target
        """
        try:
            infix_patterns = self.getInfixPatterns()
        except KeyError:
            infix_patterns = []
        infix_patterns.insert(0, value)
        return self._set(infixPatterns=infix_patterns)

[docs]    def setExceptions(self, value):
        """Sets words that won't be affected by tokenization rules.

        Parameters
        ----------
        value : List[str]
            Words that won't be affected by tokenization rules
        """
        return self._set(exceptions=value)

[docs]    def getExceptions(self):
        """Gets words that won't be affected by tokenization rules.

        Returns
        -------
        List[str]
            Words that won't be affected by tokenization rules
        """
        return self.getOrDefault("exceptions")

[docs]    def setExceptionsPath(self, path, read_as=ReadAs.TEXT, options={"format": "text"}):
        """Path to txt file with list of token exceptions

        Parameters
        ----------
        path : str
            Path to the source file
        read_as : str, optional
            How to read the file, by default ReadAs.TEXT
        options : dict, optional
            Options to read the resource, by default {"format": "text"}
        """
        opts = options.copy()
        return self._set(exceptionsPath=ExternalResource(path, read_as, opts))

[docs]    def addException(self, value):
        """Adds an additional word that won't be affected by tokenization rules.

        Parameters
        ----------
        value : str
            Additional word that won't be affected by tokenization rules
        """
        try:
            exception_tokens = self.getExceptions()
        except KeyError:
            exception_tokens = []
        exception_tokens.append(value)
        return self._set(exceptions=exception_tokens)

[docs]    def setCaseSensitiveExceptions(self, value):
        """Sets whether to care for case sensitiveness in exceptions, by default
        True.

        Parameters
        ----------
        value : bool
            Whether to care for case sensitiveness in exceptions
        """
        return self._set(caseSensitiveExceptions=value)

[docs]    def getCaseSensitiveExceptions(self):
        """Gets whether to care for case sensitiveness in exceptions.

        Returns
        -------
        bool
            Whether to care for case sensitiveness in exceptions
        """
        return self.getOrDefault("caseSensitiveExceptions")

[docs]    def setContextChars(self, value):
        """Sets character list used to separate from token boundaries, by
        default ['.', ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"].

        Parameters
        ----------
        value : List[str]
            Character list used to separate from token boundaries
        """
        return self._set(contextChars=value)

[docs]    def addContextChars(self, value):
        """Adds an additional character to the list used to separate from token
        boundaries.

        Parameters
        ----------
        value : str
            Additional context character
        """
        try:
            context_chars = self.getContextChars()
        except KeyError:
            context_chars = []
        context_chars.append(value)
        return self._set(contextChars=context_chars)

[docs]    def setSplitPattern(self, value):
        """Sets pattern to separate from the inside of tokens. Takes priority
        over splitChars.

        Parameters
        ----------
        value : str
            Pattern used to separate from the inside of tokens
        """
        return self._set(splitPattern=value)

[docs]    def setSplitChars(self, value):
        """Sets character list used to separate from the inside of tokens.

        Parameters
        ----------
        value : List[str]
            Character list used to separate from the inside of tokens
        """
        return self._set(splitChars=value)

[docs]    def addSplitChars(self, value):
        """Adds an additional character to separate from the inside of tokens.

        Parameters
        ----------
        value : str
            Additional character to separate from the inside of tokens
        """
        try:
            split_chars = self.getSplitChars()
        except KeyError:
            split_chars = []
        split_chars.append(value)
        return self._set(splitChars=split_chars)

[docs]    def setMinLength(self, value):
        """Sets the minimum allowed length for each token, by default 0.

        Parameters
        ----------
        value : int
            Minimum allowed length for each token
        """
        return self._set(minLength=value)

[docs]    def setMaxLength(self, value):
        """Sets the maximum allowed length for each token, by default 99999.

        Parameters
        ----------
        value : int
            Maximum allowed length for each token
        """
        return self._set(maxLength=value)

    def _create_model(self, java_model):
        return TokenizerModel(java_model=java_model)


[docs]class TokenizerModel(AnnotatorModel):
    """Tokenizes raw text into word pieces, tokens. Identifies tokens with
    tokenization open standards. A few rules will help customizing it if
    defaults do not fit user needs.

    This class represents an already fitted :class:`.Tokenizer`.

    See the main class Tokenizer for more examples of usage.

    ======================  ======================
    Input Annotation types  Output Annotation type
    ======================  ======================
    ``DOCUMENT``            ``TOKEN``
    ======================  ======================

    Parameters
    ----------
    splitPattern
        Character list used to separate from the inside of tokens
    splitChars
        Character list used to separate from the inside of tokens
    """
[docs]    name = "TokenizerModel"

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

[docs]    exceptions = Param(Params._dummy(),
                       "exceptions",
                       "Words that won't be affected by tokenization rules",
                       typeConverter=TypeConverters.toListString)

[docs]    caseSensitiveExceptions = Param(Params._dummy(),
                                    "caseSensitiveExceptions",
                                    "Whether to care for case sensitiveness in exceptions",
                                    typeConverter=TypeConverters.toBoolean)

[docs]    targetPattern = Param(Params._dummy(),
                          "targetPattern",
                          "pattern to grab from text as token candidates. Defaults \S+",
                          typeConverter=TypeConverters.toString)

[docs]    rules = Param(Params._dummy(),
                  "rules",
                  "Rules structure factory containing pre processed regex rules",
                  typeConverter=TypeConverters.identity)

[docs]    splitPattern = Param(Params._dummy(),
                         "splitPattern",
                         "character list used to separate from the inside of tokens",
                         typeConverter=TypeConverters.toString)

[docs]    splitChars = Param(Params._dummy(),
                       "splitChars",
                       "character list used to separate from the inside of tokens",
                       typeConverter=TypeConverters.toListString)

    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.TokenizerModel", java_model=None):
        super(TokenizerModel, self).__init__(
            classname=classname,
            java_model=java_model
        )
        self._setDefault(
            targetPattern="\\S+",
            caseSensitiveExceptions=True
        )

[docs]    def setSplitPattern(self, value):
        """Sets pattern to separate from the inside of tokens. Takes priority
        over splitChars.

        Parameters
        ----------
        value : str
            Pattern used to separate from the inside of tokens
        """
        return self._set(splitPattern=value)

[docs]    def setSplitChars(self, value):
        """Sets character list used to separate from the inside of tokens.

        Parameters
        ----------
        value : List[str]
            Character list used to separate from the inside of tokens
        """
        return self._set(splitChars=value)

[docs]    def addSplitChars(self, value):
        """Adds an additional character to separate from the inside of tokens.

        Parameters
        ----------
        value : str
            Additional character to separate from the inside of tokens
        """
        try:
            split_chars = self.getSplitChars()
        except KeyError:
            split_chars = []
        split_chars.append(value)
        return self._set(splitChars=split_chars)

    @staticmethod
[docs]    def pretrained(name="token_rules", lang="en", remote_loc=None):
        """Downloads and loads a pretrained model.

        Parameters
        ----------
        name : str, optional
            Name of the pretrained model, by default "token_rules"
        lang : str, optional
            Language of the pretrained model, by default "en"
        remote_loc : str, optional
            Optional remote address of the resource, by default None. Will use
            Spark NLPs repositories otherwise.

        Returns
        -------
        TokenizerModel
            The restored model
        """
        from sparknlp.pretrained import ResourceDownloader
        return ResourceDownloader.downloadModel(TokenizerModel, name, lang, remote_loc)