Source code for sparknlp.annotator.spell_check.symmetric_delete

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for SymmetricDelete."""

from sparknlp.common import *


[docs]class SymmetricDeleteApproach(AnnotatorApproach):
    """Trains a Symmetric Delete spelling correction algorithm. Retrieves tokens
    and utilizes distance metrics to compute possible derived words.

    The Symmetric Delete spelling correction algorithm reduces the complexity of edit
    candidate generation and dictionary lookup for a given Damerau-Levenshtein distance.
    It is six orders of magnitude faster (than the standard approach with deletes +
    transposes + replaces + inserts) and language independent.

    A dictionary of correct spellings must be provided with :meth:`.setDictionary` in
    the form of a text file, where each word is parsed by a regex pattern.

    For instantiated/pretrained models, see :class:`.SymmetricDeleteModel`.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``TOKEN``              ``TOKEN``
    ====================== ======================

    Parameters
    ----------
    dictionary
        folder or file with text that teaches about the language
    maxEditDistance
        max edit distance characters to derive strings from a word, by default 3
    frequencyThreshold
        minimum frequency of words to be considered from training, by default 0
    deletesThreshold
        minimum frequency of corrections a word needs to have to be considered
        from training, by default 0

    References
    ----------
    Inspired by `SymSpell <https://github.com/wolfgarbe/SymSpell>`__.

    Examples
    --------
    In this example, the dictionary ``"words.txt"`` has the form of::

        ...
        gummy
        gummic
        gummier
        gummiest
        gummiferous
        ...

    This dictionary is then set to be the basis of the spell checker.

    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> spellChecker = SymmetricDeleteApproach() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("spell") \\
    ...     .setDictionary("src/test/resources/spell/words.txt")
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     tokenizer,
    ...     spellChecker
    ... ])
    >>> pipelineModel = pipeline.fit(trainingData)

    See Also
    --------
    NorvigSweetingApproach : for an alternative approach to spell checking
    ContextSpellCheckerApproach : for a DL based approach
    """
[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

[docs]    corpus = Param(Params._dummy(),
                   "corpus",
                   "folder or file with text that teaches about the language",
                   typeConverter=TypeConverters.identity)

[docs]    dictionary = Param(Params._dummy(),
                       "dictionary",
                       "folder or file with text that teaches about the language",
                       typeConverter=TypeConverters.identity)

[docs]    maxEditDistance = Param(Params._dummy(),
                            "maxEditDistance",
                            "max edit distance characters to derive strings from a word",
                            typeConverter=TypeConverters.toInt)

[docs]    frequencyThreshold = Param(Params._dummy(),
                               "frequencyThreshold",
                               "minimum frequency of words to be considered from training. " +
                               "Increase if training set is LARGE. Defaults to 0",
                               typeConverter=TypeConverters.toInt)

[docs]    deletesThreshold = Param(Params._dummy(),
                             "deletesThreshold",
                             "minimum frequency of corrections a word needs to have to be considered from training." +
                             "Increase if training set is LARGE. Defaults to 0",
                             typeConverter=TypeConverters.toInt)

[docs]    dupsLimit = Param(Params._dummy(),
                      "dupsLimit",
                      "maximum duplicate of characters in a word to consider. Defaults to 2",
                      typeConverter=TypeConverters.toInt)

    @keyword_only
    def __init__(self):
        super(SymmetricDeleteApproach, self).__init__(
            classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach")
        self._setDefault(maxEditDistance=3, frequencyThreshold=0, deletesThreshold=0, dupsLimit=2)
[docs]        self.dictionary_path = ""

[docs]    def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}):
        """Sets folder or file with text that teaches about the language.

        Parameters
        ----------
        path : str
            Path to the resource
        token_pattern : str, optional
            Regex patttern to extract tokens, by default "\S+"
        read_as : str, optional
            How to read the resource, by default ReadAs.TEXT
        options : dict, optional
            Options for reading the resource, by default {"format": "text"}
        """
        self.dictionary_path = path
        opts = options.copy()
        if "tokenPattern" not in opts:
            opts["tokenPattern"] = token_pattern
        return self._set(dictionary=ExternalResource(path, read_as, opts))

[docs]    def setMaxEditDistance(self, v):
        """Sets max edit distance characters to derive strings from a word, by
        default 3.

        Parameters
        ----------
        v : int
            Max edit distance characters to derive strings from a word
        """
        return self._set(maxEditDistance=v)

[docs]    def setFrequencyThreshold(self, v):
        """Sets minimum frequency of words to be considered from training, by
        default 0.

        Parameters
        ----------
        v : int
            Minimum frequency of words to be considered from training
        """
        return self._set(frequencyThreshold=v)

[docs]    def setDeletesThreshold(self, v):
        """Sets minimum frequency of corrections a word needs to have to be
        considered from training, by default 0.

        Parameters
        ----------
        v : int
            Minimum frequency of corrections a word needs to have to be
            considered from training
        """
        return self._set(deletesThreshold=v)

    def _create_model(self, java_model):
        return SymmetricDeleteModel(java_model=java_model)


[docs]class SymmetricDeleteModel(AnnotatorModel):
    """Symmetric Delete spelling correction algorithm.

    The Symmetric Delete spelling correction algorithm reduces the complexity of
    edit candidate generation and dictionary lookup for a given
    Damerau-Levenshtein distance. It is six orders of magnitude faster (than the
    standard approach with deletes + transposes + replaces + inserts) and
    language independent.

    Pretrained models can be loaded with :meth:`.pretrained` of the companion
    object:

    >>> spell = SymmetricDeleteModel.pretrained() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("spell")


    The default model is ``"spellcheck_sd"``, if no name is provided. For
    available pretrained models please see the `Models Hub
    <https://sparknlp.org/models?task=Spell+Check>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``TOKEN``              ``TOKEN``
    ====================== ======================

    Parameters
    ----------
    None

    References
    ----------
    Inspired by `SymSpell <https://github.com/wolfgarbe/SymSpell>`__.

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> spellChecker = SymmetricDeleteModel.pretrained() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("spell")
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     tokenizer,
    ...     spellChecker
    ... ])
    >>> data = spark.createDataFrame([["spmetimes i wrrite wordz erong."]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.select("spell.result").show(truncate=False)
    +--------------------------------------+
    |result                                |
    +--------------------------------------+
    |[sometimes, i, write, words, wrong, .]|
    +--------------------------------------+

    See Also
    --------
    NorvigSweetingModel : for an alternative approach to spell checking
    ContextSpellCheckerModel : for a DL based approach
    """
[docs]    name = "SymmetricDeleteModel"

[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel",
                 java_model=None):
        super(SymmetricDeleteModel, self).__init__(
            classname=classname,
            java_model=java_model
        )

    @staticmethod
[docs]    def pretrained(name="spellcheck_sd", lang="en", remote_loc=None):
        """Downloads and loads a pretrained model.

        Parameters
        ----------
        name : str, optional
            Name of the pretrained model, by default "spellcheck_sd"
        lang : str, optional
            Language of the pretrained model, by default "en"
        remote_loc : str, optional
            Optional remote address of the resource, by default None. Will use
            Spark NLPs repositories otherwise.

        Returns
        -------
        SymmetricDeleteModel
            The restored model
        """
        from sparknlp.pretrained import ResourceDownloader
        return ResourceDownloader.downloadModel(SymmetricDeleteModel, name, lang, remote_loc)