Source code for sparknlp.annotator.spell_check.norvig_sweeting

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the NorvigSweeting spell checker."""

from sparknlp.common import *


[docs]class NorvigSweetingApproach(AnnotatorApproach):
    """Trains annotator, that retrieves tokens and makes corrections automatically if
    not found in an English dictionary, based on the algorithm by Peter Norvig.

    The algorithm is based on a Bayesian approach to spell checking: Given the word we
    look in the provided dictionary to choose the word with the highest probability
    to be the correct one.

    A dictionary of correct spellings must be provided with :meth:`.setDictionary` in
    the form of a text file, where each word is parsed by a regex pattern.

    For instantiated/pretrained models, see :class:`.NorvigSweetingModel`.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``TOKEN``              ``TOKEN``
    ====================== ======================

    Parameters
    ----------
    dictionary
        Dictionary needs 'tokenPattern' regex in dictionary for separating words
    caseSensitive
        Whether to ignore case sensitivity, by default False
    doubleVariants
        Whether to use more expensive spell checker, by default False

        Increase search at cost of performance. Enables extra check for word
        combinations.
    shortCircuit
        Whether to use faster mode, by default False

        Increase performance at cost of accuracy. Faster but less accurate.
    frequencyPriority
        Applies frequency over hamming in intersections, when false hamming
        takes priority, by default True
    wordSizeIgnore
        Minimum size of word before ignoring, by default 3
    dupsLimit
        Maximum duplicate of characters in a word to consider, by default 2
    reductLimit
        Word reductions limit, by default 3
    intersections
        Hamming intersections to attempt, by default 10
    vowelSwapLimit
        Vowel swap attempts, by default 6

    References
    ----------

    Inspired by the spell checker by Peter Norvig:
    `How to Write a Spelling Corrector <https://norvig.com/spell-correct.html>`__

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline

    In this example, the dictionary ``"words.txt"`` has the form of::

        ...
        gummy
        gummic
        gummier
        gummiest
        gummiferous
        ...

    This dictionary is then set to be the basis of the spell checker.

    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> spellChecker = NorvigSweetingApproach() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("spell") \\
    ...     .setDictionary("src/test/resources/spell/words.txt")
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     tokenizer,
    ...     spellChecker
    ... ])
    >>> pipelineModel = pipeline.fit(trainingData)

    See Also
    --------
    SymmetricDeleteApproach : for an alternative approach to spell checking
    ContextSpellCheckerApproach : for a DL based approach
    """
[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

[docs]    dictionary = Param(Params._dummy(),
                       "dictionary",
                       "dictionary needs 'tokenPattern' regex in dictionary for separating words",
                       typeConverter=TypeConverters.identity)

[docs]    caseSensitive = Param(Params._dummy(),
                          "caseSensitive",
                          "whether to ignore case sensitivty",
                          typeConverter=TypeConverters.toBoolean)

[docs]    doubleVariants = Param(Params._dummy(),
                           "doubleVariants",
                           "whether to use more expensive spell checker",
                           typeConverter=TypeConverters.toBoolean)

[docs]    shortCircuit = Param(Params._dummy(),
                         "shortCircuit",
                         "whether to use faster mode",
                         typeConverter=TypeConverters.toBoolean)

[docs]    frequencyPriority = Param(Params._dummy(),
                              "frequencyPriority",
                              "applies frequency over hamming in intersections. When false hamming takes priority",
                              typeConverter=TypeConverters.toBoolean)

[docs]    wordSizeIgnore = Param(Params._dummy(),
                           "wordSizeIgnore",
                           "minimum size of word before ignoring. Defaults to 3",
                           typeConverter=TypeConverters.toInt)

[docs]    dupsLimit = Param(Params._dummy(),
                      "dupsLimit",
                      "maximum duplicate of characters in a word to consider. Defaults to 2",
                      typeConverter=TypeConverters.toInt)

[docs]    reductLimit = Param(Params._dummy(),
                        "reductLimit",
                        "word reductions limit. Defaults to 3",
                        typeConverter=TypeConverters.toInt)

[docs]    intersections = Param(Params._dummy(),
                          "intersections",
                          "hamming intersections to attempt. Defaults to 10",
                          typeConverter=TypeConverters.toInt)

[docs]    vowelSwapLimit = Param(Params._dummy(),
                           "vowelSwapLimit",
                           "vowel swap attempts. Defaults to 6",
                           typeConverter=TypeConverters.toInt)

    @keyword_only
    def __init__(self):
        super(NorvigSweetingApproach, self).__init__(
            classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach")
        self._setDefault(caseSensitive=False, doubleVariants=False, shortCircuit=False, wordSizeIgnore=3, dupsLimit=2,
                         reductLimit=3, intersections=10, vowelSwapLimit=6, frequencyPriority=True)
[docs]        self.dictionary_path = ""

[docs]    def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}):
        """Sets dictionary which needs 'tokenPattern' regex for separating
        words.

        Parameters
        ----------
        path : str
            Path to the source file
        token_pattern : str, optional
            Pattern for token separation, by default ``\\S+``
        read_as : str, optional
            How to read the file, by default ReadAs.TEXT
        options : dict, optional
            Options to read the resource, by default {"format": "text"}
        """
        self.dictionary_path = path
        opts = options.copy()
        if "tokenPattern" not in opts:
            opts["tokenPattern"] = token_pattern
        return self._set(dictionary=ExternalResource(path, read_as, opts))

[docs]    def setCaseSensitive(self, value):
        """Sets whether to ignore case sensitivity, by default False.

        Parameters
        ----------
        value : bool
            Whether to ignore case sensitivity
        """
        return self._set(caseSensitive=value)

[docs]    def setDoubleVariants(self, value):
        """Sets whether to use more expensive spell checker, by default False.

        Increase search at cost of performance. Enables extra check for word
        combinations.

        Parameters
        ----------
        value : bool
            [description]
        """
        return self._set(doubleVariants=value)

[docs]    def setShortCircuit(self, value):
        """Sets whether to use faster mode, by default False.

        Increase performance at cost of accuracy. Faster but less accurate.

        Parameters
        ----------
        value : bool
            Whether to use faster mode
        """
        return self._set(shortCircuit=value)

[docs]    def setFrequencyPriority(self, value):
        """Sets whether to consider frequency over hamming in intersections,
        when false hamming takes priority, by default True.

        Parameters
        ----------
        value : bool
            Whether to consider frequency over hamming in intersections
        """
        return self._set(frequencyPriority=value)

    def _create_model(self, java_model):
        return NorvigSweetingModel(java_model=java_model)


[docs]class NorvigSweetingModel(AnnotatorModel):
    """This annotator retrieves tokens and makes corrections automatically if
    not found in an English dictionary.

    The Symmetric Delete spelling correction algorithm reduces the complexity of
    edit candidate generation and dictionary lookup for a given
    Damerau-Levenshtein distance. It is six orders of magnitude faster (than the
    standard approach with deletes + transposes + replaces + inserts) and
    language independent.

    This is the instantiated model of the :class:`.NorvigSweetingApproach`. For
    training your own model, please see the documentation of that class.

    Pretrained models can be loaded with :meth:`.pretrained` of the companion
    object:

    >>>    spellChecker = NorvigSweetingModel.pretrained() \\
    ...        .setInputCols(["token"]) \\
    ...        .setOutputCol("spell") \\


    The default model is ``"spellcheck_norvig"``, if no name is provided. For
    available pretrained models please see the `Models Hub
    <https://sparknlp.org/models?task=Spell+Check>`__.


    For extended examples of usage, see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``TOKEN``              ``TOKEN``
    ====================== ======================

    Parameters
    ----------
    None

    References
    ----------
    Inspired by Norvig model and `SymSpell
    <https://github.com/wolfgarbe/SymSpell>`__.

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> spellChecker = NorvigSweetingModel.pretrained() \\
    ...     .setInputCols(["token"]) \\
    ...     .setOutputCol("spell")
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     tokenizer,
    ...     spellChecker
    ... ])
    >>> data = spark.createDataFrame([["somtimes i wrrite wordz erong."]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.select("spell.result").show(truncate=False)
    +--------------------------------------+
    |result                                |
    +--------------------------------------+
    |[sometimes, i, write, words, wrong, .]|
    +--------------------------------------+

    See Also
    --------
    SymmetricDeleteModel : for an alternative approach to spell checking
    ContextSpellCheckerModel : for a DL based approach
    """
[docs]    name = "NorvigSweetingModel"

[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN]

[docs]    outputAnnotatorType = AnnotatorType.TOKEN

    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel", java_model=None):
        super(NorvigSweetingModel, self).__init__(
            classname=classname,
            java_model=java_model
        )

    @staticmethod
[docs]    def pretrained(name="spellcheck_norvig", lang="en", remote_loc=None):
        """Downloads and loads a pretrained model.

        Parameters
        ----------
        name : str, optional
            Name of the pretrained model, by default "spellcheck_norvig"
        lang : str, optional
            Language of the pretrained model, by default "en"
        remote_loc : str, optional
            Optional remote address of the resource, by default None. Will use
            Spark NLPs repositories otherwise.

        Returns
        -------
        NorvigSweetingModel
            The restored model
        """
        from sparknlp.pretrained import ResourceDownloader
        return ResourceDownloader.downloadModel(NorvigSweetingModel, name, lang, remote_loc)