Source code for sparknlp.annotator.spell_check.norvig_sweeting

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the NorvigSweeting spell checker."""

from sparknlp.common import *


[docs]class NorvigSweetingApproach(AnnotatorApproach): """Trains annotator, that retrieves tokens and makes corrections automatically if not found in an English dictionary, based on the algorithm by Peter Norvig. The algorithm is based on a Bayesian approach to spell checking: Given the word we look in the provided dictionary to choose the word with the highest probability to be the correct one. A dictionary of correct spellings must be provided with :meth:`.setDictionary` in the form of a text file, where each word is parsed by a regex pattern. For instantiated/pretrained models, see :class:`.NorvigSweetingModel`. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN`` ``TOKEN`` ====================== ====================== Parameters ---------- dictionary Dictionary needs 'tokenPattern' regex in dictionary for separating words caseSensitive Whether to ignore case sensitivity, by default False doubleVariants Whether to use more expensive spell checker, by default False Increase search at cost of performance. Enables extra check for word combinations. shortCircuit Whether to use faster mode, by default False Increase performance at cost of accuracy. Faster but less accurate. frequencyPriority Applies frequency over hamming in intersections, when false hamming takes priority, by default True wordSizeIgnore Minimum size of word before ignoring, by default 3 dupsLimit Maximum duplicate of characters in a word to consider, by default 2 reductLimit Word reductions limit, by default 3 intersections Hamming intersections to attempt, by default 10 vowelSwapLimit Vowel swap attempts, by default 6 References ---------- Inspired by the spell checker by Peter Norvig: `How to Write a Spelling Corrector <https://norvig.com/spell-correct.html>`__ Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline In this example, the dictionary ``"words.txt"`` has the form of:: ... gummy gummic gummier gummiest gummiferous ... This dictionary is then set to be the basis of the spell checker. >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> spellChecker = NorvigSweetingApproach() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("spell") \\ ... .setDictionary("src/test/resources/spell/words.txt") >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer, ... spellChecker ... ]) >>> pipelineModel = pipeline.fit(trainingData) See Also -------- SymmetricDeleteApproach : for an alternative approach to spell checking ContextSpellCheckerApproach : for a DL based approach """ inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.TOKEN dictionary = Param(Params._dummy(), "dictionary", "dictionary needs 'tokenPattern' regex in dictionary for separating words", typeConverter=TypeConverters.identity) caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to ignore case sensitivty", typeConverter=TypeConverters.toBoolean) doubleVariants = Param(Params._dummy(), "doubleVariants", "whether to use more expensive spell checker", typeConverter=TypeConverters.toBoolean) shortCircuit = Param(Params._dummy(), "shortCircuit", "whether to use faster mode", typeConverter=TypeConverters.toBoolean) frequencyPriority = Param(Params._dummy(), "frequencyPriority", "applies frequency over hamming in intersections. When false hamming takes priority", typeConverter=TypeConverters.toBoolean) wordSizeIgnore = Param(Params._dummy(), "wordSizeIgnore", "minimum size of word before ignoring. Defaults to 3", typeConverter=TypeConverters.toInt) dupsLimit = Param(Params._dummy(), "dupsLimit", "maximum duplicate of characters in a word to consider. Defaults to 2", typeConverter=TypeConverters.toInt) reductLimit = Param(Params._dummy(), "reductLimit", "word reductions limit. Defaults to 3", typeConverter=TypeConverters.toInt) intersections = Param(Params._dummy(), "intersections", "hamming intersections to attempt. Defaults to 10", typeConverter=TypeConverters.toInt) vowelSwapLimit = Param(Params._dummy(), "vowelSwapLimit", "vowel swap attempts. Defaults to 6", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self): super(NorvigSweetingApproach, self).__init__( classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach") self._setDefault(caseSensitive=False, doubleVariants=False, shortCircuit=False, wordSizeIgnore=3, dupsLimit=2, reductLimit=3, intersections=10, vowelSwapLimit=6, frequencyPriority=True) self.dictionary_path = ""
[docs] def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}): """Sets dictionary which needs 'tokenPattern' regex for separating words. Parameters ---------- path : str Path to the source file token_pattern : str, optional Pattern for token separation, by default ``\\S+`` read_as : str, optional How to read the file, by default ReadAs.TEXT options : dict, optional Options to read the resource, by default {"format": "text"} """ self.dictionary_path = path opts = options.copy() if "tokenPattern" not in opts: opts["tokenPattern"] = token_pattern return self._set(dictionary=ExternalResource(path, read_as, opts))
[docs] def setCaseSensitive(self, value): """Sets whether to ignore case sensitivity, by default False. Parameters ---------- value : bool Whether to ignore case sensitivity """ return self._set(caseSensitive=value)
[docs] def setDoubleVariants(self, value): """Sets whether to use more expensive spell checker, by default False. Increase search at cost of performance. Enables extra check for word combinations. Parameters ---------- value : bool [description] """ return self._set(doubleVariants=value)
[docs] def setShortCircuit(self, value): """Sets whether to use faster mode, by default False. Increase performance at cost of accuracy. Faster but less accurate. Parameters ---------- value : bool Whether to use faster mode """ return self._set(shortCircuit=value)
[docs] def setFrequencyPriority(self, value): """Sets whether to consider frequency over hamming in intersections, when false hamming takes priority, by default True. Parameters ---------- value : bool Whether to consider frequency over hamming in intersections """ return self._set(frequencyPriority=value)
def _create_model(self, java_model): return NorvigSweetingModel(java_model=java_model)
[docs]class NorvigSweetingModel(AnnotatorModel): """This annotator retrieves tokens and makes corrections automatically if not found in an English dictionary. The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of magnitude faster (than the standard approach with deletes + transposes + replaces + inserts) and language independent. This is the instantiated model of the :class:`.NorvigSweetingApproach`. For training your own model, please see the documentation of that class. Pretrained models can be loaded with :meth:`.pretrained` of the companion object: >>> spellChecker = NorvigSweetingModel.pretrained() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("spell") \\ The default model is ``"spellcheck_norvig"``, if no name is provided. For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Spell+Check>`__. For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN`` ``TOKEN`` ====================== ====================== Parameters ---------- None References ---------- Inspired by Norvig model and `SymSpell <https://github.com/wolfgarbe/SymSpell>`__. Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> spellChecker = NorvigSweetingModel.pretrained() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("spell") >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer, ... spellChecker ... ]) >>> data = spark.createDataFrame([["somtimes i wrrite wordz erong."]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.select("spell.result").show(truncate=False) +--------------------------------------+ |result | +--------------------------------------+ |[sometimes, i, write, words, wrong, .]| +--------------------------------------+ See Also -------- SymmetricDeleteModel : for an alternative approach to spell checking ContextSpellCheckerModel : for a DL based approach """ name = "NorvigSweetingModel" inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.TOKEN def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel", java_model=None): super(NorvigSweetingModel, self).__init__( classname=classname, java_model=java_model ) @staticmethod
[docs] def pretrained(name="spellcheck_norvig", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional Name of the pretrained model, by default "spellcheck_norvig" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise. Returns ------- NorvigSweetingModel The restored model """ from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(NorvigSweetingModel, name, lang, remote_loc)