Source code for sparknlp.annotator.spell_check.symmetric_delete

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for SymmetricDelete."""

from sparknlp.common import *


[docs]class SymmetricDeleteApproach(AnnotatorApproach): """Trains a Symmetric Delete spelling correction algorithm. Retrieves tokens and utilizes distance metrics to compute possible derived words. The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of magnitude faster (than the standard approach with deletes + transposes + replaces + inserts) and language independent. A dictionary of correct spellings must be provided with :meth:`.setDictionary` in the form of a text file, where each word is parsed by a regex pattern. For instantiated/pretrained models, see :class:`.SymmetricDeleteModel`. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN`` ``TOKEN`` ====================== ====================== Parameters ---------- dictionary folder or file with text that teaches about the language maxEditDistance max edit distance characters to derive strings from a word, by default 3 frequencyThreshold minimum frequency of words to be considered from training, by default 0 deletesThreshold minimum frequency of corrections a word needs to have to be considered from training, by default 0 References ---------- Inspired by `SymSpell <https://github.com/wolfgarbe/SymSpell>`__. Examples -------- In this example, the dictionary ``"words.txt"`` has the form of:: ... gummy gummic gummier gummiest gummiferous ... This dictionary is then set to be the basis of the spell checker. >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> spellChecker = SymmetricDeleteApproach() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("spell") \\ ... .setDictionary("src/test/resources/spell/words.txt") >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer, ... spellChecker ... ]) >>> pipelineModel = pipeline.fit(trainingData) See Also -------- NorvigSweetingApproach : for an alternative approach to spell checking ContextSpellCheckerApproach : for a DL based approach """ inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.TOKEN corpus = Param(Params._dummy(), "corpus", "folder or file with text that teaches about the language", typeConverter=TypeConverters.identity) dictionary = Param(Params._dummy(), "dictionary", "folder or file with text that teaches about the language", typeConverter=TypeConverters.identity) maxEditDistance = Param(Params._dummy(), "maxEditDistance", "max edit distance characters to derive strings from a word", typeConverter=TypeConverters.toInt) frequencyThreshold = Param(Params._dummy(), "frequencyThreshold", "minimum frequency of words to be considered from training. " + "Increase if training set is LARGE. Defaults to 0", typeConverter=TypeConverters.toInt) deletesThreshold = Param(Params._dummy(), "deletesThreshold", "minimum frequency of corrections a word needs to have to be considered from training." + "Increase if training set is LARGE. Defaults to 0", typeConverter=TypeConverters.toInt) dupsLimit = Param(Params._dummy(), "dupsLimit", "maximum duplicate of characters in a word to consider. Defaults to 2", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self): super(SymmetricDeleteApproach, self).__init__( classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach") self._setDefault(maxEditDistance=3, frequencyThreshold=0, deletesThreshold=0, dupsLimit=2) self.dictionary_path = ""
[docs] def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.TEXT, options={"format": "text"}): """Sets folder or file with text that teaches about the language. Parameters ---------- path : str Path to the resource token_pattern : str, optional Regex patttern to extract tokens, by default "\S+" read_as : str, optional How to read the resource, by default ReadAs.TEXT options : dict, optional Options for reading the resource, by default {"format": "text"} """ self.dictionary_path = path opts = options.copy() if "tokenPattern" not in opts: opts["tokenPattern"] = token_pattern return self._set(dictionary=ExternalResource(path, read_as, opts))
[docs] def setMaxEditDistance(self, v): """Sets max edit distance characters to derive strings from a word, by default 3. Parameters ---------- v : int Max edit distance characters to derive strings from a word """ return self._set(maxEditDistance=v)
[docs] def setFrequencyThreshold(self, v): """Sets minimum frequency of words to be considered from training, by default 0. Parameters ---------- v : int Minimum frequency of words to be considered from training """ return self._set(frequencyThreshold=v)
[docs] def setDeletesThreshold(self, v): """Sets minimum frequency of corrections a word needs to have to be considered from training, by default 0. Parameters ---------- v : int Minimum frequency of corrections a word needs to have to be considered from training """ return self._set(deletesThreshold=v)
def _create_model(self, java_model): return SymmetricDeleteModel(java_model=java_model)
[docs]class SymmetricDeleteModel(AnnotatorModel): """Symmetric Delete spelling correction algorithm. The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup for a given Damerau-Levenshtein distance. It is six orders of magnitude faster (than the standard approach with deletes + transposes + replaces + inserts) and language independent. Pretrained models can be loaded with :meth:`.pretrained` of the companion object: >>> spell = SymmetricDeleteModel.pretrained() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("spell") The default model is ``"spellcheck_sd"``, if no name is provided. For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Spell+Check>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN`` ``TOKEN`` ====================== ====================== Parameters ---------- None References ---------- Inspired by `SymSpell <https://github.com/wolfgarbe/SymSpell>`__. Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> spellChecker = SymmetricDeleteModel.pretrained() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("spell") >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer, ... spellChecker ... ]) >>> data = spark.createDataFrame([["spmetimes i wrrite wordz erong."]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.select("spell.result").show(truncate=False) +--------------------------------------+ |result | +--------------------------------------+ |[sometimes, i, write, words, wrong, .]| +--------------------------------------+ See Also -------- NorvigSweetingModel : for an alternative approach to spell checking ContextSpellCheckerModel : for a DL based approach """ name = "SymmetricDeleteModel" inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.TOKEN def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel", java_model=None): super(SymmetricDeleteModel, self).__init__( classname=classname, java_model=java_model ) @staticmethod
[docs] def pretrained(name="spellcheck_sd", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional Name of the pretrained model, by default "spellcheck_sd" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise. Returns ------- SymmetricDeleteModel The restored model """ from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(SymmetricDeleteModel, name, lang, remote_loc)