Source code for sparknlp.annotator.normalizer

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the Normalizer."""
from sparknlp.common import *


[docs]class Normalizer(AnnotatorApproach): """Annotator that cleans out tokens. Requires stems, hence tokens. Removes all dirty characters from text following a regex pattern and transforms words based on a provided dictionary For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN`` ``TOKEN`` ====================== ====================== Parameters ---------- cleanupPatterns Normalization regex patterns which match will be removed from token, by default ['[^\\pL+]'] lowercase Whether to convert strings to lowercase, by default False slangDictionary Slang dictionary is a delimited text. needs 'delimiter' in options minLength The minimum allowed length for each token, by default 0 maxLength The maximum allowed length for each token Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> normalizer = Normalizer() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("normalized") \\ ... .setLowercase(True) \\ ... .setCleanupPatterns([\"\"\"[^\\w\\d\\s]\"\"\"]) The pattern removes punctuations (keeps alphanumeric chars). If we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z]) >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer, ... normalizer ... ]) >>> data = spark.createDataFrame([["John and Peter are brothers. However they don't support each other that much."]]) \\ ... .toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.selectExpr("normalized.result").show(truncate = False) +----------------------------------------------------------------------------------------+ |result | +----------------------------------------------------------------------------------------+ |[john, and, peter, are, brothers, however, they, dont, support, each, other, that, much]| +----------------------------------------------------------------------------------------+ """ inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.TOKEN cleanupPatterns = Param(Params._dummy(), "cleanupPatterns", "normalization regex patterns which match will be removed from token", typeConverter=TypeConverters.toListString) lowercase = Param(Params._dummy(), "lowercase", "whether to convert strings to lowercase") slangMatchCase = Param(Params._dummy(), "slangMatchCase", "whether or not to be case sensitive to match slangs. Defaults to false.") slangDictionary = Param(Params._dummy(), "slangDictionary", "slang dictionary is a delimited text. needs 'delimiter' in options", typeConverter=TypeConverters.identity) minLength = Param(Params._dummy(), "minLength", "Set the minimum allowed length for each token", typeConverter=TypeConverters.toInt) maxLength = Param(Params._dummy(), "maxLength", "Set the maximum allowed length for each token", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self): super(Normalizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Normalizer") self._setDefault( cleanupPatterns=["[^\\pL+]"], lowercase=False, slangMatchCase=False, minLength=0 )
[docs] def setCleanupPatterns(self, value): """Sets normalization regex patterns which match will be removed from token, by default ['[^\\pL+]']. Parameters ---------- value : List[str] Normalization regex patterns which match will be removed from token """ return self._set(cleanupPatterns=value)
[docs] def setLowercase(self, value): """Sets whether to convert strings to lowercase, by default False. Parameters ---------- value : bool Whether to convert strings to lowercase, by default False """ return self._set(lowercase=value)
[docs] def setSlangDictionary(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}): """Sets slang dictionary is a delimited text. Needs 'delimiter' in options. Parameters ---------- path : str Path to the source files delimiter : str Delimiter for the values read_as : str, optional How to read the file, by default ReadAs.TEXT options : dict, optional Options to read the resource, by default {"format": "text"} """ opts = options.copy() if "delimiter" not in opts: opts["delimiter"] = delimiter return self._set(slangDictionary=ExternalResource(path, read_as, opts))
[docs] def setMinLength(self, value): """Sets the minimum allowed length for each token, by default 0. Parameters ---------- value : int Minimum allowed length for each token. """ return self._set(minLength=value)
[docs] def setMaxLength(self, value): """Sets the maximum allowed length for each token. Parameters ---------- value : int Maximum allowed length for each token """ return self._set(maxLength=value)
def _create_model(self, java_model): return NormalizerModel(java_model=java_model)
[docs]class NormalizerModel(AnnotatorModel): """Instantiated Model of the Normalizer. This is the instantiated model of the :class:`.Normalizer`. For training your own model, please see the documentation of that class. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN`` ``TOKEN`` ====================== ====================== Parameters ---------- cleanupPatterns normalization regex patterns which match will be removed from token lowercase whether to convert strings to lowercase """ inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.TOKEN cleanupPatterns = Param(Params._dummy(), "cleanupPatterns", "normalization regex patterns which match will be removed from token", typeConverter=TypeConverters.toListString) lowercase = Param(Params._dummy(), "lowercase", "whether to convert strings to lowercase") slangMatchCase = Param(Params._dummy(), "slangMatchCase", "whether or not to be case sensitive to match slangs. Defaults to false.") def __init__(self, classname="com.johnsnowlabs.nlp.annotators.NormalizerModel", java_model=None): super(NormalizerModel, self).__init__( classname=classname, java_model=java_model ) name = "NormalizerModel"