Source code for sparknlp.annotator.stemmer

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the Stemmer."""
from sparknlp.common import *


[docs]class Stemmer(AnnotatorModel): """Returns hard-stems out of words with the objective of retrieving the meaningful part of the word. For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN`` ``TOKEN`` ====================== ====================== Parameters ---------- None Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> stemmer = Stemmer() \\ ... .setInputCols(["token"]) \\ ... .setOutputCol("stem") >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer, ... stemmer ... ]) >>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers."]]) \\ ... .toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.selectExpr("stem.result").show(truncate = False) +-------------------------------------------------------------+ |result | +-------------------------------------------------------------+ |[peter, piper, employe, ar, pick, peck, of, pickl, pepper, .]| +-------------------------------------------------------------+ """ inputAnnotatorTypes = [AnnotatorType.TOKEN] outputAnnotatorType = AnnotatorType.TOKEN language = Param(Params._dummy(), "language", "stemmer algorithm", typeConverter=TypeConverters.toString) name = "Stemmer" @keyword_only def __init__(self): super(Stemmer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Stemmer") self._setDefault( language="english"
)