Source code for sparknlp.annotator.pos.perceptron

#  Copyright 2017-2022 John Snow Labs
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the Perceptron Annotator."""

from sparknlp.common import *

[docs]class PerceptronApproach(AnnotatorApproach): """Trains an averaged Perceptron model to tag words part-of-speech. Sets a POS tag to each word within a sentence. For pretrained models please see the :class:`.PerceptronModel`. The training data needs to be in a Spark DataFrame, where the column needs to consist of Annotations of type ``POS``. The `Annotation` needs to have member ``result`` set to the POS tag and have a ``"word"`` mapping to its word inside of member ``metadata``. This DataFrame for training can easily created by the helper class :class:`.POS`. >>> POS().readDataset(spark, datasetPath) \\ ... .selectExpr("explode(tags) as tags").show(truncate=False) +---------------------------------------------+ |tags | +---------------------------------------------+ |[pos, 0, 5, NNP, [word -> Pierre], []] | |[pos, 7, 12, NNP, [word -> Vinken], []] | |[pos, 14, 14, ,, [word -> ,], []] | |[pos, 31, 34, MD, [word -> will], []] | |[pos, 36, 39, VB, [word -> join], []] | |[pos, 41, 43, DT, [word -> the], []] | |[pos, 45, 49, NN, [word -> board], []] | ... For extended examples of usage, see the `Examples <>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN, DOCUMENT`` ``POS`` ====================== ====================== Parameters ---------- posCol Column name for Array of POS tags that match tokens nIterations Number of iterations in training, converges to better accuracy, by default 5 Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from import * >>> from import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> sentence = SentenceDetector() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("sentence") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["sentence"]) \\ ... .setOutputCol("token") >>> datasetPath = "src/test/resources/anc-pos-corpus-small/test-training.txt" >>> trainingPerceptronDF = POS().readDataset(spark, datasetPath) >>> trainedPos = PerceptronApproach() \\ ... .setInputCols(["document", "token"]) \\ ... .setOutputCol("pos") \\ ... .setPosColumn("tags") \\ ... .fit(trainingPerceptronDF) >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... sentence, ... tokenizer, ... trainedPos ... ]) >>> data = spark.createDataFrame([["To be or not to be, is this the question?"]]).toDF("text") >>> result = >>> result.selectExpr("pos.result").show(truncate=False) +--------------------------------------------------+ |result | +--------------------------------------------------+ |[NNP, NNP, CD, JJ, NNP, NNP, ,, MD, VB, DT, CD, .]| +--------------------------------------------------+ """ inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.POS posCol = Param(Params._dummy(), "posCol", "column of Array of POS tags that match tokens", typeConverter=TypeConverters.toString) nIterations = Param(Params._dummy(), "nIterations", "Number of iterations in training, converges to better accuracy", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self): super(PerceptronApproach, self).__init__( classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach") self._setDefault( nIterations=5 )
[docs] def setPosColumn(self, value): """Sets column name for Array of POS tags that match tokens. Parameters ---------- value : str Name of column for Array of POS tags """ return self._set(posCol=value)
[docs] def setIterations(self, value): """Sets number of iterations in training, by default 5. Parameters ---------- value : int Number of iterations in training """ return self._set(nIterations=value)
[docs] def getNIterations(self): """Gets number of iterations in training, by default 5. Returns ------- int Number of iterations in training """ return self.getOrDefault(self.nIterations)
def _create_model(self, java_model): return PerceptronModel(java_model=java_model)
[docs]class PerceptronModel(AnnotatorModel): """Averaged Perceptron model to tag words part-of-speech. Sets a POS tag to each word within a sentence. This is the instantiated model of the :class:`.PerceptronApproach`. For training your own model, please see the documentation of that class. Pretrained models can be loaded with :meth:`.pretrained` of the companion object: >>> posTagger = PerceptronModel.pretrained() \\ ... .setInputCols(["document", "token"]) \\ ... .setOutputCol("pos") The default model is ``"pos_anc"``, if no name is provided. For available pretrained models please see the `Models Hub <>`__. Additionally, pretrained pipelines are available for this module, see `Pipelines <>`__. For extended examples of usage, see the `Examples <>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``TOKEN, DOCUMENT`` ``POS`` ====================== ====================== Parameters ---------- None Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> tokenizer = Tokenizer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("token") >>> posTagger = PerceptronModel.pretrained() \\ ... .setInputCols(["document", "token"]) \\ ... .setOutputCol("pos") >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... tokenizer, ... posTagger ... ]) >>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers"]]).toDF("text") >>> result = >>> result.selectExpr("explode(pos) as pos").show(truncate=False) +-------------------------------------------+ |pos | +-------------------------------------------+ |[pos, 0, 4, NNP, [word -> Peter], []] | |[pos, 6, 11, NNP, [word -> Pipers], []] | |[pos, 13, 21, NNS, [word -> employees], []]| |[pos, 23, 25, VBP, [word -> are], []] | |[pos, 27, 33, VBG, [word -> picking], []] | |[pos, 35, 39, NNS, [word -> pecks], []] | |[pos, 41, 42, IN, [word -> of], []] | |[pos, 44, 50, JJ, [word -> pickled], []] | |[pos, 52, 58, NNS, [word -> peppers], []] | +-------------------------------------------+ """ name = "PerceptronModel" inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.POS def __init__(self, classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronModel", java_model=None): super(PerceptronModel, self).__init__( classname=classname, java_model=java_model ) @staticmethod
[docs] def pretrained(name="pos_anc", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional Name of the pretrained model, by default "pos_anc" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise. Returns ------- PerceptronModel The restored model """ from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(PerceptronModel, name, lang, remote_loc)