Source code for sparknlp.annotator.pos.perceptron

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the Perceptron Annotator."""

from sparknlp.common import *


[docs]class PerceptronApproach(AnnotatorApproach):
    """Trains an averaged Perceptron model to tag words part-of-speech. Sets a
    POS tag to each word within a sentence.

    For pretrained models please see the :class:`.PerceptronModel`.

    The training data needs to be in a Spark DataFrame, where the column needs
    to consist of Annotations of type ``POS``. The `Annotation` needs to have
    member ``result`` set to the POS tag and have a ``"word"`` mapping to its
    word inside of member ``metadata``. This DataFrame for training can easily
    created by the helper class :class:`.POS`.


    >>> POS().readDataset(spark, datasetPath) \\
    ...     .selectExpr("explode(tags) as tags").show(truncate=False)
    +---------------------------------------------+
    |tags                                         |
    +---------------------------------------------+
    |[pos, 0, 5, NNP, [word -> Pierre], []]       |
    |[pos, 7, 12, NNP, [word -> Vinken], []]      |
    |[pos, 14, 14, ,, [word -> ,], []]            |
    |[pos, 31, 34, MD, [word -> will], []]        |
    |[pos, 36, 39, VB, [word -> join], []]        |
    |[pos, 41, 43, DT, [word -> the], []]         |
    |[pos, 45, 49, NN, [word -> board], []]       |
                            ...


    For extended examples of usage, see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/french/Train-Perceptron-French.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``TOKEN, DOCUMENT``    ``POS``
    ====================== ======================

    Parameters
    ----------
    posCol
        Column name for Array of POS tags that match tokens
    nIterations
        Number of iterations in training, converges to better accuracy, by
        default 5

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from sparknlp.training import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> sentence = SentenceDetector() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("sentence")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["sentence"]) \\
    ...     .setOutputCol("token")
    >>> datasetPath = "src/test/resources/anc-pos-corpus-small/test-training.txt"
    >>> trainingPerceptronDF = POS().readDataset(spark, datasetPath)
    >>> trainedPos = PerceptronApproach() \\
    ...     .setInputCols(["document", "token"]) \\
    ...     .setOutputCol("pos") \\
    ...     .setPosColumn("tags") \\
    ...     .fit(trainingPerceptronDF)
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     sentence,
    ...     tokenizer,
    ...     trainedPos
    ... ])
    >>> data = spark.createDataFrame([["To be or not to be, is this the question?"]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("pos.result").show(truncate=False)
    +--------------------------------------------------+
    |result                                            |
    +--------------------------------------------------+
    |[NNP, NNP, CD, JJ, NNP, NNP, ,, MD, VB, DT, CD, .]|
    +--------------------------------------------------+
    """

[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.POS

[docs]    posCol = Param(Params._dummy(),
                   "posCol",
                   "column of Array of POS tags that match tokens",
                   typeConverter=TypeConverters.toString)

[docs]    nIterations = Param(Params._dummy(),
                        "nIterations",
                        "Number of iterations in training, converges to better accuracy",
                        typeConverter=TypeConverters.toInt)

    @keyword_only
    def __init__(self):
        super(PerceptronApproach, self).__init__(
            classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach")
        self._setDefault(
            nIterations=5
        )

[docs]    def setPosColumn(self, value):
        """Sets column name for Array of POS tags that match tokens.

        Parameters
        ----------
        value : str
            Name of column for Array of POS tags
        """
        return self._set(posCol=value)

[docs]    def setIterations(self, value):
        """Sets number of iterations in training, by default 5.

        Parameters
        ----------
        value : int
            Number of iterations in training
        """
        return self._set(nIterations=value)

[docs]    def getNIterations(self):
        """Gets number of iterations in training, by default 5.

        Returns
        -------
        int
            Number of iterations in training
        """
        return self.getOrDefault(self.nIterations)

    def _create_model(self, java_model):
        return PerceptronModel(java_model=java_model)


[docs]class PerceptronModel(AnnotatorModel):
    """Averaged Perceptron model to tag words part-of-speech. Sets a POS tag to
    each word within a sentence.

    This is the instantiated model of the :class:`.PerceptronApproach`. For
    training your own model, please see the documentation of that class.

    Pretrained models can be loaded with :meth:`.pretrained` of the companion
    object:

    >>> posTagger = PerceptronModel.pretrained() \\
    ...     .setInputCols(["document", "token"]) \\
    ...     .setOutputCol("pos")


    The default model is ``"pos_anc"``, if no name is provided.

    For available pretrained models please see the `Models Hub
    <https://sparknlp.org/models?task=Part+of+Speech+Tagging>`__.
    Additionally, pretrained pipelines are available for this module, see
    `Pipelines <https://sparknlp.org/docs/en/pipelines>`__.

    For extended examples of usage, see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/french/Train-Perceptron-French.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``TOKEN, DOCUMENT``    ``POS``
    ====================== ======================

    Parameters
    ----------
    None

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> tokenizer = Tokenizer() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("token")
    >>> posTagger = PerceptronModel.pretrained() \\
    ...     .setInputCols(["document", "token"]) \\
    ...     .setOutputCol("pos")
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     tokenizer,
    ...     posTagger
    ... ])
    >>> data = spark.createDataFrame([["Peter Pipers employees are picking pecks of pickled peppers"]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("explode(pos) as pos").show(truncate=False)
    +-------------------------------------------+
    |pos                                        |
    +-------------------------------------------+
    |[pos, 0, 4, NNP, [word -> Peter], []]      |
    |[pos, 6, 11, NNP, [word -> Pipers], []]    |
    |[pos, 13, 21, NNS, [word -> employees], []]|
    |[pos, 23, 25, VBP, [word -> are], []]      |
    |[pos, 27, 33, VBG, [word -> picking], []]  |
    |[pos, 35, 39, NNS, [word -> pecks], []]    |
    |[pos, 41, 42, IN, [word -> of], []]        |
    |[pos, 44, 50, JJ, [word -> pickled], []]   |
    |[pos, 52, 58, NNS, [word -> peppers], []]  |
    +-------------------------------------------+
    """
[docs]    name = "PerceptronModel"

[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.POS

    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronModel", java_model=None):
        super(PerceptronModel, self).__init__(
            classname=classname,
            java_model=java_model
        )

    @staticmethod
[docs]    def pretrained(name="pos_anc", lang="en", remote_loc=None):
        """Downloads and loads a pretrained model.

        Parameters
        ----------
        name : str, optional
            Name of the pretrained model, by default "pos_anc"
        lang : str, optional
            Language of the pretrained model, by default "en"
        remote_loc : str, optional
            Optional remote address of the resource, by default None. Will use
            Spark NLPs repositories otherwise.

        Returns
        -------
        PerceptronModel
            The restored model
        """
        from sparknlp.pretrained import ResourceDownloader
        return ResourceDownloader.downloadModel(PerceptronModel, name, lang, remote_loc)