Source code for sparknlp.training.pos

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains helper classes for part-of-speech tagging."""

from sparknlp.internal import ExtendedJavaWrapper


[docs]class POS(ExtendedJavaWrapper):
    """Helper class for creating DataFrames for training a part-of-speech
    tagger.

    The dataset needs to consist of sentences on each line, where each word is
    delimited with its respective tag.

    **Input File Format**::

        A|DT few|JJ months|NNS ago|RB you|PRP received|VBD a|DT letter|NN


    The sentence can then be parsed with :meth:`.readDataset` into a column with
    annotations of type ``POS``.

    Can be used to train a :class:`PerceptronApproach
    <sparknlp.annotator.PerceptronApproach>`.

    Examples
    --------
    In this example, the file ``test-training.txt`` has the content of the
    sentence above.

    >>> from sparknlp.training import POS
    >>> pos = POS()
    >>> path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
    >>> posDf = pos.readDataset(spark, path, "|", "tags")
    >>> posDf.selectExpr("explode(tags) as tags").show(truncate=False)
    +---------------------------------------------+
    |tags                                         |
    +---------------------------------------------+
    |[pos, 0, 5, NNP, [word -> Pierre], []]       |
    |[pos, 7, 12, NNP, [word -> Vinken], []]      |
    |[pos, 14, 14, ,, [word -> ,], []]            |
    |[pos, 16, 17, CD, [word -> 61], []]          |
    |[pos, 19, 23, NNS, [word -> years], []]      |
    |[pos, 25, 27, JJ, [word -> old], []]         |
    |[pos, 29, 29, ,, [word -> ,], []]            |
    |[pos, 31, 34, MD, [word -> will], []]        |
    |[pos, 36, 39, VB, [word -> join], []]        |
    |[pos, 41, 43, DT, [word -> the], []]         |
    |[pos, 45, 49, NN, [word -> board], []]       |
    |[pos, 51, 52, IN, [word -> as], []]          |
    |[pos, 47, 47, DT, [word -> a], []]           |
    |[pos, 56, 67, JJ, [word -> nonexecutive], []]|
    |[pos, 69, 76, NN, [word -> director], []]    |
    |[pos, 78, 81, NNP, [word -> Nov.], []]       |
    |[pos, 83, 84, CD, [word -> 29], []]          |
    |[pos, 81, 81, ., [word -> .], []]            |
    +---------------------------------------------+
    """

    def __init__(self):
        super(POS, self).__init__("com.johnsnowlabs.nlp.training.POS")

[docs]    def readDataset(self, spark, path, delimiter="|", outputPosCol="tags", outputDocumentCol="document",
                    outputTextCol="text"):
        # ToDo Replace with std pyspark
        """Reads the dataset from an external resource.

        Parameters
        ----------
        spark : :class:`pyspark.sql.SparkSession`
            Initiated Spark Session with Spark NLP
        path : str
            Path to the resource
        delimiter : str, optional
            Delimiter of word and POS, by default "|"
        outputPosCol : str, optional
            Name of the output POS column, by default "tags"
        outputDocumentCol : str, optional
            Name of the output document column, by default "document"
        outputTextCol : str, optional
            Name of the output text column, by default "text"

        Returns
        -------
        :class:`pyspark.sql.DataFrame`
            Spark Dataframe with the data
        """
        jSession = spark._jsparkSession

        jdf = self._java_obj.readDataset(jSession, path, delimiter, outputPosCol, outputDocumentCol, outputTextCol)
        dataframe = self.getDataFrame(spark, jdf)
        return dataframe