Source code for sparknlp.training.conllu

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for CoNLLU."""

from sparknlp.common import ReadAs
from sparknlp.internal import ExtendedJavaWrapper


[docs]class CoNLLU(ExtendedJavaWrapper):
    """Instantiates the class to read a CoNLL-U dataset.

    The dataset should be in the format of `CoNLL-U
    <https://universaldependencies.org/format.html>`_ and needs to be specified
    with :meth:`.readDataset`, which will create a dataframe with the data.

    Can be used to train a :class:`DependencyParserApproach
    <sparknlp.annotator.DependencyParserApproach>`

    **Input File Format**::

        # sent_id = 1
        # text = They buy and sell books.
        1   They     they    PRON    PRP    Case=Nom|Number=Plur               2   nsubj   2:nsubj|4:nsubj   _
        2   buy      buy     VERB    VBP    Number=Plur|Person=3|Tense=Pres    0   root    0:root            _
        3   and      and     CONJ    CC     _                                  4   cc      4:cc              _
        4   sell     sell    VERB    VBP    Number=Plur|Person=3|Tense=Pres    2   conj    0:root|2:conj     _
        5   books    book    NOUN    NNS    Number=Plur                        2   obj     2:obj|4:obj       SpaceAfter=No
        6   .        .       PUNCT   .      _                                  2   punct   2:punct           _

    Examples
    --------
    >>> from sparknlp.training import CoNLLU
    >>> conlluFile = "src/test/resources/conllu/en.test.conllu"
    >>> conllDataSet = CoNLLU(False).readDataset(spark, conlluFile)
    >>> conllDataSet.selectExpr(
    ...     "text",
    ...     "form.result as form",
    ...     "upos.result as upos",
    ...     "xpos.result as xpos",
    ...     "lemma.result as lemma"
    ... ).show(1, False)
    +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
    |text                                   |form                                          |upos                                         |xpos                          |lemma                                       |
    +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
    |What if Google Morphed Into GoogleOS?  |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|
    +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
    """

    def __init__(self,
                 textCol='text',
                 documentCol='document',
                 sentenceCol='sentence',
                 formCol='form',
                 uposCol='upos',
                 xposCol='xpos',
                 lemmaCol='lemma',
                 explodeSentences=True
                 ):
        super(CoNLLU, self).__init__("com.johnsnowlabs.nlp.training.CoNLLU",
                                     textCol,
                                     documentCol,
                                     sentenceCol,
                                     formCol,
                                     uposCol,
                                     xposCol,
                                     lemmaCol,
                                     explodeSentences)

[docs]    def readDataset(self, spark, path, read_as=ReadAs.TEXT):
        """Reads the dataset from an external resource.

        Parameters
        ----------
        spark : :class:`pyspark.sql.SparkSession`
            Initiated Spark Session with Spark NLP
        path : str
            Path to the resource
        read_as : str, optional
            How to read the resource, by default ReadAs.TEXT

        Returns
        -------
        :class:`pyspark.sql.DataFrame`
            Spark Dataframe with the data
        """
        # ToDo Replace with std pyspark
        jSession = spark._jsparkSession

        jdf = self._java_obj.readDataset(jSession, path, read_as)
        dataframe = self.getDataFrame(spark, jdf)
        return dataframe