Source code for sparknlp.training.conll

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for CoNLL."""

import pyspark

from sparknlp.common import ReadAs
from sparknlp.internal import ExtendedJavaWrapper


[docs]class CoNLL(ExtendedJavaWrapper): """Instantiates the class to read a CoNLL dataset. The dataset should be in the format of `CoNLL 2003 <https://www.clips.uantwerpen.be/conll2003/ner/>`_ and needs to be specified with :meth:`.readDataset`, which will create a dataframe with the data. Can be used to train a :class:`NerDLApproach <sparknlp.annotator.NerDLApproach>`. **Input File Format**:: -DOCSTART- -X- -X- O EU NNP B-NP B-ORG rejects VBZ B-VP O German JJ B-NP B-MISC call NN I-NP O to TO B-VP O boycott VB I-VP O British JJ B-NP B-MISC lamb NN I-NP O . . O O Parameters ---------- documentCol : str, optional Name of the :class:`.DocumentAssembler` column, by default 'document' sentenceCol : str, optional Name of the :class:`.SentenceDetector` column, by default 'sentence' tokenCol : str, optional Name of the :class:`.Tokenizer` column, by default 'token' posCol : str, optional Name of the :class:`.PerceptronModel` column, by default 'pos' conllLabelIndex : int, optional Index of the label column in the dataset, by default 3 conllPosIndex : int, optional Index of the POS tags in the dataset, by default 1 textCol : str, optional Index of the text column in the dataset, by default 'text' labelCol : str, optional Name of the label column, by default 'label' explodeSentences : bool, optional Whether to explode sentences to separate rows, by default True delimiter: str, optional Delimiter used to separate columns inside CoNLL file includeDocId: bool, optional Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found) Examples -------- >>> from sparknlp.training import CoNLL >>> trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train") >>> trainingData.selectExpr( ... "text", ... "token.result as tokens", ... "pos.result as pos", ... "label.result as label" ... ).show(3, False) +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ |text |tokens |pos |label | +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ |EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]| |Peter Blackburn |[Peter, Blackburn] |[NNP, NNP] |[B-PER, I-PER] | |BRUSSELS 1996-08-22 |[BRUSSELS, 1996-08-22] |[NNP, CD] |[B-LOC, O] | +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ """ def __init__(self, documentCol='document', sentenceCol='sentence', tokenCol='token', posCol='pos', conllLabelIndex=3, conllPosIndex=1, conllDocIdCol="doc_id", textCol='text', labelCol='label', explodeSentences=True, delimiter=' ', includeDocId=False ): super(CoNLL, self).__init__("com.johnsnowlabs.nlp.training.CoNLL", documentCol, sentenceCol, tokenCol, posCol, conllLabelIndex, conllPosIndex, conllDocIdCol, textCol, labelCol, explodeSentences, delimiter, includeDocId)
[docs] def readDataset(self, spark, path, read_as=ReadAs.TEXT, partitions=8, storage_level=pyspark.StorageLevel.DISK_ONLY): # ToDo Replace with std pyspark """Reads the dataset from an external resource. Parameters ---------- spark : :class:`pyspark.sql.SparkSession` Initiated Spark Session with Spark NLP path : str Path to the resource, it can take two forms; a path to a conll file, or a path to a folder containing multiple CoNLL files. When the path points to a folder, the path must end in '*'. Examples: "/path/to/single/file.conll' "/path/to/folder/containing/multiple/files/*' read_as : str, optional How to read the resource, by default ReadAs.TEXT partitions : sets the minimum number of partitions for the case of lifting multiple files in parallel into a single dataframe. Defaults to 8. storage_level : sets the persistence level according to PySpark definitions. Defaults to StorageLevel.DISK_ONLY. Applies only when lifting multiple files. Returns ------- :class:`pyspark.sql.DataFrame` Spark Dataframe with the data """ jSession = spark._jsparkSession jdf = self._java_obj.readDataset(jSession, path, read_as, partitions, spark.sparkContext._getJavaStorageLevel(storage_level)) dataframe = self.getDataFrame(spark, jdf) return dataframe