case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false) extends Product with Serializable
Helper class to load a CoNLL type dataset for training.
The dataset should be in the format of
CoNLL 2003 and needs to be specified with
readDataset. Other CoNLL datasets are not supported.
Two types of input paths are supported,
Folder: this is a path ending in *, and representing a collection of CoNLL files within a
directory. E.g., 'path/to/multiple/conlls/*' Using this pattern will result in all the
files being read into a single Dataframe. Some constraints apply on the schemas of the
multiple files.
File: this is a path to a single file. E.g., 'path/to/single_file.conll'
Example
val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train") trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label") .show(3, false) +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ |text |tokens |pos |label | +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ |EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]| |Peter Blackburn |[Peter, Blackburn] |[NNP, NNP] |[B-PER, I-PER] | |BRUSSELS 1996-08-22 |[BRUSSELS, 1996-08-22] |[NNP, CD] |[B-LOC, O] | +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ trainingData.printSchema root |-- text: string (nullable = true) |-- document: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- sentence: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- token: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- pos: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- label: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false)
- documentCol
Name of the
DOCUMENTAnnotator type column- sentenceCol
Name of the Sentences of
DOCUMENTAnnotator type column- tokenCol
Name of the
TOKENAnnotator type column- posCol
Name of the
POSAnnotator type column- conllLabelIndex
Index of the column for NER Label in the dataset
- conllPosIndex
Index of the column for the POS tags in the dataset
- conllDocIdCol
Name of the column for the text in the dataset
- conllTextCol
Name of the column for the text in the dataset
- labelCol
Name of the
NAMED_ENTITYAnnotator type column- explodeSentences
Whether to explode each sentence to a separate row
- delimiter
Delimiter used to separate columns inside CoNLL file
- includeDocId
Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
- Alphabetic
- By Inheritance
- CoNLL
- Serializable
- Serializable
- Product
- Equals
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
-
new
CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false)
- documentCol
Name of the
DOCUMENTAnnotator type column- sentenceCol
Name of the Sentences of
DOCUMENTAnnotator type column- tokenCol
Name of the
TOKENAnnotator type column- posCol
Name of the
POSAnnotator type column- conllLabelIndex
Index of the column for NER Label in the dataset
- conllPosIndex
Index of the column for the POS tags in the dataset
- conllDocIdCol
Name of the column for the text in the dataset
- conllTextCol
Name of the column for the text in the dataset
- labelCol
Name of the
NAMED_ENTITYAnnotator type column- explodeSentences
Whether to explode each sentence to a separate row
- delimiter
Delimiter used to separate columns inside CoNLL file
- includeDocId
Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- val annotationType: ArrayType
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
- def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
- val conllDocIdCol: String
- val conllLabelIndex: Int
- val conllPosIndex: Int
- val conllTextCol: String
- val delimiter: String
- val documentCol: String
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- val explodeSentences: Boolean
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
- def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- val includeDocId: Boolean
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
- val labelCol: String
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
- def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
- def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]
- def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]
- def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
- def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
- def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
- val posCol: String
- def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString, parallelism: Int = 8, storageLevel: StorageLevel = StorageLevel.DISK_ONLY): Dataset[_]
- def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
- def readDocs(er: ExternalResource): Seq[CoNLLDocument]
- def readLines(lines: Array[String]): Seq[CoNLLDocument]
- def removeSurroundingHyphens(text: String): String
- def schema: StructType
- val sentenceCol: String
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
- val tokenCol: String
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()