case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false) extends Product with Serializable
Helper class to load a CoNLL type dataset for training.
The dataset should be in the format of
CoNLL 2003 and needs to be specified with
readDataset
. Other CoNLL datasets are not supported.
Two types of input paths are supported,
Folder: this is a path ending in *
, and representing a collection of CoNLL files within a
directory. E.g., 'path/to/multiple/conlls/*' Using this pattern will result in all the
files being read into a single Dataframe. Some constraints apply on the schemas of the
multiple files.
File: this is a path to a single file. E.g., 'path/to/single_file.conll'
Example
val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train") trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label") .show(3, false) +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ |text |tokens |pos |label | +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ |EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]| |Peter Blackburn |[Peter, Blackburn] |[NNP, NNP] |[B-PER, I-PER] | |BRUSSELS 1996-08-22 |[BRUSSELS, 1996-08-22] |[NNP, CD] |[B-LOC, O] | +------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+ trainingData.printSchema root |-- text: string (nullable = true) |-- document: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- sentence: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- token: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- pos: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- label: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false)
- documentCol
Name of the
DOCUMENT
Annotator type column- sentenceCol
Name of the Sentences of
DOCUMENT
Annotator type column- tokenCol
Name of the
TOKEN
Annotator type column- posCol
Name of the
POS
Annotator type column- conllLabelIndex
Index of the column for NER Label in the dataset
- conllPosIndex
Index of the column for the POS tags in the dataset
- conllDocIdCol
Name of the column for the text in the dataset
- conllTextCol
Name of the column for the text in the dataset
- labelCol
Name of the
NAMED_ENTITY
Annotator type column- explodeSentences
Whether to explode each sentence to a separate row
- delimiter
Delimiter used to separate columns inside CoNLL file
- includeDocId
Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
- Alphabetic
- By Inheritance
- CoNLL
- Serializable
- Serializable
- Product
- Equals
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
-
new
CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false)
- documentCol
Name of the
DOCUMENT
Annotator type column- sentenceCol
Name of the Sentences of
DOCUMENT
Annotator type column- tokenCol
Name of the
TOKEN
Annotator type column- posCol
Name of the
POS
Annotator type column- conllLabelIndex
Index of the column for NER Label in the dataset
- conllPosIndex
Index of the column for the POS tags in the dataset
- conllDocIdCol
Name of the column for the text in the dataset
- conllTextCol
Name of the column for the text in the dataset
- labelCol
Name of the
NAMED_ENTITY
Annotator type column- explodeSentences
Whether to explode each sentence to a separate row
- delimiter
Delimiter used to separate columns inside CoNLL file
- includeDocId
Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
- val annotationType: ArrayType
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
- def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
- val conllDocIdCol: String
- val conllLabelIndex: Int
- val conllPosIndex: Int
- val conllTextCol: String
- val delimiter: String
- val documentCol: String
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- val explodeSentences: Boolean
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
- def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- val includeDocId: Boolean
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
- val labelCol: String
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
- def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
- def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]
- def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]
- def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
- def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
- def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
- val posCol: String
- def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString, parallelism: Int = 8, storageLevel: StorageLevel = StorageLevel.DISK_ONLY): Dataset[_]
- def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
- def readDocs(er: ExternalResource): Seq[CoNLLDocument]
- def readLines(lines: Array[String]): Seq[CoNLLDocument]
- def removeSurroundingHyphens(text: String): String
- def schema: StructType
- val sentenceCol: String
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
- val tokenCol: String
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()