Packages

case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false) extends Product with Serializable

Helper class to load a CoNLL type dataset for training.

The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

Two types of input paths are supported,

Folder: this is a path ending in *, and representing a collection of CoNLL files within a directory. E.g., 'path/to/multiple/conlls/*' Using this pattern will result in all the files being read into a single Dataframe. Some constraints apply on the schemas of the multiple files.

File: this is a path to a single file. E.g., 'path/to/single_file.conll'

Example

val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
  .show(3, false)
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|text                                            |tokens                                                    |pos                                  |label                                    |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+
|EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
|Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
|BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
+------------------------------------------------+----------------------------------------------------------+-------------------------------------+-----------------------------------------+

trainingData.printSchema
root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- pos: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- label: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
documentCol

Name of the DOCUMENT Annotator type column

sentenceCol

Name of the Sentences of DOCUMENT Annotator type column

tokenCol

Name of the TOKEN Annotator type column

posCol

Name of the POS Annotator type column

conllLabelIndex

Index of the column for NER Label in the dataset

conllPosIndex

Index of the column for the POS tags in the dataset

conllDocIdCol

Name of the column for the text in the dataset

conllTextCol

Name of the column for the text in the dataset

labelCol

Name of the NAMED_ENTITY Annotator type column

explodeSentences

Whether to explode each sentence to a separate row

delimiter

Delimiter used to separate columns inside CoNLL file

includeDocId

Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)

Linear Supertypes
Serializable, Serializable, Product, Equals, AnyRef, Any
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. CoNLL
  2. Serializable
  3. Serializable
  4. Product
  5. Equals
  6. AnyRef
  7. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Instance Constructors

  1. new CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false)

    documentCol

    Name of the DOCUMENT Annotator type column

    sentenceCol

    Name of the Sentences of DOCUMENT Annotator type column

    tokenCol

    Name of the TOKEN Annotator type column

    posCol

    Name of the POS Annotator type column

    conllLabelIndex

    Index of the column for NER Label in the dataset

    conllPosIndex

    Index of the column for the POS tags in the dataset

    conllDocIdCol

    Name of the column for the text in the dataset

    conllTextCol

    Name of the column for the text in the dataset

    labelCol

    Name of the NAMED_ENTITY Annotator type column

    explodeSentences

    Whether to explode each sentence to a separate row

    delimiter

    Delimiter used to separate columns inside CoNLL file

    includeDocId

    Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)

Value Members

  1. final def !=(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int
    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  4. val annotationType: ArrayType
  5. final def asInstanceOf[T0]: T0
    Definition Classes
    Any
  6. def clearTokens(tokens: Array[IndexedTaggedWord]): Array[IndexedTaggedWord]
  7. def clone(): AnyRef
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  8. val conllDocIdCol: String
  9. val conllLabelIndex: Int
  10. val conllPosIndex: Int
  11. val conllTextCol: String
  12. val delimiter: String
  13. val documentCol: String
  14. final def eq(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  15. val explodeSentences: Boolean
  16. def finalize(): Unit
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] )
  17. def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
  18. final def getClass(): Class[_]
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  19. val includeDocId: Boolean
  20. final def isInstanceOf[T0]: Boolean
    Definition Classes
    Any
  21. val labelCol: String
  22. final def ne(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  23. final def notify(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  24. final def notifyAll(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  25. def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
  26. def packDocs(docs: Seq[CoNLLDocument], spark: SparkSession): Dataset[_]
  27. def packNerTagged(sentences: Seq[NerTaggedSentence]): Seq[Annotation]
  28. def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
  29. def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
  30. def packTokenized(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
  31. val posCol: String
  32. def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString, parallelism: Int = 8, storageLevel: StorageLevel = StorageLevel.DISK_ONLY): Dataset[_]
  33. def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
  34. def readDocs(er: ExternalResource): Seq[CoNLLDocument]
  35. def readLines(lines: Array[String]): Seq[CoNLLDocument]
  36. def removeSurroundingHyphens(text: String): String
  37. def schema: StructType
  38. val sentenceCol: String
  39. final def synchronized[T0](arg0: ⇒ T0): T0
    Definition Classes
    AnyRef
  40. val tokenCol: String
  41. final def wait(): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  42. final def wait(arg0: Long, arg1: Int): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  43. final def wait(arg0: Long): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped