
package training

  1. Alphabetic
  1. Public
  2. All

Type Members

  1. case class CoNLL(documentCol: String = "document", sentenceCol: String = "sentence", tokenCol: String = "token", posCol: String = "pos", conllLabelIndex: Int = 3, conllPosIndex: Int = 1, conllDocIdCol: String = "doc_id", conllTextCol: String = "text", labelCol: String = "label", explodeSentences: Boolean = true, delimiter: String = " ", includeDocId: Boolean = false) extends Product with Serializable

    Helper class to load a CoNLL type dataset for training.

    Helper class to load a CoNLL type dataset for training.

    The dataset should be in the format of CoNLL 2003 and needs to be specified with readDataset. Other CoNLL datasets are not supported.

    Two types of input paths are supported,

    Folder: this is a path ending in *, and representing a collection of CoNLL files within a directory. E.g., 'path/to/multiple/conlls/*' Using this pattern will result in all the files being read into a single Dataframe. Some constraints apply on the schemas of the multiple files.

    File: this is a path to a single file. E.g., 'path/to/single_file.conll'


    val trainingData = CoNLL().readDataset(spark, "src/test/resources/conll2003/eng.train")
    trainingData.selectExpr("text", "token.result as tokens", "pos.result as pos", "label.result as label")
      .show(3, false)
    |text                                            |tokens                                                    |pos                                  |label                                    |
    |EU rejects German call to boycott British lamb .|[EU, rejects, German, call, to, boycott, British, lamb, .]|[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]|[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]|
    |Peter Blackburn                                 |[Peter, Blackburn]                                        |[NNP, NNP]                           |[B-PER, I-PER]                           |
    |BRUSSELS 1996-08-22                             |[BRUSSELS, 1996-08-22]                                    |[NNP, CD]                            |[B-LOC, O]                               |
     |-- text: string (nullable = true)
     |-- document: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
     |-- sentence: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
     |-- token: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
     |-- pos: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)
     |-- label: array (nullable = false)
     |    |-- element: struct (containsNull = true)
     |    |    |-- annotatorType: string (nullable = true)
     |    |    |-- begin: integer (nullable = false)
     |    |    |-- end: integer (nullable = false)
     |    |    |-- result: string (nullable = true)
     |    |    |-- metadata: map (nullable = true)
     |    |    |    |-- key: string
     |    |    |    |-- value: string (valueContainsNull = true)
     |    |    |-- embeddings: array (nullable = true)
     |    |    |    |-- element: float (containsNull = false)

    Name of the DOCUMENT Annotator type column


    Name of the Sentences of DOCUMENT Annotator type column


    Name of the TOKEN Annotator type column


    Name of the POS Annotator type column


    Index of the column for NER Label in the dataset


    Index of the column for the POS tags in the dataset


    Name of the column for the text in the dataset


    Name of the column for the text in the dataset


    Name of the NAMED_ENTITY Annotator type column


    Whether to explode each sentence to a separate row


    Delimiter used to separate columns inside CoNLL file


    Whether to try and parse the document id from the third item in the -DOCSTART- line (X if not found)

  2. class CoNLL2003NerReader extends AnyRef

    Helper class for to work with CoNLL 2003 dataset for NER task Class is made for easy use from Java

  3. case class CoNLLDocument(text: String, nerTagged: Seq[NerTaggedSentence], posTagged: Seq[PosTaggedSentence], docId: Option[String]) extends Product with Serializable
  4. case class CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true) extends Product with Serializable

    Instantiates the class to read a CoNLL-U dataset.

    Instantiates the class to read a CoNLL-U dataset.

    The dataset should be in the format of CoNLL-U and needs to be specified with readDataset, which will create a dataframe with the data.


    val conlluFile = "src/test/resources/conllu/en.test.conllu"
    val conllDataSet = CoNLLU(false).readDataset(ResourceHelper.spark, conlluFile)
    conllDataSet.selectExpr("text", "form.result as form", "upos.result as upos", "xpos.result as xpos", "lemma.result as lemma")
      .show(1, false)
    |text                                   |form                                          |upos                                         |xpos                          |lemma                                       |
    |What if Google Morphed Into GoogleOS?  |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|

    Whether to split each sentence into a separate row

  5. case class CoNLLUDocument(text: String, uPosTagged: Seq[PosTaggedSentence], xPosTagged: Seq[PosTaggedSentence], lemma: Seq[PosTaggedSentence]) extends Product with Serializable
  6. case class POS() extends Product with Serializable

    Helper class for creating DataFrames for training a part-of-speech tagger.

    Helper class for creating DataFrames for training a part-of-speech tagger.

    The dataset needs to consist of sentences on each line, where each word is delimited with its respective tag:

    Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN Nov.|NNP 29|CD .|.

    The sentence can then be parsed with readDataset into a column with annotations of type POS.


    In this example, the file test-training.txt has the content of the sentence above.

    val pos = POS()
    val path = "src/test/resources/anc-pos-corpus-small/test-training.txt"
    val posDf = pos.readDataset(spark, path, "|", "tags")
    posDf.selectExpr("explode(tags) as tags").show(false)
    |tags                                         |
    |[pos, 0, 5, NNP, [word -> Pierre], []]       |
    |[pos, 7, 12, NNP, [word -> Vinken], []]      |
    |[pos, 14, 14, ,, [word -> ,], []]            |
    |[pos, 16, 17, CD, [word -> 61], []]          |
    |[pos, 19, 23, NNS, [word -> years], []]      |
    |[pos, 25, 27, JJ, [word -> old], []]         |
    |[pos, 29, 29, ,, [word -> ,], []]            |
    |[pos, 31, 34, MD, [word -> will], []]        |
    |[pos, 36, 39, VB, [word -> join], []]        |
    |[pos, 41, 43, DT, [word -> the], []]         |
    |[pos, 45, 49, NN, [word -> board], []]       |
    |[pos, 51, 52, IN, [word -> as], []]          |
    |[pos, 47, 47, DT, [word -> a], []]           |
    |[pos, 56, 67, JJ, [word -> nonexecutive], []]|
    |[pos, 69, 76, NN, [word -> director], []]    |
    |[pos, 78, 81, NNP, [word -> Nov.], []]       |
    |[pos, 83, 84, CD, [word -> 29], []]          |
    |[pos, 81, 81, ., [word -> .], []]            |
  7. case class PubTator() extends Product with Serializable

    The PubTator format includes medical papers’ titles, abstracts, and tagged chunks.

    The PubTator format includes medical papers’ titles, abstracts, and tagged chunks.

    For more information see PubTator Docs and MedMentions Docs.

    readDataset is used to create a Spark DataFrame from a PubTator text file.


    val pubTatorFile = "./src/test/resources/corpus_pubtator_sample.txt"
    val pubTatorDataSet = PubTator().readDataset(ResourceHelper.spark, pubTatorFile)
    |  doc_id|      finished_token|        finished_pos|        finished_ner|finished_token_metadata|finished_pos_metadata|finished_label_metadata|
    |25763772|[DCTN4, as, a, mo...|[NNP, IN, DT, NN,...|[B-T116, O, O, O,...|   [[sentence, 0], [...| [[word, DCTN4], [...|   [[word, DCTN4], [...|
  8. class SpacyToAnnotation extends AnyRef

Value Members

  1. object CoNLLHelper
  2. object CoNLLUCols extends Enumeration
