CoNLLU

case class CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true) extends Product with Serializable

Instantiates the class to read a CoNLL-U dataset.

The dataset should be in the format of CoNLL-U and needs to be specified with readDataset, which will create a dataframe with the data.

Example

import com.johnsnowlabs.nlp.training.CoNLLU

val conlluFile = "src/test/resources/conllu/en.test.conllu"
val conllDataSet = CoNLLU(false).readDataset(ResourceHelper.spark, conlluFile)
conllDataSet.selectExpr("text", "form.result as form", "upos.result as upos", "xpos.result as xpos", "lemma.result as lemma")
  .show(1, false)
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|text                                   |form                                          |upos                                         |xpos                          |lemma                                       |
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
|What if Google Morphed Into GoogleOS?  |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]|
+---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+

explodeSentences: Whether to split each sentence into a separate row

Linear Supertypes

Serializable, Serializable, Product, Equals, AnyRef, Any

Ordering

Alphabetic
By Inheritance

Inherited

CoNLLU
Serializable
Serializable
Product
Equals
AnyRef
Any

Hide All
Show All

Visibility

Public
All

Instance Constructors

new CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true)
explodeSentences
Whether to split each sentence into a separate row

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
val conllTextCol: String
val documentCol: String
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
val explodeSentences: Boolean
def finalize(): Unit

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
val formCol: String
def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
Annotations
@native()
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
val lemmaCol: String
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
Annotations
@native()
final def notifyAll(): Unit

Definition Classes
AnyRef
Annotations
@native()
def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
def packDocs(docs: Seq[CoNLLUDocument], spark: SparkSession): Dataset[_]
def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
def packTokenized(sentences: Seq[TaggedSentence]): Seq[Annotation]
def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString): Dataset[_]
def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
def readDocs(er: ExternalResource): Seq[CoNLLUDocument]
def schema: StructType
val sentenceCol: String
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
val uposCol: String
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
val xposCol: String

Packages

CoNLLU

case class CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true) extends Product with Serializable

Example

Instance Constructors

Value Members

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped

Packages

CoNLLU 

case class CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true) extends Product with Serializable

Example

Instance Constructors

Value Members

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from AnyRef

Inherited from Any

Ungrouped

CoNLLU