case class CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true) extends Product with Serializable
Instantiates the class to read a CoNLL-U dataset.
The dataset should be in the format of
CoNLL-U and needs to be specified with
readDataset
, which will create a dataframe with the data.
Example
import com.johnsnowlabs.nlp.training.CoNLLU val conlluFile = "src/test/resources/conllu/en.test.conllu" val conllDataSet = CoNLLU(false).readDataset(ResourceHelper.spark, conlluFile) conllDataSet.selectExpr("text", "form.result as form", "upos.result as upos", "xpos.result as xpos", "lemma.result as lemma") .show(1, false) +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+ |text |form |upos |xpos |lemma | +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+ |What if Google Morphed Into GoogleOS? |[What, if, Google, Morphed, Into, GoogleOS, ?]|[PRON, SCONJ, PROPN, VERB, ADP, PROPN, PUNCT]|[WP, IN, NNP, VBD, IN, NNP, .]|[what, if, Google, morph, into, GoogleOS, ?]| +---------------------------------------+----------------------------------------------+---------------------------------------------+------------------------------+--------------------------------------------+
- explodeSentences
Whether to split each sentence into a separate row
Linear Supertypes
Ordering
- Alphabetic
- By Inheritance
Inherited
- CoNLLU
- Serializable
- Serializable
- Product
- Equals
- AnyRef
- Any
- Hide All
- Show All
Visibility
- Public
- All
Instance Constructors
-
new
CoNLLU(conllTextCol: String = "text", documentCol: String = "document", sentenceCol: String = "sentence", formCol: String = ..., uposCol: String = ..., xposCol: String = ..., lemmaCol: String = ..., explodeSentences: Boolean = true)
- explodeSentences
Whether to split each sentence into a separate row
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
- val conllTextCol: String
- val documentCol: String
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
- val explodeSentences: Boolean
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
- val formCol: String
- def getAnnotationType(column: String, annotatorType: String, addMetadata: Boolean = true): StructField
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
- val lemmaCol: String
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
- def packAssembly(text: String, isTraining: Boolean = true): Seq[Annotation]
- def packDocs(docs: Seq[CoNLLUDocument], spark: SparkSession): Dataset[_]
- def packPosTagged(sentences: Seq[TaggedSentence]): Seq[Annotation]
- def packSentence(text: String, sentences: Seq[TaggedSentence]): Seq[Annotation]
- def packTokenized(sentences: Seq[TaggedSentence]): Seq[Annotation]
- def readDataset(spark: SparkSession, path: String, readAs: String = ReadAs.TEXT.toString): Dataset[_]
- def readDatasetFromLines(lines: Array[String], spark: SparkSession): Dataset[_]
- def readDocs(er: ExternalResource): Seq[CoNLLUDocument]
- def schema: StructType
- val sentenceCol: String
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
- val uposCol: String
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
- val xposCol: String