case class POS() extends Product with Serializable
Helper class for creating DataFrames for training a part-of-speech tagger.
The dataset needs to consist of sentences on each line, where each word is delimited with its respective tag:
Pierre|NNP Vinken|NNP ,|, 61|CD years|NNS old|JJ ,|, will|MD join|VB the|DT board|NN as|IN a|DT nonexecutive|JJ director|NN Nov.|NNP 29|CD .|.
The sentence can then be parsed with readDataset into a column with annotations of type
POS
.
Example
In this example, the file test-training.txt
has the content of the sentence above.
import com.johnsnowlabs.nlp.training.POS val pos = POS() val path = "src/test/resources/anc-pos-corpus-small/test-training.txt" val posDf = pos.readDataset(spark, path, "|", "tags") posDf.selectExpr("explode(tags) as tags").show(false) +---------------------------------------------+ |tags | +---------------------------------------------+ |[pos, 0, 5, NNP, [word -> Pierre], []] | |[pos, 7, 12, NNP, [word -> Vinken], []] | |[pos, 14, 14, ,, [word -> ,], []] | |[pos, 16, 17, CD, [word -> 61], []] | |[pos, 19, 23, NNS, [word -> years], []] | |[pos, 25, 27, JJ, [word -> old], []] | |[pos, 29, 29, ,, [word -> ,], []] | |[pos, 31, 34, MD, [word -> will], []] | |[pos, 36, 39, VB, [word -> join], []] | |[pos, 41, 43, DT, [word -> the], []] | |[pos, 45, 49, NN, [word -> board], []] | |[pos, 51, 52, IN, [word -> as], []] | |[pos, 47, 47, DT, [word -> a], []] | |[pos, 56, 67, JJ, [word -> nonexecutive], []]| |[pos, 69, 76, NN, [word -> director], []] | |[pos, 78, 81, NNP, [word -> Nov.], []] | |[pos, 83, 84, CD, [word -> 29], []] | |[pos, 81, 81, ., [word -> .], []] | +---------------------------------------------+
- Alphabetic
- By Inheritance
- POS
- Serializable
- Serializable
- Product
- Equals
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
- new POS()
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
def
readDataset(sparkSession: SparkSession, path: String, delimiter: String = "|", outputPosCol: String = "tags", outputDocumentCol: String = "document", outputTextCol: String = "text"): DataFrame
Reads the provided dataset file with given parameters and returns a DataFrame ready to for training a part-of-speech tagger.
Reads the provided dataset file with given parameters and returns a DataFrame ready to for training a part-of-speech tagger.
- sparkSession
Current Spark sessions
- path
Path to the resource
- delimiter
Delimiter used to separate word from their tag in the text
- outputPosCol
Name for the output column of the part-of-tags
- outputDocumentCol
Name for the DocumentAssembler column
- outputTextCol
Name for the column of the raw text
- returns
DataFrame of parsed text
- def readFromDataframe(posDataframe: DataFrame, tokensCol: String = "tokens", labelsCol: String = "labels", outPutDocColName: String = "text", outPutPosColName: String = "tags"): DataFrame
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
- def wrapColumnMetadata(col: Column, annotatorType: String, outPutColName: String): Column