Description
This model identifies Positive or Negative sentiments in Vietnamese texts.
Predicted Entities
POSITIVE
, NEGATIVE
How to use
document = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")\
.setCleanupMode("shrink")
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")
normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")
lemmatizer = LemmatizerModel.pretrained("lemma", "vi") \
.setInputCols(["normalized"]) \
.setOutputCol("lemma")
distilbert = DistilBertEmbeddings.pretrained("distilbert_base_cased", "vi")\
.setInputCols(["document",'token'])\
.setOutputCol("embeddings")\
.setCaseSensitive(False)
embeddingsSentence = SentenceEmbeddings() \
.setInputCols(["document", "embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")
sentimentClassifier = ClassifierDLModel.pretrained('classifierdl_distilbert_sentiment', 'vi') \
.setInputCols(["document", "sentence_embeddings"]) \
.setOutputCol("class")
vi_sentiment_pipeline = Pipeline(stages=[document, tokenizer, normalizer, lemmatizer, distilbert, embeddingsSentence, sentimentClassifier])
light_pipeline = LightPipeline(vi_sentiment_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))
result = light_pipeline.annotate("Chất cotton siêu đẹp mịn mát.")
result["class"]
val document = DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
.setCleanupMode("shrink")
val tokenizer = Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")
val normalizer = Normalizer()
.setInputCols(Array("token"))
.setOutputCol("normalized")
val lemmatizer = LemmatizerModel.pretrained("lemma", "vi")
.setInputCols(Array("normalized"))
.setOutputCol("lemma")
val distilbert = DistilBertEmbeddings.pretrained("distilbert_base_cased", "vi")
.setInputCols(Array("document","token"))
.setOutputCol("embeddings")
.setCaseSensitive(False)
val embeddingsSentence = SentenceEmbeddings()
.setInputCols(Array("document", "embeddings"))
.setOutputCol("sentence_embeddings")
.setPoolingStrategy("AVERAGE")
val sentimentClassifier = ClassifierDLModel.pretrained.("classifierdl_distilbert_sentiment", "vi")
.setInputCols(Array("document", "sentence_embeddings"))
.setOutputCol("class")
val pipeline = new Pipeline().setStages(Array(document, tokenizer, normalizer, lemmatizer, distilbert, embeddingsSentence, sentimentClassifier))
val light_pipeline = LightPipeline(pipeline.fit(spark.createDataFrame([[""]]).toDF("text")))
val result = light_pipeline.annotate("Chất cotton siêu đẹp mịn mát.")
Results
['POSITIVE']
Model Information
Model Name: | classifierdl_distilbert_sentiment |
Compatibility: | Spark NLP 3.4.0+ |
License: | Open Source |
Edition: | Official |
Input Labels: | [sentence_embeddings] |
Output Labels: | [class] |
Language: | vi |
Size: | 23.6 MB |
References
https://www.kaggle.com/datvuthanh/vietnamese-sentiment
Benchmarking
label precision recall f1-score support
NEGATIVE 0.88 0.79 0.83 956
POSITIVE 0.80 0.89 0.84 931
accuracy - - 0.84 1887
macro-avg 0.84 0.84 0.84 1887
weighted-avg 0.84 0.84 0.84 1887