Description
This model identifies positive or negative sentiments in Swahili texts.
Predicted Entities
Negative
, Positive
Live Demo Open in Colab Download Copy S3 URI
How to use
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")
normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized")
stopwords_cleaner = StopWordsCleaner.pretrained("stopwords_sw", "sw") \
.setInputCols(["normalized"]) \
.setOutputCol("cleanTokens")\
.setCaseSensitive(False)
embeddings = XlmRoBertaEmbeddings.pretrained("xlm_roberta_base_finetuned_swahili", "sw")\
.setInputCols(["document", "cleanTokens"])\
.setOutputCol("embeddings")
embeddingsSentence = SentenceEmbeddings() \
.setInputCols(["document", "embeddings"]) \
.setOutputCol("sentence_embeddings") \
.setPoolingStrategy("AVERAGE")
sentimentClassifier = ClassifierDLModel.pretrained("classifierdl_xlm_roberta_sentiment", "sw") \
.setInputCols(["document", "sentence_embeddings"]) \
.setOutputCol("class")
sw_pipeline = Pipeline(stages=[document_assembler, tokenizer, normalizer, stopwords_cleaner, embeddings, embeddingsSentence, sentimentClassifier])
light_pipeline = LightPipeline(sw_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))
result1 = light_pipeline.annotate("Hadithi yenyewe ni ya kutabirika tu na ya uvivu.")
result2 = light_pipeline.annotate("Mtandao wa kushangaza wa 4G katika mji wa Mombasa pamoja na mipango nzuri sana na ya bei rahisi.")
print(result1["class"], result2["class"], sep = "\n")
val document_assembler = DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val tokenizer = Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")
val normalizer = Normalizer()
.setInputCols(Array("token"))
.setOutputCol("normalized")
val stopwords_cleaner = StopWordsCleaner.pretrained("stopwords_sw", "sw")
.setInputCols(Array("normalized"))
.setOutputCol("cleanTokens")
.setCaseSensitive(False)
val embeddings = XlmRoBertaEmbeddings.pretrained("xlm_roberta_base_finetuned_swahili", "sw")
.setInputCols(Array("document", "cleanTokens"))
.setOutputCol("embeddings")
val embeddingsSentence = SentenceEmbeddings()
.setInputCols(Array("document", "embeddings"))
.setOutputCol("sentence_embeddings")
.setPoolingStrategy("AVERAGE")
val sentimentClassifier = ClassifierDLModel.pretrained("classifierdl_xlm_roberta_sentiment", "sw")
.setInputCols(Array("document", "sentence_embeddings"))
.setOutputCol("class")
val sw_sentiment_pipeline = new Pipeline().setStages(Array(document_assembler, tokenizer, normalizer, stopwords_cleaner, embeddings, embeddingsSentence, sentimentClassifier))
val light_pipeline = LightPipeline(sw_sentiment_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))
val result1 = light_pipeline.annotate("Hadithi yenyewe ni ya kutabirika tu na ya uvivu.")
val result2 = light_pipeline.annotate("Mtandao wa kushangaza wa 4G katika mji wa Mombasa pamoja na mipango nzuri sana na ya bei rahisi.")
import nlu
nlu.load("sw.classify.sentiment.").predict("""Mtandao wa kushangaza wa 4G katika mji wa Mombasa pamoja na mipango nzuri sana na ya bei rahisi.""")
Results
['Negative']
['Positive']
Model Information
Model Name: | classifierdl_xlm_roberta_sentiment |
Compatibility: | Spark NLP 3.3.4+ |
License: | Open Source |
Edition: | Official |
Input Labels: | [sentence_embeddings] |
Output Labels: | [class] |
Language: | sw |
Size: | 23.0 MB |
Data Source
https://github.com/Jinamizi/Swahili-sentiment-analysis
Benchmarking
label precision recall f1-score support
Negative 0.79 0.84 0.81 85
Positive 0.86 0.82 0.84 103
accuracy - - 0.82 188
macro-avg 0.82 0.83 0.82 188
weighted-avg 0.83 0.82 0.82 188