# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for ViveknSentiment."""
from sparknlp.common import *
from sparknlp.common.annotator_type import AnnotatorType
[docs]class ViveknSentimentApproach(AnnotatorApproach):
"""Trains a sentiment analyser inspired by the algorithm by Vivek Narayanan.
The analyzer requires sentence boundaries to give a score in context.
Tokenization is needed to make sure tokens are within bounds. Transitivity
requirements are also required.
The training data needs to consist of a column for normalized text and a
label column (either ``"positive"`` or ``"negative"``).
For extended examples of usage, see the `Examples
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
====================== ======================
Input Annotation types Output Annotation type
====================== ======================
``TOKEN, DOCUMENT`` ``SENTIMENT``
====================== ======================
Parameters
----------
sentimentCol
column with the sentiment result of every row. Must be 'positive' or 'negative'
pruneCorpus
Removes unfrequent scenarios from scope. The higher the better performance. Defaults 1
References
----------
The algorithm is based on the paper `"Fast and accurate sentiment
classification using an enhanced Naive Bayes model"
<https://arxiv.org/abs/1305.6143>`__.
https://github.com/vivekn/sentiment/
Examples
--------
>>> import sparknlp
>>> from sparknlp.base import *
>>> from sparknlp.annotator import *
>>> from pyspark.ml import Pipeline
>>> document = DocumentAssembler() \\
... .setInputCol("text") \\
... .setOutputCol("document")
>>> token = Tokenizer() \\
... .setInputCols(["document"]) \\
... .setOutputCol("token")
>>> normalizer = Normalizer() \\
... .setInputCols(["token"]) \\
... .setOutputCol("normal")
>>> vivekn = ViveknSentimentApproach() \\
... .setInputCols(["document", "normal"]) \\
... .setSentimentCol("train_sentiment") \\
... .setOutputCol("result_sentiment")
>>> finisher = Finisher() \\
... .setInputCols(["result_sentiment"]) \\
... .setOutputCols("final_sentiment")
>>> pipeline = Pipeline().setStages([document, token, normalizer, vivekn, finisher])
>>> training = spark.createDataFrame([
... ("I really liked this movie!", "positive"),
... ("The cast was horrible", "negative"),
... ("Never going to watch this again or recommend it to anyone", "negative"),
... ("It's a waste of time", "negative"),
... ("I loved the protagonist", "positive"),
... ("The music was really really good", "positive")
... ]).toDF("text", "train_sentiment")
>>> pipelineModel = pipeline.fit(training)
>>> data = spark.createDataFrame([
... ["I recommend this movie"],
... ["Dont waste your time!!!"]
... ]).toDF("text")
>>> result = pipelineModel.transform(data)
>>> result.select("final_sentiment").show(truncate=False)
+---------------+
|final_sentiment|
+---------------+
|[positive] |
|[negative] |
+---------------+
"""
inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
outputAnnotatorType = AnnotatorType.SENTIMENT
sentimentCol = Param(Params._dummy(),
"sentimentCol",
"column with the sentiment result of every row. Must be 'positive' or 'negative'",
typeConverter=TypeConverters.toString)
pruneCorpus = Param(Params._dummy(),
"pruneCorpus",
"Removes unfrequent scenarios from scope. The higher the better performance. Defaults 1",
typeConverter=TypeConverters.toInt)
importantFeatureRatio = Param(Params._dummy(),
"importantFeatureRatio",
"proportion of feature content to be considered relevant. Defaults to 0.5",
typeConverter=TypeConverters.toFloat)
unimportantFeatureStep = Param(Params._dummy(),
"unimportantFeatureStep",
"proportion to lookahead in unimportant features. Defaults to 0.025",
typeConverter=TypeConverters.toFloat)
featureLimit = Param(Params._dummy(),
"featureLimit",
"content feature limit, to boost performance in very dirt text. Default disabled with -1",
typeConverter=TypeConverters.toInt)
@keyword_only
def __init__(self):
super(ViveknSentimentApproach, self).__init__(
classname="com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach")
self._setDefault(pruneCorpus=1, importantFeatureRatio=0.5, unimportantFeatureStep=0.025, featureLimit=-1)
[docs] def setSentimentCol(self, value):
"""Sets column with the sentiment result of every row.
Must be either 'positive' or 'negative'.
Parameters
----------
value : str
Name of the column
"""
return self._set(sentimentCol=value)
[docs] def setPruneCorpus(self, value):
"""Sets the removal of unfrequent scenarios from scope, by default 1.
The higher the better performance.
Parameters
----------
value : int
The frequency
"""
return self._set(pruneCorpus=value)
def _create_model(self, java_model):
return ViveknSentimentModel(java_model=java_model)
[docs]class ViveknSentimentModel(AnnotatorModel):
"""Sentiment analyser inspired by the algorithm by Vivek Narayanan.
This is the instantiated model of the :class:`.ViveknSentimentApproach`. For
training your own model, please see the documentation of that class.
The analyzer requires sentence boundaries to give a score in context.
Tokenization is needed to make sure tokens are within bounds. Transitivity
requirements are also required.
For extended examples of usage, see the `Examples
<https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/vivekn-sentiment/VivekNarayanSentimentApproach.ipynb>`__.
====================== ======================
Input Annotation types Output Annotation type
====================== ======================
``TOKEN, DOCUMENT`` ``SENTIMENT``
====================== ======================
Parameters
----------
None
References
----------
The algorithm is based on the paper `"Fast and accurate sentiment
classification using an enhanced Naive Bayes model"
<https://arxiv.org/abs/1305.6143>`__.
https://github.com/vivekn/sentiment/
"""
name = "ViveknSentimentModel"
inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT]
outputAnnotatorType = AnnotatorType.SENTIMENT
importantFeatureRatio = Param(Params._dummy(),
"importantFeatureRatio",
"proportion of feature content to be considered relevant. Defaults to 0.5",
typeConverter=TypeConverters.toFloat)
unimportantFeatureStep = Param(Params._dummy(),
"unimportantFeatureStep",
"proportion to lookahead in unimportant features. Defaults to 0.025",
typeConverter=TypeConverters.toFloat)
featureLimit = Param(Params._dummy(),
"featureLimit",
"content feature limit, to boost performance in very dirt text. Default disabled with -1",
typeConverter=TypeConverters.toInt)
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentModel", java_model=None):
super(ViveknSentimentModel, self).__init__(
classname=classname,
java_model=java_model
)
@staticmethod
[docs] def pretrained(name="sentiment_vivekn", lang="en", remote_loc=None):
"""Downloads and loads a pretrained model.
Parameters
----------
name : str, optional
Name of the pretrained model, by default "sentiment_vivekn"
lang : str, optional
Language of the pretrained model, by default "en"
remote_loc : str, optional
Optional remote address of the resource, by default None. Will use
Spark NLPs repositories otherwise.
Returns
-------
ViveknSentimentModel
The restored model
"""
from sparknlp.pretrained import ResourceDownloader
return ResourceDownloader.downloadModel(ViveknSentimentModel, name, lang, remote_loc)