# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for LanguageDetectorDL."""
from sparknlp.common import *
[docs]class LanguageDetectorDL(AnnotatorModel, HasStorageRef, HasEngine):
"""Language Identification and Detection by using CNN and RNN architectures
in TensorFlow.
``LanguageDetectorDL`` is an annotator that detects the language of
documents or sentences depending on the inputCols. The models are trained on
large datasets such as Wikipedia and Tatoeba. Depending on the language
(how similar the characters are), the LanguageDetectorDL works best with
text longer than 140 characters. The output is a language code in
`Wiki Code style <https://en.wikipedia.org/wiki/List_of_Wikipedias>`__.
Pretrained models can be loaded with :meth:`.pretrained` of the companion
object:
>>> languageDetector = LanguageDetectorDL.pretrained() \\
... .setInputCols(["sentence"]) \\
... .setOutputCol("language")
The default model is ``"ld_wiki_tatoeba_cnn_21"``, default language is
``"xx"`` (meaning multi-lingual), if no values are provided.
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Language+Detection>`__.
For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/language-detection/Language_Detection_and_Indentification.ipynb>`__.
====================== ======================
Input Annotation types Output Annotation type
====================== ======================
``DOCUMENT`` ``LANGUAGE``
====================== ======================
Parameters
----------
configProtoBytes
ConfigProto from tensorflow, serialized into byte array.
threshold
The minimum threshold for the final result otheriwse it will be either
neutral or the value set in thresholdLabel, by default 0.5
thresholdLabel
In case the score is less than threshold, what should be the label, by
default Unknown
coalesceSentences
If sets to true the output of all sentences will be averaged to one
output instead of one output per sentence, by default True.
languages
The languages used to trained the model
Examples
--------
>>> import sparknlp
>>> from sparknlp.base import *
>>> from sparknlp.annotator import *
>>> from pyspark.ml import Pipeline
>>> documentAssembler = DocumentAssembler() \\
... .setInputCol("text") \\
... .setOutputCol("document")
>>> languageDetector = LanguageDetectorDL.pretrained() \\
... .setInputCols("document") \\
... .setOutputCol("language")
>>> pipeline = Pipeline() \\
... .setStages([
... documentAssembler,
... languageDetector
... ])
>>> data = spark.createDataFrame([
... ["Spark NLP is an open-source text processing library for advanced natural language processing for the Python, Java and Scala programming languages."],
... ["Spark NLP est une bibliothèque de traitement de texte open source pour le traitement avancé du langage naturel pour les langages de programmation Python, Java et Scala."],
... ["Spark NLP ist eine Open-Source-Textverarbeitungsbibliothek für fortgeschrittene natürliche Sprachverarbeitung für die Programmiersprachen Python, Java und Scala."]
... ]).toDF("text")
>>> result = pipeline.fit(data).transform(data)
>>> result.select("language.result").show(truncate=False)
+------+
|result|
+------+
|[en] |
|[fr] |
|[de] |
+------+
"""
name = "LanguageDetectorDL"
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
outputAnnotatorType = AnnotatorType.LANGUAGE
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ld.dl.LanguageDetectorDL", java_model=None):
super(LanguageDetectorDL, self).__init__(
classname=classname,
java_model=java_model
)
self._setDefault(
threshold=0.5,
thresholdLabel="Unknown",
coalesceSentences=True
)
configProtoBytes = Param(Params._dummy(), "configProtoBytes",
"ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
TypeConverters.toListInt)
threshold = Param(Params._dummy(), "threshold",
"The minimum threshold for the final result otheriwse it will be either neutral or the value set in thresholdLabel.",
TypeConverters.toFloat)
thresholdLabel = Param(Params._dummy(), "thresholdLabel",
"In case the score is less than threshold, what should be the label. Default is neutral.",
TypeConverters.toString)
coalesceSentences = Param(Params._dummy(), "coalesceSentences",
"If sets to true the output of all sentences will be averaged to one output instead of one output per sentence. Default to false.",
TypeConverters.toBoolean)
languages = Param(Params._dummy(), "languages",
"get the languages used to trained the model",
TypeConverters.toListString)
[docs] def setConfigProtoBytes(self, b):
"""Sets configProto from tensorflow, serialized into byte array.
Parameters
----------
b : List[int]
ConfigProto from tensorflow, serialized into byte array
"""
return self._set(configProtoBytes=b)
[docs] def setThreshold(self, v):
"""Sets the minimum threshold for the final result otherwise it will be
either neutral or the value set in thresholdLabel, by default 0.5.
Parameters
----------
v : float
Minimum threshold for the final result
"""
self._set(threshold=v)
return self
[docs] def setThresholdLabel(self, p):
"""Sets what should be the label in case the score is less than
threshold, by default Unknown.
Parameters
----------
p : str
The replacement label.
"""
return self._set(thresholdLabel=p)
[docs] def setCoalesceSentences(self, value):
"""Sets if the output of all sentences will be averaged to one output
instead of one output per sentence, by default True.
Parameters
----------
value : bool
If the output of all sentences will be averaged to one output
"""
return self._set(coalesceSentences=value)
@staticmethod
[docs] def pretrained(name="ld_wiki_tatoeba_cnn_21", lang="xx", remote_loc=None):
"""Downloads and loads a pretrained model.
Parameters
----------
name : str, optional
Name of the pretrained model, by default "ld_wiki_tatoeba_cnn_21"
lang : str, optional
Language of the pretrained model, by default "xx"
remote_loc : str, optional
Optional remote address of the resource, by default None. Will use
Spark NLPs repositories otherwise.
Returns
-------
LanguageDetectorDL
The restored model
"""
from sparknlp.pretrained import ResourceDownloader
return ResourceDownloader.downloadModel(LanguageDetectorDL, name, lang, remote_loc)