Source code for sparknlp.annotator.sentence.sentence_detector_dl

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for SentenceDetectorDl."""

from sparknlp.common import *


[docs]class SentenceDetectorDLApproach(AnnotatorApproach):
    """Trains an annotator that detects sentence boundaries using a deep
    learning approach.

    Currently, only the CNN model is supported for training, but in the future
    the architecture of the model can be set with :meth:`.setModel`.

    For pretrained models see :class:`.SentenceDetectorDLModel`.

    Each extracted sentence can be returned in an Array or exploded to separate
    rows, if ``explodeSentences`` is set to ``True``.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``DOCUMENT``
    ====================== ======================

    Parameters
    ----------
    modelArchitecture
        Model architecture (CNN)
    impossiblePenultimates
        Impossible penultimates - list of strings which a sentence can't end
        with
    validationSplit
        Choose the proportion of training dataset to be validated against the
        model on each
    epochsNumber
        Number of epochs for the optimization process
    outputLogsPath
        Path to folder where logs will be saved. If no path is specified, no
        logs are generated
    explodeSentences
        Whether to explode each sentence into a different row, for better
        parallelization. Defaults to False.

    References
    ----------
    The default model ``"cnn"`` is based on the paper `Deep-EOS: General-Purpose
    Neural Networks for Sentence Boundary Detection (2020, Stefan Schweter,
    Sajawel Ahmed)
    <https://konvens.org/proceedings/2019/papers/KONVENS2019_paper_41.pdf>`__
    using a CNN architecture. We also modified the original implementation a
    little bit to cover broken sentences and some impossible end of line chars.

    Examples
    --------
    The training process needs data, where each data point is a sentence.
    In this example the ``train.txt`` file has the form of::

        ...
        Slightly more moderate language would make our present situation – namely the lack of progress – a little easier.
        His political successors now have great responsibilities to history and to the heritage of values bequeathed to them by Nelson Mandela.
        ...

    where each line is one sentence.

    Training can then be started like so:

    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> trainingData = spark.read.text("train.txt").toDF("text")
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> sentenceDetector = SentenceDetectorDLApproach() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("sentences") \\
    ...     .setEpochsNumber(100)
    >>> pipeline = Pipeline().setStages([documentAssembler, sentenceDetector])
    >>> model = pipeline.fit(trainingData)
    """

[docs]    name = "SentenceDetectorDLApproach"

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.DOCUMENT

[docs]    modelArchitecture = Param(Params._dummy(),
                              "modelArchitecture",
                              "Model architecture (CNN)",
                              typeConverter=TypeConverters.toString)

[docs]    impossiblePenultimates = Param(Params._dummy(),
                                   "impossiblePenultimates",
                                   "Impossible penultimates - list of strings which a sentence can't end with",
                                   typeConverter=TypeConverters.toListString)

[docs]    validationSplit = Param(Params._dummy(),
                            "validationSplit",
                            "Choose the proportion of training dataset to be validated against the model on each "
                            "Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
                            TypeConverters.toFloat)

[docs]    epochsNumber = Param(Params._dummy(),
                         "epochsNumber",
                         "Number of epochs for the optimization process",
                         TypeConverters.toInt)

[docs]    outputLogsPath = Param(Params._dummy(),
                           "outputLogsPath",
                           "Path to folder where logs will be saved. If no path is specified, no logs are generated",
                           TypeConverters.toString)

[docs]    explodeSentences = Param(Params._dummy(),
                             "explodeSentences",
                             "whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
                             TypeConverters.toBoolean)

[docs]    def setModel(self, model_architecture):
        """Sets the Model architecture. Currently only ``"cnn"`` is available.

        Parameters
        ----------
        model_architecture : str
            Model architecture
        """
        return self._set(modelArchitecture=model_architecture)

[docs]    def setValidationSplit(self, validation_split):
        """Sets the proportion of training dataset to be validated against the
        model on each Epoch, by default it is 0.0 and off. The value should be
        between 0.0 and 1.0.

        Parameters
        ----------
        validation_split : float
            Proportion of training dataset to be validated
        """
        return self._set(validationSplit=validation_split)

[docs]    def setEpochsNumber(self, epochs_number):
        """Sets number of epochs to train.

        Parameters
        ----------
        epochs_number : int
            Number of epochs
        """
        return self._set(epochsNumber=epochs_number)

[docs]    def setOutputLogsPath(self, output_logs_path):
        """Sets folder path to save training logs.

        Parameters
        ----------
        output_logs_path : str
            Folder path to save training logs
        """
        return self._set(outputLogsPath=output_logs_path)

[docs]    def setImpossiblePenultimates(self, impossible_penultimates):
        """Sets impossible penultimates - list of strings which a sentence can't
        end with.

        Parameters
        ----------
        impossible_penultimates : List[str]
            List of strings which a sentence can't end with

        """
        return self._set(impossiblePenultimates=impossible_penultimates)

[docs]    def setExplodeSentences(self, value):
        """Sets whether to explode each sentence into a different row, for
        better parallelization, by default False.

        Parameters
        ----------
        value : bool
            Whether to explode each sentence into a different row
        """
        return self._set(explodeSentences=value)

    def _create_model(self, java_model):
        return SentenceDetectorDLModel(java_model=java_model)

    @keyword_only
    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLApproach"):
        super(SentenceDetectorDLApproach, self).__init__(classname=classname)


[docs]class SentenceDetectorDLModel(AnnotatorModel, HasEngine):
    """Annotator that detects sentence boundaries using a deep learning approach.

    Instantiated Model of the :class:`.SentenceDetectorDLApproach`.
    Detects sentence boundaries using a deep learning approach.

    Pretrained models can be loaded with :meth:`.pretrained` of the companion
    object:

    >>> sentenceDL = SentenceDetectorDLModel.pretrained() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("sentencesDL")

    The default model is ``"sentence_detector_dl"``, if no name is provided. For
    available pretrained models please see the `Models Hub
    <https://sparknlp.org/models?task=Sentence+Detection>`__.

    Each extracted sentence can be returned in an Array or exploded to separate
    rows, if ``explodeSentences`` is set to ``true``.

    For extended examples of usage, see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/multilingual/SentenceDetectorDL.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``DOCUMENT``
    ====================== ======================

    Parameters
    ----------
    modelArchitecture
        Model architecture (CNN)
    explodeSentences
        whether to explode each sentence into a different row, for better
        parallelization. Defaults to false.
    customBounds
        characters used to explicitly mark sentence bounds, by default []
    useCustomBoundsOnly
        Only utilize custom bounds in sentence detection, by default False
    splitLength
        length at which sentences will be forcibly split
    minLength
        Set the minimum allowed length for each sentence, by default 0
    maxLength
        Set the maximum allowed length for each sentence, by default 99999
    impossiblePenultimates
        Impossible penultimates - list of strings which a sentence can't end
        with

    Examples
    --------
    In this example, the normal `SentenceDetector` is compared to the
    `SentenceDetectorDLModel`. In a pipeline, `SentenceDetectorDLModel` can be
    used as a replacement for the `SentenceDetector`.

    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> sentence = SentenceDetector() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("sentences")
    >>> sentenceDL = SentenceDetectorDLModel \\
    ...     .pretrained("sentence_detector_dl", "en") \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("sentencesDL")
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     sentence,
    ...     sentenceDL
    ... ])
    >>> data = spark.createDataFrame([[\"\"\"John loves Mary.Mary loves Peter
    ...     Peter loves Helen .Helen loves John;
    ...     Total: four people involved.\"\"\"]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("explode(sentences.result) as sentences").show(truncate=False)
    +----------------------------------------------------------+
    |sentences                                                 |
    +----------------------------------------------------------+
    |John loves Mary.Mary loves Peter\\n     Peter loves Helen .|
    |Helen loves John;                                         |
    |Total: four people involved.                              |
    +----------------------------------------------------------+
    >>> result.selectExpr("explode(sentencesDL.result) as sentencesDL").show(truncate=False)
    +----------------------------+
    |sentencesDL                 |
    +----------------------------+
    |John loves Mary.            |
    |Mary loves Peter            |
    |Peter loves Helen .         |
    |Helen loves John;           |
    |Total: four people involved.|
    +----------------------------+
    """
[docs]    name = "SentenceDetectorDLModel"

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.DOCUMENT

[docs]    modelArchitecture = Param(Params._dummy(), "modelArchitecture", "Model architecture (CNN)",
                              typeConverter=TypeConverters.toString)

[docs]    explodeSentences = Param(Params._dummy(),
                             "explodeSentences",
                             "whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
                             TypeConverters.toBoolean)

[docs]    customBounds = Param(Params._dummy(),
                         "customBounds",
                         "characters used to explicitly mark sentence bounds",
                         typeConverter=TypeConverters.toListString)

[docs]    useCustomBoundsOnly = Param(Params._dummy(),
                                "useCustomBoundsOnly",
                                "Only utilize custom bounds in sentence detection",
                                typeConverter=TypeConverters.toBoolean)

[docs]    splitLength = Param(Params._dummy(),
                        "splitLength",
                        "length at which sentences will be forcibly split.",
                        typeConverter=TypeConverters.toInt)

[docs]    minLength = Param(Params._dummy(),
                      "minLength",
                      "Set the minimum allowed length for each sentence.",
                      typeConverter=TypeConverters.toInt)

[docs]    maxLength = Param(Params._dummy(),
                      "maxLength",
                      "Set the maximum allowed length for each sentence",
                      typeConverter=TypeConverters.toInt)

[docs]    impossiblePenultimates = Param(Params._dummy(),
                                   "impossiblePenultimates",
                                   "Impossible penultimates - list of strings which a sentence can't end with",
                                   typeConverter=TypeConverters.toListString)

[docs]    def setModel(self, modelArchitecture):
        """Sets the Model architecture. Currently only ``"cnn"`` is available.

        Parameters
        ----------
        model_architecture : str
            Model architecture
        """
        return self._set(modelArchitecture=modelArchitecture)

[docs]    def setExplodeSentences(self, value):
        """Sets whether to explode each sentence into a different row, for
        better parallelization, by default False.

        Parameters
        ----------
        value : bool
            Whether to explode each sentence into a different row
        """
        return self._set(explodeSentences=value)

[docs]    def setCustomBounds(self, value):
        """Sets characters used to explicitly mark sentence bounds, by default
        [].

        Parameters
        ----------
        value : List[str]
            Characters used to explicitly mark sentence bounds
        """
        return self._set(customBounds=value)

[docs]    def setUseCustomBoundsOnly(self, value):
        """Sets whether to only utilize custom bounds in sentence detection, by
        default False.

        Parameters
        ----------
        value : bool
            Whether to only utilize custom bounds
        """
        return self._set(useCustomBoundsOnly=value)

[docs]    def setSplitLength(self, value):
        """Sets length at which sentences will be forcibly split.

        Parameters
        ----------
        value : int
            Length at which sentences will be forcibly split.
        """
        return self._set(splitLength=value)

[docs]    def setMinLength(self, value):
        """Sets minimum allowed length for each sentence, by default 0

        Parameters
        ----------
        value : int
            Minimum allowed length for each sentence
        """
        return self._set(minLength=value)

[docs]    def setMaxLength(self, value):
        """Sets the maximum allowed length for each sentence, by default
        99999

        Parameters
        ----------
        value : int
            Maximum allowed length for each sentence
        """
        return self._set(maxLength=value)

[docs]    def setImpossiblePenultimates(self, impossible_penultimates):
        """Sets impossible penultimates - list of strings which a sentence can't
        end with.

        Parameters
        ----------
        impossible_penultimates : List[str]
            List of strings which a sentence can't end with

        """
        return self._set(impossiblePenultimates=impossible_penultimates)

    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLModel",
                 java_model=None):
        super(SentenceDetectorDLModel, self).__init__(
            classname=classname,
            java_model=java_model
        )
        self._setDefault(
            useCustomBoundsOnly=False,
            customBounds=[],
            explodeSentences=False,
            minLength=0,
            maxLength=99999
        )

    @staticmethod
[docs]    def pretrained(name="sentence_detector_dl", lang="en", remote_loc=None):
        """Downloads and loads a pretrained model.

        Parameters
        ----------
        name : str, optional
            Name of the pretrained model, by default "sentence_detector_dl"
        lang : str, optional
            Language of the pretrained model, by default "en"
        remote_loc : str, optional
            Optional remote address of the resource, by default None. Will use
            Spark NLPs repositories otherwise.

        Returns
        -------
        SentenceDetectorDLModel
            The restored model
        """
        from sparknlp.pretrained import ResourceDownloader
        return ResourceDownloader.downloadModel(SentenceDetectorDLModel, name, lang, remote_loc)