Source code for sparknlp.annotator.sentence.sentence_detector

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the SentenceDetector."""

from sparknlp.common import *


[docs]class SentenceDetectorParams:
    """Base class for SentenceDetector parameters
    """

[docs]    useAbbreviations = Param(Params._dummy(),
                             "useAbbreviations",
                             "whether to apply abbreviations at sentence detection",
                             typeConverter=TypeConverters.toBoolean)

[docs]    customBounds = Param(Params._dummy(),
                         "customBounds",
                         "characters used to explicitly mark sentence bounds",
                         typeConverter=TypeConverters.toListString)

[docs]    useCustomBoundsOnly = Param(Params._dummy(),
                                "useCustomBoundsOnly",
                                "Only utilize custom bounds in sentence detection",
                                typeConverter=TypeConverters.toBoolean)

[docs]    customBoundsStrategy = Param(Params._dummy(),
                                 "customBoundsStrategy",
                                 "How to return matched custom bounds",
                                 typeConverter=TypeConverters.toString)

[docs]    explodeSentences = Param(Params._dummy(),
                             "explodeSentences",
                             "whether to explode each sentence into a different row, for better parallelization. Defaults to false.",
                             typeConverter=TypeConverters.toBoolean)

[docs]    splitLength = Param(Params._dummy(),
                        "splitLength",
                        "length at which sentences will be forcibly split.",
                        typeConverter=TypeConverters.toInt)

[docs]    minLength = Param(Params._dummy(),
                      "minLength",
                      "Set the minimum allowed length for each sentence.",
                      typeConverter=TypeConverters.toInt)

[docs]    maxLength = Param(Params._dummy(),
                      "maxLength",
                      "Set the maximum allowed length for each sentence",
                      typeConverter=TypeConverters.toInt)


[docs]class SentenceDetector(AnnotatorModel, SentenceDetectorParams):
    """Annotator that detects sentence boundaries using regular expressions.

    The following characters are checked as sentence boundaries:

    1. Lists ("(i), (ii)", "(a), (b)", "1., 2.")
    2. Numbers
    3. Abbreviations
    4. Punctuations
    5. Multiple Periods
    6. Geo-Locations/Coordinates ("N°. 1026.253.553.")
    7. Ellipsis ("...")
    8. In-between punctuations
    9. Quotation marks
    10. Exclamation Points
    11. Basic Breakers (".", ";")

    For the explicit regular expressions used for detection, refer to source of
    `PragmaticContentFormatter <https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala>`__.

    To add additional custom bounds, the parameter ``customBounds`` can be set with an array:

    >>> sentence = SentenceDetector() \\
    >>>     .setInputCols(["document"]) \\
    >>>     .setOutputCol("sentence") \\
    >>>     .setCustomBounds(["\\n\\n"])

    If only the custom bounds should be used, then the parameter ``useCustomBoundsOnly`` should be set to ``true``.

    Each extracted sentence can be returned in an Array or exploded to separate rows,
    if ``explodeSentences`` is set to ``true``.

    For extended examples of usage, see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/sentence-detection/SentenceDetector_advanced_examples.ipynb>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``DOCUMENT``
    ====================== ======================

    Parameters
    ----------
    useAbbreviations
        whether to apply abbreviations at sentence detection, by default True
    customBounds
        characters used to explicitly mark sentence bounds, by default []
    useCustomBoundsOnly
        Only utilize custom bounds in sentence detection, by default False
    customBoundsStrategy
        Sets how to return matched custom bounds, by default "none".

        Will have no effect if no custom bounds are used.
        Possible values are:

        - "none" - Will not return the matched bound
        - "prepend" - Prepends a sentence break to the match
        - "append" - Appends a sentence break to the match
    explodeSentences
        whether to explode each sentence into a different row, for better
        parallelization, by default False
    splitLength
        length at which sentences will be forcibly split
    minLength
        Set the minimum allowed length for each sentence, by default 0
    maxLength
        Set the maximum allowed length for each sentence, by default 99999
    detectLists
        whether detect lists during sentence detection, by default True

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> sentence = SentenceDetector() \\
    ...     .setInputCols(["document"]) \\
    ...     .setOutputCol("sentence")
    ...     .setCustomBounds(["\\n\\n"])
    >>> pipeline = Pipeline().setStages([
    ...     documentAssembler,
    ...     sentence
    ... ])
    >>> data = spark.createDataFrame([["This is my first sentence. This my second.\\n\\nHow about a third?"]]).toDF("text")
    >>> result = pipeline.fit(data).transform(data)
    >>> result.selectExpr("explode(sentence) as sentences").show(truncate=False)
    +------------------------------------------------------------------+
    |sentences                                                         |
    +------------------------------------------------------------------+
    |[document, 0, 25, This is my first sentence., [sentence -> 0], []]|
    |[document, 27, 41, This my second., [sentence -> 1], []]          |
    |[document, 43, 60, How about a third?, [sentence -> 2], []]       |
    +------------------------------------------------------------------+
    """

[docs]    name = 'SentenceDetector'

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]

[docs]    outputAnnotatorType = AnnotatorType.DOCUMENT

    # this one is exclusive to this detector
[docs]    detectLists = Param(Params._dummy(),
                        "detectLists",
                        "whether detect lists during sentence detection",
                        typeConverter=TypeConverters.toBoolean)

[docs]    def setCustomBounds(self, value):
        """Sets characters used to explicitly mark sentence bounds, by default
        [].

        Parameters
        ----------
        value : List[str]
            Characters used to explicitly mark sentence bounds
        """
        return self._set(customBounds=value)

[docs]    def setCustomBoundsStrategy(self, value):
        """Sets how to return matched custom bounds, by default "none".

        Will have no effect if no custom bounds are used.
        Possible values are:

        - "none" - Will not return the matched bound
        - "prepend" - Prepends a sentence break to the match
        - "append" - Appends a sentence break to the match

        Parameters
        ----------
        value : str
            Strategy to use
        """
        return self._set(customBoundsStrategy=value)

[docs]    def setUseAbbreviations(self, value):
        """Sets whether to apply abbreviations at sentence detection, by default
        True

        Parameters
        ----------
        value : bool
            Whether to apply abbreviations at sentence detection
        """
        return self._set(useAbbreviations=value)

[docs]    def setDetectLists(self, value):
        """Sets whether detect lists during sentence detection, by default True

        Parameters
        ----------
        value : bool
            Whether detect lists during sentence detection
        """
        return self._set(detectLists=value)

[docs]    def setUseCustomBoundsOnly(self, value):
        """Sets whether to only utilize custom bounds in sentence detection, by
        default False.

        Parameters
        ----------
        value : bool
            Whether to only utilize custom bounds
        """
        return self._set(useCustomBoundsOnly=value)

[docs]    def setExplodeSentences(self, value):
        """Sets whether to explode each sentence into a different row, for
        better parallelization, by default False.

        Parameters
        ----------
        value : bool
            Whether to explode each sentence into a different row
        """
        return self._set(explodeSentences=value)

[docs]    def setSplitLength(self, value):
        """Sets length at which sentences will be forcibly split.

        Parameters
        ----------
        value : int
            Length at which sentences will be forcibly split.
        """
        return self._set(splitLength=value)

[docs]    def setMinLength(self, value):
        """Sets minimum allowed length for each sentence, by default 0

        Parameters
        ----------
        value : int
            Minimum allowed length for each sentence
        """
        return self._set(minLength=value)

[docs]    def setMaxLength(self, value):
        """Sets the maximum allowed length for each sentence, by default
        99999

        Parameters
        ----------
        value : int
            Maximum allowed length for each sentence
        """
        return self._set(maxLength=value)

    @keyword_only
    def __init__(self):
        super(SentenceDetector, self).__init__(
            classname="com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector")
        self._setDefault(
            useAbbreviations=True,
            detectLists=True,
            useCustomBoundsOnly=False,
            customBounds=[],
            customBoundsStrategy="none",
            explodeSentences=False,
            minLength=0,
            maxLength=99999
        )