Source code for sparknlp.annotator.document_title_splitter

#  Copyright 2017-2026 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""Contains classes for the DocumentTitleSplitter"""

from sparknlp.common import *


[docs]class DocumentTitleSplitter(AnnotatorModel):
    """Annotator that groups element-level documents into title-aware sections.

    ``DocumentTitleSplitter`` is intended to work with element-level ``DOCUMENT``
    annotations, such as those produced by
    ``Reader2Doc().setOutputAsDocument(False)``. Whenever an input annotation has
    ``metadata["elementType"] == "Title"``, it starts a new semantic section and
    the title stays with the following content.

    Optionally, oversized sections can be split by character length after the
    semantic grouping phase.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``DOCUMENT``
    ====================== ======================

    Parameters
    ----------
    joinString
        String used to join element texts inside a section, by default ``" "``.
    splitOnPageChange
        Whether to start a new section when page number changes, by default
        ``False``.
    enableOverflowSplitting
        Whether to split oversized sections after title grouping, by default
        ``False``.
    maxCharacters
        Maximum size of an overflow-split chunk, by default ``500``.
    explodeSplits
        Whether to explode split chunks to separate rows, by default ``False``.
    """

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
[docs]    outputAnnotatorType = AnnotatorType.DOCUMENT

[docs]    joinString = Param(
        Params._dummy(),
        "joinString",
        "String used to join element texts inside a section",
        typeConverter=TypeConverters.toString,
    )
[docs]    splitOnPageChange = Param(
        Params._dummy(),
        "splitOnPageChange",
        "Whether to start a new section when page number changes",
        typeConverter=TypeConverters.toBoolean,
    )
[docs]    enableOverflowSplitting = Param(
        Params._dummy(),
        "enableOverflowSplitting",
        "Whether to split oversized sections after title grouping",
        typeConverter=TypeConverters.toBoolean,
    )
[docs]    maxCharacters = Param(
        Params._dummy(),
        "maxCharacters",
        "Maximum size of an overflow-split chunk",
        typeConverter=TypeConverters.toInt,
    )
[docs]    explodeSplits = Param(
        Params._dummy(),
        "explodeSplits",
        "Whether to explode split chunks to separate rows",
        typeConverter=TypeConverters.toBoolean,
    )

    @keyword_only
    def __init__(self):
        super(DocumentTitleSplitter, self).__init__(
            classname="com.johnsnowlabs.nlp.annotators.DocumentTitleSplitter"
        )
        self._setDefault(
            joinString=" ",
            splitOnPageChange=False,
            enableOverflowSplitting=False,
            maxCharacters=500,
            explodeSplits=False,
        )

[docs]    def setJoinString(self, value):
        """Sets the string used to join element texts inside a section.

        Parameters
        ----------
        value : str
            Join string used between element texts
        """
        return self._set(joinString=value)

[docs]    def setSplitOnPageChange(self, value):
        """Sets whether to start a new section when page number changes.

        Parameters
        ----------
        value : bool
            Whether to start a new section when page number changes
        """
        return self._set(splitOnPageChange=value)

[docs]    def setEnableOverflowSplitting(self, value):
        """Sets whether to split oversized sections after title grouping.

        Parameters
        ----------
        value : bool
            Whether to split oversized sections after title grouping
        """
        return self._set(enableOverflowSplitting=value)

[docs]    def setMaxCharacters(self, value):
        """Sets the maximum size of an overflow-split chunk.

        Parameters
        ----------
        value : int
            Maximum size of an overflow-split chunk
        """
        if value < 1:
            raise ValueError("maxCharacters should be larger than 0.")
        return self._set(maxCharacters=value)

[docs]    def setExplodeSplits(self, value):
        """Sets whether to explode split chunks to separate rows.

        Parameters
        ----------
        value : bool
            Whether to explode split chunks to separate rows
        """
        return self._set(explodeSplits=value)