Source code for sparknlp.annotator.document_title_splitter
# Copyright 2017-2026 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for the DocumentTitleSplitter"""
from sparknlp.common import *
[docs]class DocumentTitleSplitter(AnnotatorModel):
"""Annotator that groups element-level documents into title-aware sections.
``DocumentTitleSplitter`` is intended to work with element-level ``DOCUMENT``
annotations, such as those produced by
``Reader2Doc().setOutputAsDocument(False)``. Whenever an input annotation has
``metadata["elementType"] == "Title"``, it starts a new semantic section and
the title stays with the following content.
Optionally, oversized sections can be split by character length after the
semantic grouping phase.
====================== ======================
Input Annotation types Output Annotation type
====================== ======================
``DOCUMENT`` ``DOCUMENT``
====================== ======================
Parameters
----------
joinString
String used to join element texts inside a section, by default ``" "``.
splitOnPageChange
Whether to start a new section when page number changes, by default
``False``.
enableOverflowSplitting
Whether to split oversized sections after title grouping, by default
``False``.
maxCharacters
Maximum size of an overflow-split chunk, by default ``500``.
explodeSplits
Whether to explode split chunks to separate rows, by default ``False``.
"""
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] joinString = Param(
Params._dummy(),
"joinString",
"String used to join element texts inside a section",
typeConverter=TypeConverters.toString,
)
[docs] splitOnPageChange = Param(
Params._dummy(),
"splitOnPageChange",
"Whether to start a new section when page number changes",
typeConverter=TypeConverters.toBoolean,
)
[docs] enableOverflowSplitting = Param(
Params._dummy(),
"enableOverflowSplitting",
"Whether to split oversized sections after title grouping",
typeConverter=TypeConverters.toBoolean,
)
[docs] maxCharacters = Param(
Params._dummy(),
"maxCharacters",
"Maximum size of an overflow-split chunk",
typeConverter=TypeConverters.toInt,
)
[docs] explodeSplits = Param(
Params._dummy(),
"explodeSplits",
"Whether to explode split chunks to separate rows",
typeConverter=TypeConverters.toBoolean,
)
@keyword_only
def __init__(self):
super(DocumentTitleSplitter, self).__init__(
classname="com.johnsnowlabs.nlp.annotators.DocumentTitleSplitter"
)
self._setDefault(
joinString=" ",
splitOnPageChange=False,
enableOverflowSplitting=False,
maxCharacters=500,
explodeSplits=False,
)
[docs] def setJoinString(self, value):
"""Sets the string used to join element texts inside a section.
Parameters
----------
value : str
Join string used between element texts
"""
return self._set(joinString=value)
[docs] def setSplitOnPageChange(self, value):
"""Sets whether to start a new section when page number changes.
Parameters
----------
value : bool
Whether to start a new section when page number changes
"""
return self._set(splitOnPageChange=value)
[docs] def setEnableOverflowSplitting(self, value):
"""Sets whether to split oversized sections after title grouping.
Parameters
----------
value : bool
Whether to split oversized sections after title grouping
"""
return self._set(enableOverflowSplitting=value)
[docs] def setMaxCharacters(self, value):
"""Sets the maximum size of an overflow-split chunk.
Parameters
----------
value : int
Maximum size of an overflow-split chunk
"""
if value < 1:
raise ValueError("maxCharacters should be larger than 0.")
return self._set(maxCharacters=value)
[docs] def setExplodeSplits(self, value):
"""Sets whether to explode split chunks to separate rows.
Parameters
----------
value : bool
Whether to explode split chunks to separate rows
"""
return self._set(explodeSplits=value)