Source code for sparknlp.reader.layout_aligner_for_text
# Copyright 2017-2026 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters
from sparknlp.common import AnnotatorType, AnnotatorProperties
from sparknlp.internal import AnnotatorTransformer
[docs]class LayoutAlignerForText(AnnotatorTransformer, AnnotatorProperties):
"""Rebuilds final text by combining aligned document chunks and image captions.
This transformer is designed to consume ``aligned_doc`` + ``image_caption`` pairs and produce
coherent text output with re-computed ``begin`` and ``end`` indexes.
======================= ======================
Input Annotation types Output Annotation type
======================= ======================
``DOCUMENT, DOCUMENT`` ``DOCUMENT``
======================= ======================
"""
[docs] name = "LayoutAlignerForText"
[docs] inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] joinDelimiter = Param(
Params._dummy(),
"joinDelimiter",
"Delimiter used to join rebuilt text segments.",
typeConverter=TypeConverters.toString,
)
[docs] inlinePrefixThreshold = Param(
Params._dummy(),
"inlinePrefixThreshold",
"Inline images with x <= threshold are inserted before paragraph text.",
typeConverter=TypeConverters.toInt,
)
[docs] explodeElements = Param(
Params._dummy(),
"explodeElements",
"Whether to emit one output row per aligned text element.",
typeConverter=TypeConverters.toBoolean,
)
@keyword_only
def __init__(self):
super(LayoutAlignerForText, self).__init__(
classname="com.johnsnowlabs.reader.LayoutAlignerForText"
)
self._setDefault(
outputCol="aligned_text",
joinDelimiter="\n",
inlinePrefixThreshold=10,
explodeElements=False,
)
@keyword_only
[docs] def setParams(self):
kwargs = self._input_kwargs
return self._set(**kwargs)
[docs] def setJoinDelimiter(self, value):
return self._set(joinDelimiter=value)
[docs] def setInlinePrefixThreshold(self, value):
return self._set(inlinePrefixThreshold=value)
[docs] def setExplodeElements(self, value):
return self._set(explodeElements=value)