Source code for sparknlp.reader.layout_aligner_for_text

#  Copyright 2017-2026 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters

from sparknlp.common import AnnotatorType, AnnotatorProperties
from sparknlp.internal import AnnotatorTransformer


[docs]class LayoutAlignerForText(AnnotatorTransformer, AnnotatorProperties): """Rebuilds final text by combining aligned document chunks and image captions. This transformer is designed to consume ``aligned_doc`` + ``image_caption`` pairs and produce coherent text output with re-computed ``begin`` and ``end`` indexes. ======================= ====================== Input Annotation types Output Annotation type ======================= ====================== ``DOCUMENT, DOCUMENT`` ``DOCUMENT`` ======================= ====================== """
[docs] name = "LayoutAlignerForText"
[docs] inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] joinDelimiter = Param( Params._dummy(), "joinDelimiter", "Delimiter used to join rebuilt text segments.", typeConverter=TypeConverters.toString, )
[docs] inlinePrefixThreshold = Param( Params._dummy(), "inlinePrefixThreshold", "Inline images with x <= threshold are inserted before paragraph text.", typeConverter=TypeConverters.toInt, )
[docs] explodeElements = Param( Params._dummy(), "explodeElements", "Whether to emit one output row per aligned text element.", typeConverter=TypeConverters.toBoolean, )
@keyword_only def __init__(self): super(LayoutAlignerForText, self).__init__( classname="com.johnsnowlabs.reader.LayoutAlignerForText" ) self._setDefault( outputCol="aligned_text", joinDelimiter="\n", inlinePrefixThreshold=10, explodeElements=False, ) @keyword_only
[docs] def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs)
[docs] def setJoinDelimiter(self, value): return self._set(joinDelimiter=value)
[docs] def setInlinePrefixThreshold(self, value): return self._set(inlinePrefixThreshold=value)
[docs] def setExplodeElements(self, value): return self._set(explodeElements=value)