Source code for sparknlp.reader.layout_aligner_for_vision

#  Copyright 2017-2025 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters

from sparknlp.common import AnnotatorType, AnnotatorProperties
from sparknlp.internal import AnnotatorTransformer


[docs]class LayoutAlignerForVision(AnnotatorTransformer, AnnotatorProperties): """Aligns document chunks with nearby images and emits paired outputs. The output is written to three derived columns based on ``outputCol``: ``<outputCol>_doc``, ``<outputCol>_image``, and ``<outputCol>_prompt``. ======================= ====================== Input Annotation types Output Annotation type ======================= ====================== ``DOCUMENT, IMAGE`` ``DOCUMENT`` ======================= ====================== """
[docs] name = "LayoutAlignerForVision"
[docs] inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.IMAGE]
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] maxDistance = Param( Params._dummy(), "maxDistance", "Maximum vertical distance (px) to align image with paragraph.", typeConverter=TypeConverters.toInt, )
[docs] paragraphSpacingY = Param( Params._dummy(), "paragraphSpacingY", "Vertical spacing heuristic used during parsing.", typeConverter=TypeConverters.toInt, )
[docs] includeContextWindow = Param( Params._dummy(), "includeContextWindow", "Include paragraph +/-1 as context for floating images.", typeConverter=TypeConverters.toBoolean, )
[docs] confidenceThreshold = Param( Params._dummy(), "confidenceThreshold", "Minimum confidence required to emit alignment.", typeConverter=TypeConverters.toFloat, )
[docs] explodeDocs = Param( Params._dummy(), "explodeDocs", "Whether to explode aligned doc/image pairs into separate rows.", typeConverter=TypeConverters.toBoolean, )
[docs] mergeImagesPerChunk = Param( Params._dummy(), "mergeImagesPerChunk", "When true, keep one primary image per paragraph and store all matches in doc metadata.", typeConverter=TypeConverters.toBoolean, )
[docs] addNeighborText = Param( Params._dummy(), "addNeighborText", "When true, include aligned text in the prompt output.", typeConverter=TypeConverters.toBoolean, )
[docs] imageCaptionBasePrompt = Param( Params._dummy(), "imageCaptionBasePrompt", "Base prompt used for captioning aligned images.", typeConverter=TypeConverters.toString, )
[docs] neighborTextCharsWindow = Param( Params._dummy(), "neighborTextCharsWindow", "When > 0, include this many characters before and after aligned text in prompt context.", typeConverter=TypeConverters.toInt, )
@keyword_only def __init__(self): super(LayoutAlignerForVision, self).__init__( classname="com.johnsnowlabs.reader.LayoutAlignerForVision" ) self._setDefault( outputCol="aligned", maxDistance=40, paragraphSpacingY=25, includeContextWindow=True, confidenceThreshold=0.0, explodeDocs=True, mergeImagesPerChunk=False, addNeighborText=False, imageCaptionBasePrompt="Describe in a short and easy to understand sentence what you see in the image", neighborTextCharsWindow=0, ) @keyword_only
[docs] def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs)
[docs] def setMaxDistance(self, value): return self._set(maxDistance=value)
[docs] def setParagraphSpacingY(self, value): return self._set(paragraphSpacingY=value)
[docs] def setIncludeContextWindow(self, value): return self._set(includeContextWindow=value)
[docs] def setConfidenceThreshold(self, value): return self._set(confidenceThreshold=value)
[docs] def setExplodeDocs(self, value): return self._set(explodeDocs=value)
[docs] def setMergeImagesPerChunk(self, value): return self._set(mergeImagesPerChunk=value)
[docs] def setAddNeighborText(self, value): return self._set(addNeighborText=value)
[docs] def setImageCaptionBasePrompt(self, value): return self._set(imageCaptionBasePrompt=value)
[docs] def setNeighborTextCharsWindow(self, value): return self._set(neighborTextCharsWindow=value)