# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters
from sparknlp.common import AnnotatorType, AnnotatorProperties
from sparknlp.internal import AnnotatorTransformer
[docs]class LayoutAlignerForVision(AnnotatorTransformer, AnnotatorProperties):
"""Aligns document chunks with nearby images and emits paired outputs.
The output is written to three derived columns based on ``outputCol``:
``<outputCol>_doc``, ``<outputCol>_image``, and ``<outputCol>_prompt``.
======================= ======================
Input Annotation types Output Annotation type
======================= ======================
``DOCUMENT, IMAGE`` ``DOCUMENT``
======================= ======================
"""
[docs] name = "LayoutAlignerForVision"
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] maxDistance = Param(
Params._dummy(),
"maxDistance",
"Maximum vertical distance (px) to align image with paragraph.",
typeConverter=TypeConverters.toInt,
)
[docs] paragraphSpacingY = Param(
Params._dummy(),
"paragraphSpacingY",
"Vertical spacing heuristic used during parsing.",
typeConverter=TypeConverters.toInt,
)
[docs] includeContextWindow = Param(
Params._dummy(),
"includeContextWindow",
"Include paragraph +/-1 as context for floating images.",
typeConverter=TypeConverters.toBoolean,
)
[docs] confidenceThreshold = Param(
Params._dummy(),
"confidenceThreshold",
"Minimum confidence required to emit alignment.",
typeConverter=TypeConverters.toFloat,
)
[docs] explodeDocs = Param(
Params._dummy(),
"explodeDocs",
"Whether to explode aligned doc/image pairs into separate rows.",
typeConverter=TypeConverters.toBoolean,
)
[docs] mergeImagesPerChunk = Param(
Params._dummy(),
"mergeImagesPerChunk",
"When true, keep one primary image per paragraph and store all matches in doc metadata.",
typeConverter=TypeConverters.toBoolean,
)
[docs] addNeighborText = Param(
Params._dummy(),
"addNeighborText",
"When true, include aligned text in the prompt output.",
typeConverter=TypeConverters.toBoolean,
)
[docs] imageCaptionBasePrompt = Param(
Params._dummy(),
"imageCaptionBasePrompt",
"Base prompt used for captioning aligned images.",
typeConverter=TypeConverters.toString,
)
[docs] neighborTextCharsWindow = Param(
Params._dummy(),
"neighborTextCharsWindow",
"When > 0, include this many characters before and after aligned text in prompt context.",
typeConverter=TypeConverters.toInt,
)
@keyword_only
def __init__(self):
super(LayoutAlignerForVision, self).__init__(
classname="com.johnsnowlabs.reader.LayoutAlignerForVision"
)
self._setDefault(
outputCol="aligned",
maxDistance=40,
paragraphSpacingY=25,
includeContextWindow=True,
confidenceThreshold=0.0,
explodeDocs=True,
mergeImagesPerChunk=False,
addNeighborText=False,
imageCaptionBasePrompt="Describe in a short and easy to understand sentence what you see in the image",
neighborTextCharsWindow=0,
)
@keyword_only
[docs] def setParams(self):
kwargs = self._input_kwargs
return self._set(**kwargs)
[docs] def setMaxDistance(self, value):
return self._set(maxDistance=value)
[docs] def setParagraphSpacingY(self, value):
return self._set(paragraphSpacingY=value)
[docs] def setIncludeContextWindow(self, value):
return self._set(includeContextWindow=value)
[docs] def setConfidenceThreshold(self, value):
return self._set(confidenceThreshold=value)
[docs] def setExplodeDocs(self, value):
return self._set(explodeDocs=value)
[docs] def setMergeImagesPerChunk(self, value):
return self._set(mergeImagesPerChunk=value)
[docs] def setAddNeighborText(self, value):
return self._set(addNeighborText=value)
[docs] def setImageCaptionBasePrompt(self, value):
return self._set(imageCaptionBasePrompt=value)
[docs] def setNeighborTextCharsWindow(self, value):
return self._set(neighborTextCharsWindow=value)