Source code for sparknlp.reader.layout_aligner_for_vision

#  Copyright 2017-2025 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters

from sparknlp.common import AnnotatorType, AnnotatorProperties
from sparknlp.internal import AnnotatorTransformer


[docs]class LayoutAlignerForVision(AnnotatorTransformer, AnnotatorProperties):
    """Aligns document chunks with nearby images and emits paired outputs.

    The output is written to three derived columns based on ``outputCol``:
    ``<outputCol>_doc``, ``<outputCol>_image``, and ``<outputCol>_prompt``.

    ======================= ======================
    Input Annotation types  Output Annotation type
    ======================= ======================
    ``DOCUMENT, IMAGE``     ``DOCUMENT``
    ======================= ======================
    """

[docs]    name = "LayoutAlignerForVision"

[docs]    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.IMAGE]
[docs]    outputAnnotatorType = AnnotatorType.DOCUMENT

[docs]    maxDistance = Param(
        Params._dummy(),
        "maxDistance",
        "Maximum vertical distance (px) to align image with paragraph.",
        typeConverter=TypeConverters.toInt,
    )

[docs]    paragraphSpacingY = Param(
        Params._dummy(),
        "paragraphSpacingY",
        "Vertical spacing heuristic used during parsing.",
        typeConverter=TypeConverters.toInt,
    )

[docs]    includeContextWindow = Param(
        Params._dummy(),
        "includeContextWindow",
        "Include paragraph +/-1 as context for floating images.",
        typeConverter=TypeConverters.toBoolean,
    )

[docs]    confidenceThreshold = Param(
        Params._dummy(),
        "confidenceThreshold",
        "Minimum confidence required to emit alignment.",
        typeConverter=TypeConverters.toFloat,
    )

[docs]    explodeDocs = Param(
        Params._dummy(),
        "explodeDocs",
        "Whether to explode aligned doc/image pairs into separate rows.",
        typeConverter=TypeConverters.toBoolean,
    )

[docs]    mergeImagesPerChunk = Param(
        Params._dummy(),
        "mergeImagesPerChunk",
        "When true, keep one primary image per paragraph and store all matches in doc metadata.",
        typeConverter=TypeConverters.toBoolean,
    )

[docs]    addNeighborText = Param(
        Params._dummy(),
        "addNeighborText",
        "When true, include aligned text in the prompt output.",
        typeConverter=TypeConverters.toBoolean,
    )

[docs]    imageCaptionBasePrompt = Param(
        Params._dummy(),
        "imageCaptionBasePrompt",
        "Base prompt used for captioning aligned images.",
        typeConverter=TypeConverters.toString,
    )

[docs]    neighborTextCharsWindow = Param(
        Params._dummy(),
        "neighborTextCharsWindow",
        "When > 0, include this many characters before and after aligned text in prompt context.",
        typeConverter=TypeConverters.toInt,
    )

    @keyword_only
    def __init__(self):
        super(LayoutAlignerForVision, self).__init__(
            classname="com.johnsnowlabs.reader.LayoutAlignerForVision"
        )
        self._setDefault(
            outputCol="aligned",
            maxDistance=40,
            paragraphSpacingY=25,
            includeContextWindow=True,
            confidenceThreshold=0.0,
            explodeDocs=True,
            mergeImagesPerChunk=False,
            addNeighborText=False,
            imageCaptionBasePrompt="Describe in a short and easy to understand sentence what you see in the image",
            neighborTextCharsWindow=0,
        )

    @keyword_only
[docs]    def setParams(self):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

[docs]    def setMaxDistance(self, value):
        return self._set(maxDistance=value)

[docs]    def setParagraphSpacingY(self, value):
        return self._set(paragraphSpacingY=value)

[docs]    def setIncludeContextWindow(self, value):
        return self._set(includeContextWindow=value)

[docs]    def setConfidenceThreshold(self, value):
        return self._set(confidenceThreshold=value)

[docs]    def setExplodeDocs(self, value):
        return self._set(explodeDocs=value)

[docs]    def setMergeImagesPerChunk(self, value):
        return self._set(mergeImagesPerChunk=value)

[docs]    def setAddNeighborText(self, value):
        return self._set(addNeighborText=value)

[docs]    def setImageCaptionBasePrompt(self, value):
        return self._set(imageCaptionBasePrompt=value)

[docs]    def setNeighborTextCharsWindow(self, value):
        return self._set(neighborTextCharsWindow=value)