Source code for sparknlp.base.multi_column_assembler

#  Copyright 2017-2026 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the MultiColumnAssembler."""

from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param

from sparknlp.internal import AnnotatorTransformer
from sparknlp.common import AnnotatorProperties, AnnotatorType


[docs]class MultiColumnAssembler(AnnotatorTransformer, AnnotatorProperties): """Merges multiple annotation columns into a single annotation column. This is useful when multiple annotators produce separate annotation columns (e.g., ``document_text``, ``document_table`` from ``ReaderAssembler``) and a downstream annotator (e.g., ``AutoGGUFVisionModel``) expects a single input column containing all annotations. Annotations from all input columns are collected and concatenated into the output column. The output annotator type defaults to ``DOCUMENT`` but can be configured. Each annotation's metadata is preserved, and a ``source_column`` key is added to track the original column name. **Note:** All input columns must use the ``Annotation`` schema. Columns using ``AnnotationImage`` schema (e.g., IMAGE-typed columns from ``ReaderAssembler``) are not supported. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT`` ``DOCUMENT`` ====================== ====================== Parameters ---------- inputCols Input annotation columns to merge outputCol Output annotation column name outputAsAnnotatorType The annotator type to use for the output annotations (Default: ``document``) sortByBegin Whether to sort merged annotations by their begin position (Default: ``False``) Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from pyspark.ml import Pipeline >>> documentAssembler1 = DocumentAssembler() \\ ... .setInputCol("text1") \\ ... .setOutputCol("document_text") >>> documentAssembler2 = DocumentAssembler() \\ ... .setInputCol("text2") \\ ... .setOutputCol("document_table") >>> multiColumnAssembler = MultiColumnAssembler() \\ ... .setInputCols(["document_text", "document_table"]) \\ ... .setOutputCol("merged_document") >>> data = spark.createDataFrame([("Hello world", "Name | Age")]).toDF("text1", "text2") >>> pipeline = Pipeline().setStages([documentAssembler1, documentAssembler2, multiColumnAssembler]).fit(data) >>> result = pipeline.transform(data) >>> result.selectExpr("merged_document.result").show(truncate=False) +---------------------------+ |result | +---------------------------+ |[Hello world, Name | Age] | +---------------------------+ """
[docs] inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] outputAsAnnotatorType = Param( Params._dummy(), "outputAsAnnotatorType", "The annotator type to use for the output annotations (Default: document)", typeConverter=TypeConverters.toString, )
[docs] sortByBegin = Param( Params._dummy(), "sortByBegin", "Whether to sort merged annotations by their begin position (Default: False)", typeConverter=TypeConverters.toBoolean, )
[docs] name = "MultiColumnAssembler"
@keyword_only def __init__(self): super(MultiColumnAssembler, self).__init__( classname="com.johnsnowlabs.nlp.MultiColumnAssembler" ) self._setDefault(outputAsAnnotatorType="document", sortByBegin=False) @keyword_only
[docs] def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs)
[docs] def setInputCols(self, *value): """Sets input annotation columns to merge. Parameters ---------- *value : str Input column names """ if type(value[0]) == str or type(value[0]) == list: if len(value) == 1 and type(value[0]) == list: return self._set(inputCols=value[0]) else: return self._set(inputCols=list(value)) else: raise TypeError("InputCols datatype not supported. It must be either str or list")
[docs] def setOutputAsAnnotatorType(self, value): """Sets the annotator type for the output annotations. Parameters ---------- value : str The annotator type (e.g., "document", "chunk", "table") """ return self._set(outputAsAnnotatorType=value)
[docs] def setSortByBegin(self, value): """Sets whether to sort merged annotations by begin position. Parameters ---------- value : bool Whether to sort by begin position """ return self._set(sortByBegin=value)