Source code for sparknlp.partition.partition_transformer

#  Copyright 2017-2025 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains the PartitionTransformer class for reading various types of documents into chunks."""
from sparknlp.common import *
from sparknlp.partition.partition_properties import *


[docs]class PartitionTransformer( AnnotatorModel, HasEmailReaderProperties, HasExcelReaderProperties, HasHTMLReaderProperties, HasPowerPointProperties, HasTextReaderProperties, HasChunkerProperties ): """ The PartitionTransformer annotator allows you to use the Partition feature more smoothly within existing Spark NLP workflows, enabling seamless reuse of your pipelines. It supports reading from files, URLs, in-memory strings, or byte arrays, and works within a Spark NLP pipeline. Supported formats include: - Plain text - HTML - Word (.doc/.docx) - Excel (.xls/.xlsx) - PowerPoint (.ppt/.pptx) - Email files (.eml, .msg) - PDFs Parameters ---------- inputCols : list of str Names of input columns (typically from DocumentAssembler). outputCol : str Name of the column to store the output. contentType : str The type of content: e.g., "text", "url", "file", etc. headers : dict, optional Headers to be used if content type is a URL. Examples -------- >>> dataset = spark.createDataFrame([ ... ("https://www.blizzard.com",), ... ], ["text"]) >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> partition = PartitionTransformer() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("partition") \\ ... .setContentType("url") \\ ... .setHeaders({"Accept-Language": "es-ES"}) >>> pipeline = Pipeline(stages=[documentAssembler, partition]) >>> pipelineModel = pipeline.fit(dataset) >>> resultDf = pipelineModel.transform(dataset) >>> resultDf.show() +--------------------+--------------------+--------------------+ | text| document| partition| +--------------------+--------------------+--------------------+ |https://www.blizz...|[{Title, Juegos d...|[{document, 0, 16...| |https://www.googl...|[{Title, Gmail Im...|[{document, 0, 28...| +--------------------+--------------------+--------------------+ """
[docs] name = "PartitionTransformer"
[docs] inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] contentPath = Param( Params._dummy(), "contentPath", "Path to the content source", typeConverter=TypeConverters.toString )
[docs] def setContentPath(self, value): return self._set(contentPath=value)
[docs] def getContentPath(self): return self.getOrDefault(self.contentPath)
[docs] contentType = Param( Params._dummy(), "contentType", "Set the content type to load following MIME specification", typeConverter=TypeConverters.toString )
[docs] def setContentType(self, value): return self._set(contentType=value)
[docs] def getContentType(self): return self.getOrDefault(self.contentType)
[docs] storeContent = Param( Params._dummy(), "storeContent", "Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output.", typeConverter=TypeConverters.toBoolean )
[docs] def setStoreContent(self, value): return self._set(storeContent=value)
[docs] def getStoreContent(self): return self.getOrDefault(self.storeContent)
[docs] titleFontSize = Param( Params._dummy(), "titleFontSize", "Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized).", typeConverter=TypeConverters.toInt )
[docs] def setTitleFontSize(self, value): return self._set(titleFontSize=value)
[docs] def getTitleFontSize(self): return self.getOrDefault(self.titleFontSize)
[docs] inferTableStructure = Param( Params._dummy(), "inferTableStructure", "Whether to generate an HTML table representation from structured table content. When enabled, a full <table> element is added alongside cell-level elements, based on row and column layout.", typeConverter=TypeConverters.toBoolean )
[docs] def setInferTableStructure(self, value): return self._set(inferTableStructure=value)
[docs] def getInferTableStructure(self): return self.getOrDefault(self.inferTableStructure)
[docs] includePageBreaks = Param( Params._dummy(), "includePageBreaks", "Whether to detect and tag content with page break metadata. In Word documents, this includes manual and section breaks. In Excel files, this includes page breaks based on column boundaries.", typeConverter=TypeConverters.toBoolean )
[docs] def setIncludePageBreaks(self, value): return self._set(includePageBreaks=value)
[docs] def getIncludePageBreaks(self): return self.getOrDefault(self.includePageBreaks)
@keyword_only def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer", java_model=None): super(PartitionTransformer, self).__init__( classname=classname, java_model=java_model ) DOUBLE_PARAGRAPH_PATTERN = r"(?:\s*\n\s*){2,}" self._setDefault( contentPath="", contentType="text/plain", storeContent=False, titleFontSize = 9, inferTableStructure=False, includePageBreaks=False, addAttachmentContent=False, cellSeparator="\t", appendCells=False, timeout=0, includeSlideNotes=False, titleLengthSize=50, groupBrokenParagraphs=False, paragraphSplit=DOUBLE_PARAGRAPH_PATTERN, shortLineWordThreshold=5, maxLineCount=2000, threshold=0.1, chunkingStrategy="", maxCharacters=100, newAfterNChars=-1, overlap=0, combineTextUnderNChars=0, overlapAll=False )