# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for partition properties used in reading various document types."""
from typing import Dict
from pyspark.ml.param import Param, Params, TypeConverters
[docs]class HasReaderProperties(Params):
[docs] outputCol = Param(
Params._dummy(),
"outputCol",
"output column name",
typeConverter=TypeConverters.toString
)
[docs] def setOutputCol(self, value):
"""Sets output column name.
Parameters
----------
value : str
Name of the Output Column
"""
return self._set(outputCol=value)
[docs] contentPath = Param(
Params._dummy(),
"contentPath",
"Path to the content source.",
typeConverter=TypeConverters.toString
)
[docs] def setContentPath(self, value: str):
"""Sets content path.
Parameters
----------
value : str
Path to the content source.
"""
return self._set(contentPath=value)
[docs] contentType = Param(
Params._dummy(),
"contentType",
"Set the content type to load following MIME specification.",
typeConverter=TypeConverters.toString
)
[docs] def setContentType(self, value: str):
"""Sets content type following MIME specification.
Parameters
----------
value : str
Content type string (MIME format).
"""
return self._set(contentType=value)
[docs] storeContent = Param(
Params._dummy(),
"storeContent",
"Whether to include the raw file content in the output DataFrame "
"as a separate 'content' column, alongside the structured output.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setStoreContent(self, value: bool):
"""Sets whether to store raw file content.
Parameters
----------
value : bool
True to include raw file content, False otherwise.
"""
return self._set(storeContent=value)
[docs] titleFontSize = Param(
Params._dummy(),
"titleFontSize",
"Minimum font size threshold used as part of heuristic rules to detect "
"title elements based on formatting (e.g., bold, centered, capitalized).",
typeConverter=TypeConverters.toInt
)
[docs] def setTitleFontSize(self, value: int):
"""Sets minimum font size for detecting titles.
Parameters
----------
value : int
Minimum font size threshold for title detection.
"""
return self._set(titleFontSize=value)
[docs] inferTableStructure = Param(
Params._dummy(),
"inferTableStructure",
"Whether to generate an HTML table representation from structured table content. "
"When enabled, a full <table> element is added alongside cell-level elements, "
"based on row and column layout.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setInferTableStructure(self, value: bool):
"""Sets whether to infer table structure.
Parameters
----------
value : bool
True to generate HTML table representation, False otherwise.
"""
return self._set(inferTableStructure=value)
[docs] includePageBreaks = Param(
Params._dummy(),
"includePageBreaks",
"Whether to detect and tag content with page break metadata. "
"In Word documents, this includes manual and section breaks. "
"In Excel files, this includes page breaks based on column boundaries.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setIncludePageBreaks(self, value: bool):
"""Sets whether to include page break metadata.
Parameters
----------
value : bool
True to detect and tag page breaks, False otherwise.
"""
return self._set(includePageBreaks=value)
[docs] ignoreExceptions = Param(
Params._dummy(),
"ignoreExceptions",
"Whether to ignore exceptions during processing.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setIgnoreExceptions(self, value: bool):
"""Sets whether to ignore exceptions during processing.
Parameters
----------
value : bool
True to ignore exceptions, False otherwise.
"""
return self._set(ignoreExceptions=value)
[docs] explodeDocs = Param(
Params._dummy(),
"explodeDocs",
"Whether to explode the documents into separate rows.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setExplodeDocs(self, value: bool):
"""Sets whether to explode the documents into separate rows.
Parameters
----------
value : bool
True to split documents into multiple rows, False to keep them in one row.
"""
return self._set(explodeDocs=value)
[docs] flattenOutput = Param(
Params._dummy(),
"flattenOutput",
"If true, output is flattened to plain text with minimal metadata",
typeConverter=TypeConverters.toBoolean
)
[docs] def setFlattenOutput(self, value):
"""Sets whether to flatten the output to plain text with minimal metadata.
ParametersF
----------
value : bool
If true, output is flattened to plain text with minimal metadata
"""
return self._set(flattenOutput=value)
[docs] titleThreshold = Param(
Params._dummy(),
"titleThreshold",
"Minimum font size threshold for title detection in PDF docs",
typeConverter=TypeConverters.toFloat
)
[docs] def setTitleThreshold(self, value):
"""Sets the minimum font size threshold for title detection in PDF documents.
Parameters
----------
value : float
Minimum font size threshold for title detection in PDF docs
"""
return self._set(titleThreshold=value)
[docs] outputAsDocument = Param(
Params._dummy(),
"outputAsDocument",
"Whether to return all sentences joined into a single document",
typeConverter=TypeConverters.toBoolean
)
[docs] def setOutputAsDocument(self, value):
"""Sets whether to return all sentences joined into a single document.
Parameters
----------
value : bool
Whether to return all sentences joined into a single document
"""
return self._set(outputAsDocument=value)
[docs]class HasEmailReaderProperties(Params):
[docs] addAttachmentContent = Param(
Params._dummy(),
"addAttachmentContent",
"Whether to extract and include the textual content of plain-text attachments in the output",
typeConverter=TypeConverters.toBoolean
)
[docs] def setAddAttachmentContent(self, value):
"""
Sets whether to extract and include the textual content of plain-text attachments in the output.
Parameters
----------
value : bool
Whether to include text from plain-text attachments.
"""
return self._set(addAttachmentContent=value)
[docs] def getAddAttachmentContent(self):
"""
Gets whether to extract and include the textual content of plain-text attachments in the output.
Returns
-------
bool
Whether to include text from plain-text attachments.
"""
return self.getOrDefault(self.addAttachmentContent)
[docs]class HasExcelReaderProperties(Params):
[docs] cellSeparator = Param(
Params._dummy(),
"cellSeparator",
"String used to join cell values in a row when assembling textual output.",
typeConverter=TypeConverters.toString
)
[docs] def setCellSeparator(self, value):
"""
Sets the string used to join cell values in a row when assembling textual output.
Parameters
----------
value : str
Delimiter used to concatenate cell values.
"""
return self._set(cellSeparator=value)
[docs] def getCellSeparator(self):
"""
Gets the string used to join cell values in a row when assembling textual output.
Returns
-------
str
Delimiter used to concatenate cell values.
"""
return self.getOrDefault(self.cellSeparator)
[docs] appendCells = Param(
Params._dummy(),
"appendCells",
"Whether to append all rows into a single content block instead of creating separate elements per row.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setAppendCells(self, value):
"""
Sets whether to append all rows into a single content block.
Parameters
----------
value : bool
True to merge rows into one block, False for individual elements.
"""
return self._set(appendCells=value)
[docs] def getAppendCells(self):
"""
Gets whether to append all rows into a single content block.
Returns
-------
bool
True to merge rows into one block, False for individual elements.
"""
return self.getOrDefault(self.appendCells)
[docs]class HasHTMLReaderProperties(Params):
[docs] timeout = Param(
Params._dummy(),
"timeout",
"Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
typeConverter=TypeConverters.toInt
)
[docs] def setTimeout(self, value):
"""
Sets the timeout (in seconds) for reading remote HTML resources.
Parameters
----------
value : int
Timeout in seconds for remote content retrieval.
"""
return self._set(timeout=value)
[docs] def getTimeout(self):
"""
Gets the timeout value for reading remote HTML resources.
Returns
-------
int
Timeout in seconds.
"""
return self.getOrDefault(self.timeout)
[docs]class HasPowerPointProperties(Params):
[docs] includeSlideNotes = Param(
Params._dummy(),
"includeSlideNotes",
"Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setIncludeSlideNotes(self, value):
"""
Sets whether to extract speaker notes from slides.
Parameters
----------
value : bool
If True, notes are included as narrative text elements.
"""
return self._set(includeSlideNotes=value)
[docs] def getIncludeSlideNotes(self):
"""
Gets whether to extract speaker notes from slides.
Returns
-------
bool
True if notes are included as narrative text elements.
"""
return self.getOrDefault(self.includeSlideNotes)
[docs]class HasTextReaderProperties(Params):
[docs] titleLengthSize = Param(
Params._dummy(),
"titleLengthSize",
"Maximum character length used to determine if a text block qualifies as a title during parsing.",
typeConverter=TypeConverters.toInt
)
[docs] def setTitleLengthSize(self, value):
return self._set(titleLengthSize=value)
[docs] def getTitleLengthSize(self):
return self.getOrDefault(self.titleLengthSize)
[docs] groupBrokenParagraphs = Param(
Params._dummy(),
"groupBrokenParagraphs",
"Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setGroupBrokenParagraphs(self, value):
return self._set(groupBrokenParagraphs=value)
[docs] def getGroupBrokenParagraphs(self):
return self.getOrDefault(self.groupBrokenParagraphs)
[docs] paragraphSplit = Param(
Params._dummy(),
"paragraphSplit",
"Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
typeConverter=TypeConverters.toString
)
[docs] def setParagraphSplit(self, value):
return self._set(paragraphSplit=value)
[docs] def getParagraphSplit(self):
return self.getOrDefault(self.paragraphSplit)
[docs] shortLineWordThreshold = Param(
Params._dummy(),
"shortLineWordThreshold",
"Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
typeConverter=TypeConverters.toInt
)
[docs] def setShortLineWordThreshold(self, value):
return self._set(shortLineWordThreshold=value)
[docs] def getShortLineWordThreshold(self):
return self.getOrDefault(self.shortLineWordThreshold)
[docs] maxLineCount = Param(
Params._dummy(),
"maxLineCount",
"Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
typeConverter=TypeConverters.toInt
)
[docs] def setMaxLineCount(self, value):
return self._set(maxLineCount=value)
[docs] def getMaxLineCount(self):
return self.getOrDefault(self.maxLineCount)
[docs] threshold = Param(
Params._dummy(),
"threshold",
"Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
typeConverter=TypeConverters.toFloat
)
[docs] def setThreshold(self, value):
return self._set(threshold=value)
[docs] def getThreshold(self):
return self.getOrDefault(self.threshold)
[docs]class HasChunkerProperties(Params):
[docs] chunkingStrategy = Param(
Params._dummy(),
"chunkingStrategy",
"Set the chunking strategy",
typeConverter=TypeConverters.toString
)
[docs] def setChunkingStrategy(self, value):
return self._set(chunkingStrategy=value)
[docs] maxCharacters = Param(
Params._dummy(),
"maxCharacters",
"Set the maximum number of characters",
typeConverter=TypeConverters.toInt
)
[docs] def setMaxCharacters(self, value):
return self._set(maxCharacters=value)
[docs] newAfterNChars = Param(
Params._dummy(),
"newAfterNChars",
"Insert a new chunk after N characters",
typeConverter=TypeConverters.toInt
)
[docs] def setNewAfterNChars(self, value):
return self._set(newAfterNChars=value)
[docs] overlap = Param(
Params._dummy(),
"overlap",
"Set the number of overlapping characters between chunks",
typeConverter=TypeConverters.toInt
)
[docs] def setOverlap(self, value):
return self._set(overlap=value)
[docs] combineTextUnderNChars = Param(
Params._dummy(),
"combineTextUnderNChars",
"Threshold to merge adjacent small sections",
typeConverter=TypeConverters.toInt
)
[docs] def setCombineTextUnderNChars(self, value):
return self._set(combineTextUnderNChars=value)
[docs] overlapAll = Param(
Params._dummy(),
"overlapAll",
"Apply overlap context between all sections, not just split chunks",
typeConverter=TypeConverters.toBoolean
)
[docs] def setOverlapAll(self, value):
return self._set(overlapAll=value)
from pyspark.ml.param import Param, Params, TypeConverters
[docs]class HasPdfProperties(Params):
[docs] pageNumCol = Param(
Params._dummy(),
"pageNumCol",
"Page number output column name.",
typeConverter=TypeConverters.toString
)
[docs] def setPageNumCol(self, value: str):
"""Sets page number output column name.
Parameters
----------
value : str
Name of the column for page numbers.
"""
return self._set(pageNumCol=value)
[docs] originCol = Param(
Params._dummy(),
"originCol",
"Input column name with original path of file.",
typeConverter=TypeConverters.toString
)
[docs] def setOriginCol(self, value: str):
"""Sets input column with original file path.
Parameters
----------
value : str
Column name that stores the file path.
"""
return self._set(originCol=value)
[docs] partitionNum = Param(
Params._dummy(),
"partitionNum",
"Number of partitions.",
typeConverter=TypeConverters.toInt
)
[docs] def setPartitionNum(self, value: int):
"""Sets number of partitions.
Parameters
----------
value : int
Number of partitions to use.
"""
return self._set(partitionNum=value)
[docs] storeSplittedPdf = Param(
Params._dummy(),
"storeSplittedPdf",
"Force to store bytes content of splitted pdf.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setStoreSplittedPdf(self, value: bool):
"""Sets whether to store byte content of split PDF pages.
Parameters
----------
value : bool
True to store PDF page bytes, False otherwise.
"""
return self._set(storeSplittedPdf=value)
[docs] splitPage = Param(
Params._dummy(),
"splitPage",
"Enable/disable splitting per page to identify page numbers and improve performance.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setSplitPage(self, value: bool):
"""Sets whether to split PDF into pages.
Parameters
----------
value : bool
True to split per page, False otherwise.
"""
return self._set(splitPage=value)
[docs] onlyPageNum = Param(
Params._dummy(),
"onlyPageNum",
"Extract only page numbers.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setOnlyPageNum(self, value: bool):
"""Sets whether to extract only page numbers.
Parameters
----------
value : bool
True to extract only page numbers, False otherwise.
"""
return self._set(onlyPageNum=value)
[docs] textStripper = Param(
Params._dummy(),
"textStripper",
"Text stripper type used for output layout and formatting.",
typeConverter=TypeConverters.toString
)
[docs] def setTextStripper(self, value: str):
"""Sets text stripper type.
Parameters
----------
value : str
Text stripper type for layout and formatting.
"""
return self._set(textStripper=value)
[docs] sort = Param(
Params._dummy(),
"sort",
"Enable/disable sorting content on the page.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setSort(self, value: bool):
"""Sets whether to sort content on the page.
Parameters
----------
value : bool
True to sort content, False otherwise.
"""
return self._set(sort=value)
[docs] normalizeLigatures = Param(
Params._dummy(),
"normalizeLigatures",
"Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
typeConverter=TypeConverters.toBoolean
)
[docs] def setNormalizeLigatures(self, value: bool):
"""Sets whether to normalize ligatures (e.g., fl → f + l).
Parameters
----------
value : bool
True to normalize ligatures, False otherwise.
"""
return self._set(normalizeLigatures=value)
[docs] readAsImage = Param(
Params._dummy(),
"readAsImage",
"Read PDF pages as images.",
typeConverter=TypeConverters.toBoolean
)
[docs] def setReadAsImage(self, value: bool):
"""Sets whether to read PDF pages as images.
Parameters
----------
value : bool
True to read as images, False otherwise.
"""
return self._set(readAsImage=value)