Source code for sparknlp.partition.partition_properties

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for partition properties used in reading various document types."""
from typing import Dict
from pyspark.ml.param import Param, Params, TypeConverters


[docs]class HasReaderProperties(Params):
[docs]    inputCol = Param(
        Params._dummy(),
        "inputCol",
        "input column name",
        typeConverter=TypeConverters.toString
    )

[docs]    def setInputCol(self, value):
        """Sets input column name.

        Parameters
        ----------
        value : str
            Name of the Input Column
        """
        return self._set(inputCol=value)

[docs]    outputCol = Param(
        Params._dummy(),
        "outputCol",
        "output column name",
        typeConverter=TypeConverters.toString
    )

[docs]    def setOutputCol(self, value):
        """Sets output column name.

        Parameters
        ----------
        value : str
            Name of the Output Column
        """
        return self._set(outputCol=value)

[docs]    contentPath = Param(
        Params._dummy(),
        "contentPath",
        "Path to the content source.",
        typeConverter=TypeConverters.toString
    )

[docs]    def setContentPath(self, value: str):
        """Sets content path.

        Parameters
        ----------
        value : str
            Path to the content source.
        """
        return self._set(contentPath=value)

[docs]    contentType = Param(
        Params._dummy(),
        "contentType",
        "Set the content type to load following MIME specification.",
        typeConverter=TypeConverters.toString
    )

[docs]    def setContentType(self, value: str):
        """Sets content type following MIME specification.

        Parameters
        ----------
        value : str
            Content type string (MIME format).
        """
        return self._set(contentType=value)

[docs]    storeContent = Param(
        Params._dummy(),
        "storeContent",
        "Whether to include the raw file content in the output DataFrame "
        "as a separate 'content' column, alongside the structured output.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setStoreContent(self, value: bool):
        """Sets whether to store raw file content.

        Parameters
        ----------
        value : bool
            True to include raw file content, False otherwise.
        """
        return self._set(storeContent=value)

[docs]    titleFontSize = Param(
        Params._dummy(),
        "titleFontSize",
        "Minimum font size threshold used as part of heuristic rules to detect "
        "title elements based on formatting (e.g., bold, centered, capitalized).",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setTitleFontSize(self, value: int):
        """Sets minimum font size for detecting titles.

        Parameters
        ----------
        value : int
            Minimum font size threshold for title detection.
        """
        return self._set(titleFontSize=value)

[docs]    inferTableStructure = Param(
        Params._dummy(),
        "inferTableStructure",
        "Whether to generate an HTML table representation from structured table content. "
        "When enabled, a full <table> element is added alongside cell-level elements, "
        "based on row and column layout.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setInferTableStructure(self, value: bool):
        """Sets whether to infer table structure.

        Parameters
        ----------
        value : bool
            True to generate HTML table representation, False otherwise.
        """
        return self._set(inferTableStructure=value)

[docs]    includePageBreaks = Param(
        Params._dummy(),
        "includePageBreaks",
        "Whether to detect and tag content with page break metadata. "
        "In Word documents, this includes manual and section breaks. "
        "In Excel files, this includes page breaks based on column boundaries.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setIncludePageBreaks(self, value: bool):
        """Sets whether to include page break metadata.

        Parameters
        ----------
        value : bool
            True to detect and tag page breaks, False otherwise.
        """
        return self._set(includePageBreaks=value)

[docs]    ignoreExceptions = Param(
        Params._dummy(),
        "ignoreExceptions",
        "Whether to ignore exceptions during processing.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setIgnoreExceptions(self, value: bool):
        """Sets whether to ignore exceptions during processing.

        Parameters
        ----------
        value : bool
            True to ignore exceptions, False otherwise.
        """
        return self._set(ignoreExceptions=value)

[docs]    explodeDocs = Param(
        Params._dummy(),
        "explodeDocs",
        "Whether to explode the documents into separate rows.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setExplodeDocs(self, value: bool):
        """Sets whether to explode the documents into separate rows.

        Parameters
        ----------
        value : bool
            True to split documents into multiple rows, False to keep them in one row.
        """
        return self._set(explodeDocs=value)

[docs]    flattenOutput = Param(
        Params._dummy(),
        "flattenOutput",
        "If true, output is flattened to plain text with minimal metadata",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setFlattenOutput(self, value):
        """Sets whether to flatten the output to plain text with minimal metadata.

        ParametersF
        ----------
        value : bool
            If true, output is flattened to plain text with minimal metadata
        """
        return self._set(flattenOutput=value)

[docs]    titleThreshold = Param(
        Params._dummy(),
        "titleThreshold",
        "Minimum font size threshold for title detection in PDF docs",
        typeConverter=TypeConverters.toFloat
    )

[docs]    def setTitleThreshold(self, value):
        """Sets the minimum font size threshold for title detection in PDF documents.

        Parameters
        ----------
        value : float
            Minimum font size threshold for title detection in PDF docs
        """
        return self._set(titleThreshold=value)

[docs]    outputAsDocument = Param(
        Params._dummy(),
        "outputAsDocument",
        "Whether to return all sentences joined into a single document",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setOutputAsDocument(self, value):
        """Sets whether to return all sentences joined into a single document.

        Parameters
        ----------
        value : bool
            Whether to return all sentences joined into a single document
        """
        return self._set(outputAsDocument=value)


[docs]class HasEmailReaderProperties(Params):
[docs]    addAttachmentContent = Param(
        Params._dummy(),
        "addAttachmentContent",
        "Whether to extract and include the textual content of plain-text attachments in the output",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setAddAttachmentContent(self, value):
        """
        Sets whether to extract and include the textual content of plain-text attachments in the output.

        Parameters
        ----------
        value : bool
            Whether to include text from plain-text attachments.
        """
        return self._set(addAttachmentContent=value)

[docs]    def getAddAttachmentContent(self):
        """
        Gets whether to extract and include the textual content of plain-text attachments in the output.

        Returns
        -------
        bool
            Whether to include text from plain-text attachments.
        """
        return self.getOrDefault(self.addAttachmentContent)


[docs]class HasExcelReaderProperties(Params):
[docs]    cellSeparator = Param(
        Params._dummy(),
        "cellSeparator",
        "String used to join cell values in a row when assembling textual output.",
        typeConverter=TypeConverters.toString
    )

[docs]    def setCellSeparator(self, value):
        """
        Sets the string used to join cell values in a row when assembling textual output.

        Parameters
        ----------
        value : str
            Delimiter used to concatenate cell values.
        """
        return self._set(cellSeparator=value)

[docs]    def getCellSeparator(self):
        """
        Gets the string used to join cell values in a row when assembling textual output.

        Returns
        -------
        str
            Delimiter used to concatenate cell values.
        """
        return self.getOrDefault(self.cellSeparator)

[docs]    appendCells = Param(
        Params._dummy(),
        "appendCells",
        "Whether to append all rows into a single content block instead of creating separate elements per row.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setAppendCells(self, value):
        """
        Sets whether to append all rows into a single content block.

        Parameters
        ----------
        value : bool
            True to merge rows into one block, False for individual elements.
        """
        return self._set(appendCells=value)

[docs]    def getAppendCells(self):
        """
        Gets whether to append all rows into a single content block.

        Returns
        -------
        bool
            True to merge rows into one block, False for individual elements.
        """
        return self.getOrDefault(self.appendCells)


[docs]class HasHTMLReaderProperties(Params):
[docs]    timeout = Param(
        Params._dummy(),
        "timeout",
        "Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs.",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setTimeout(self, value):
        """
        Sets the timeout (in seconds) for reading remote HTML resources.

        Parameters
        ----------
        value : int
            Timeout in seconds for remote content retrieval.
        """
        return self._set(timeout=value)

[docs]    def getTimeout(self):
        """
        Gets the timeout value for reading remote HTML resources.

        Returns
        -------
        int
            Timeout in seconds.
        """
        return self.getOrDefault(self.timeout)

[docs]    def setHeaders(self, headers: Dict[str, str]):
        self._call_java("setHeadersPython", headers)
        return self

[docs]    outputFormat = Param(
        Params._dummy(),
        "outputFormat",
        "Output format for the table content. Options are 'plain-text' or 'html-table'. Default is 'json-table'.",
        typeConverter=TypeConverters.toString
    )

[docs]    def setOutputFormat(self, value: str):
        """Sets output format for the table content.

        Options
        -------
        - 'plain-text'
        - 'html-table'
        - 'json-table' (default)

        Parameters
        ----------
        value : str
            Output format for the table content.
        """
        return self._set(outputFormat=value)


[docs]class HasPowerPointProperties(Params):
[docs]    includeSlideNotes = Param(
        Params._dummy(),
        "includeSlideNotes",
        "Whether to extract speaker notes from slides. When enabled, notes are included as narrative text elements.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setIncludeSlideNotes(self, value):
        """
        Sets whether to extract speaker notes from slides.

        Parameters
        ----------
        value : bool
            If True, notes are included as narrative text elements.
        """
        return self._set(includeSlideNotes=value)

[docs]    def getIncludeSlideNotes(self):
        """
        Gets whether to extract speaker notes from slides.

        Returns
        -------
        bool
            True if notes are included as narrative text elements.
        """
        return self.getOrDefault(self.includeSlideNotes)


[docs]class HasTextReaderProperties(Params):
[docs]    titleLengthSize = Param(
        Params._dummy(),
        "titleLengthSize",
        "Maximum character length used to determine if a text block qualifies as a title during parsing.",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setTitleLengthSize(self, value):
        """Set the maximum character length used to identify title blocks.

        Parameters
        ----------
        value : int
            Maximum number of characters a text block can have to be considered a title.

        Returns
        -------
        self
            The instance with updated `titleLengthSize` parameter.
        """
        return self._set(titleLengthSize=value)

[docs]    def getTitleLengthSize(self):
        """Get the configured maximum title length.

        Returns
        -------
        int
            The maximum character length used to detect title blocks.
        """
        return self.getOrDefault(self.titleLengthSize)

[docs]    groupBrokenParagraphs = Param(
        Params._dummy(),
        "groupBrokenParagraphs",
        "Whether to merge fragmented lines into coherent paragraphs using heuristics based on line length and structure.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setGroupBrokenParagraphs(self, value):
        """Enable or disable grouping of broken paragraphs.

        Parameters
        ----------
        value : bool
            True to merge fragmented lines into paragraphs, False to leave lines as-is.

        Returns
        -------
        self
            The instance with updated `groupBrokenParagraphs` parameter.
        """
        return self._set(groupBrokenParagraphs=value)

[docs]    def getGroupBrokenParagraphs(self):
        """Get whether broken paragraph grouping is enabled.

        Returns
        -------
        bool
            True if grouping of broken paragraphs is enabled, False otherwise.
        """
        return self.getOrDefault(self.groupBrokenParagraphs)

[docs]    paragraphSplit = Param(
        Params._dummy(),
        "paragraphSplit",
        "Regex pattern used to detect paragraph boundaries when grouping broken paragraphs.",
        typeConverter=TypeConverters.toString
    )

[docs]    def setParagraphSplit(self, value):
        """Set the regex pattern used to split paragraphs when grouping broken paragraphs.

        Parameters
        ----------
        value : str
            Regular expression string used to detect paragraph boundaries.

        Returns
        -------
        self
            The instance with updated `paragraphSplit` parameter.
        """
        return self._set(paragraphSplit=value)

[docs]    def getParagraphSplit(self):
        """Get the paragraph-splitting regex pattern.

        Returns
        -------
        str
            The regex pattern used to detect paragraph boundaries.
        """
        return self.getOrDefault(self.paragraphSplit)

[docs]    shortLineWordThreshold = Param(
        Params._dummy(),
        "shortLineWordThreshold",
        "Maximum word count for a line to be considered 'short' during broken paragraph grouping.",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setShortLineWordThreshold(self, value):
        """Set the maximum word count for a line to be considered short.

        Parameters
        ----------
        value : int
            Number of words under which a line is considered 'short'.

        Returns
        -------
        self
            The instance with updated `shortLineWordThreshold` parameter.
        """
        return self._set(shortLineWordThreshold=value)

[docs]    def getShortLineWordThreshold(self):
        """Get the short line word threshold.

        Returns
        -------
        int
            Word count threshold for short lines used in paragraph grouping.
        """
        return self.getOrDefault(self.shortLineWordThreshold)

[docs]    maxLineCount = Param(
        Params._dummy(),
        "maxLineCount",
        "Maximum number of lines to evaluate when estimating paragraph layout characteristics.",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setMaxLineCount(self, value):
        """Set the maximum number of lines to inspect when estimating paragraph layout.

        Parameters
        ----------
        value : int
            Maximum number of lines to evaluate for layout heuristics.

        Returns
        -------
        self
            The instance with updated `maxLineCount` parameter.
        """
        return self._set(maxLineCount=value)

[docs]    def getMaxLineCount(self):
        """Get the maximum number of lines used for layout heuristics.

        Returns
        -------
        int
            The configured maximum number of lines to consider.
        """
        return self.getOrDefault(self.maxLineCount)

[docs]    threshold = Param(
        Params._dummy(),
        "threshold",
        "Threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.",
        typeConverter=TypeConverters.toFloat
    )

[docs]    def setThreshold(self, value):
        """Set the empty-line ratio threshold for paragraph grouping decision.

        Parameters
        ----------
        value : float
            Ratio (0.0-1.0) of empty lines used to switch grouping strategies.

        Returns
        -------
        self
            The instance with updated `threshold` parameter.
        """
        return self._set(threshold=value)

[docs]    def getThreshold(self):
        """Get the configured empty-line threshold ratio.

        Returns
        -------
        float
            The ratio used to decide paragraph grouping strategy.
        """
        return self.getOrDefault(self.threshold)

[docs]    extractTagAttributes = Param(
        Params._dummy(),
        "extractTagAttributes",
        "Extract attribute values into separate lines when parsing tag-based formats (e.g., HTML or XML).",
        typeConverter=TypeConverters.toListString
    )

[docs]    def setExtractTagAttributes(self, attributes: list[str]):
        """
        Specify which tag attributes should have their values extracted as text when parsing
        tag-based formats (e.g., HTML or XML).

        :param attributes: list of attribute names to extract
        :return: this instance with the updated `extractTagAttributes` parameter
        """
        return self._set(extractTagAttributes=attributes)

[docs]    def getExtractTagAttributes(self):
        """Get the list of tag attribute names configured to be extracted.

        Returns
        -------
        list[str]
            The attribute names whose values will be extracted as text.
        """
        return self.getOrDefault(self.extractTagAttributes)


[docs]class HasChunkerProperties(Params):

[docs]    chunkingStrategy = Param(
        Params._dummy(),
        "chunkingStrategy",
        "Set the chunking strategy",
        typeConverter=TypeConverters.toString
    )

[docs]    def setChunkingStrategy(self, value):
        return self._set(chunkingStrategy=value)

[docs]    maxCharacters = Param(
        Params._dummy(),
        "maxCharacters",
        "Set the maximum number of characters",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setMaxCharacters(self, value):
        return self._set(maxCharacters=value)

[docs]    newAfterNChars = Param(
        Params._dummy(),
        "newAfterNChars",
        "Insert a new chunk after N characters",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setNewAfterNChars(self, value):
        return self._set(newAfterNChars=value)

[docs]    overlap = Param(
        Params._dummy(),
        "overlap",
        "Set the number of overlapping characters between chunks",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setOverlap(self, value):
        return self._set(overlap=value)

[docs]    combineTextUnderNChars = Param(
        Params._dummy(),
        "combineTextUnderNChars",
        "Threshold to merge adjacent small sections",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setCombineTextUnderNChars(self, value):
        return self._set(combineTextUnderNChars=value)

[docs]    overlapAll = Param(
        Params._dummy(),
        "overlapAll",
        "Apply overlap context between all sections, not just split chunks",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setOverlapAll(self, value):
        return self._set(overlapAll=value)


from pyspark.ml.param import Param, Params, TypeConverters


[docs]class HasPdfProperties(Params):

[docs]    pageNumCol = Param(
        Params._dummy(),
        "pageNumCol",
        "Page number output column name.",
        typeConverter=TypeConverters.toString
    )

[docs]    def setPageNumCol(self, value: str):
        """Sets page number output column name.

        Parameters
        ----------
        value : str
            Name of the column for page numbers.
        """
        return self._set(pageNumCol=value)

[docs]    originCol = Param(
        Params._dummy(),
        "originCol",
        "Input column name with original path of file.",
        typeConverter=TypeConverters.toString
    )

[docs]    def setOriginCol(self, value: str):
        """Sets input column with original file path.

        Parameters
        ----------
        value : str
            Column name that stores the file path.
        """
        return self._set(originCol=value)

[docs]    partitionNum = Param(
        Params._dummy(),
        "partitionNum",
        "Number of partitions.",
        typeConverter=TypeConverters.toInt
    )

[docs]    def setPartitionNum(self, value: int):
        """Sets number of partitions.

        Parameters
        ----------
        value : int
            Number of partitions to use.
        """
        return self._set(partitionNum=value)

[docs]    storeSplittedPdf = Param(
        Params._dummy(),
        "storeSplittedPdf",
        "Force to store bytes content of splitted pdf.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setStoreSplittedPdf(self, value: bool):
        """Sets whether to store byte content of split PDF pages.

        Parameters
        ----------
        value : bool
            True to store PDF page bytes, False otherwise.
        """
        return self._set(storeSplittedPdf=value)

[docs]    splitPage = Param(
        Params._dummy(),
        "splitPage",
        "Enable/disable splitting per page to identify page numbers and improve performance.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setSplitPage(self, value: bool):
        """Sets whether to split PDF into pages.

        Parameters
        ----------
        value : bool
            True to split per page, False otherwise.
        """
        return self._set(splitPage=value)

[docs]    onlyPageNum = Param(
        Params._dummy(),
        "onlyPageNum",
        "Extract only page numbers.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setOnlyPageNum(self, value: bool):
        """Sets whether to extract only page numbers.

        Parameters
        ----------
        value : bool
            True to extract only page numbers, False otherwise.
        """
        return self._set(onlyPageNum=value)

[docs]    textStripper = Param(
        Params._dummy(),
        "textStripper",
        "Text stripper type used for output layout and formatting.",
        typeConverter=TypeConverters.toString
    )

[docs]    def setTextStripper(self, value: str):
        """Sets text stripper type.

        Parameters
        ----------
        value : str
            Text stripper type for layout and formatting.
        """
        return self._set(textStripper=value)

[docs]    sort = Param(
        Params._dummy(),
        "sort",
        "Enable/disable sorting content on the page.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setSort(self, value: bool):
        """Sets whether to sort content on the page.

        Parameters
        ----------
        value : bool
            True to sort content, False otherwise.
        """
        return self._set(sort=value)

[docs]    extractCoordinates = Param(
        Params._dummy(),
        "extractCoordinates",
        "Force extract coordinates of text.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setExtractCoordinates(self, value: bool):
        """Sets whether to extract coordinates of text.

        Parameters
        ----------
        value : bool
            True to extract coordinates, False otherwise.
        """
        return self._set(extractCoordinates=value)

[docs]    normalizeLigatures = Param(
        Params._dummy(),
        "normalizeLigatures",
        "Whether to convert ligature chars such as 'ﬂ' into its corresponding chars (e.g., {'f', 'l'}).",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setNormalizeLigatures(self, value: bool):
        """Sets whether to normalize ligatures (e.g., ﬂ → f + l).

        Parameters
        ----------
        value : bool
            True to normalize ligatures, False otherwise.
        """
        return self._set(normalizeLigatures=value)

[docs]    readAsImage = Param(
        Params._dummy(),
        "readAsImage",
        "Read PDF pages as images.",
        typeConverter=TypeConverters.toBoolean
    )

[docs]    def setReadAsImage(self, value: bool):
        """Sets whether to read PDF pages as images.

        Parameters
        ----------
        value : bool
            True to read as images, False otherwise.
        """
        return self._set(readAsImage=value)