Source code for sparknlp.partition.partition

#  Copyright 2017-2025 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains the Partition annotator for reading and processing various document types."""
import sparknlp
from sparknlp.internal import ExtendedJavaWrapper


[docs]class Partition(ExtendedJavaWrapper):
    """
    A unified interface for extracting structured content from various document types
    using Spark NLP readers.

    This class supports reading from files, URLs, in-memory strings, or byte arrays,
    and returns parsed output as a structured Spark DataFrame.

    Supported formats include:
    - Plain text
    - HTML
    - Word (.doc/.docx)
    - Excel (.xls/.xlsx)
    - PowerPoint (.ppt/.pptx)
    - Email files (.eml, .msg)
    - PDFs

    Parameters
    ----------
    params : dict, optional
        Configuration parameters, including:

        - content_type : str
            Override automatic file type detection.
        - store_content : bool
            Include raw file content in the output DataFrame.
        - timeout : int
            Timeout for fetching HTML content.
        - title_font_size : int
            Font size used to identify titles.
        - include_page_breaks : bool
            Tag content with page break metadata.
        - group_broken_paragraphs : bool
            Merge broken lines into full paragraphs.
        - title_length_size : int
            Max character length to qualify as title.
        - paragraph_split : str
            Regex to detect paragraph boundaries.
        - short_line_word_threshold : int
            Max words in a line to be considered short.
        - threshold : float
            Ratio of empty lines for switching grouping.
        - max_line_count : int
            Max lines evaluated in paragraph analysis.
        - include_slide_notes : bool
            Include speaker notes in output.
        - infer_table_structure : bool
            Generate HTML table structure.
        - append_cells : bool
            Merge Excel rows into one block.
        - cell_separator : str
            Join cell values in a row.
        - add_attachment_content : bool
            Include text of plain-text attachments.
        - headers : dict
            Request headers when using URLs.

    Examples
    --------

    Reading Text Files

    >>> txt_directory = "/content/txtfiles/reader/txt"
    >>> partition_df = Partition(content_type="text/plain").partition(txt_directory)
    >>> partition_df.show()
    >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
    >>> partition_df.show()
    >>> partition_df = Partition().partition(
    ...     "https://www.wikipedia.com",
    ...     headers={"Accept-Language": "es-ES"}
    ... )
    >>> partition_df.show()
    +--------------------+--------------------+
    |                path|                 txt|
    +--------------------+--------------------+
    |file:/content/txt...|[{Title, BIG DATA...|
    +--------------------+--------------------+

    Reading Email Files

    >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml")
    >>> partition_df.show()
    +--------------------+--------------------+
    |                path|               email|
    +--------------------+--------------------+
    |file:/content/ema...|[{Title, Test Sev...|
    +--------------------+--------------------+

    Reading Webpages

    >>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"})
    >>> partition_df.show()
    +--------------------+--------------------+
    |                 url|                html|
    +--------------------+--------------------+
    |https://www.wikip...|[{Title, Wikipedi...|
    +--------------------+--------------------+

    For more examples, refer to:
    `examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb`
    """
    def  __init__(self, **kwargs):
[docs]        self.spark = sparknlp.start()
        params = {}
        for key, value in kwargs.items():
            try:
                params[key] = str(value)
            except Exception as e:
                raise ValueError(f"Invalid value for key '{key}': Cannot cast {type(value)} to string. Original error: {e}")

        super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)


[docs]    def partition(self, path, headers=None):
        """
        Reads and parses content from a URL, file, or directory path.

        Parameters
        ----------
        path : str
            Path to file or directory. URLs and DFS are supported.
        headers : dict, optional
            Headers for URL requests.

        Returns
        -------
        pyspark.sql.DataFrame
            DataFrame with parsed content.
    """
        if headers is None:
            headers = {}
        jdf = self._java_obj.partition(path, headers)
        dataframe = self.getDataFrame(self.spark, jdf)
        return dataframe


[docs]    def partition_urls(self, path, headers=None):
        """
        Reads and parses content from multiple URLs.

        Parameters
        ----------
        path : list[str]
            List of URLs.
        headers : dict, optional
            Request headers for URLs.

        Returns
        -------
        pyspark.sql.DataFrame
            DataFrame with parsed URL content.

        Examples
        --------
        >>> urls_df = Partition().partition_urls([
        ...     "https://www.wikipedia.org", "https://example.com/"
        ... ])
        >>> urls_df.show()
        +--------------------+--------------------+
        |                 url|                html|
        +--------------------+--------------------+
        |https://www.wikip...|[{Title, Wikipedi...|
        |https://example.com/|[{Title, Example ...|
        +--------------------+--------------------+

        >>> urls_df.printSchema()
         root
         |-- url: string (nullable = true)
         |-- html: array (nullable = true)
         |    |-- element: struct (containsNull = true)
         |    |    |-- elementType: string (nullable = true)
         |    |    |-- content: string (nullable = true)
         |    |    |-- metadata: map (nullable = true)
         |    |    |    |-- key: string
         |    |    |    |-- value: string (valueContainsNull = true)
        """
        if headers is None:
            headers = {}
        jdf = self._java_obj.partitionUrlsJava(path, headers)
        dataframe = self.getDataFrame(self.spark, jdf)
        return dataframe


[docs]    def partition_text(self, text):
        """
        Parses content from a raw text string.

        Parameters
        ----------
        text : str
            Raw text input.

        Returns
        -------
        pyspark.sql.DataFrame
            DataFrame with parsed text.

        Examples
        --------
        >>> raw_text = (
        ...     "The big brown fox\\n"
        ...     "was walking down the lane.\\n"
        ...     "\\n"
        ...     "At the end of the lane,\\n"
        ...     "the fox met a bear."
        ... )
        >>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text)
        >>> text_df.show()
        +--------------------------------------+
        |txt                                   |
        +--------------------------------------+
        |[{NarrativeText, The big brown fox was|
        +--------------------------------------+
        >>> text_df.printSchema()
        root
         |-- txt: array (nullable = true)
         |    |-- element: struct (containsNull = true)
         |    |    |-- elementType: string (nullable = true)
         |    |    |-- content: string (nullable = true)
         |    |    |-- metadata: map (nullable = true)
         |    |    |    |-- key: string
         |    |    |    |-- value: string (valueContainsNull = true)
        """
        jdf = self._java_obj.partitionText(text)
        dataframe = self.getDataFrame(self.spark, jdf)
        return dataframe