Source code for sparknlp.partition.partition

#  Copyright 2017-2025 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains the Partition annotator for reading and processing various document types."""
import sparknlp
from sparknlp.internal import ExtendedJavaWrapper


[docs]class Partition(ExtendedJavaWrapper): """ A unified interface for extracting structured content from various document types using Spark NLP readers. This class supports reading from files, URLs, in-memory strings, or byte arrays, and returns parsed output as a structured Spark DataFrame. Supported formats include: - Plain text - HTML - Word (.doc/.docx) - Excel (.xls/.xlsx) - PowerPoint (.ppt/.pptx) - Email files (.eml, .msg) - PDFs Parameters ---------- params : dict, optional Configuration parameters, including: - content_type : str Override automatic file type detection. - store_content : bool Include raw file content in the output DataFrame. - timeout : int Timeout for fetching HTML content. - title_font_size : int Font size used to identify titles. - include_page_breaks : bool Tag content with page break metadata. - group_broken_paragraphs : bool Merge broken lines into full paragraphs. - title_length_size : int Max character length to qualify as title. - paragraph_split : str Regex to detect paragraph boundaries. - short_line_word_threshold : int Max words in a line to be considered short. - threshold : float Ratio of empty lines for switching grouping. - max_line_count : int Max lines evaluated in paragraph analysis. - include_slide_notes : bool Include speaker notes in output. - infer_table_structure : bool Generate HTML table structure. - append_cells : bool Merge Excel rows into one block. - cell_separator : str Join cell values in a row. - add_attachment_content : bool Include text of plain-text attachments. - headers : dict Request headers when using URLs. Examples -------- Reading Text Files >>> txt_directory = "/content/txtfiles/reader/txt" >>> partition_df = Partition(content_type="text/plain").partition(txt_directory) >>> partition_df.show() >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml") >>> partition_df.show() >>> partition_df = Partition().partition( ... "https://www.wikipedia.com", ... headers={"Accept-Language": "es-ES"} ... ) >>> partition_df.show() +--------------------+--------------------+ | path| txt| +--------------------+--------------------+ |file:/content/txt...|[{Title, BIG DATA...| +--------------------+--------------------+ Reading Email Files >>> partition_df = Partition().partition("./email-files/test-several-attachments.eml") >>> partition_df.show() +--------------------+--------------------+ | path| email| +--------------------+--------------------+ |file:/content/ema...|[{Title, Test Sev...| +--------------------+--------------------+ Reading Webpages >>> partition_df = Partition().partition("https://www.wikipedia.com", headers = {"Accept-Language": "es-ES"}) >>> partition_df.show() +--------------------+--------------------+ | url| html| +--------------------+--------------------+ |https://www.wikip...|[{Title, Wikipedi...| +--------------------+--------------------+ For more examples, refer to: `examples/python/data-preprocessing/SparkNLP_Partition_Reader_Demo.ipynb` """ def __init__(self, **kwargs):
[docs] self.spark = sparknlp.start()
params = {} for key, value in kwargs.items(): try: params[key] = str(value) except Exception as e: raise ValueError(f"Invalid value for key '{key}': Cannot cast {type(value)} to string. Original error: {e}") super(Partition, self).__init__("com.johnsnowlabs.partition.Partition", params)
[docs] def partition(self, path, headers=None): """ Reads and parses content from a URL, file, or directory path. Parameters ---------- path : str Path to file or directory. URLs and DFS are supported. headers : dict, optional Headers for URL requests. Returns ------- pyspark.sql.DataFrame DataFrame with parsed content. """ if headers is None: headers = {} jdf = self._java_obj.partition(path, headers) dataframe = self.getDataFrame(self.spark, jdf) return dataframe
[docs] def partition_urls(self, path, headers=None): """ Reads and parses content from multiple URLs. Parameters ---------- path : list[str] List of URLs. headers : dict, optional Request headers for URLs. Returns ------- pyspark.sql.DataFrame DataFrame with parsed URL content. Examples -------- >>> urls_df = Partition().partition_urls([ ... "https://www.wikipedia.org", "https://example.com/" ... ]) >>> urls_df.show() +--------------------+--------------------+ | url| html| +--------------------+--------------------+ |https://www.wikip...|[{Title, Wikipedi...| |https://example.com/|[{Title, Example ...| +--------------------+--------------------+ >>> urls_df.printSchema() root |-- url: string (nullable = true) |-- html: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- elementType: string (nullable = true) | | |-- content: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) """ if headers is None: headers = {} jdf = self._java_obj.partitionUrlsJava(path, headers) dataframe = self.getDataFrame(self.spark, jdf) return dataframe
[docs] def partition_text(self, text): """ Parses content from a raw text string. Parameters ---------- text : str Raw text input. Returns ------- pyspark.sql.DataFrame DataFrame with parsed text. Examples -------- >>> raw_text = ( ... "The big brown fox\\n" ... "was walking down the lane.\\n" ... "\\n" ... "At the end of the lane,\\n" ... "the fox met a bear." ... ) >>> text_df = Partition(group_broken_paragraphs=True).partition_text(text=raw_text) >>> text_df.show() +--------------------------------------+ |txt | +--------------------------------------+ |[{NarrativeText, The big brown fox was| +--------------------------------------+ >>> text_df.printSchema() root |-- txt: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- elementType: string (nullable = true) | | |-- content: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) """ jdf = self._java_obj.partitionText(text) dataframe = self.getDataFrame(self.spark, jdf) return dataframe