Source code for sparknlp.reader.sparknlp_reader

#  Copyright 2017-2024 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from sparknlp.internal import ExtendedJavaWrapper


[docs]class SparkNLPReader(ExtendedJavaWrapper):
    """Instantiates class to read HTML, email, and document files.

    Two types of input paths are supported:

    - `htmlPath`: A path to a directory of HTML files or a single HTML file (e.g., `"path/html/files"`).
    - `url`: A single URL or a set of URLs (e.g., `"https://www.wikipedia.org"`).

    Parameters
    ----------
    spark : SparkSession
        The active Spark session.
    params : dict, optional
        A dictionary with custom configurations.
    """

    def __init__(self, spark, params=None):
        if params is None:
            params = {}
        super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params)
        self.spark = spark

[docs]    def html(self, htmlPath):
        """Reads HTML files or URLs and returns a Spark DataFrame.

        Parameters
        ----------
        htmlPath : str or list of str
            Path(s) to HTML file(s) or a list of URLs.

        Returns
        -------
        pyspark.sql.DataFrame
            A DataFrame containing the parsed HTML content.

        Examples
        --------
        >>> from sparknlp.reader import SparkNLPReader
        >>> html_df = SparkNLPReader(spark).html("https://www.wikipedia.org")

        You can also use SparkNLP to simplify the process:

        >>> import sparknlp
        >>> html_df = sparknlp.read().html("https://www.wikipedia.org")
        >>> html_df.show(truncate=False)
        """
        if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
            raise TypeError("htmlPath must be a string or a list of strings")
        jdf = self._java_obj.html(htmlPath)
        return self.getDataFrame(self.spark, jdf)

[docs]    def email(self, filePath):
        """Reads email files and returns a Spark DataFrame.

        Parameters
        ----------
        filePath : str
            Path to an email file or a directory containing emails.

        Returns
        -------
        pyspark.sql.DataFrame
            A DataFrame containing parsed email data.

        Examples
        --------
        >>> from sparknlp.reader import SparkNLPReader
        >>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")

        Using SparkNLP:

        >>> import sparknlp
        >>> email_df = sparknlp.read().email("home/user/emails-directory")
        >>> email_df.show(truncate=False)
        """
        if not isinstance(filePath, str):
            raise TypeError("filePath must be a string")
        jdf = self._java_obj.email(filePath)
        return self.getDataFrame(self.spark, jdf)

[docs]    def doc(self, docPath):
        """Reads document files and returns a Spark DataFrame.

        Parameters
        ----------
        docPath : str
            Path to a document file.

        Returns
        -------
        pyspark.sql.DataFrame
            A DataFrame containing parsed document content.
        """
        if not isinstance(docPath, str):
            raise TypeError("docPath must be a string")
        jdf = self._java_obj.doc(docPath)
        return self.getDataFrame(self.spark, jdf)