Source code for sparknlp.reader.sparknlp_reader

#  Copyright 2017-2024 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from sparknlp.internal import ExtendedJavaWrapper


[docs]class SparkNLPReader(ExtendedJavaWrapper): """Instantiates class to read HTML, email, and document files. Two types of input paths are supported: - `htmlPath`: A path to a directory of HTML files or a single HTML file (e.g., `"path/html/files"`). - `url`: A single URL or a set of URLs (e.g., `"https://www.wikipedia.org"`). Parameters ---------- spark : SparkSession The active Spark session. params : dict, optional A dictionary with custom configurations. """ def __init__(self, spark, params=None): if params is None: params = {} super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params) self.spark = spark
[docs] def html(self, htmlPath): """Reads HTML files or URLs and returns a Spark DataFrame. Parameters ---------- htmlPath : str or list of str Path(s) to HTML file(s) or a list of URLs. Returns ------- pyspark.sql.DataFrame A DataFrame containing the parsed HTML content. Examples -------- >>> from sparknlp.reader import SparkNLPReader >>> html_df = SparkNLPReader(spark).html("https://www.wikipedia.org") You can also use SparkNLP to simplify the process: >>> import sparknlp >>> html_df = sparknlp.read().html("https://www.wikipedia.org") >>> html_df.show(truncate=False) """ if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)): raise TypeError("htmlPath must be a string or a list of strings") jdf = self._java_obj.html(htmlPath) return self.getDataFrame(self.spark, jdf)
[docs] def email(self, filePath): """Reads email files and returns a Spark DataFrame. Parameters ---------- filePath : str Path to an email file or a directory containing emails. Returns ------- pyspark.sql.DataFrame A DataFrame containing parsed email data. Examples -------- >>> from sparknlp.reader import SparkNLPReader >>> email_df = SparkNLPReader(spark).email("home/user/emails-directory") Using SparkNLP: >>> import sparknlp >>> email_df = sparknlp.read().email("home/user/emails-directory") >>> email_df.show(truncate=False) """ if not isinstance(filePath, str): raise TypeError("filePath must be a string") jdf = self._java_obj.email(filePath) return self.getDataFrame(self.spark, jdf)
[docs] def doc(self, docPath): """Reads document files and returns a Spark DataFrame. Parameters ---------- docPath : str Path to a document file. Returns ------- pyspark.sql.DataFrame A DataFrame containing parsed document content. """ if not isinstance(docPath, str): raise TypeError("docPath must be a string") jdf = self._java_obj.doc(docPath) return self.getDataFrame(self.spark, jdf)