# Copyright 2017-2024 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sparknlp.internal import ExtendedJavaWrapper
[docs]class SparkNLPReader(ExtendedJavaWrapper):
"""Instantiates class to read documents in various formats.
Parameters
----------
params : spark
Spark session
params : dict, optional
Parameter with custom configuration
Notes
-----
This class can read HTML, email, PDF, MS Word, Excel, PowerPoint, and text files.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> reader = SparkNLPReader(spark)
Reading HTML
>>> html_df = reader.html("https://www.wikipedia.org")
>>> # Or with shorthand
>>> import sparknlp
>>> html_df = sparknlp.read().html("https://www.wikipedia.org")
Reading PDF
>>> pdf_df = reader.pdf("home/user/pdfs-directory")
>>> # Or with shorthand
>>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
Reading Email
>>> email_df = reader.email("home/user/emails-directory")
>>> # Or with shorthand
>>> email_df = sparknlp.read().email("home/user/emails-directory")
"""
def __init__(self, spark, params=None, headers=None):
if params is None:
params = {}
super(SparkNLPReader, self).__init__("com.johnsnowlabs.reader.SparkNLPReader", params, headers)
[docs] def html(self, htmlPath):
"""Reads HTML files or URLs and returns a Spark DataFrame.
Parameters
----------
htmlPath : str or list of str
Path(s) to HTML file(s) or a list of URLs.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing the parsed HTML content.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> html_df = SparkNLPReader().html("https://www.wikipedia.org")
You can also use SparkNLP to simplify the process:
>>> import sparknlp
>>> html_df = sparknlp.read().html("https://www.wikipedia.org")
>>> html_df.show(truncate=False)
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|url |html |
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|https://example.com/|[{Title, Example Domain, {pageNumber -> 1}}, {NarrativeText, 0, This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission., {pageNumber -> 1}}, {NarrativeText, 0, More information... More information..., {pageNumber -> 1}}] |
+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
>>> html_df.printSchema()
root
|-- url: string (nullable = true)
|-- html: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- elementType: string (nullable = true)
| | |-- content: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
"""
if not isinstance(htmlPath, (str, list)) or (isinstance(htmlPath, list) and not all(isinstance(item, str) for item in htmlPath)):
raise TypeError("htmlPath must be a string or a list of strings")
jdf = self._java_obj.html(htmlPath)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe
[docs] def email(self, filePath):
"""Reads email files and returns a Spark DataFrame.
Parameters
----------
filePath : str
Path to an email file or a directory containing emails.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing parsed email data.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> email_df = SparkNLPReader(spark).email("home/user/emails-directory")
You can also use SparkNLP to simplify the process:
>>> import sparknlp
>>> email_df = sparknlp.read().email("home/user/emails-directory")
>>> email_df.show()
+---------------------------------------------------+
|email |
+---------------------------------------------------+
|[{Title, Email Text Attachments, {sent_to -> Danilo|
+---------------------------------------------------+
>>> email_df.printSchema()
root
|-- path: string (nullable = true)
|-- content: array (nullable = true)
|-- email: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- elementType: string (nullable = true)
| | |-- content: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
"""
if not isinstance(filePath, str):
raise TypeError("filePath must be a string")
jdf = self._java_obj.email(filePath)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe
[docs] def doc(self, docPath):
"""Reads word document files and returns a Spark DataFrame.
Parameters
----------
docPath : str
Path to a word document file.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing parsed document content.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> doc_df = SparkNLPReader().doc(spark, "home/user/word-directory")
You can use SparkNLP for one line of code
>>> import sparknlp
>>> doc_df = sparknlp.read().doc("home/user/word-directory")
>>> doc_df.show()
+-------------------------------------------------+
|doc | |
+-------------------------------------------------+
|[{Table, Header Col 1, {}}, {Table, Header Col 2,|
+-------------------------------------------------+
>>> doc_df.printSchema()
root
|-- path: string (nullable = true)
|-- content: array (nullable = true)
|-- doc: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- elementType: string (nullable = true)
| | |-- content: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
"""
if not isinstance(docPath, str):
raise TypeError("docPath must be a string")
jdf = self._java_obj.doc(docPath)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe
[docs] def pdf(self, pdfPath):
if not isinstance(pdfPath, str):
raise TypeError("docPath must be a string")
jdf = self._java_obj.pdf(pdfPath)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe
[docs] def xls(self, docPath):
"""Reads excel document files and returns a Spark DataFrame.
Parameters
----------
docPath : str
Path to an excel document file.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing parsed document content.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> xlsDf = SparkNLPReader().xls(spark, "home/user/excel-directory")
You can use SparkNLP for one line of code
>>> import sparknlp
>>> xlsDf = sparknlp.read().xls("home/user/excel-directory")
>>> xlsDf.show()
+--------------------------------------------+
|xls |
+--------------------------------------------+
|[{Title, Financial performance, {SheetNam}}]|
+--------------------------------------------+
>>> xlsDf.printSchema()
root
|-- path: string (nullable = true)
|-- content: binary (nullable = true)
|-- xls: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- elementType: string (nullable = true)
| | |-- content: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
"""
if not isinstance(docPath, str):
raise TypeError("docPath must be a string")
jdf = self._java_obj.xls(docPath)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe
[docs] def ppt(self, docPath):
"""
Reads power point document files and returns a Spark DataFrame.
Parameters
----------
docPath : str
Path to an power point document file.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing parsed document content.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> pptDf = SparkNLPReader().ppt(spark, "home/user/powerpoint-directory")
You can use SparkNLP for one line of code
>>> import sparknlp
>>> pptDf = sparknlp.read().ppt("home/user/powerpoint-directory")
>>> pptDf.show(truncate=False)
+-------------------------------------+
|ppt |
+-------------------------------------+
|[{Title, Adding a Bullet Slide, {}},]|
+-------------------------------------+
"""
if not isinstance(docPath, str):
raise TypeError("docPath must be a string")
jdf = self._java_obj.ppt(docPath)
dataframe = self.getDataFrame(self.spark, jdf)
return dataframe
[docs] def txt(self, docPath):
"""Reads TXT files and returns a Spark DataFrame.
Parameters
----------
docPath : str
Path to a TXT file.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing parsed document content.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
You can use SparkNLP for one line of code
>>> import sparknlp
>>> txtDf = sparknlp.read().txt("home/user/txt/files")
>>> txtDf.show(truncate=False)
+-----------------------------------------------+
|txt |
+-----------------------------------------------+
|[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}]|
+-----------------------------------------------+
"""
if not isinstance(docPath, str):
raise TypeError("docPath must be a string")
jdf = self._java_obj.txt(docPath)
return self.getDataFrame(self.spark, jdf)
[docs] def xml(self, docPath):
"""Reads XML files and returns a Spark DataFrame.
Parameters
----------
docPath : str
Path to an XML file or a directory containing XML files.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing parsed XML content.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
You can use SparkNLP for one line of code
>>> import sparknlp
>>> xml_df = sparknlp.read().xml("home/user/xml-directory")
>>> xml_df.show(truncate=False)
+-----------------------------------------------------------+
|xml |
+-----------------------------------------------------------+
|[{Title, John Smith, {elementId -> ..., tag -> title}}] |
+-----------------------------------------------------------+
>>> xml_df.printSchema()
root
|-- path: string (nullable = true)
|-- xml: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- elementType: string (nullable = true)
| | |-- content: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
"""
if not isinstance(docPath, str):
raise TypeError("docPath must be a string")
jdf = self._java_obj.xml(docPath)
return self.getDataFrame(self.spark, jdf)
[docs] def md(self, filePath):
"""Reads Markdown files and returns a Spark DataFrame.
Parameters
----------
filePath : str
Path to a Markdown file or a directory containing Markdown files.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing parsed Markdown content.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> md_df = SparkNLPReader(spark).md("home/user/markdown-directory")
You can use SparkNLP for one line of code
>>> import sparknlp
>>> md_df = sparknlp.read().md("home/user/markdown-directory")
>>> md_df.show(truncate=False)
+-----------------------------------------------------------+
|md |
+-----------------------------------------------------------+
|[{Title, Sample Markdown Document, {elementId -> ..., tag -> title}}]|
+-----------------------------------------------------------+
>>> md_df.printSchema()
root
|-- path: string (nullable = true)
|-- md: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- elementType: string (nullable = true)
| | |-- content: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
"""
if not isinstance(filePath, str):
raise TypeError("filePath must be a string")
jdf = self._java_obj.md(filePath)
return self.getDataFrame(self.spark, jdf)
[docs] def csv(self, csvPath):
"""Reads CSV files and returns a Spark DataFrame.
Parameters
----------
docPath : str
Path to an CSV file or a directory containing CSV files.
Returns
-------
pyspark.sql.DataFrame
A DataFrame containing parsed CSV content.
Examples
--------
>>> from sparknlp.reader import SparkNLPReader
>>> csv_df = SparkNLPReader(spark).csv("home/user/csv-directory")
You can use SparkNLP for one line of code
>>> import sparknlp
>>> csv_df = sparknlp.read().csv("home/user/csv-directory")
>>> csv_df.show(truncate=False)
+-----------------------------------------------------------------------------------------------------------------------------------------+
|csv |
+-----------------------------------------------------------------------------------------------------------------------------------------+
|[{NarrativeText, Alice 100 Bob 95, {}}, {Table, <table><tr><td>Alice</td><td>100</td></tr><tr><td>Bob</td><td>95</td></tr></table>, {}}] |
+-----------------------------------------------------------------------------------------------------------------------------------------+
>>> csv_df.printSchema()
root
|-- path: string (nullable = true)
|-- csv: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- elementType: string (nullable = true)
| | |-- content: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
"""
if not isinstance(csvPath, str):
raise TypeError("docPath must be a string")
jdf = self._java_obj.csv(csvPath)
return self.getDataFrame(self.spark, jdf)