HTMLReader

class HTMLReader extends Serializable

Class to parse and read HTML files.

Linear Supertypes

Serializable, Serializable, AnyRef, Any

Ordering

Alphabetic
By Inheritance

Inherited

HTMLReader
Serializable
Serializable
AnyRef
Any

Hide All
Show All

Visibility

Public
All

Instance Constructors

new HTMLReader(titleFontSize: Int = 16, storeContent: Boolean = false, timeout: Int = 0, includeTitleTag: Boolean = false, outputFormat: String = "plain-text", headers: Map[String, String] = Map.empty, ignoreUrlErrors: Boolean = true)
titleFontSize
Minimum font size threshold in pixels used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized). By default, it is set to 16.
storeContent
Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output. By default, it is set to false.
timeout
Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs. By default, it is set to 0.
headers
sets the necessary headers for the URL request.
ignoreUrlErrors
When true, remote URL fetch failures return a synthetic HTML fallback instead of failing the Spark job. By default, it is set to true. Two types of input paths are supported for the reader, htmlPath: this is a path to a directory of HTML files or a path to an HTML file E.g. "path/html/files" url: this is the URL or set of URLs of a website . E.g., "https://www.wikipedia.org"
Example
```
val path = "./html-files/fake-html.html"
val HTMLReader = new HTMLReader()
val htmlDF = HTMLReader.read(url)
```
```
htmlDF.show()
+--------------------+--------------------+
|                path|                html|
+--------------------+--------------------+
|file:/content/htm...|[{Title, My First...|
+--------------------+--------------------+

htmlDf.printSchema()
root
 |-- path: string (nullable = true)
 |-- html: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- elementType: string (nullable = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
```
For more examples please refer to this notebook.

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
Annotations
@native()
def getOutputColumn: String
def hashCode(): Int

Definition Classes
AnyRef → Any
Annotations
@native()
def htmlToHTMLElement(html: String): Array[HTMLElement]
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
Annotations
@native()
final def notifyAll(): Unit

Definition Classes
AnyRef
Annotations
@native()
def read(inputURLs: Array[String]): DataFrame
inputURLs
this is a list of URLs E.g. [www.wikipedia.com, www.example.com]
returns
Dataframe with parsed URL content.
def read(inputSource: String): DataFrame
inputSource
this is the link to the URL E.g. www.wikipedia.com
returns
Dataframe with parsed URL content.
def setOutputColumn(value: String): HTMLReader.this.type
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
def urlToHTMLElement(url: String): Array[HTMLElement]
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... ) @native()

Packages

HTMLReader

class HTMLReader extends Serializable

Instance Constructors

Example

Value Members

Inherited from Serializable

Inherited from Serializable

Inherited from AnyRef

Inherited from Any

Ungrouped

Packages

HTMLReader 

class HTMLReader extends Serializable

Instance Constructors

Example

Value Members

Inherited from Serializable

Inherited from Serializable

Inherited from AnyRef

Inherited from Any

Ungrouped

HTMLReader