class HTMLReader extends Serializable
Class to parse and read HTML files.
- Alphabetic
- By Inheritance
- HTMLReader
- Serializable
- Serializable
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
-
new
HTMLReader(titleFontSize: Int = 16, storeContent: Boolean = false, timeout: Int = 0, includeTitleTag: Boolean = false, headers: Map[String, String] = Map.empty)
- titleFontSize
Minimum font size threshold in pixels used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized). By default, it is set to 16.
- storeContent
Whether to include the raw file content in the output DataFrame as a separate 'content' column, alongside the structured output. By default, it is set to false.
- timeout
Timeout value in seconds for reading remote HTML resources. Applied when fetching content from URLs. By default, it is set to 0.
- headers
sets the necessary headers for the URL request. Two types of input paths are supported for the reader, htmlPath: this is a path to a directory of HTML files or a path to an HTML file E.g. "path/html/files" url: this is the URL or set of URLs of a website . E.g., "https://www.wikipedia.org"
Example
val path = "./html-files/fake-html.html" val HTMLReader = new HTMLReader() val htmlDF = HTMLReader.read(url)
htmlDF.show() +--------------------+--------------------+ | path| html| +--------------------+--------------------+ |file:/content/htm...|[{Title, My First...| +--------------------+--------------------+ htmlDf.printSchema() root |-- path: string (nullable = true) |-- html: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- elementType: string (nullable = true) | | |-- content: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true)
For more examples please refer to this notebook.
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- def getOutputColumn: String
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- def htmlToHTMLElement(html: String): Array[HTMLElement]
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
def
read(inputURLs: Array[String]): DataFrame
- inputURLs
this is a list of URLs E.g. [www.wikipedia.com, www.example.com]
- returns
Dataframe with parsed URL content.
-
def
read(inputSource: String): DataFrame
- inputSource
this is the link to the URL E.g. www.wikipedia.com
- returns
Dataframe with parsed URL content.
- def setOutputColumn(value: String): HTMLReader.this.type
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
- def urlToHTMLElement(url: String): Array[HTMLElement]
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()