class PdfReader extends Serializable
Class to parse and read PDF files.
Linear Supertypes
Ordering
- Alphabetic
- By Inheritance
Inherited
- PdfReader
- Serializable
- Serializable
- AnyRef
- Any
- Hide All
- Show All
Visibility
- Public
- All
Instance Constructors
-
new
PdfReader(storeContent: Boolean = false, titleThreshold: Double = 18.0)
- storeContent
Whether to include the raw file content in the output DataFrame as a separate 'content' pdfPath: this is a path to a directory of HTML files or a path to an HTML file E.g. "path/pdf/files"
Example
val path = "./pdf-files/pdf-doc.pdf" val PdfReader = new PdfReader() val pdfDF = PdfReader.read(url)
pdfDF.show() +--------------------+--------------------+ | path| html| +--------------------+--------------------+ |file:/content/htm...|[{Title, My First...| +--------------------+--------------------+ pdfDF.printSchema() root |-- path: string (nullable = true) |-- pdf: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- elementType: string (nullable = true) | | |-- content: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true)
For more examples please refer to this notebook.
- titleThreshold
Minimum font size threshold used as part of heuristic rules to detect title elements based on formatting (e.g., bold, centered, capitalized). By default, it is set to 18.
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
- def getOutputColumn: String
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
- def pdf(filePath: String): DataFrame
- def pdfToHTMLElement(content: Array[Byte]): Seq[HTMLElement]
- def setOutputColumn(name: String): PdfReader.this.type
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()