Reader2Doc

Companion object Reader2Doc

class Reader2Doc extends Transformer with DefaultParamsWritable with HasOutputAnnotatorType with HasOutputAnnotationCol with HasBinaryReaderProperties with HasTextReaderProperties with HasReaderContent

The Reader2Doc annotator allows you to use the reading files more smoothly within existing Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Doc can be used for extracting structured content from various document types using Spark NLP readers. It supports reading from many files types and returns parsed output as a structured Spark DataFrame.

By default, the annotator combines all extracted elements into a single document annotation. For more fine-grained control, you can use setOutputAsDocument(false) and setExplodeDocs(true) to filter individual elements.

Supported formats include plain text, HTML, Word (.doc/.docx), Excel (.xls/.xlsx), PowerPoint (.ppt/.pptx), email files (.eml, .msg), and PDFs.

Example

import com.johnsnowlabs.reader.Reader2Doc
import com. johnsnowlabs.nlp.base.DocumentAssembler
import org.apache.spark.ml.Pipeline

val reader2Doc = new Reader2Doc()
  .setContentType("application/pdf")
  .setContentPath(s"$pdfDirectory/")
  .setExplodeDocs(true)

val pipeline = new Pipeline()
  .setStages(Array(reader2Doc))

val pipelineModel = pipeline.fit(emptyDataSet)
val resultDf = pipelineModel.transform(emptyDataSet)

resultDf.show()
+------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                            |
+------------------------------------------------------------------------------------------------------------------------------------+
|[{document, 0, 14, This is a Title, {pageNumber -> 1, elementType -> Title, fileName -> pdf-title.pdf}, []}]                        |
|[{document, 15, 38, This is a narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}]      |
|[{document, 39, 68, This is another narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}]|
+------------------------------------------------------------------------------------------------------------------------------------+

Linear Supertypes

HasReaderContent, HasTagsReaderProperties, HasXmlReaderProperties, HasHTMLReaderProperties, HasReaderProperties, HasTextReaderProperties, HasBinaryReaderProperties, HasPowerPointProperties, HasPdfReaderProperties, HasExcelReaderProperties, HasEmailReaderProperties, ParamsAndFeaturesWritable, HasFeatures, HasOutputAnnotationCol, HasOutputAnnotatorType, DefaultParamsWritable, MLWritable, Transformer, PipelineStage, Logging, Params, Serializable, Serializable, Identifiable, AnyRef, Any

Known Subclasses

Reader2Table

Ordering

Grouped
Alphabetic
By Inheritance

Inherited

Reader2Doc
HasReaderContent
HasTagsReaderProperties
HasXmlReaderProperties
HasHTMLReaderProperties
HasReaderProperties
HasTextReaderProperties
HasBinaryReaderProperties
HasPowerPointProperties
HasPdfReaderProperties
HasExcelReaderProperties
HasEmailReaderProperties
ParamsAndFeaturesWritable
HasFeatures
HasOutputAnnotationCol
HasOutputAnnotatorType
DefaultParamsWritable
MLWritable
Transformer
PipelineStage
Logging
Params
Serializable
Serializable
Identifiable
AnyRef
Any

Hide All
Show All

Visibility

Public
All

Instance Constructors

new Reader2Doc()
new Reader2Doc(uid: String)

Type Members

type AnnotatorType = String

Definition Classes
HasOutputAnnotatorType

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def $[T](param: Param[T]): T

Attributes
protected
Definition Classes
Params
def $$[T](feature: StructFeature[T]): T

Attributes
protected
Definition Classes
HasFeatures
def $$[K, V](feature: MapFeature[K, V]): Map[K, V]

Attributes
protected
Definition Classes
HasFeatures
def $$[T](feature: SetFeature[T]): Set[T]

Attributes
protected
Definition Classes
HasFeatures
def $$[T](feature: ArrayFeature[T]): Array[T]

Attributes
protected
Definition Classes
HasFeatures
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
val addAttachmentContent: Param[Boolean]

Definition Classes
HasEmailReaderProperties
def afterAnnotate(dataset: DataFrame): DataFrame
val appendCells: Param[Boolean]

Definition Classes
HasExcelReaderProperties
final def asInstanceOf[T0]: T0

Definition Classes
Any
def buildEmptyDataFrame(dataset: Dataset[_]): DataFrame

Definition Classes
HasReaderContent
def buildErrorDataFrame(dataset: Dataset[_], contentPath: String, ext: String): DataFrame

Definition Classes
HasReaderContent
val cellSeparator: Param[String]

Definition Classes
HasExcelReaderProperties
final def clear(param: Param[_]): Reader2Doc.this.type

Definition Classes
Params
def clone(): AnyRef

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
val contentPath: Param[String]

Definition Classes
HasReaderProperties
val contentType: Param[String]

Definition Classes
HasReaderProperties
def copy(extra: ParamMap): Transformer

Definition Classes
Reader2Doc → Transformer → PipelineStage → Params
def copyValues[T <: Params](to: T, extra: ParamMap): T

Attributes
protected
Definition Classes
Params
final def defaultCopy[T <: Params](extra: ParamMap): T

Attributes
protected
Definition Classes
Params
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
val excludeNonText: BooleanParam
def explainParam(param: Param[_]): String

Definition Classes
Params
def explainParams(): String

Definition Classes
Params
val explodeDocs: BooleanParam

Definition Classes
HasReaderProperties
final def extractParamMap(): ParamMap

Definition Classes
Params
final def extractParamMap(extra: ParamMap): ParamMap

Definition Classes
Params
val extractTagAttributes: StringArrayParam

Definition Classes
HasTextReaderProperties
val features: ArrayBuffer[Feature[_, _, _]]

Definition Classes
HasFeatures
def finalize(): Unit

Attributes
protected[lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
val flattenOutput: BooleanParam

Definition Classes
HasReaderProperties
def get[T](feature: StructFeature[T]): Option[T]

Attributes
protected
Definition Classes
HasFeatures
def get[K, V](feature: MapFeature[K, V]): Option[Map[K, V]]

Attributes
protected
Definition Classes
HasFeatures
def get[T](feature: SetFeature[T]): Option[Set[T]]

Attributes
protected
Definition Classes
HasFeatures
def get[T](feature: ArrayFeature[T]): Option[Array[T]]

Attributes
protected
Definition Classes
HasFeatures
final def get[T](param: Param[T]): Option[T]

Definition Classes
Params
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
Annotations
@native()
def getContentType: String

Definition Classes
HasReaderContent
final def getDefault[T](param: Param[T]): Option[T]

Definition Classes
Params
val getFileName: UserDefinedFunction

Definition Classes
HasReaderContent
final def getInputCol: String

Definition Classes
HasReaderProperties
final def getOrDefault[T](param: Param[T]): T

Definition Classes
Params
final def getOutputCol: String
Gets annotation column name going to generate
Gets annotation column name going to generate

Definition Classes
HasOutputAnnotationCol
def getParam(paramName: String): Param[Any]

Definition Classes
Params
val groupBrokenParagraphs: Param[Boolean]

Definition Classes
HasTextReaderProperties
final def hasDefault[T](param: Param[T]): Boolean

Definition Classes
Params
def hasParam(paramName: String): Boolean

Definition Classes
Params
def hashCode(): Int

Definition Classes
AnyRef → Any
Annotations
@native()
val headers: Param[Map[String, String]]

Definition Classes
HasHTMLReaderProperties
val ignoreExceptions: BooleanParam

Definition Classes
HasReaderProperties
val includePageBreaks: Param[Boolean]

Definition Classes
HasReaderProperties
val includeSlideNotes: Param[Boolean]

Definition Classes
HasPowerPointProperties
val includeTitleTag: Param[Boolean]

Definition Classes
HasHTMLReaderProperties
val inferTableStructure: Param[Boolean]

Definition Classes
HasReaderProperties
def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean

Attributes
protected
Definition Classes
Logging
def initializeLogIfNecessary(isInterpreter: Boolean): Unit

Attributes
protected
Definition Classes
Logging
final val inputCol: Param[String]

Attributes
protected
Definition Classes
HasReaderProperties
final def isDefined(param: Param[_]): Boolean

Definition Classes
Params
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def isSet(param: Param[_]): Boolean

Definition Classes
Params
def isTraceEnabled(): Boolean

Attributes
protected
Definition Classes
Logging
val joinString: Param[String]
def listAllFilesRecursively(dir: File): Seq[File]

Definition Classes
HasReaderContent
def log: Logger

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logName: String

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
val maxLineCount: Param[Int]

Definition Classes
HasTextReaderProperties
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
Annotations
@native()
final def notifyAll(): Unit

Definition Classes
AnyRef
Annotations
@native()
def onWrite(path: String, spark: SparkSession): Unit

Attributes
protected
Definition Classes
ParamsAndFeaturesWritable
val onlyLeafNodes: Param[Boolean]

Definition Classes
HasXmlReaderProperties
val outputAnnotatorType: AnnotatorType

Definition Classes
Reader2Doc → HasOutputAnnotatorType
val outputAsDocument: BooleanParam
Whether to return all sentences joined into a single document
final val outputCol: Param[String]

Attributes
protected
Definition Classes
HasOutputAnnotationCol
val outputFormat: Param[String]

Definition Classes
HasHTMLReaderProperties
val paragraphSplit: Param[String]

Definition Classes
HasTextReaderProperties
lazy val params: Array[Param[_]]

Definition Classes
Params
def partitionBuilder: Partition

Attributes
protected
def partitionContent(partition: Partition, contentPath: String, isText: Boolean, dataset: Dataset[_]): DataFrame

Definition Classes
HasReaderContent
def partitionContentFromPath(partition: Partition, contentPath: String, isText: Boolean, dataset: Dataset[_]): DataFrame

Definition Classes
HasReaderContent
def partitionMixedContent(dataset: Dataset[_], dirPath: String, partitionParams: Map[String, String]): DataFrame

Definition Classes
HasReaderContent
def partitionToAnnotation: UserDefinedFunction
final val readAsImage: BooleanParam

Definition Classes
HasPdfReaderProperties
def retrieveFileName(path: String): String

Definition Classes
HasReaderContent
def save(path: String): Unit

Definition Classes
MLWritable
Annotations
@Since( "1.6.0" ) @throws( ... )
def set[T](feature: StructFeature[T], value: T): Reader2Doc.this.type

Attributes
protected
Definition Classes
HasFeatures
def set[K, V](feature: MapFeature[K, V], value: Map[K, V]): Reader2Doc.this.type

Attributes
protected
Definition Classes
HasFeatures
def set[T](feature: SetFeature[T], value: Set[T]): Reader2Doc.this.type

Attributes
protected
Definition Classes
HasFeatures
def set[T](feature: ArrayFeature[T], value: Array[T]): Reader2Doc.this.type

Attributes
protected
Definition Classes
HasFeatures
final def set(paramPair: ParamPair[_]): Reader2Doc.this.type

Attributes
protected
Definition Classes
Params
final def set(param: String, value: Any): Reader2Doc.this.type

Attributes
protected
Definition Classes
Params
final def set[T](param: Param[T], value: T): Reader2Doc.this.type

Definition Classes
Params
def setAddAttachmentContent(value: Boolean): Reader2Doc.this.type

Definition Classes
HasEmailReaderProperties
def setAppendCells(value: Boolean): Reader2Doc.this.type

Definition Classes
HasExcelReaderProperties
def setCellSeparator(value: String): Reader2Doc.this.type

Definition Classes
HasExcelReaderProperties
def setContentPath(value: String): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setContentType(value: String): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setDefault[T](feature: StructFeature[T], value: () ⇒ T): Reader2Doc.this.type

Attributes
protected
Definition Classes
HasFeatures
def setDefault[K, V](feature: MapFeature[K, V], value: () ⇒ Map[K, V]): Reader2Doc.this.type

Attributes
protected
Definition Classes
HasFeatures
def setDefault[T](feature: SetFeature[T], value: () ⇒ Set[T]): Reader2Doc.this.type

Attributes
protected
Definition Classes
HasFeatures
def setDefault[T](feature: ArrayFeature[T], value: () ⇒ Array[T]): Reader2Doc.this.type

Attributes
protected
Definition Classes
HasFeatures
final def setDefault(paramPairs: ParamPair[_]*): Reader2Doc.this.type

Attributes
protected
Definition Classes
Params
final def setDefault[T](param: Param[T], value: T): Reader2Doc.this.type

Attributes
protected[org.apache.spark.ml]
Definition Classes
Params
def setExcludeNonText(value: Boolean): Reader2Doc.this.type
Excludes rows that are not text data.
Excludes rows that are not text data. e.g. tables
def setExplodeDocs(value: Boolean): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setExtractTagAttributes(attributes: Array[String]): Reader2Doc.this.type
Specify which tag attributes should have their values extracted as text when parsing tag-based formats (e.g., HTML or XML).
Specify which tag attributes should have their values extracted as text when parsing tag-based formats (e.g., HTML or XML).
attributes
array of attribute names to extract
returns
this instance with the updated extractTagAttributes parameter

Definition Classes
HasTextReaderProperties
def setFlattenOutput(value: Boolean): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setGroupBrokenParagraphs(value: Boolean): Reader2Doc.this.type
Enable or disable merging of fragmented lines into coherent paragraphs when parsing text.
Enable or disable merging of fragmented lines into coherent paragraphs when parsing text. When enabled, heuristics based on line length and structure are used to group lines.
value
true to group broken paragraphs, false to preserve original line breaks
returns
this instance with the updated groupBrokenParagraphs parameter

Definition Classes
HasTextReaderProperties
def setHeaders(value: Map[String, String]): Reader2Doc.this.type

Definition Classes
HasHTMLReaderProperties
def setHeadersPython(headers: Map[String, String]): Reader2Doc.this.type

Definition Classes
HasHTMLReaderProperties
def setIgnoreExceptions(value: Boolean): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setIncludePageBreaks(value: Boolean): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setIncludeSlideNotes(value: Boolean): Reader2Doc.this.type

Definition Classes
HasPowerPointProperties
def setIncludeTitleTag(value: Boolean): Reader2Doc.this.type

Definition Classes
HasHTMLReaderProperties
def setInferTableStructure(value: Boolean): Reader2Doc.this.type

Definition Classes
HasReaderProperties
final def setInputCol(value: String): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setJoinString(value: String): Reader2Doc.this.type
If outputAsDocument is true, specifies the string used to join elements into a single
def setMaxLineCount(value: Int): Reader2Doc.this.type
Set the maximum number of lines to evaluate when estimating paragraph layout characteristics.
Set the maximum number of lines to evaluate when estimating paragraph layout characteristics. This limits the amount of text inspected for layout heuristics.
value
maximum number of lines to inspect
returns
this instance with the updated maxLineCount parameter

Definition Classes
HasTextReaderProperties
def setOnlyLeafNodes(value: Boolean): Reader2Doc.this.type

Definition Classes
HasXmlReaderProperties
def setOutputAsDocument(value: Boolean): Reader2Doc.this.type
Whether to return all sentences joined into a single document
final def setOutputCol(value: String): Reader2Doc.this.type
Overrides annotation column name when transforming
Overrides annotation column name when transforming

Definition Classes
HasOutputAnnotationCol
def setOutputFormat(value: String): Reader2Doc.this.type

Definition Classes
HasHTMLReaderProperties
def setParagraphSplit(value: String): Reader2Doc.this.type
Set the regular expression used to detect paragraph boundaries when grouping broken paragraphs.
Set the regular expression used to detect paragraph boundaries when grouping broken paragraphs.
value
regex pattern string to detect paragraph boundaries
returns
this instance with the updated paragraphSplit parameter

Definition Classes
HasTextReaderProperties
def setShortLineWordThreshold(value: Int): Reader2Doc.this.type
Set the maximum number of words for a line to be considered "short" when grouping broken paragraphs.
Set the maximum number of words for a line to be considered "short" when grouping broken paragraphs. Short lines often indicate line-wrapping within a paragraph rather than a real paragraph break.
value
maximum word count for a line to be considered short
returns
this instance with the updated shortLineWordThreshold parameter

Definition Classes
HasTextReaderProperties
def setStoreContent(value: Boolean): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setThreshold(value: Double): Reader2Doc.this.type
Set the threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping.
Set the threshold ratio of empty lines used to decide between new line-based or broken-paragraph grouping. Lower values make it easier to choose broken-paragraph grouping.
value
ratio between 0.0 and 1.0 representing the empty-line threshold
returns
this instance with the updated threshold parameter

Definition Classes
HasTextReaderProperties
def setTimeout(value: Int): Reader2Doc.this.type

Definition Classes
HasHTMLReaderProperties
def setTitleFontSize(value: Int): Reader2Doc.this.type

Definition Classes
HasReaderProperties
def setTitleLengthSize(value: Int): Reader2Doc.this.type
Set the maximum character length used to determine if a text block qualifies as a title during parsing.
Set the maximum character length used to determine if a text block qualifies as a title during parsing.
value
maximum number of characters to treat a block as a title
returns
this instance with the updated titleLengthSize parameter

Definition Classes
HasTextReaderProperties
def setTitleThreshold(value: Double): Reader2Doc.this.type

Definition Classes
HasPdfReaderProperties
def setXmlKeepTags(value: Boolean): Reader2Doc.this.type

Definition Classes
HasXmlReaderProperties
val shortLineWordThreshold: Param[Int]

Definition Classes
HasTextReaderProperties
val storeContent: Param[Boolean]

Definition Classes
HasReaderProperties
val supportedTypes: Map[String, (String, Boolean)]

Definition Classes
HasReaderContent
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
val threshold: Param[Double]

Definition Classes
HasTextReaderProperties
val timeout: Param[Int]

Definition Classes
HasHTMLReaderProperties
val titleFontSize: Param[Int]

Definition Classes
HasReaderProperties
val titleLengthSize: Param[Int]

Definition Classes
HasTextReaderProperties
val titleThreshold: Param[Double]

Definition Classes
HasPdfReaderProperties
def toString(): String

Definition Classes
Identifiable → AnyRef → Any
def transform(dataset: Dataset[_]): DataFrame

Definition Classes
Reader2Doc → Transformer
def transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame

Definition Classes
Transformer
Annotations
@Since( "2.0.0" )
def transform(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DataFrame

Definition Classes
Transformer
Annotations
@Since( "2.0.0" ) @varargs()
def transformSchema(schema: StructType): StructType

Definition Classes
Reader2Doc → PipelineStage
def transformSchema(schema: StructType, logging: Boolean): StructType

Attributes
protected
Definition Classes
PipelineStage
Annotations
@DeveloperApi()
val uid: String

Definition Classes
Reader2Doc → Identifiable
def validateRequiredParameters(): Unit

Attributes
protected
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... ) @native()
def write: MLWriter

Definition Classes
ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
val xmlKeepTags: Param[Boolean]

Definition Classes
HasXmlReaderProperties

Packages

Reader2Doc

Companion object Reader2Doc

class Reader2Doc extends Transformer with DefaultParamsWritable with HasOutputAnnotatorType with HasOutputAnnotationCol with HasBinaryReaderProperties with HasTextReaderProperties with HasReaderContent

Example

Instance Constructors

Type Members

Value Members

Inherited from HasReaderContent

Inherited from HasTagsReaderProperties

Inherited from HasXmlReaderProperties

Inherited from HasHTMLReaderProperties

Inherited from HasReaderProperties

Inherited from HasTextReaderProperties

Inherited from HasBinaryReaderProperties

Inherited from HasPowerPointProperties

Inherited from HasPdfReaderProperties

Inherited from HasExcelReaderProperties

Inherited from HasEmailReaderProperties

Inherited from ParamsAndFeaturesWritable

Inherited from HasFeatures

Inherited from HasOutputAnnotationCol

Inherited from HasOutputAnnotatorType

Inherited from DefaultParamsWritable

Inherited from MLWritable

Inherited from Transformer

Inherited from PipelineStage

Inherited from Logging

Inherited from Params

Inherited from Serializable

Inherited from Serializable

Inherited from Identifiable

Inherited from AnyRef

Inherited from Any

param

Ungrouped

Packages

Reader2Doc 

Companion object Reader2Doc

class Reader2Doc extends Transformer with DefaultParamsWritable with HasOutputAnnotatorType with HasOutputAnnotationCol with HasBinaryReaderProperties with HasTextReaderProperties with HasReaderContent

Example

Instance Constructors

Type Members

Value Members

Inherited from HasReaderContent

Inherited from HasTagsReaderProperties

Inherited from HasXmlReaderProperties

Inherited from HasHTMLReaderProperties

Inherited from HasReaderProperties

Inherited from HasTextReaderProperties

Inherited from HasBinaryReaderProperties

Inherited from HasPowerPointProperties

Inherited from HasPdfReaderProperties

Inherited from HasExcelReaderProperties

Inherited from HasEmailReaderProperties

Inherited from ParamsAndFeaturesWritable

Inherited from HasFeatures

Inherited from HasOutputAnnotationCol

Inherited from HasOutputAnnotatorType

Inherited from DefaultParamsWritable

Inherited from MLWritable

Inherited from Transformer

Inherited from PipelineStage

Inherited from Logging

Inherited from Params

Inherited from Serializable

Inherited from Serializable

Inherited from Identifiable

Inherited from AnyRef

Inherited from Any

param

Ungrouped

Reader2Doc