class Reader2Doc extends Transformer with DefaultParamsWritable with HasOutputAnnotatorType with HasOutputAnnotationCol with HasReaderProperties with HasEmailReaderProperties with HasExcelReaderProperties with HasHTMLReaderProperties with HasPowerPointProperties with HasTextReaderProperties with HasXmlReaderProperties

The Reader2Doc annotator allows you to use the reading files more smoothly within existing Spark NLP workflows, enabling seamless reuse of your pipelines. Reader2Doc can be used for extracting structured content from various document types using Spark NLP readers. It supports reading from many files types and returns parsed output as a structured Spark DataFrame.

Supported formats include plain text, HTML, Word (.doc/.docx), Excel (.xls/.xlsx), PowerPoint (.ppt/.pptx), email files (.eml, .msg), and PDFs.

Example

import com.johnsnowlabs.reader.Reader2Doc
import com. johnsnowlabs.nlp.base.DocumentAssembler
import org.apache.spark.ml.Pipeline

val partition = new Reader2Doc()
  .setContentType("application/pdf")
  .setContentPath(s"$pdfDirectory/")

val pipeline = new Pipeline()
  .setStages(Array(reader2Doc))

val pipelineModel = pipeline.fit(emptyDataSet)
val resultDf = pipelineModel.transform(emptyDataSet)

resultDf.show()
+------------------------------------------------------------------------------------------------------------------------------------+
|document                                                                                                                            |
+------------------------------------------------------------------------------------------------------------------------------------+
|[{document, 0, 14, This is a Title, {pageNumber -> 1, elementType -> Title, fileName -> pdf-title.pdf}, []}]                        |
|[{document, 15, 38, This is a narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}]      |
|[{document, 39, 68, This is another narrative text, {pageNumber -> 1, elementType -> NarrativeText, fileName -> pdf-title.pdf}, []}]|
+------------------------------------------------------------------------------------------------------------------------------------+
Linear Supertypes
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. Reader2Doc
  2. HasXmlReaderProperties
  3. HasTextReaderProperties
  4. HasPowerPointProperties
  5. HasHTMLReaderProperties
  6. HasExcelReaderProperties
  7. HasEmailReaderProperties
  8. HasReaderProperties
  9. ParamsAndFeaturesWritable
  10. HasFeatures
  11. HasOutputAnnotationCol
  12. HasOutputAnnotatorType
  13. DefaultParamsWritable
  14. MLWritable
  15. Transformer
  16. PipelineStage
  17. Logging
  18. Params
  19. Serializable
  20. Serializable
  21. Identifiable
  22. AnyRef
  23. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Instance Constructors

  1. new Reader2Doc()
  2. new Reader2Doc(uid: String)

Type Members

  1. type AnnotatorType = String
    Definition Classes
    HasOutputAnnotatorType

Value Members

  1. final def !=(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int
    Definition Classes
    AnyRef → Any
  3. final def $[T](param: Param[T]): T
    Attributes
    protected
    Definition Classes
    Params
  4. def $$[T](feature: StructFeature[T]): T
    Attributes
    protected
    Definition Classes
    HasFeatures
  5. def $$[K, V](feature: MapFeature[K, V]): Map[K, V]
    Attributes
    protected
    Definition Classes
    HasFeatures
  6. def $$[T](feature: SetFeature[T]): Set[T]
    Attributes
    protected
    Definition Classes
    HasFeatures
  7. def $$[T](feature: ArrayFeature[T]): Array[T]
    Attributes
    protected
    Definition Classes
    HasFeatures
  8. final def ==(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  9. val addAttachmentContent: Param[Boolean]
    Definition Classes
    HasEmailReaderProperties
  10. val appendCells: Param[Boolean]
    Definition Classes
    HasExcelReaderProperties
  11. final def asInstanceOf[T0]: T0
    Definition Classes
    Any
  12. val cellSeparator: Param[String]
    Definition Classes
    HasExcelReaderProperties
  13. final def clear(param: Param[_]): Reader2Doc.this.type
    Definition Classes
    Params
  14. def clone(): AnyRef
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  15. val contentPath: Param[String]
    Definition Classes
    HasReaderProperties
  16. val contentType: Param[String]
    Definition Classes
    HasReaderProperties
  17. def copy(extra: ParamMap): Transformer
    Definition Classes
    Reader2Doc → Transformer → PipelineStage → Params
  18. def copyValues[T <: Params](to: T, extra: ParamMap): T
    Attributes
    protected
    Definition Classes
    Params
  19. final def defaultCopy[T <: Params](extra: ParamMap): T
    Attributes
    protected
    Definition Classes
    Params
  20. final def eq(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  21. def equals(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  22. def explainParam(param: Param[_]): String
    Definition Classes
    Params
  23. def explainParams(): String
    Definition Classes
    Params
  24. val explodeDocs: BooleanParam
  25. final def extractParamMap(): ParamMap
    Definition Classes
    Params
  26. final def extractParamMap(extra: ParamMap): ParamMap
    Definition Classes
    Params
  27. val features: ArrayBuffer[Feature[_, _, _]]
    Definition Classes
    HasFeatures
  28. def finalize(): Unit
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] )
  29. val flattenOutput: BooleanParam
  30. def get[T](feature: StructFeature[T]): Option[T]
    Attributes
    protected
    Definition Classes
    HasFeatures
  31. def get[K, V](feature: MapFeature[K, V]): Option[Map[K, V]]
    Attributes
    protected
    Definition Classes
    HasFeatures
  32. def get[T](feature: SetFeature[T]): Option[Set[T]]
    Attributes
    protected
    Definition Classes
    HasFeatures
  33. def get[T](feature: ArrayFeature[T]): Option[Array[T]]
    Attributes
    protected
    Definition Classes
    HasFeatures
  34. final def get[T](param: Param[T]): Option[T]
    Definition Classes
    Params
  35. final def getClass(): Class[_]
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  36. final def getDefault[T](param: Param[T]): Option[T]
    Definition Classes
    Params
  37. final def getOrDefault[T](param: Param[T]): T
    Definition Classes
    Params
  38. final def getOutputCol: String

    Gets annotation column name going to generate

    Gets annotation column name going to generate

    Definition Classes
    HasOutputAnnotationCol
  39. def getParam(paramName: String): Param[Any]
    Definition Classes
    Params
  40. val groupBrokenParagraphs: Param[Boolean]
    Definition Classes
    HasTextReaderProperties
  41. final def hasDefault[T](param: Param[T]): Boolean
    Definition Classes
    Params
  42. def hasParam(paramName: String): Boolean
    Definition Classes
    Params
  43. def hashCode(): Int
    Definition Classes
    AnyRef → Any
    Annotations
    @native()
  44. val headers: Param[Map[String, String]]
    Definition Classes
    HasHTMLReaderProperties
  45. val includePageBreaks: Param[Boolean]
    Definition Classes
    HasReaderProperties
  46. val includeSlideNotes: Param[Boolean]
    Definition Classes
    HasPowerPointProperties
  47. val includeTitleTag: Param[Boolean]
    Definition Classes
    HasHTMLReaderProperties
  48. val inferTableStructure: Param[Boolean]
    Definition Classes
    HasReaderProperties
  49. def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean
    Attributes
    protected
    Definition Classes
    Logging
  50. def initializeLogIfNecessary(isInterpreter: Boolean): Unit
    Attributes
    protected
    Definition Classes
    Logging
  51. final def isDefined(param: Param[_]): Boolean
    Definition Classes
    Params
  52. final def isInstanceOf[T0]: Boolean
    Definition Classes
    Any
  53. final def isSet(param: Param[_]): Boolean
    Definition Classes
    Params
  54. def isTraceEnabled(): Boolean
    Attributes
    protected
    Definition Classes
    Logging
  55. def log: Logger
    Attributes
    protected
    Definition Classes
    Logging
  56. def logDebug(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  57. def logDebug(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  58. def logError(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  59. def logError(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  60. def logInfo(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  61. def logInfo(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  62. def logName: String
    Attributes
    protected
    Definition Classes
    Logging
  63. def logTrace(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  64. def logTrace(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  65. def logWarning(msg: ⇒ String, throwable: Throwable): Unit
    Attributes
    protected
    Definition Classes
    Logging
  66. def logWarning(msg: ⇒ String): Unit
    Attributes
    protected
    Definition Classes
    Logging
  67. val maxLineCount: Param[Int]
    Definition Classes
    HasTextReaderProperties
  68. final def ne(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  69. final def notify(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  70. final def notifyAll(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native()
  71. def onWrite(path: String, spark: SparkSession): Unit
    Attributes
    protected
    Definition Classes
    ParamsAndFeaturesWritable
  72. val onlyLeafNodes: Param[Boolean]
    Definition Classes
    HasXmlReaderProperties
  73. val outputAnnotatorType: AnnotatorType
    Definition Classes
    Reader2DocHasOutputAnnotatorType
  74. final val outputCol: Param[String]
    Attributes
    protected
    Definition Classes
    HasOutputAnnotationCol
  75. val paragraphSplit: Param[String]
    Definition Classes
    HasTextReaderProperties
  76. lazy val params: Array[Param[_]]
    Definition Classes
    Params
  77. def save(path: String): Unit
    Definition Classes
    MLWritable
    Annotations
    @Since( "1.6.0" ) @throws( ... )
  78. def set[T](feature: StructFeature[T], value: T): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  79. def set[K, V](feature: MapFeature[K, V], value: Map[K, V]): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  80. def set[T](feature: SetFeature[T], value: Set[T]): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  81. def set[T](feature: ArrayFeature[T], value: Array[T]): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  82. final def set(paramPair: ParamPair[_]): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    Params
  83. final def set(param: String, value: Any): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    Params
  84. final def set[T](param: Param[T], value: T): Reader2Doc.this.type
    Definition Classes
    Params
  85. def setAddAttachmentContent(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasEmailReaderProperties
  86. def setAppendCells(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasExcelReaderProperties
  87. def setCellSeparator(value: String): Reader2Doc.this.type
    Definition Classes
    HasExcelReaderProperties
  88. def setContentPath(value: String): Reader2Doc.this.type
    Definition Classes
    HasReaderProperties
  89. def setContentType(value: String): Reader2Doc.this.type
    Definition Classes
    HasReaderProperties
  90. def setDefault[T](feature: StructFeature[T], value: () ⇒ T): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  91. def setDefault[K, V](feature: MapFeature[K, V], value: () ⇒ Map[K, V]): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  92. def setDefault[T](feature: SetFeature[T], value: () ⇒ Set[T]): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  93. def setDefault[T](feature: ArrayFeature[T], value: () ⇒ Array[T]): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    HasFeatures
  94. final def setDefault(paramPairs: ParamPair[_]*): Reader2Doc.this.type
    Attributes
    protected
    Definition Classes
    Params
  95. final def setDefault[T](param: Param[T], value: T): Reader2Doc.this.type
    Attributes
    protected[org.apache.spark.ml]
    Definition Classes
    Params
  96. def setExplodeDocs(value: Boolean): Reader2Doc.this.type
  97. def setFlattenOutput(value: Boolean): Reader2Doc.this.type
  98. def setGroupBrokenParagraphs(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasTextReaderProperties
  99. def setHeaders(value: Map[String, String]): Reader2Doc.this.type
    Definition Classes
    HasHTMLReaderProperties
  100. def setHeadersPython(headers: Map[String, String]): Reader2Doc.this.type
    Definition Classes
    HasHTMLReaderProperties
  101. def setIncludePageBreaks(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasReaderProperties
  102. def setIncludeSlideNotes(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasPowerPointProperties
  103. def setIncludeTitleTag(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasHTMLReaderProperties
  104. def setInferTableStructure(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasReaderProperties
  105. def setMaxLineCount(value: Int): Reader2Doc.this.type
    Definition Classes
    HasTextReaderProperties
  106. def setOnlyLeafNodes(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasXmlReaderProperties
  107. final def setOutputCol(value: String): Reader2Doc.this.type

    Overrides annotation column name when transforming

    Overrides annotation column name when transforming

    Definition Classes
    HasOutputAnnotationCol
  108. def setParagraphSplit(value: String): Reader2Doc.this.type
    Definition Classes
    HasTextReaderProperties
  109. def setShortLineWordThreshold(value: Int): Reader2Doc.this.type
    Definition Classes
    HasTextReaderProperties
  110. def setStoreContent(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasReaderProperties
  111. def setThreshold(value: Double): Reader2Doc.this.type
    Definition Classes
    HasTextReaderProperties
  112. def setTimeout(value: Int): Reader2Doc.this.type
    Definition Classes
    HasHTMLReaderProperties
  113. def setTitleFontSize(value: Int): Reader2Doc.this.type
    Definition Classes
    HasReaderProperties
  114. def setTitleLengthSize(value: Int): Reader2Doc.this.type
    Definition Classes
    HasTextReaderProperties
  115. def setTitleThreshold(value: Float): Reader2Doc.this.type
  116. def setXmlKeepTags(value: Boolean): Reader2Doc.this.type
    Definition Classes
    HasXmlReaderProperties
  117. val shortLineWordThreshold: Param[Int]
    Definition Classes
    HasTextReaderProperties
  118. val storeContent: Param[Boolean]
    Definition Classes
    HasReaderProperties
  119. final def synchronized[T0](arg0: ⇒ T0): T0
    Definition Classes
    AnyRef
  120. val threshold: Param[Double]
    Definition Classes
    HasTextReaderProperties
  121. val timeout: Param[Int]
    Definition Classes
    HasHTMLReaderProperties
  122. val titleFontSize: Param[Int]
    Definition Classes
    HasReaderProperties
  123. val titleLengthSize: Param[Int]
    Definition Classes
    HasTextReaderProperties
  124. val titleThreshold: Param[Float]
  125. def toString(): String
    Definition Classes
    Identifiable → AnyRef → Any
  126. def transform(dataset: Dataset[_]): DataFrame
    Definition Classes
    Reader2Doc → Transformer
  127. def transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame
    Definition Classes
    Transformer
    Annotations
    @Since( "2.0.0" )
  128. def transform(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DataFrame
    Definition Classes
    Transformer
    Annotations
    @Since( "2.0.0" ) @varargs()
  129. def transformSchema(schema: StructType): StructType
    Definition Classes
    Reader2Doc → PipelineStage
  130. def transformSchema(schema: StructType, logging: Boolean): StructType
    Attributes
    protected
    Definition Classes
    PipelineStage
    Annotations
    @DeveloperApi()
  131. val uid: String
    Definition Classes
    Reader2Doc → Identifiable
  132. final def wait(): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  133. final def wait(arg0: Long, arg1: Int): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  134. final def wait(arg0: Long): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  135. def write: MLWriter
    Definition Classes
    ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
  136. val xmlKeepTags: Param[Boolean]
    Definition Classes
    HasXmlReaderProperties

Inherited from HasXmlReaderProperties

Inherited from HasTextReaderProperties

Inherited from HasPowerPointProperties

Inherited from HasHTMLReaderProperties

Inherited from HasExcelReaderProperties

Inherited from HasEmailReaderProperties

Inherited from HasReaderProperties

Inherited from ParamsAndFeaturesWritable

Inherited from HasFeatures

Inherited from HasOutputAnnotationCol

Inherited from HasOutputAnnotatorType

Inherited from DefaultParamsWritable

Inherited from MLWritable

Inherited from Transformer

Inherited from PipelineStage

Inherited from Logging

Inherited from Params

Inherited from Serializable

Inherited from Serializable

Inherited from Identifiable

Inherited from AnyRef

Inherited from Any

Ungrouped