# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from pyspark.ml.wrapper import JavaTransformer
from sparknlp.reader.enums import TextStripperType
[docs]class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
JavaMLReadable, JavaMLWritable):
"""
Extract text from PDF documents as either a single string or multiple strings per page.
Input is a column with binary content of PDF files. Output is a column with extracted text,
with options to include page numbers or split pages.
Parameters
----------
pageNumCol : str, optional
Page number output column name.
partitionNum : int, optional
Number of partitions (default is 0).
storeSplittedPdf : bool, optional
Whether to store content of split PDFs (default is False).
splitPage : bool, optional
Enable/disable splitting per page (default is True).
onlyPageNum : bool, optional
Whether to extract only page numbers (default is False).
textStripper : str or TextStripperType, optional
Defines layout and formatting type.
sort : bool, optional
Enable/disable sorting content per page (default is False).
Examples
--------
>>> import sparknlp
>>> from sparknlp.reader import *
>>> from pyspark.ml import Pipeline
>>> pdf_path = "Documents/files/pdf"
>>> data_frame = spark.read.format("binaryFile").load(pdf_path)
>>> pdf_to_text = PdfToText().setStoreSplittedPdf(True)
>>> pipeline = Pipeline(stages=[pdf_to_text])
>>> pipeline_model = pipeline.fit(data_frame)
>>> pdf_df = pipeline_model.transform(data_frame)
>>> pdf_df.show()
+--------------------+--------------------+
| path| modificationTime|
+--------------------+--------------------+
|file:/Users/paula...|2025-05-15 11:33:...|
|file:/Users/paula...|2025-05-15 11:33:...|
+--------------------+--------------------+
>>> pdf_df.printSchema()
root
|-- path: string (nullable = true)
|-- modificationTime: timestamp (nullable = true)
|-- length: long (nullable = true)
|-- text: string (nullable = true)
|-- height_dimension: integer (nullable = true)
|-- width_dimension: integer (nullable = true)
|-- content: binary (nullable = true)
|-- exception: string (nullable = true)
|-- pagenum: integer (nullable = true)
"""
[docs] pageNumCol = Param(Params._dummy(), "pageNumCol",
"Page number output column name.",
typeConverter=TypeConverters.toString)
[docs] partitionNum = Param(Params._dummy(), "partitionNum",
"Number of partitions.",
typeConverter=TypeConverters.toInt)
[docs] storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf",
"Force to store splitted pdf.",
typeConverter=TypeConverters.toBoolean)
[docs] splitPage = Param(Params._dummy(), "splitPage",
"Param for enable/disable splitting document per page",
typeConverter=TypeConverters.toBoolean)
[docs] textStripper = Param(Params._dummy(), "textStripper",
"Text stripper type used for output layout and formatting",
typeConverter=TypeConverters.toString)
[docs] sort = Param(Params._dummy(), "sort",
"Param for enable/disable sort lines",
typeConverter=TypeConverters.toBoolean)
[docs] onlyPageNum = Param(Params._dummy(), "onlyPageNum",
"Force to extract only number of pages",
typeConverter=TypeConverters.toBoolean)
[docs] normalizeLigatures = Param(Params._dummy(), "normalizeLigatures",
"Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).",
typeConverter=TypeConverters.toBoolean)
@keyword_only
def __init__(self):
"""
__init__(self)
"""
super(PdfToText, self).__init__()
self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
[docs] def setInputCol(self, value):
"""
Sets the value of :py:attr:`inputCol`.
"""
return self._set(inputCol=value)
[docs] def setOutputCol(self, value):
"""
Sets the value of :py:attr:`outputCol`.
"""
return self._set(outputCol=value)
[docs] def setPageNumCol(self, value):
"""
Sets the value of :py:attr:`pageNumCol`.
"""
return self._set(pageNumCol=value)
[docs] def setPartitionNum(self, value):
"""
Sets the value of :py:attr:`partitionNum`.
"""
return self._set(partitionNum=value)
[docs] def setStoreSplittedPdf(self, value):
"""
Sets the value of :py:attr:`storeSplittedPdf`.
"""
return self._set(storeSplittedPdf=value)
[docs] def setSplitPage(self, value):
"""
Sets the value of :py:attr:`splitPage`.
"""
return self._set(splitPage=value)
[docs] def setOnlyPageNum(self, value):
"""
Sets the value of :py:attr:`onlyPageNum`.
"""
return self._set(onlyPageNum=value)
[docs] def setTextStripper(self, value):
"""
Sets the value of :py:attr:`textStripper`.
"""
if isinstance(value, TextStripperType):
value = value.value
if value not in [i.value for i in TextStripperType]:
type_value = type(value)
raise ValueError(f"Param textStripper must be a 'TextStripperType' enum but got {type_value}.")
return self._set(textStripper=str(value))
[docs] def setSort(self, value):
"""
Sets the value of :py:attr:`sort`.
"""
return self._set(sort=value)
[docs] def setNormalizeLigatures(self, value):
"""
Sets the value of :py:attr:`normalizeLigatures`.
"""
return self._set(normalizeLigatures=value)