Source code for sparknlp.reader.pdf_to_text

#  Copyright 2017-2025 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from pyspark.ml.wrapper import JavaTransformer

from sparknlp.reader.enums import TextStripperType


[docs]class PdfToText(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ Extract text from PDF documents as either a single string or multiple strings per page. Input is a column with binary content of PDF files. Output is a column with extracted text, with options to include page numbers or split pages. Parameters ---------- pageNumCol : str, optional Page number output column name. partitionNum : int, optional Number of partitions (default is 0). storeSplittedPdf : bool, optional Whether to store content of split PDFs (default is False). splitPage : bool, optional Enable/disable splitting per page (default is True). onlyPageNum : bool, optional Whether to extract only page numbers (default is False). textStripper : str or TextStripperType, optional Defines layout and formatting type. sort : bool, optional Enable/disable sorting content per page (default is False). Examples -------- >>> import sparknlp >>> from sparknlp.reader import * >>> from pyspark.ml import Pipeline >>> pdf_path = "Documents/files/pdf" >>> data_frame = spark.read.format("binaryFile").load(pdf_path) >>> pdf_to_text = PdfToText().setStoreSplittedPdf(True) >>> pipeline = Pipeline(stages=[pdf_to_text]) >>> pipeline_model = pipeline.fit(data_frame) >>> pdf_df = pipeline_model.transform(data_frame) >>> pdf_df.show() +--------------------+--------------------+ | path| modificationTime| +--------------------+--------------------+ |file:/Users/paula...|2025-05-15 11:33:...| |file:/Users/paula...|2025-05-15 11:33:...| +--------------------+--------------------+ >>> pdf_df.printSchema() root |-- path: string (nullable = true) |-- modificationTime: timestamp (nullable = true) |-- length: long (nullable = true) |-- text: string (nullable = true) |-- height_dimension: integer (nullable = true) |-- width_dimension: integer (nullable = true) |-- content: binary (nullable = true) |-- exception: string (nullable = true) |-- pagenum: integer (nullable = true) """
[docs] pageNumCol = Param(Params._dummy(), "pageNumCol", "Page number output column name.", typeConverter=TypeConverters.toString)
[docs] partitionNum = Param(Params._dummy(), "partitionNum", "Number of partitions.", typeConverter=TypeConverters.toInt)
[docs] storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf", "Force to store splitted pdf.", typeConverter=TypeConverters.toBoolean)
[docs] splitPage = Param(Params._dummy(), "splitPage", "Param for enable/disable splitting document per page", typeConverter=TypeConverters.toBoolean)
[docs] textStripper = Param(Params._dummy(), "textStripper", "Text stripper type used for output layout and formatting", typeConverter=TypeConverters.toString)
[docs] sort = Param(Params._dummy(), "sort", "Param for enable/disable sort lines", typeConverter=TypeConverters.toBoolean)
[docs] onlyPageNum = Param(Params._dummy(), "onlyPageNum", "Force to extract only number of pages", typeConverter=TypeConverters.toBoolean)
[docs] extractCoordinates = Param(Params._dummy(), "extractCoordinates", "Force extract coordinates of text.", typeConverter=TypeConverters.toBoolean)
[docs] normalizeLigatures = Param(Params._dummy(), "normalizeLigatures", "Whether to convert ligature chars such as 'fl' into its corresponding chars (e.g., {'f', 'l'}).", typeConverter=TypeConverters.toBoolean)
@keyword_only def __init__(self): """ __init__(self) """ super(PdfToText, self).__init__() self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
[docs] def setInputCol(self, value): """ Sets the value of :py:attr:`inputCol`. """ return self._set(inputCol=value)
[docs] def setOutputCol(self, value): """ Sets the value of :py:attr:`outputCol`. """ return self._set(outputCol=value)
[docs] def setPageNumCol(self, value): """ Sets the value of :py:attr:`pageNumCol`. """ return self._set(pageNumCol=value)
[docs] def setPartitionNum(self, value): """ Sets the value of :py:attr:`partitionNum`. """ return self._set(partitionNum=value)
[docs] def setStoreSplittedPdf(self, value): """ Sets the value of :py:attr:`storeSplittedPdf`. """ return self._set(storeSplittedPdf=value)
[docs] def setSplitPage(self, value): """ Sets the value of :py:attr:`splitPage`. """ return self._set(splitPage=value)
[docs] def setOnlyPageNum(self, value): """ Sets the value of :py:attr:`onlyPageNum`. """ return self._set(onlyPageNum=value)
[docs] def setTextStripper(self, value): """ Sets the value of :py:attr:`textStripper`. """ if isinstance(value, TextStripperType): value = value.value if value not in [i.value for i in TextStripperType]: type_value = type(value) raise ValueError(f"Param textStripper must be a 'TextStripperType' enum but got {type_value}.") return self._set(textStripper=str(value))
[docs] def setSort(self, value): """ Sets the value of :py:attr:`sort`. """ return self._set(sort=value)
[docs] def setExtractCoordinates(self, value): """ Sets the value of :py:attr:`extractCoordinates`. """ return self._set(extractCoordinates=value)
[docs] def setNormalizeLigatures(self, value): """ Sets the value of :py:attr:`normalizeLigatures`. """ return self._set(normalizeLigatures=value)