Source code for sparknlp.reader.pdf_to_text
from pyspark import keyword_only
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.ml.util import JavaMLReadable, JavaMLWritable
from pyspark.ml.wrapper import JavaTransformer
from sparknlp.reader.enums import TextStripperType
[docs]class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
JavaMLReadable, JavaMLWritable):
"""
Extract text from Pdf document to single string or to several strings per each page.
Input is a column with binary representation of PDF document.
As output generate column with text and page number.
Explode each page as separate row if split to page enabled.
"""
pageNumCol = Param(Params._dummy(), "pageNumCol",
"Page number output column name.",
typeConverter=TypeConverters.toString)
partitionNum = Param(Params._dummy(), "partitionNum",
"Number of partitions.",
typeConverter=TypeConverters.toInt)
storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf",
"Force to store splitted pdf.",
typeConverter=TypeConverters.toBoolean)
splitPage = Param(Params._dummy(), "splitPage",
"Param for enable/disable splitting document per page",
typeConverter=TypeConverters.toBoolean)
textStripper = Param(Params._dummy(), "textStripper",
"Text stripper type used for output layout and formatting",
typeConverter=TypeConverters.toString)
sort = Param(Params._dummy(), "sort",
"Param for enable/disable sort lines",
typeConverter=TypeConverters.toBoolean)
onlyPageNum = Param(Params._dummy(), "onlyPageNum",
"Force to extract only number of pages",
typeConverter=TypeConverters.toBoolean)
@keyword_only
def __init__(self):
"""
__init__(self)
"""
super(PdfToText, self).__init__()
self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
[docs] def setInputCol(self, value):
"""
Sets the value of :py:attr:`inputCol`.
"""
return self._set(inputCol=value)
[docs] def setOutputCol(self, value):
"""
Sets the value of :py:attr:`outputCol`.
"""
return self._set(outputCol=value)
[docs] def setPageNumCol(self, value):
"""
Sets the value of :py:attr:`pageNumCol`.
"""
return self._set(pageNumCol=value)
[docs] def setPartitionNum(self, value):
"""
Sets the value of :py:attr:`partitionNum`.
"""
return self._set(partitionNum=value)
[docs] def setStoreSplittedPdf(self, value):
"""
Sets the value of :py:attr:`storeSplittedPdf`.
"""
return self._set(storeSplittedPdf=value)
[docs] def setSplitPage(self, value):
"""
Sets the value of :py:attr:`splitPage`.
"""
return self._set(splitPage=value)
[docs] def setOnlyPageNum(self, value):
"""
Sets the value of :py:attr:`onlyPageNum`.
"""
return self._set(onlyPageNum=value)
[docs] def setTextStripper(self, value):
"""
Sets the value of :py:attr:`textStripper`.
"""
if isinstance(value, TextStripperType):
value = value.value
if value not in [i.value for i in TextStripperType]:
type_value = type(value)
raise ValueError(f"Param textStripper must be a 'TextStripperType' enum but got {type_value}.")
return self._set(textStripper=str(value))
[docs] def setSort(self, value):
"""
Sets the value of :py:attr:`sort`.
"""
return self._set(sort=value)