Source code for sparknlp.reader.reader2table
# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param
from sparknlp.common import AnnotatorType
from sparknlp.internal import AnnotatorTransformer
from sparknlp.partition.partition_properties import *
[docs]class Reader2Table(
AnnotatorTransformer,
HasEmailReaderProperties,
HasExcelReaderProperties,
HasHTMLReaderProperties,
HasPowerPointProperties,
HasTextReaderProperties
):
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] contentPath = Param(
Params._dummy(),
"contentPath",
"contentPath path to files to read",
typeConverter=TypeConverters.toString
)
[docs] outputCol = Param(
Params._dummy(),
"outputCol",
"output column name",
typeConverter=TypeConverters.toString
)
[docs] contentType = Param(
Params._dummy(),
"contentType",
"Set the content type to load following MIME specification",
typeConverter=TypeConverters.toString
)
[docs] explodeDocs = Param(
Params._dummy(),
"explodeDocs",
"whether to explode the documents into separate rows",
typeConverter=TypeConverters.toBoolean
)
[docs] flattenOutput = Param(
Params._dummy(),
"flattenOutput",
"If true, output is flattened to plain text with minimal metadata",
typeConverter=TypeConverters.toBoolean
)
[docs] titleThreshold = Param(
Params._dummy(),
"titleThreshold",
"Minimum font size threshold for title detection in PDF docs",
typeConverter=TypeConverters.toFloat
)
@keyword_only
def __init__(self):
super(Reader2Table, self).__init__(classname="com.johnsnowlabs.reader.Reader2Table")
self._setDefault(outputCol="document")
@keyword_only
[docs] def setParams(self):
kwargs = self._input_kwargs
return self._set(**kwargs)
[docs] def setContentPath(self, value):
"""Sets content path.
Parameters
----------
value : str
contentPath path to files to read
"""
return self._set(contentPath=value)
[docs] def setContentType(self, value):
"""
Set the content type to load following MIME specification
Parameters
----------
value : str
content type to load following MIME specification
"""
return self._set(contentType=value)
[docs] def setExplodeDocs(self, value):
"""Sets whether to explode the documents into separate rows.
Parameters
----------
value : boolean
Whether to explode the documents into separate rows
"""
return self._set(explodeDocs=value)
[docs] def setOutputCol(self, value):
"""Sets output column name.
Parameters
----------
value : str
Name of the Output Column
"""
return self._set(outputCol=value)
[docs] def setFlattenOutput(self, value):
"""Sets whether to flatten the output to plain text with minimal metadata.
Parameters
----------
value : bool
If true, output is flattened to plain text with minimal metadata
"""
return self._set(flattenOutput=value)
[docs] def setTitleThreshold(self, value):
"""Sets the minimum font size threshold for title detection in PDF documents.
Parameters
----------
value : float
Minimum font size threshold for title detection in PDF docs
"""
return self._set(titleThreshold=value)