Source code for sparknlp.annotator.cleaners.extractor
# Copyright 2017-2025 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains classes for Extractor."""
from sparknlp.common import *
[docs]class Extractor(AnnotatorModel):
name = "Extractor"
inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
outputAnnotatorType = AnnotatorType.CHUNK
emailDateTimeTzPattern = Param(Params._dummy(),
"emailDateTimeTzPattern",
"Specifies the date-time pattern for email timestamps, including time zone formatting.",
typeConverter=TypeConverters.toString)
emailAddress = Param(
Params._dummy(),
"emailAddress",
"Specifies the pattern for email addresses.",
typeConverter=TypeConverters.toString
)
ipAddressPattern = Param(
Params._dummy(),
"ipAddressPattern",
"Specifies the pattern for IP addresses.",
typeConverter=TypeConverters.toString
)
ipAddressNamePattern = Param(
Params._dummy(),
"ipAddressNamePattern",
"Specifies the pattern for IP addresses with names.",
typeConverter=TypeConverters.toString
)
mapiIdPattern = Param(
Params._dummy(),
"mapiIdPattern",
"Specifies the pattern for MAPI IDs.",
typeConverter=TypeConverters.toString
)
usPhoneNumbersPattern = Param(
Params._dummy(),
"usPhoneNumbersPattern",
"Specifies the pattern for US phone numbers.",
typeConverter=TypeConverters.toString
)
imageUrlPattern = Param(
Params._dummy(),
"imageUrlPattern",
"Specifies the pattern for image URLs.",
typeConverter=TypeConverters.toString
)
textPattern = Param(
Params._dummy(),
"textPattern",
"Specifies the pattern for text after and before.",
typeConverter=TypeConverters.toString
)
extractorMode = Param(
Params._dummy(),
"extractorMode",
"possible values: " +
"email_date, email_address, ip_address, ip_address_name, mapi_id, us_phone_numbers, image_urls, bullets, text_after, text_before",
typeConverter=TypeConverters.toString
)
index = Param(
Params._dummy(),
"index",
"Specifies the index of the pattern to extract in text after or before",
typeConverter=TypeConverters.toInt
)
[docs] def setEmailDateTimeTzPattern(self, value):
"""Sets specifies the date-time pattern for email timestamps, including time zone formatting.
Parameters
----------
value : str
Specifies the date-time pattern for email timestamps, including time zone formatting.
"""
return self._set(emailDateTimeTzPattern=value)
[docs] def setEmailAddress(self, value):
"""Sets the pattern for email addresses.
Parameters
----------
value : str
Specifies the pattern for email addresses.
"""
return self._set(emailAddress=value)
[docs] def setIpAddressPattern(self, value):
"""Sets the pattern for IP addresses.
Parameters
----------
value : str
Specifies the pattern for IP addresses.
"""
return self._set(ipAddressPattern=value)
[docs] def setIpAddressNamePattern(self, value):
"""Sets the pattern for IP addresses with names.
Parameters
----------
value : str
Specifies the pattern for IP addresses with names.
"""
return self._set(ipAddressNamePattern=value)
[docs] def setMapiIdPattern(self, value):
"""Sets the pattern for MAPI IDs.
Parameters
----------
value : str
Specifies the pattern for MAPI IDs.
"""
return self._set(mapiIdPattern=value)
[docs] def setUsPhoneNumbersPattern(self, value):
"""Sets the pattern for US phone numbers.
Parameters
----------
value : str
Specifies the pattern for US phone numbers.
"""
return self._set(usPhoneNumbersPattern=value)
[docs] def setImageUrlPattern(self, value):
"""Sets the pattern for image URLs.
Parameters
----------
value : str
Specifies the pattern for image URLs.
"""
return self._set(imageUrlPattern=value)
[docs] def setTextPattern(self, value):
"""Sets the pattern for text after and before.
Parameters
----------
value : str
Specifies the pattern for text after and before.
"""
return self._set(textPattern=value)
[docs] def setIndex(self, value):
"""Sets the index of the pattern to extract in text after or before.
Parameters
----------
value : int
Specifies the index of the pattern to extract in text after or before.
"""
return self._set(index=value)
@keyword_only
def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Extractor", java_model=None):
super(Extractor, self).__init__(
classname=classname,
java_model=java_model
)