Source code for sparknlp.annotator.cleaners.cleaner

#  Copyright 2017-2025 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for Cleaner."""
from sparknlp.annotator import MarianTransformer
from sparknlp.common import *

[docs]class Cleaner(MarianTransformer):
[docs]    name = "Cleaner"

[docs]    inputAnnotatorTypes = [AnnotatorType.TOKEN]

[docs]    outputAnnotatorType = AnnotatorType.CHUNK

[docs]    encoding = Param(Params._dummy(),
                   "encoding",
                   "The encoding to be used for decoding the byte string (default is utf-8)",
                   typeConverter=TypeConverters.toString)

[docs]    cleanPrefixPattern = Param(Params._dummy(),
                     "cleanPrefixPattern",
                     "The pattern for the prefix. Can be a simple string or a regex pattern.",
                     typeConverter=TypeConverters.toString)

[docs]    cleanPostfixPattern = Param(Params._dummy(),
                               "cleanPostfixPattern",
                               "The pattern for the postfix. Can be a simple string or a regex pattern.",
                               typeConverter=TypeConverters.toString)

[docs]    cleanerMode = Param(
        Params._dummy(),
        "cleanerMode",
        "possible values: " +
        "clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets, clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes",
        typeConverter=TypeConverters.toString
    )

[docs]    extraWhitespace = Param(Params._dummy(),
                    "extraWhitespace",
                    "Whether to remove extra whitespace.",
                    typeConverter=TypeConverters.toBoolean)

[docs]    dashes = Param(Params._dummy(),
                "dashes",
                "Whether to handle dashes in text.",
                typeConverter=TypeConverters.toBoolean)

[docs]    bullets = Param(Params._dummy(),
                   "bullets",
                   "Whether to handle bullets in text.",
                   typeConverter=TypeConverters.toBoolean)

[docs]    trailingPunctuation = Param(Params._dummy(),
                    "trailingPunctuation",
                    "Whether to remove trailing punctuation from text.",
                    typeConverter=TypeConverters.toBoolean)

[docs]    lowercase = Param(Params._dummy(),
                "lowercase",
                "Whether to convert text to lowercase.",
                typeConverter=TypeConverters.toBoolean)

[docs]    ignoreCase = Param(Params._dummy(),
                      "ignoreCase",
                      "If true, ignores case in the pattern.",
                      typeConverter=TypeConverters.toBoolean)

[docs]    strip = Param(Params._dummy(),
               "strip",
               "If true, removes leading or trailing whitespace from the cleaned string.",
               typeConverter=TypeConverters.toBoolean)

[docs]    def setEncoding(self, value):
        """Sets the encoding to be used for decoding the byte string (default is utf-8).

        Parameters
        ----------
        value : str
            The encoding to be used for decoding the byte string (default is utf-8)
        """
        return self._set(encoding=value)

[docs]    def setCleanPrefixPattern(self, value):
        """Sets the pattern for the prefix. Can be a simple string or a regex pattern.

        Parameters
        ----------
        value : str
            The pattern for the prefix. Can be a simple string or a regex pattern.
        """
        return self._set(cleanPrefixPattern=value)

[docs]    def setCleanPostfixPattern(self, value):
        """Sets the pattern for the postfix. Can be a simple string or a regex pattern.

        Parameters
        ----------
        value : str
            The pattern for the postfix. Can be a simple string or a regex pattern.
        """
        return self._set(cleanPostfixPattern=value)

[docs]    def setCleanerMode(self, value):
        """Sets the cleaner mode.

        Possible values:
            clean, bytes_string_to_string, clean_non_ascii_chars, clean_ordered_bullets,
            clean_postfix, clean_prefix, remove_punctuation, replace_unicode_quotes

        Parameters
        ----------
        value : str
            The mode for cleaning operations.
        """
        return self._set(cleanerMode=value)

[docs]    def setExtraWhitespace(self, value):
        """Sets whether to remove extra whitespace.

        Parameters
        ----------
        value : bool
            Whether to remove extra whitespace.
        """
        return self._set(extraWhitespace=value)

[docs]    def setDashes(self, value):
        """Sets whether to handle dashes in text.

        Parameters
        ----------
        value : bool
            Whether to handle dashes in text.
        """
        return self._set(dashes=value)

[docs]    def setBullets(self, value):
        """Sets whether to handle bullets in text.

        Parameters
        ----------
        value : bool
            Whether to handle bullets in text.
        """
        return self._set(bullets=value)

[docs]    def setTrailingPunctuation(self, value):
        """Sets whether to remove trailing punctuation from text.

        Parameters
        ----------
        value : bool
            Whether to remove trailing punctuation from text.
        """
        return self._set(trailingPunctuation=value)

[docs]    def setLowercase(self, value):
        """Sets whether to convert text to lowercase.

        Parameters
        ----------
        value : bool
            Whether to convert text to lowercase.
        """
        return self._set(lowercase=value)

[docs]    def setIgnoreCase(self, value):
        """Sets whether to ignore case in the pattern.

        Parameters
        ----------
        value : bool
            If true, ignores case in the pattern.
        """
        return self._set(ignoreCase=value)

[docs]    def setStrip(self, value):
        """Sets whether to remove leading or trailing whitespace from the cleaned string.

        Parameters
        ----------
        value : bool
            If true, removes leading or trailing whitespace from the cleaned string.
        """
        return self._set(strip=value)

    @keyword_only
    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cleaners.Cleaner", java_model=None):
        super(Cleaner, self).__init__(
            classname=classname,
            java_model=java_model
        )