Source code for sparknlp.annotator.token.tokenizer

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the Tokenizer."""


from sparknlp.common import *


[docs]class Tokenizer(AnnotatorApproach): """Tokenizes raw text in document type columns into TokenizedSentence . This class represents a non fitted tokenizer. Fitting it will cause the internal RuleFactory to construct the rules for tokenizing from the input configuration. Identifies tokens with tokenization open standards. A few rules will help customizing it if defaults do not fit user needs. For extended examples of usage see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT`` ``TOKEN`` ====================== ====================== Parameters ---------- targetPattern Pattern to grab from text as token candidates, by default ``\\S+`` prefixPattern Regex with groups and begins with ``\\A`` to match target prefix, by default ``\\A([^\\s\\w\\$\\.]*)`` suffixPattern Regex with groups and ends with ``\\z`` to match target suffix, by default ``([^\\s\\w]?)([^\\s\\w]*)\\z`` infixPatterns Regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults exceptions Words that won't be affected by tokenization rules exceptionsPath Path to file containing list of exceptions caseSensitiveExceptions Whether to care for case sensitiveness in exceptions, by default True contextChars Character list used to separate from token boundaries, by default ['.', ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"] splitPattern Pattern to separate from the inside of tokens. Takes priority over splitChars. splitChars Character list used to separate from the inside of tokens minLength Set the minimum allowed length for each token, by default 0 maxLength Set the maximum allowed length for each token, by default 99999 Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> data = spark.createDataFrame([["I'd like to say we didn't expect that. Jane's boyfriend."]]).toDF("text") >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document") >>> tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token").fit(data) >>> pipeline = Pipeline().setStages([documentAssembler, tokenizer]).fit(data) >>> result = pipeline.transform(data) >>> result.selectExpr("token.result").show(truncate=False) +-----------------------------------------------------------------------+ |output | +-----------------------------------------------------------------------+ |[I'd, like, to, say, we, didn't, expect, that, ., Jane's, boyfriend, .]| +-----------------------------------------------------------------------+ """ name = 'Tokenizer' inputAnnotatorTypes = [AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.TOKEN targetPattern = Param(Params._dummy(), "targetPattern", "pattern to grab from text as token candidates. Defaults \S+", typeConverter=TypeConverters.toString) prefixPattern = Param(Params._dummy(), "prefixPattern", "regex with groups and begins with \A to match target prefix. Defaults to \A([^\s\w\$\.]*)", typeConverter=TypeConverters.toString) suffixPattern = Param(Params._dummy(), "suffixPattern", "regex with groups and ends with \z to match target suffix. Defaults to ([^\s\w]?)([^\s\w]*)\z", typeConverter=TypeConverters.toString) infixPatterns = Param(Params._dummy(), "infixPatterns", "regex patterns that match tokens within a single target. groups identify different sub-tokens. multiple defaults", typeConverter=TypeConverters.toListString) exceptions = Param(Params._dummy(), "exceptions", "Words that won't be affected by tokenization rules", typeConverter=TypeConverters.toListString) exceptionsPath = Param(Params._dummy(), "exceptionsPath", "path to file containing list of exceptions", typeConverter=TypeConverters.identity) caseSensitiveExceptions = Param(Params._dummy(), "caseSensitiveExceptions", "Whether to care for case sensitiveness in exceptions", typeConverter=TypeConverters.toBoolean) contextChars = Param(Params._dummy(), "contextChars", "character list used to separate from token boundaries", typeConverter=TypeConverters.toListString) splitPattern = Param(Params._dummy(), "splitPattern", "character list used to separate from the inside of tokens", typeConverter=TypeConverters.toString) splitChars = Param(Params._dummy(), "splitChars", "character list used to separate from the inside of tokens", typeConverter=TypeConverters.toListString) minLength = Param(Params._dummy(), "minLength", "Set the minimum allowed length for each token", typeConverter=TypeConverters.toInt) maxLength = Param(Params._dummy(), "maxLength", "Set the maximum allowed length for each token", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self): super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Tokenizer") self._setDefault( targetPattern="\\S+", contextChars=[".", ",", ";", ":", "!", "?", "*", "-", "(", ")", "\"", "'"], caseSensitiveExceptions=True, minLength=0, maxLength=99999 )
[docs] def getInfixPatterns(self): """Gets regex patterns that match tokens within a single target. Groups identify different sub-tokens. Returns ------- List[str] The infix patterns """ return self.getOrDefault("infixPatterns")
[docs] def getSuffixPattern(self): """Gets regex with groups and ends with ``\\z`` to match target suffix. Returns ------- str The suffix pattern """ return self.getOrDefault("suffixPattern")
[docs] def getPrefixPattern(self): """Gets regex with groups and begins with ``\\A`` to match target prefix. Returns ------- str The prefix pattern """ return self.getOrDefault("prefixPattern")
[docs] def getContextChars(self): """Gets character list used to separate from token boundaries. Returns ------- List[str] Character list used to separate from token boundaries """ return self.getOrDefault("contextChars")
[docs] def getSplitChars(self): """Gets character list used to separate from the inside of tokens. Returns ------- List[str] Character list used to separate from the inside of tokens """ return self.getOrDefault("splitChars")
[docs] def setTargetPattern(self, value): """Sets pattern to grab from text as token candidates, by default ``\\S+``. Parameters ---------- value : str Pattern to grab from text as token candidates """ return self._set(targetPattern=value)
[docs] def setPrefixPattern(self, value): """Sets regex with groups and begins with ``\\A`` to match target prefix, by default ``\\A([^\\s\\w\\$\\.]*)``. Parameters ---------- value : str Regex with groups and begins with ``\\A`` to match target prefix """ return self._set(prefixPattern=value)
[docs] def setSuffixPattern(self, value): """Sets regex with groups and ends with ``\\z`` to match target suffix, by default ``([^\\s\\w]?)([^\\s\\w]*)\\z``. Parameters ---------- value : str Regex with groups and ends with ``\\z`` to match target suffix """ return self._set(suffixPattern=value)
[docs] def setInfixPatterns(self, value): """Sets regex patterns that match tokens within a single target. Groups identify different sub-tokens. Parameters ---------- value : List[str] Regex patterns that match tokens within a single target """ return self._set(infixPatterns=value)
[docs] def addInfixPattern(self, value): """Adds an additional regex pattern that match tokens within a single target. Groups identify different sub-tokens. Parameters ---------- value : str Regex pattern that match tokens within a single target """ try: infix_patterns = self.getInfixPatterns() except KeyError: infix_patterns = [] infix_patterns.insert(0, value) return self._set(infixPatterns=infix_patterns)
[docs] def setExceptions(self, value): """Sets words that won't be affected by tokenization rules. Parameters ---------- value : List[str] Words that won't be affected by tokenization rules """ return self._set(exceptions=value)
[docs] def getExceptions(self): """Gets words that won't be affected by tokenization rules. Returns ------- List[str] Words that won't be affected by tokenization rules """ return self.getOrDefault("exceptions")
[docs] def setExceptionsPath(self, path, read_as=ReadAs.TEXT, options={"format": "text"}): """Path to txt file with list of token exceptions Parameters ---------- path : str Path to the source file read_as : str, optional How to read the file, by default ReadAs.TEXT options : dict, optional Options to read the resource, by default {"format": "text"} """ opts = options.copy() return self._set(exceptionsPath=ExternalResource(path, read_as, opts))
[docs] def addException(self, value): """Adds an additional word that won't be affected by tokenization rules. Parameters ---------- value : str Additional word that won't be affected by tokenization rules """ try: exception_tokens = self.getExceptions() except KeyError: exception_tokens = [] exception_tokens.append(value) return self._set(exceptions=exception_tokens)
[docs] def setCaseSensitiveExceptions(self, value): """Sets whether to care for case sensitiveness in exceptions, by default True. Parameters ---------- value : bool Whether to care for case sensitiveness in exceptions """ return self._set(caseSensitiveExceptions=value)
[docs] def getCaseSensitiveExceptions(self): """Gets whether to care for case sensitiveness in exceptions. Returns ------- bool Whether to care for case sensitiveness in exceptions """ return self.getOrDefault("caseSensitiveExceptions")
[docs] def setContextChars(self, value): """Sets character list used to separate from token boundaries, by default ['.', ',', ';', ':', '!', '?', '*', '-', '(', ')', '"', "'"]. Parameters ---------- value : List[str] Character list used to separate from token boundaries """ return self._set(contextChars=value)
[docs] def addContextChars(self, value): """Adds an additional character to the list used to separate from token boundaries. Parameters ---------- value : str Additional context character """ try: context_chars = self.getContextChars() except KeyError: context_chars = [] context_chars.append(value) return self._set(contextChars=context_chars)
[docs] def setSplitPattern(self, value): """Sets pattern to separate from the inside of tokens. Takes priority over splitChars. Parameters ---------- value : str Pattern used to separate from the inside of tokens """ return self._set(splitPattern=value)
[docs] def setSplitChars(self, value): """Sets character list used to separate from the inside of tokens. Parameters ---------- value : List[str] Character list used to separate from the inside of tokens """ return self._set(splitChars=value)
[docs] def addSplitChars(self, value): """Adds an additional character to separate from the inside of tokens. Parameters ---------- value : str Additional character to separate from the inside of tokens """ try: split_chars = self.getSplitChars() except KeyError: split_chars = [] split_chars.append(value) return self._set(splitChars=split_chars)
[docs] def setMinLength(self, value): """Sets the minimum allowed length for each token, by default 0. Parameters ---------- value : int Minimum allowed length for each token """ return self._set(minLength=value)
[docs] def setMaxLength(self, value): """Sets the maximum allowed length for each token, by default 99999. Parameters ---------- value : int Maximum allowed length for each token """ return self._set(maxLength=value)
def _create_model(self, java_model): return TokenizerModel(java_model=java_model)
[docs]class TokenizerModel(AnnotatorModel): """Tokenizes raw text into word pieces, tokens. Identifies tokens with tokenization open standards. A few rules will help customizing it if defaults do not fit user needs. This class represents an already fitted :class:`.Tokenizer`. See the main class Tokenizer for more examples of usage. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT`` ``TOKEN`` ====================== ====================== Parameters ---------- splitPattern Character list used to separate from the inside of tokens splitChars Character list used to separate from the inside of tokens """ name = "TokenizerModel" inputAnnotatorTypes = [AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.TOKEN exceptions = Param(Params._dummy(), "exceptions", "Words that won't be affected by tokenization rules", typeConverter=TypeConverters.toListString) caseSensitiveExceptions = Param(Params._dummy(), "caseSensitiveExceptions", "Whether to care for case sensitiveness in exceptions", typeConverter=TypeConverters.toBoolean) targetPattern = Param(Params._dummy(), "targetPattern", "pattern to grab from text as token candidates. Defaults \S+", typeConverter=TypeConverters.toString) rules = Param(Params._dummy(), "rules", "Rules structure factory containing pre processed regex rules", typeConverter=TypeConverters.identity) splitPattern = Param(Params._dummy(), "splitPattern", "character list used to separate from the inside of tokens", typeConverter=TypeConverters.toString) splitChars = Param(Params._dummy(), "splitChars", "character list used to separate from the inside of tokens", typeConverter=TypeConverters.toListString) def __init__(self, classname="com.johnsnowlabs.nlp.annotators.TokenizerModel", java_model=None): super(TokenizerModel, self).__init__( classname=classname, java_model=java_model ) self._setDefault( targetPattern="\\S+", caseSensitiveExceptions=True )
[docs] def setSplitPattern(self, value): """Sets pattern to separate from the inside of tokens. Takes priority over splitChars. Parameters ---------- value : str Pattern used to separate from the inside of tokens """ return self._set(splitPattern=value)
[docs] def setSplitChars(self, value): """Sets character list used to separate from the inside of tokens. Parameters ---------- value : List[str] Character list used to separate from the inside of tokens """ return self._set(splitChars=value)
[docs] def addSplitChars(self, value): """Adds an additional character to separate from the inside of tokens. Parameters ---------- value : str Additional character to separate from the inside of tokens """ try: split_chars = self.getSplitChars() except KeyError: split_chars = [] split_chars.append(value) return self._set(splitChars=split_chars)
@staticmethod
[docs] def pretrained(name="token_rules", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional Name of the pretrained model, by default "token_rules" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise. Returns ------- TokenizerModel The restored model """ from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(TokenizerModel, name, lang, remote_loc)