Source code for sparknlp.annotator.matcher.regex_matcher

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the RegexMatcher."""

from sparknlp.common import *


[docs]class RegexMatcher(AnnotatorApproach): """Uses rules to match a set of regular expressions and associate them with a provided identifier. A rule consists of a regex pattern and an identifier, delimited by a character of choice. An example could be `"\\d{4}\\/\\d\\d\\/\\d\\d,date"` which will match strings like `"1970/01/01"` to the identifier `"date"`. Rules must be provided by either :meth:`.setRules` (followed by :meth:`.setDelimiter`) or an external file. To use an external file, a dictionary of predefined regular expressions must be provided with :meth:`.setExternalRules`. The dictionary can be set in the form of a delimited text file. Pretrained pipelines are available for this module, see `Pipelines <https://sparknlp.org/docs/en/pipelines>`__. For extended examples of usage, see the `Examples <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/regex-matcher/Matching_Text_with_RegexMatcher.ipynb>`__. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT`` ``CHUNK`` ====================== ====================== Parameters ---------- strategy Can be either MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE, by default "MATCH_ALL" rules Regex rules to match the identifier with delimiter Delimiter for rules provided with setRules externalRules external resource to rules, needs 'delimiter' in options Examples -------- >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline In this example, the ``rules.txt`` has the form of:: the\\s\\w+, followed by 'the' ceremonies, ceremony where each regex is separated by the identifier ``","`` >>> documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document") >>> sentence = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence") >>> regexMatcher = RegexMatcher() \\ ... .setExternalRules("src/test/resources/regex-matcher/rules.txt", ",") \\ ... .setInputCols(["sentence"]) \\ ... .setOutputCol("regex") \\ ... .setStrategy("MATCH_ALL") >>> pipeline = Pipeline().setStages([documentAssembler, sentence, regexMatcher]) >>> data = spark.createDataFrame([[ ... "My first sentence with the first rule. This is my second sentence with ceremonies rule." ... ]]).toDF("text") >>> results = pipeline.fit(data).transform(data) >>> results.selectExpr("explode(regex) as result").show(truncate=False) +--------------------------------------------------------------------------------------------+ |result | +--------------------------------------------------------------------------------------------+ |[chunk, 23, 31, the first, [identifier -> followed by 'the', sentence -> 0, chunk -> 0], []]| |[chunk, 71, 80, ceremonies, [identifier -> ceremony, sentence -> 1, chunk -> 0], []] | +--------------------------------------------------------------------------------------------+ """ inputAnnotatorTypes = [AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.CHUNK strategy = Param(Params._dummy(), "strategy", "MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE", typeConverter=TypeConverters.toString) externalRules = Param(Params._dummy(), "externalRules", "external resource to rules, needs 'delimiter' in options", typeConverter=TypeConverters.identity) rules = Param(Params._dummy(), "rules", "Regex rules to match the identifier with", typeConverter=TypeConverters.toListString) delimiter = Param(Params._dummy(), "delimiter", "Delimiter for rules", typeConverter=TypeConverters.toString) @keyword_only def __init__(self): super(RegexMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexMatcher") self._setDefault( strategy="MATCH_ALL" )
[docs] def setStrategy(self, value): """Sets matching strategy, by default "MATCH_ALL". Can be either MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE. Parameters ---------- value : str Matching Strategy """ return self._set(strategy=value)
[docs] def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}): """Sets external resource to rules, needs 'delimiter' in options. Only one of either parameter `rules` or `externalRules` must be set. Parameters ---------- path : str Path to the source files delimiter : str Delimiter for the dictionary file. Can also be set it `options`. read_as : str, optional How to read the file, by default ReadAs.TEXT options : dict, optional Options to read the resource, by default {"format": "text"} """ opts = options.copy() if "delimiter" not in opts: opts["delimiter"] = delimiter return self._set(externalRules=ExternalResource(path, read_as, opts))
[docs] def setRules(self, value): """Sets the regex rules to match the identifier with. The rules must consist of a regex pattern and an identifier for that pattern. The regex pattern and the identifier must be delimited by a character that will also have to set with `setDelimiter`. Only one of either parameter `rules` or `externalRules` must be set. Examples -------- >>> regexMatcher = RegexMatcher() \\ ... .setRules(["\\d{4}\\/\\d\\d\\/\\d\\d,date", "\\d{2}\\/\\d\\d\\/\\d\\d,short_date"]) \\ ... .setDelimiter(",") \\ ... .setInputCols(["sentence"]) \\ ... .setOutputCol("regex") \\ ... .setStrategy("MATCH_ALL") Parameters ---------- value : List[str] List of rules """ return self._set(rules=value)
[docs] def setDelimiter(self, value): """Sets the delimiter for rules. Parameters ---------- value : str Delimiter for the rules """ return self._set(delimiter=value)
def _create_model(self, java_model): return RegexMatcherModel(java_model=java_model)
[docs]class RegexMatcherModel(AnnotatorModel): """Instantiated model of the RegexMatcher. This is the instantiated model of the :class:`.RegexMatcher`. For training your own model, please see the documentation of that class. ====================== ====================== Input Annotation types Output Annotation type ====================== ====================== ``DOCUMENT`` ``CHUNK`` ====================== ====================== Parameters ---------- None """ inputAnnotatorTypes = [AnnotatorType.DOCUMENT] outputAnnotatorType = AnnotatorType.CHUNK def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RegexMatcherModel", java_model=None): super(RegexMatcherModel, self).__init__( classname=classname, java_model=java_model ) name = "RegexMatcherModel"