Source code for sparknlp.annotator.param.evaluation_dl_params

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

from sparknlp.common import *
from sparknlp.internal import ParamsGettersSetters


[docs]class EvaluationDLParams(ParamsGettersSetters):

[docs]    verbose = Param(Params._dummy(), "verbose", "Level of verbosity during training", TypeConverters.toInt)

[docs]    validationSplit = Param(Params._dummy(), "validationSplit",
                            "Choose the proportion of training dataset to be validated against the model on each Epoch. The value should be between 0.0 and 1.0 and by default it is 0.0 and off.",
                            TypeConverters.toFloat)

[docs]    evaluationLogExtended = Param(Params._dummy(), "evaluationLogExtended",
                                  "Whether logs for validation to be extended: it displays time and evaluation of each label. Default is False.",
                                  TypeConverters.toBoolean)

[docs]    enableOutputLogs = Param(Params._dummy(), "enableOutputLogs",
                             "Whether to use stdout in addition to Spark logs.",
                             TypeConverters.toBoolean)

[docs]    outputLogsPath = Param(Params._dummy(), "outputLogsPath", "Folder path to save training logs",
                           TypeConverters.toString)

[docs]    testDataset = Param(Params._dummy(), "testDataset",
                        "Path to test dataset. If set used to calculate statistic on it during training.",
                        TypeConverters.identity)

[docs]    def setVerbose(self, value):
        """Sets level of verbosity during training

        Parameters
        ----------
        value : int
            Level of verbosity
        """
        return self._set(verbose=value)

[docs]    def setValidationSplit(self, v):
        """Sets the proportion of training dataset to be validated against the
        model on each Epoch, by default it is 0.0 and off. The value should be
        between 0.0 and 1.0.

        Parameters
        ----------
        v : float
            Proportion of training dataset to be validated
        """
        self._set(validationSplit=v)
        return self

[docs]    def setEvaluationLogExtended(self, v):
        """Sets whether logs for validation to be extended, by default False.
        Displays time and evaluation of each label.

        Parameters
        ----------
        v : bool
            Whether logs for validation to be extended

        """
        self._set(evaluationLogExtended=v)
        return self

[docs]    def setEnableOutputLogs(self, value):
        """Sets whether to use stdout in addition to Spark logs, by default
        False.

        Parameters
        ----------
        value : bool
            Whether to use stdout in addition to Spark logs
        """
        return self._set(enableOutputLogs=value)

[docs]    def setOutputLogsPath(self, p):
        """Sets folder path to save training logs

        Parameters
        ----------
        p : str
            Folder path to save training logs
        """
        return self._set(outputLogsPath=p)

[docs]    def setTestDataset(self, path, read_as=ReadAs.SPARK, options={"format": "parquet"}):
        """Path to a parquet file of a test dataset. If set, it is used to calculate
        statistics on it during training.

        The parquet file must be a dataframe that has the same columns as the model that
        is being trained. For example, if the model needs as input `DOCUMENT`, `TOKEN`,
        `WORD_EMBEDDINGS` (Features) and `NAMED_ENTITY` (label) then these columns also
        need to be present while saving the dataframe. The pre-processing steps for the
        training dataframe should also be applied to the test dataframe.

        An example on how to create such a parquet file could be:

        >>> # assuming preProcessingPipeline
        >>> (train, test) = data.randomSplit([0.8, 0.2])
        >>> preProcessingPipeline
        ...     .fit(test)
        ...     .transform(test)
        ...     .write
        ...     .mode("overwrite")
        ...     .parquet("test_data")
        >>> annotator.setTestDataset("test_data")

        Parameters
        ----------
        path : str
            Path to test dataset
        read_as : str, optional
            How to read the resource, by default ReadAs.SPARK
        options : dict, optional
            Options for reading the resource, by default {"format": "csv"}
        """
        return self._set(testDataset=ExternalResource(path, read_as, options.copy()))