Source code for sparknlp.base.gguf_ranking_finisher

#  Copyright 2017-2024 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the GGUFRankingFinisher."""

from pyspark import keyword_only
from pyspark.ml.param import TypeConverters, Params, Param
from sparknlp.internal import AnnotatorTransformer


[docs]class GGUFRankingFinisher(AnnotatorTransformer):
    """Finisher for AutoGGUFReranker outputs that provides ranking capabilities
    including top-k selection, sorting by relevance score, and score normalization.

    This finisher processes the output of AutoGGUFReranker, which contains documents with
    relevance scores in their metadata. It provides several options for post-processing:

    - Top-k selection: Select only the top k documents by relevance score
    - Score thresholding: Filter documents by minimum relevance score
    - Min-max scaling: Normalize relevance scores to 0-1 range
    - Sorting: Sort documents by relevance score in descending order
    - Ranking: Add rank information to document metadata

    The finisher preserves the document annotation structure while adding ranking information
    to the metadata and optionally filtering/sorting the documents.

    For extended examples of usage, see the `Examples
    <https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/finisher/gguf_ranking_finisher_example.py>`__.

    ====================== ======================
    Input Annotation types Output Annotation type
    ====================== ======================
    ``DOCUMENT``           ``DOCUMENT``
    ====================== ======================

    Parameters
    ----------
    inputCols
        Name of input annotation columns containing reranked documents
    outputCol
        Name of output annotation column containing ranked documents, by default "ranked_documents"
    topK
        Maximum number of top documents to return based on relevance score (-1 for no limit), by default -1
    minRelevanceScore
        Minimum relevance score threshold for filtering documents, by default Double.MinValue
    minMaxScaling
        Whether to apply min-max scaling to normalize relevance scores to 0-1 range, by default False

    Examples
    --------
    >>> import sparknlp
    >>> from sparknlp.base import *
    >>> from sparknlp.annotator import *
    >>> from pyspark.ml import Pipeline
    >>> documentAssembler = DocumentAssembler() \\
    ...     .setInputCol("text") \\
    ...     .setOutputCol("document")
    >>> reranker = AutoGGUFReranker.pretrained() \\
    ...     .setInputCols("document") \\
    ...     .setOutputCol("reranked_documents") \\
    ...     .setQuery("A man is eating pasta.")
    >>> finisher = GGUFRankingFinisher() \\
    ...     .setInputCols("reranked_documents") \\
    ...     .setOutputCol("ranked_documents") \\
    ...     .setTopK(3) \\
    ...     .setMinMaxScaling(True)
    >>> pipeline = Pipeline().setStages([documentAssembler, reranker, finisher])
    >>> data = spark.createDataFrame([
    ...     ("A man is eating food.",),
    ...     ("A man is eating a piece of bread.",),
    ...     ("The girl is carrying a baby.",),
    ...     ("A man is riding a horse.",)
    ... ], ["text"])
    >>> result = pipeline.fit(data).transform(data)
    >>> result.select("ranked_documents").show(truncate=False)
    # Documents will be sorted by relevance with rank information in metadata
    """

[docs]    name = "GGUFRankingFinisher"

[docs]    inputCols = Param(Params._dummy(),
                     "inputCols",
                     "Name of input annotation columns containing reranked documents",
                     typeConverter=TypeConverters.toListString)

[docs]    outputCol = Param(Params._dummy(),
                     "outputCol", 
                     "Name of output annotation column containing ranked documents",
                     typeConverter=TypeConverters.toListString)

[docs]    topK = Param(Params._dummy(),
                 "topK",
                 "Maximum number of top documents to return based on relevance score (-1 for no limit)",
                 typeConverter=TypeConverters.toInt)

[docs]    minRelevanceScore = Param(Params._dummy(),
                             "minRelevanceScore",
                             "Minimum relevance score threshold for filtering documents",
                             typeConverter=TypeConverters.toFloat)

[docs]    minMaxScaling = Param(Params._dummy(),
                         "minMaxScaling",
                         "Whether to apply min-max scaling to normalize relevance scores to 0-1 range",
                         typeConverter=TypeConverters.toBoolean)

    @keyword_only
    def __init__(self):
        super(GGUFRankingFinisher, self).__init__(
            classname="com.johnsnowlabs.nlp.finisher.GGUFRankingFinisher")
        self._setDefault(
            topK=-1,
            minRelevanceScore=float('-inf'),  # Equivalent to Double.MinValue
            minMaxScaling=False,
            outputCol=["ranked_documents"]
        )

    @keyword_only
[docs]    def setParams(self):
        kwargs = self._input_kwargs
        return self._set(**kwargs)

[docs]    def setInputCols(self, *value):
        """Sets input annotation column names.

        Parameters
        ----------
        value : List[str]
            Input annotation column names containing reranked documents
        """
        if len(value) == 1 and isinstance(value[0], list):
            return self._set(inputCols=value[0])
        else:
            return self._set(inputCols=list(value))

[docs]    def getInputCols(self):
        """Gets input annotation column names.

        Returns
        -------
        List[str]
            Input annotation column names
        """
        return self.getOrDefault(self.inputCols)

[docs]    def setOutputCol(self, value):
        """Sets output annotation column name.

        Parameters
        ----------
        value : str
            Output annotation column name
        """
        return self._set(outputCol=[value])

[docs]    def getOutputCol(self):
        """Gets output annotation column name.

        Returns
        -------
        str
            Output annotation column name
        """
        output_cols = self.getOrDefault(self.outputCol)
        return output_cols[0] if output_cols else "ranked_documents"

[docs]    def setTopK(self, value):
        """Sets maximum number of top documents to return.

        Parameters
        ----------
        value : int
            Maximum number of top documents to return (-1 for no limit)
        """
        return self._set(topK=value)

[docs]    def getTopK(self):
        """Gets maximum number of top documents to return.

        Returns
        -------
        int
            Maximum number of top documents to return
        """
        return self.getOrDefault(self.topK)

[docs]    def setMinRelevanceScore(self, value):
        """Sets minimum relevance score threshold.

        Parameters
        ----------
        value : float
            Minimum relevance score threshold
        """
        return self._set(minRelevanceScore=value)

[docs]    def getMinRelevanceScore(self):
        """Gets minimum relevance score threshold.

        Returns
        -------
        float
            Minimum relevance score threshold
        """
        return self.getOrDefault(self.minRelevanceScore)

[docs]    def setMinMaxScaling(self, value):
        """Sets whether to apply min-max scaling.

        Parameters
        ----------
        value : bool
            Whether to apply min-max scaling to normalize scores
        """
        return self._set(minMaxScaling=value)

[docs]    def getMinMaxScaling(self):
        """Gets whether to apply min-max scaling.

        Returns
        -------
        bool
            Whether min-max scaling is enabled
        """
        return self.getOrDefault(self.minMaxScaling)