Source code for sparknlp.pretrained.resource_downloader

#  Copyright 2017-2022 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for the ResourceDownloader."""

import sys
import threading

from py4j.protocol import Py4JJavaError
from pyspark.ml import PipelineModel

import sparknlp.internal as _internal
from sparknlp.pretrained.utils import printProgress


[docs]class ResourceDownloader(object):
    """Downloads and manages resources, pretrained models/pipelines.

    Usually you will not need to use this class directly. It is called by the
    `pretrained()` function of annotators.

    However, you can use this class to list the available pretrained resources.

    Examples
    --------
    If you want to list all NerDLModels for the english language you can run:

    >>> ResourceDownloader.showPublicModels("NerDLModel", "en")
    +-------------+------+---------+
    | Model       | lang | version |
    +-------------+------+---------+
    | onto_100    | en   | 2.1.0   |
    | onto_300    | en   | 2.1.0   |
    | ner_dl_bert | en   | 2.2.0   |
    |  ...        | ...  | ...     |


    Similarly for Pipelines:

    >>> ResourceDownloader.showPublicPipelines("en")
    +------------------+------+---------+
    | Pipeline         | lang | version |
    +------------------+------+---------+
    | dependency_parse | en   | 2.0.2   |
    | check_spelling   | en   | 2.1.0   |
    | match_datetime   | en   | 2.1.0   |
    |  ...             | ...  | ...     |

    """

    @staticmethod
[docs]    def downloadModel(reader, name, language, remote_loc=None, j_dwn='PythonResourceDownloader'):
        """Downloads and loads a model with the default downloader. Usually this method
        does not need to be called directly, as it is called by the `pretrained()`
        method of the annotator.

        Parameters
        ----------
        reader : obj
           Class to read the model for
        name : str
            Name of the pretrained model
        language : str
            Language of the model
        remote_loc : str, optional
            Directory of the Spark NLP Folder, by default None
        j_dwn : str, optional
            Which java downloader to use, by default 'PythonResourceDownloader'

        Returns
        -------
        AnnotatorModel
            Loaded pretrained annotator/pipeline
        """
        print(name + " download started this may take some time.")
        file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
        if file_size == "-1":
            print("Can not find the model to download please check the name!")
        else:
            print("Approximate size to download " + file_size)
            stop_threads = False
            t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
            t1.start()
            try:
                j_obj = _internal._DownloadModel(reader.name, name, language, remote_loc, j_dwn).apply()
            except Py4JJavaError as e:
                sys.stdout.write("\n" + str(e))
                raise e
            finally:
                stop_threads = True
                t1.join()

            return reader(classname=None, java_model=j_obj)

    @staticmethod
[docs]    def downloadModelDirectly(name, remote_loc="public/models", unzip=True):
        """Downloads a model directly to the cache folder.
        You can use to copy-paste the s3 URI from the model hub  and download the model.
        For available s3 URI and models, please see the `Models Hub <https://sparknlp.org/models>`__.
        Parameters
        ----------
        name : str
            Name of the model or s3 URI
        remote_loc : str, optional
            Directory of the remote Spark NLP Folder, by default "public/models"
        unzip : Bool, optional
            Used to unzip model, by default 'True'
        """
        _internal._DownloadModelDirectly(name, remote_loc, unzip).apply()


    @staticmethod
[docs]    def downloadPipeline(name, language, remote_loc=None):
        """Downloads and loads a pipeline with the default downloader.

        Parameters
        ----------
        name : str
            Name of the pipeline
        language : str
            Language of the pipeline
        remote_loc : str, optional
            Directory of the remote Spark NLP Folder, by default None

        Returns
        -------
        PipelineModel
            The loaded pipeline
        """
        print(name + " download started this may take some time.")
        file_size = _internal._GetResourceSize(name, language, remote_loc).apply()
        if file_size == "-1":
            print("Can not find the model to download please check the name!")
        else:
            print("Approx size to download " + file_size)
            stop_threads = False
            t1 = threading.Thread(target=printProgress, args=(lambda: stop_threads,))
            t1.start()
            try:
                j_obj = _internal._DownloadPipeline(name, language, remote_loc).apply()
                jmodel = PipelineModel._from_java(j_obj)
            finally:
                stop_threads = True
                t1.join()

            return jmodel

    @staticmethod
[docs]    def clearCache(name, language, remote_loc=None):
        """Clears the cache entry of a model.

        Parameters
        ----------
        name : str
            Name of the model
        language : en
            Language of the model
        remote_loc : str, optional
            Directory of the remote Spark NLP Folder, by default None
        """
        _internal._ClearCache(name, language, remote_loc).apply()

    @staticmethod
[docs]    def showPublicModels(annotator=None, lang=None, version=None):
        """Prints all pretrained models for a particular annotator model, that are
        compatible with a version of Spark NLP. If any of the optional arguments are not
        set, the filter is not considered.

        Parameters
        ----------
        annotator : str, optional
            Name of the annotator to filer, by default None
        lang : str, optional
            Language of the models to filter, by default None
        version : str, optional
            Version of Spark NLP to filter, by default None
        """
        print(_internal._ShowPublicModels(annotator, lang, version).apply())

    @staticmethod
[docs]    def showPublicPipelines(lang=None, version=None):
        """Prints all pretrained models for a particular annotator model, that are
        compatible with a version of Spark NLP. If any of the optional arguments are not
        set, the filter is not considered.

        Parameters
        ----------
        lang : str, optional
            Language of the models to filter, by default None
        version : str, optional
            Version of Spark NLP to filter, by default None
        """
        print(_internal._ShowPublicPipelines(lang, version).apply())

    @staticmethod
[docs]    def showUnCategorizedResources():
        """Shows models or pipelines in the metadata which has not been categorized yet.
        """
        print(_internal._ShowUnCategorizedResources().apply())

    @staticmethod
[docs]    def showAvailableAnnotators():
        """Shows all available annotators in Spark NLP.
        """
        print(_internal._ShowAvailableAnnotators().apply())