all-MiniLM-L6-v2 converted for SparkNLP

Description

See https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

Predicted Entities

Download Copy S3 URI

How to use

def embedSentences(sentences: DataFrame): DataFrame = {

    val documentAssembler = new DocumentAssembler()
      .setInputCol("sentence")
      .setOutputCol("document")

    val tokenizer = new Tokenizer()
      .setInputCols("document")
      .setOutputCol("token")

        val embeddings = BertEmbeddings
          .pretrained("all_minilm_l6", "en")
          .setInputCols(Array("document", "token"))
          .setOutputCol("embeddings")


    val sentenceEmbeddings = new SentenceEmbeddings()
      .setInputCols("document", "embeddings")
      .setOutputCol("sentence_embeddings")
      .setPoolingStrategy("AVERAGE")

    val pipeline = new Pipeline().setStages(Array(
      documentAssembler,
      tokenizer,
      embeddings,
      sentenceEmbeddings
    ))

    val pipelineModel = pipeline.fit(sentences)
    val pipelineDF = pipelineModel.transform(sentences)
def embedSentences(sentences: DataFrame): DataFrame = {

    val documentAssembler = new DocumentAssembler()
      .setInputCol("sentence")
      .setOutputCol("document")

    val tokenizer = new Tokenizer()
      .setInputCols("document")
      .setOutputCol("token")

        val embeddings = BertEmbeddings
          .pretrained("all_minilm_l6", "en")
          .setInputCols(Array("document", "token"))
          .setOutputCol("embeddings")


    val sentenceEmbeddings = new SentenceEmbeddings()
      .setInputCols("document", "embeddings")
      .setOutputCol("sentence_embeddings")
      .setPoolingStrategy("AVERAGE")

    val pipeline = new Pipeline().setStages(Array(
      documentAssembler,
      tokenizer,
      embeddings,
      sentenceEmbeddings
    ))

    val pipelineModel = pipeline.fit(sentences)
    val pipelineDF = pipelineModel.transform(sentences)

Model Information

Model Name: all_MiniLM_L6_v2
Compatibility: Spark NLP 5.5.1+
License: Open Source
Edition: Community
Input Labels: [document, token]
Output Labels: [embeddings]
Language: en
Size: 84.6 MB
Case sensitive: false