MiniLM L6 V2

Description

Sentence embeddings using MiniLM.

MiniLM, a lightweight and efficient sentence embedding model that can generate text embeddings for various NLP tasks (e.g., classification, retrieval, clustering, text evaluation, etc.)

Predicted Entities

Download Copy S3 URI

How to use

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")
embeddings = MiniLMEmbeddings.pretrained() \
    .setInputCols(["document"]) \
    .setOutputCol("minilm_embeddings")
embeddingsFinisher = EmbeddingsFinisher() \
    .setInputCols(["minilm_embeddings"]) \
    .setOutputCols("finished_embeddings") \
    .setOutputAsVector(True)
pipeline = Pipeline().setStages([
    documentAssembler,
    embeddings,
    embeddingsFinisher
])

data = spark.createDataFrame([["This is a sample sentence for embedding generation.",
    "Another example sentence to demonstrate MiniLM embeddings.",
]]).toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
import spark.implicits._
import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.embeddings.MiniLMEmbeddings
import com.johnsnowlabs.nlp.EmbeddingsFinisher
import org.apache.spark.ml.Pipeline

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val embeddings = MiniLMEmbeddings.pretrained("minilm_l6_v2", "en")
  .setInputCols("document")
  .setOutputCol("minilm_embeddings")

val embeddingsFinisher = new EmbeddingsFinisher()
  .setInputCols("minilm_embeddings")
  .setOutputCols("finished_embeddings")
  .setOutputAsVector(true)

val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  embeddings,
  embeddingsFinisher
))

val data = Seq("This is a sample sentence for embedding generation.",
"Another example sentence to demonstrate MiniLM embeddings."

).toDF("text")
val result = pipeline.fit(data).transform(data)

result.selectExpr("explode(finished_embeddings) as result").show(1, 80)

Model Information

Model Name: minilm_l6_v2
Compatibility: Spark NLP 5.5.1+
License: Open Source
Edition: Official
Input Labels: [documents]
Output Labels: [minilm]
Language: en
Size: 17.2 MB