Description
Sentence embeddings using MiniLM.
MiniLM, a lightweight and efficient sentence embedding model that can generate text embeddings for various NLP tasks (e.g., classification, retrieval, clustering, text evaluation, etc.)
Predicted Entities
How to use
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
embeddings = MiniLMEmbeddings.pretrained() \
.setInputCols(["document"]) \
.setOutputCol("minilm_embeddings")
embeddingsFinisher = EmbeddingsFinisher() \
.setInputCols(["minilm_embeddings"]) \
.setOutputCols("finished_embeddings") \
.setOutputAsVector(True)
pipeline = Pipeline().setStages([
documentAssembler,
embeddings,
embeddingsFinisher
])
data = spark.createDataFrame([["This is a sample sentence for embedding generation.",
"Another example sentence to demonstrate MiniLM embeddings.",
]]).toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
import spark.implicits._
import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.embeddings.MiniLMEmbeddings
import com.johnsnowlabs.nlp.EmbeddingsFinisher
import org.apache.spark.ml.Pipeline
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val embeddings = MiniLMEmbeddings.pretrained("minilm_l6_v2", "en")
.setInputCols("document")
.setOutputCol("minilm_embeddings")
val embeddingsFinisher = new EmbeddingsFinisher()
.setInputCols("minilm_embeddings")
.setOutputCols("finished_embeddings")
.setOutputAsVector(true)
val pipeline = new Pipeline().setStages(Array(
documentAssembler,
embeddings,
embeddingsFinisher
))
val data = Seq("This is a sample sentence for embedding generation.",
"Another example sentence to demonstrate MiniLM embeddings."
).toDF("text")
val result = pipeline.fit(data).transform(data)
result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
Model Information
Model Name: | minilm_l6_v2 |
Compatibility: | Spark NLP 5.5.1+ |
License: | Open Source |
Edition: | Official |
Input Labels: | [documents] |
Output Labels: | [minilm] |
Language: | en |
Size: | 17.2 MB |