Description
FlagEmbedding focuses on retrieval-augmented LLMs
How to use
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
embeddings = BGEEmbeddings.pretrained("bge_base_en_v1_5_onnx", "en") \
.setInputCols(["document"]) \
.setOutputCol("embeddings")
nlp_pipeline = Pipeline(stages=[
document_assembler,
embeddings
])
data = spark.createDataFrame([["This is a test sentence for BGE embeddings."]]).toDF("text")
result = nlp_pipeline.fit(data).transform(data)
result.select("embeddings.embeddings").show()
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.embeddings._
import org.apache.spark.ml.Pipeline
val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")
val embeddings = BGEEmbeddings.pretrained("bge_base_en_v1_5_onnx", "en")
.setInputCols(Array("document"))
.setOutputCol("embeddings")
val pipeline = new Pipeline().setStages(Array(
documentAssembler,
embeddings
))
val data = spark.createDataFrame(Seq(
("This is a test sentence for BGE embeddings.")
)).toDF("text")
val result = pipeline.fit(data).transform(data)
result.select("embeddings.embeddings").show()
Results
+--------------------+
| embeddings|
+--------------------+
|[[0.023069248, -0...|
+--------------------+
Model Information
| Model Name: | bge_base_en_v1_5_onnx |
| Compatibility: | Spark NLP 6.0.0+ |
| License: | Open Source |
| Edition: | Official |
| Input Labels: | [document] |
| Output Labels: | [embeddings] |
| Language: | en |
| Size: | 256.0 MB |
| Case sensitive: | false |
| Max sentence length: | 512 |