Description
Universal multimodal embeddings using E5-V.
E5-V is a multimodal embedding model that bridges the modality gap between text and images, enabling strong performance in cross-modal retrieval, classification, clustering, and more. It supports both image+text and text-only embedding scenarios, and is fine-tuned from lmms-lab/llama3-llava-next-8b. The default model is "e5v_int4"
.
Pretrained models can be loaded with pretrained
of the companion object:
Predicted Entities
How to use
# Image + Text Embedding
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import lit
image_df = spark.read.format("image").option("dropInvalid", True).load(imageFolder)
imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n<image>\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
test_df = image_df.withColumn("text", lit(imagePrompt))
imageAssembler = ImageAssembler() \
.setInputCol("image") \
.setOutputCol("image_assembler")
e5vEmbeddings = E5VEmbeddings.pretrained() \
.setInputCols(["image_assembler"]) \
.setOutputCol("e5v")
pipeline = Pipeline().setStages([
imageAssembler,
e5vEmbeddings
])
result = pipeline.fit(test_df).transform(test_df)
result.select("e5v.embeddings").show(truncate=False)
# Text-Only Embedding
from sparknlp.util import EmbeddingsDataFrameUtils
textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n<sent>\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
textDesc = "A cat sitting in a box."
nullImageDF = spark.createDataFrame(
spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]),
EmbeddingsDataFrameUtils.imageSchema)
textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("<sent>", textDesc)))
e5vEmbeddings = E5VEmbeddings.pretrained() \
.setInputCols(["image"]) \
.setOutputCol("e5v")
result = e5vEmbeddings.transform(textDF)
result.select("e5v.embeddings").show(truncate=False)
// Image + Text Embedding
import org.apache.spark.sql.functions.lit
import com.johnsnowlabs.nlp.base.ImageAssembler
import com.johnsnowlabs.nlp.embeddings.E5VEmbeddings
import org.apache.spark.ml.Pipeline
val imageDF = spark.read.format("image").option("dropInvalid", value = true).load(imageFolder)
val imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n<image>\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
val testDF = imageDF.withColumn("text", lit(imagePrompt))
val imageAssembler = new ImageAssembler().setInputCol("image").setOutputCol("image_assembler")
val e5vEmbeddings = E5VEmbeddings.pretrained()
.setInputCols("image_assembler")
.setOutputCol("e5v")
val pipeline = new Pipeline().setStages(Array(imageAssembler, e5vEmbeddings))
val result = pipeline.fit(testDF).transform(testDF)
result.select("e5v.embeddings").show(truncate = false)
// Text-Only Embedding
import com.johnsnowlabs.nlp.util.EmbeddingsDataFrameUtils.{emptyImageRow, imageSchema}
val textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n<sent>\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
val textDesc = "A cat sitting in a box."
val nullImageDF = spark.createDataFrame(spark.sparkContext.parallelize(Seq(emptyImageRow)), imageSchema)
val textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("<sent>", textDesc)))
val e5vEmbeddings = E5VEmbeddings.pretrained()
.setInputCols("image")
.setOutputCol("e5v")
val result2 = e5vEmbeddings.transform(textDF)
result2.select("e5v.embeddings").show(truncate = false)
Model Information
Model Name: | e5v_int4 |
Compatibility: | Spark NLP 5.5.1+ |
License: | Open Source |
Edition: | Official |
Input Labels: | [image_assembler] |
Output Labels: | [answer] |
Language: | en |
Size: | 4.9 GB |