E5V Embeddings

Description

Universal multimodal embeddings using E5-V.

E5-V is a multimodal embedding model that bridges the modality gap between text and images, enabling strong performance in cross-modal retrieval, classification, clustering, and more. It supports both image+text and text-only embedding scenarios, and is fine-tuned from lmms-lab/llama3-llava-next-8b. The default model is "e5v_int4".

Pretrained models can be loaded with pretrained of the companion object:

Predicted Entities

Download Copy S3 URI

How to use

# Image + Text Embedding
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import lit

image_df = spark.read.format("image").option("dropInvalid", True).load(imageFolder)
imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n<image>\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
test_df = image_df.withColumn("text", lit(imagePrompt))
imageAssembler = ImageAssembler() \
    .setInputCol("image") \
    .setOutputCol("image_assembler")
e5vEmbeddings = E5VEmbeddings.pretrained() \
    .setInputCols(["image_assembler"]) \
    .setOutputCol("e5v")
pipeline = Pipeline().setStages([
    imageAssembler,
    e5vEmbeddings
])
result = pipeline.fit(test_df).transform(test_df)
result.select("e5v.embeddings").show(truncate=False)

# Text-Only Embedding
from sparknlp.util import EmbeddingsDataFrameUtils
textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n<sent>\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
textDesc = "A cat sitting in a box."
nullImageDF = spark.createDataFrame(
    spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]),
    EmbeddingsDataFrameUtils.imageSchema)
textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("<sent>", textDesc)))
e5vEmbeddings = E5VEmbeddings.pretrained() \
    .setInputCols(["image"]) \
    .setOutputCol("e5v")
result = e5vEmbeddings.transform(textDF)
result.select("e5v.embeddings").show(truncate=False)
// Image + Text Embedding
import org.apache.spark.sql.functions.lit
import com.johnsnowlabs.nlp.base.ImageAssembler
import com.johnsnowlabs.nlp.embeddings.E5VEmbeddings
import org.apache.spark.ml.Pipeline

val imageDF = spark.read.format("image").option("dropInvalid", value = true).load(imageFolder)
val imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n<image>\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
val testDF = imageDF.withColumn("text", lit(imagePrompt))
val imageAssembler = new ImageAssembler().setInputCol("image").setOutputCol("image_assembler")
val e5vEmbeddings = E5VEmbeddings.pretrained()
  .setInputCols("image_assembler")
  .setOutputCol("e5v")
val pipeline = new Pipeline().setStages(Array(imageAssembler, e5vEmbeddings))
val result = pipeline.fit(testDF).transform(testDF)
result.select("e5v.embeddings").show(truncate = false)

// Text-Only Embedding
import com.johnsnowlabs.nlp.util.EmbeddingsDataFrameUtils.{emptyImageRow, imageSchema}
val textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n<sent>\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
val textDesc = "A cat sitting in a box."
val nullImageDF = spark.createDataFrame(spark.sparkContext.parallelize(Seq(emptyImageRow)), imageSchema)
val textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("<sent>", textDesc)))
val e5vEmbeddings = E5VEmbeddings.pretrained()
  .setInputCols("image")
  .setOutputCol("e5v")
val result2 = e5vEmbeddings.transform(textDF)
result2.select("e5v.embeddings").show(truncate = false)

Model Information

Model Name: e5v_int4
Compatibility: Spark NLP 5.5.1+
License: Open Source
Edition: Official
Input Labels: [image_assembler]
Output Labels: [answer]
Language: en
Size: 4.9 GB