Description
Qwen2-VL-2B-Instruct is a 2-billion-parameter vision-language model fine-tuned for following instructions across text, image, and video inputs, enabling tasks like captioning, visual question answering, and multimodal dialogue.
Originally from Qwen/Qwen2-VL-2B-Instruct
How to use
from sparknlp.base import DocumentAssembler, ImageAssembler
from sparknlp.annotator import AutoGGUFVisionModel
from pyspark.sql.functions import lit
from pyspark.ml import Pipeline
images_path = "path/to/images/folder"
prompt = "Caption this image."
data = ImageAssembler.loadImagesAsBytes(spark, images_path)
data = data.withColumn("caption", lit(prompt))
document_assembler = (
DocumentAssembler()
.setInputCol("caption")
.setOutputCol("caption_document")
)
image_assembler = (
ImageAssembler()
.setInputCol("image")
.setOutputCol("image_assembler")
)
qwen_chat_template = """<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
"""
autoGGUFVisionModel = (
AutoGGUFVisionModel.pretrained("qwen2_vl_2b_instruct_q4_gguf")
.setInputCols(["caption_document", "image_assembler"])
.setOutputCol("completions")
.setChatTemplate(qwen_chat_template)
.setBatchSize(4)
.setNGpuLayers(32)
.setNCtx(4096)
.setMinKeep(0)
.setMinP(0.05)
.setNPredict(64)
.setNProbs(0)
.setPenalizeNl(False)
.setRepeatLastN(256)
.setRepeatPenalty(1.1)
.setStopStrings(["</s>", "<|im_end|>", "User:"])
.setTemperature(0.2)
.setTfsZ(1)
.setTypicalP(1)
.setTopK(40)
.setTopP(0.95)
)
pipeline = Pipeline().setStages([
document_assembler,
image_assembler,
autoGGUFVisionModel
])
model = pipeline.fit(data)
result = model.transform(data)
result.selectExpr(
"reverse(split(image.origin, '/'))[0] as image_name",
"completions.result"
).show(truncate=False)
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.sql.functions.lit
import org.apache.spark.ml.Pipeline
val images_path = "path/to/images/folder"
val prompt = "Caption this image."
var data = ImageAssembler.loadImagesAsBytes(spark, images_path)
data = data.withColumn("caption", lit(prompt))
val document_assembler = new DocumentAssembler()
.setInputCol("caption")
.setOutputCol("caption_document")
val image_assembler = new ImageAssembler()
.setInputCol("image")
.setOutputCol("image_assembler")
val qwen_chat_template = """<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant
"""
val autoGGUFVisionModel = AutoGGUFVisionModel.pretrained("qwen2_vl_2b_instruct_q4_gguf")
.setInputCols(Array("caption_document", "image_assembler"))
.setOutputCol("completions")
.setChatTemplate(qwen_chat_template)
.setBatchSize(4)
.setNGpuLayers(32)
.setNCtx(4096)
.setMinKeep(0)
.setMinP(0.05)
.setNPredict(64)
.setNProbs(0)
.setPenalizeNl(false)
.setRepeatLastN(256)
.setRepeatPenalty(1.1)
.setStopStrings(Array("</s>", "<|im_end|>", "User:"))
.setTemperature(0.2)
.setTfsZ(1)
.setTypicalP(1)
.setTopK(40)
.setTopP(0.95)
val pipeline = new Pipeline().setStages(Array(
document_assembler,
image_assembler,
autoGGUFVisionModel
))
val model = pipeline.fit(data)
val result = model.transform(data)
result.selectExpr(
"reverse(split(image.origin, '/'))[0] as image_name",
"completions.result"
).show(false)
Results
+---------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|image_name |result |
+---------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
|[prescription_02.png, images, content, file:]|["Outpatient Summary: Rheumatology Consultation for Systemic Lupus Erythematosus and Scleroderma Overlap with Interstitial Lung Disease"]|
|[prescription_01.png, images, content, file:]|["Medical prescription for treatment of fever and headache with medication details."] |
+---------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+
Model Information
Model Name: | qwen2_vl_2b_instruct_q4_gguf |
Compatibility: | Spark NLP 6.1.1+ |
License: | Open Source |
Edition: | Official |
Input Labels: | [caption_document, image_assembler] |
Output Labels: | [completions] |
Language: | en |
Size: | 1.6 GB |
PREVIOUSQwen3-Embedding-0.6B-GGUF