Description
Qwen2.5-VL-7B-Instruct (Q16 GGUF Quantized) is a 7-billion-parameter multimodal instruction-tuned model supporting text, image, and video understanding. Compared to Qwen2-VL, it introduces major enhancements in fine-grained visual analysis (objects, text, charts, layouts), structured outputs (tables, invoices, forms), visual localization (bounding boxes, points with JSON), and long-video comprehension (over 1 hour with temporal reasoning).
It also adds agentic capabilities, enabling tool use such as computer and phone control. This version is provided in GGUF Q16 format for efficient inference in SparkNLP pipelines and lightweight runtimes, balancing speed and accuracy.
Originally from Qwen/Qwen2.5-VL-7B-Instruct.
How to use
from sparknlp.base import DocumentAssembler, ImageAssembler
from sparknlp.annotator import AutoGGUFVisionModel
from pyspark.sql.functions import lit
from pyspark.ml import Pipeline
images_path = "path/to/images/folder"
prompt = "Caption this image."
data = ImageAssembler.loadImagesAsBytes(spark, images_path)
data = data.withColumn("caption", lit(prompt))
document_assembler = (
DocumentAssembler()
.setInputCol("caption")
.setOutputCol("caption_document")
)
image_assembler = (
ImageAssembler()
.setInputCol("image")
.setOutputCol("image_assembler")
)
qwen_chat_template = """<|im_start|>user
prompt<|im_end|>
<|im_start|>assistant
"""
autoGGUFVisionModel = (
AutoGGUFVisionModel.pretrained("qwen2.5_vl_7b_instruct_q16_gguf")
.setInputCols(["caption_document", "image_assembler"])
.setOutputCol("completions")
.setChatTemplate(qwen_chat_template)
.setBatchSize(4)
.setNGpuLayers(32)
.setNCtx(4096)
.setMinKeep(0)
.setMinP(0.05)
.setNPredict(64)
.setNProbs(0)
.setPenalizeNl(False)
.setRepeatLastN(256)
.setRepeatPenalty(1.1)
.setStopStrings(["</s>", "<|im_end|>", "User:"])
.setTemperature(0.2)
.setTfsZ(1)
.setTypicalP(1)
.setTopK(40)
.setTopP(0.95)
)
pipeline = Pipeline().setStages([
document_assembler,
image_assembler,
autoGGUFVisionModel
])
model = pipeline.fit(data)
result = model.transform(data)
result.selectExpr(
"reverse(split(image.origin, '/'))[0] as image_name",
"completions.result"
).show(truncate=False)
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.sql.functions.lit
import org.apache.spark.ml.Pipeline
val images_path = "path/to/images/folder"
val prompt = "Caption this image."
var data = ImageAssembler.loadImagesAsBytes(spark, images_path)
data = data.withColumn("caption", lit(prompt))
val document_assembler = new DocumentAssembler()
.setInputCol("caption")
.setOutputCol("caption_document")
val image_assembler = new ImageAssembler()
.setInputCol("image")
.setOutputCol("image_assembler")
val qwen_chat_template = """<|im_start|>user
prompt<|im_end|>
<|im_start|>assistant
"""
val autoGGUFVisionModel = AutoGGUFVisionModel.pretrained("qwen2.5_vl_7b_instruct_q16_gguf")
.setInputCols(Array("caption_document", "image_assembler"))
.setOutputCol("completions")
.setChatTemplate(qwen_chat_template)
.setBatchSize(4)
.setNGpuLayers(32)
.setNCtx(4096)
.setMinKeep(0)
.setMinP(0.05)
.setNPredict(64)
.setNProbs(0)
.setPenalizeNl(false)
.setRepeatLastN(256)
.setRepeatPenalty(1.1)
.setStopStrings(Array("</s>", "<|im_end|>", "User:"))
.setTemperature(0.2)
.setTfsZ(1)
.setTypicalP(1)
.setTopK(40)
.setTopP(0.95)
val pipeline = new Pipeline().setStages(Array(
document_assembler,
image_assembler,
autoGGUFVisionModel
))
val model = pipeline.fit(data)
val result = model.transform(data)
result.selectExpr(
"reverse(split(image.origin, '/'))[0] as image_name",
"completions.result"
).show(false)
Results
+-------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|image_name |result |
+-------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|prescription_02.png|["Medical prescription for systemic lupus erythematosus and scleroderma overlap with interstitial lung disease, dated 02/07/2021."]|
|prescription_01.png|["Prescription for malaria treatment, dated 30-Aug-2023, from SMS Hospital."] |
+-------------------+-----------------------------------------------------------------------------------------------------------------------------------+
Model Information
Model Name: | qwen2.5_vl_7b_instruct_q16_gguf |
Compatibility: | Spark NLP 6.1.1+ |
License: | Open Source |
Edition: | Official |
Input Labels: | [caption_document, image_assembler] |
Output Labels: | [completions] |
Language: | en |
Size: | 13.3 GB |
PREVIOUSBGE Reranker V2 M3 Q4_K_M GGUF