Qwen2.5-VL-7B-Instruct (Q16 GGUF Quantized)

Description

Qwen2.5-VL-7B-Instruct (Q16 GGUF Quantized) is a 7-billion-parameter multimodal instruction-tuned model supporting text, image, and video understanding. Compared to Qwen2-VL, it introduces major enhancements in fine-grained visual analysis (objects, text, charts, layouts), structured outputs (tables, invoices, forms), visual localization (bounding boxes, points with JSON), and long-video comprehension (over 1 hour with temporal reasoning).

It also adds agentic capabilities, enabling tool use such as computer and phone control. This version is provided in GGUF Q16 format for efficient inference in SparkNLP pipelines and lightweight runtimes, balancing speed and accuracy.

Originally from Qwen/Qwen2.5-VL-7B-Instruct.

Download Copy S3 URI

How to use

from sparknlp.base import DocumentAssembler, ImageAssembler
from sparknlp.annotator import AutoGGUFVisionModel
from pyspark.sql.functions import lit
from pyspark.ml import Pipeline

images_path = "path/to/images/folder"
prompt = "Caption this image."

data = ImageAssembler.loadImagesAsBytes(spark, images_path)
data = data.withColumn("caption", lit(prompt))

document_assembler = (
    DocumentAssembler()
    .setInputCol("caption")
    .setOutputCol("caption_document")
)

image_assembler = (
    ImageAssembler()
    .setInputCol("image")
    .setOutputCol("image_assembler")
)

qwen_chat_template = """<|im_start|>user
prompt<|im_end|>
<|im_start|>assistant
"""

autoGGUFVisionModel = (
    AutoGGUFVisionModel.pretrained("qwen2.5_vl_7b_instruct_q16_gguf")
    .setInputCols(["caption_document", "image_assembler"])
    .setOutputCol("completions")
    .setChatTemplate(qwen_chat_template)
    .setBatchSize(4)
    .setNGpuLayers(32)
    .setNCtx(4096)
    .setMinKeep(0)
    .setMinP(0.05)
    .setNPredict(64)
    .setNProbs(0)
    .setPenalizeNl(False)
    .setRepeatLastN(256)
    .setRepeatPenalty(1.1)
    .setStopStrings(["</s>", "<|im_end|>", "User:"])
    .setTemperature(0.2)
    .setTfsZ(1)
    .setTypicalP(1)
    .setTopK(40)
    .setTopP(0.95)
)

pipeline = Pipeline().setStages([
    document_assembler,
    image_assembler,
    autoGGUFVisionModel
])

model = pipeline.fit(data)
result = model.transform(data)

result.selectExpr(
    "reverse(split(image.origin, '/'))[0] as image_name",
    "completions.result"
).show(truncate=False)
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.sql.functions.lit
import org.apache.spark.ml.Pipeline

val images_path = "path/to/images/folder"
val prompt = "Caption this image."

var data = ImageAssembler.loadImagesAsBytes(spark, images_path)
data = data.withColumn("caption", lit(prompt))

val document_assembler = new DocumentAssembler()
  .setInputCol("caption")
  .setOutputCol("caption_document")

val image_assembler = new ImageAssembler()
  .setInputCol("image")
  .setOutputCol("image_assembler")

val qwen_chat_template = """<|im_start|>user
prompt<|im_end|>
<|im_start|>assistant
"""

val autoGGUFVisionModel = AutoGGUFVisionModel.pretrained("qwen2.5_vl_7b_instruct_q16_gguf")
  .setInputCols(Array("caption_document", "image_assembler"))
  .setOutputCol("completions")
  .setChatTemplate(qwen_chat_template)
  .setBatchSize(4)
  .setNGpuLayers(32)
  .setNCtx(4096)
  .setMinKeep(0)
  .setMinP(0.05)
  .setNPredict(64)
  .setNProbs(0)
  .setPenalizeNl(false)
  .setRepeatLastN(256)
  .setRepeatPenalty(1.1)
  .setStopStrings(Array("</s>", "<|im_end|>", "User:"))
  .setTemperature(0.2)
  .setTfsZ(1)
  .setTypicalP(1)
  .setTopK(40)
  .setTopP(0.95)

val pipeline = new Pipeline().setStages(Array(
  document_assembler,
  image_assembler,
  autoGGUFVisionModel
))

val model = pipeline.fit(data)
val result = model.transform(data)

result.selectExpr(
  "reverse(split(image.origin, '/'))[0] as image_name",
  "completions.result"
).show(false)

Results


+-------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|image_name         |result                                                                                                                             |
+-------------------+-----------------------------------------------------------------------------------------------------------------------------------+
|prescription_02.png|["Medical prescription for systemic lupus erythematosus and scleroderma overlap with interstitial lung disease, dated 02/07/2021."]|
|prescription_01.png|["Prescription for malaria treatment, dated 30-Aug-2023, from SMS Hospital."]                                                      |
+-------------------+-----------------------------------------------------------------------------------------------------------------------------------+

Model Information

Model Name: qwen2.5_vl_7b_instruct_q16_gguf
Compatibility: Spark NLP 6.1.1+
License: Open Source
Edition: Official
Input Labels: [caption_document, image_assembler]
Output Labels: [completions]
Language: en
Size: 13.3 GB