Description
This is an image captioning model using ViT to encode images and GPT2 to generate captions. Original model from https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
Predicted Entities
How to use
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
imageDF = spark.read \
.format("image") \
.option("dropInvalid", value = True) \
.load("src/test/resources/image/")
imageAssembler = ImageAssembler() \
.setInputCol("image") \
.setOutputCol("image_assembler")
imageCaptioning = VisionEncoderDecoderForImageCaptioning \
.pretrained() \
.setBeamSize(2) \
.setDoSample(False) \
.setInputCols(["image_assembler"]) \
.setOutputCol("caption")
pipeline = Pipeline().setStages([imageAssembler, imageCaptioning])
pipelineDF = pipeline.fit(imageDF).transform(imageDF)
pipelineDF \
.selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "caption.result") .show(truncate = False)
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.ImageAssembler
import org.apache.spark.ml.Pipeline
val imageDF: DataFrame = spark.read
.format("image")
.option("dropInvalid", value = true)
.load("src/test/resources/image/")
val imageCaptioning = new ImageAssembler()
.setInputCol("image")
.setOutputCol("image_assembler")
val imageClassifier = VisionEncoderDecoderForImageCaptioning
.pretrained()
.setBeamSize(2)
.setDoSample(false)
.setInputCols("image_assembler")
.setOutputCol("caption")
val pipeline = new Pipeline().setStages(Array(imageAssembler, imageCaptioning))
val pipelineDF = pipeline.fit(imageDF).transform(imageDF)
pipelineDF
.selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "caption.result")
.show(truncate = false)
Model Information
Model Name: | image_captioning_vit_gpt2 |
Compatibility: | Spark NLP 5.5.1+ |
License: | Open Source |
Edition: | Official |
Input Labels: | [image_assembler] |
Output Labels: | [label] |
Language: | en |
Size: | 1.0 GB |
References
References
https://huggingface.co/nlpconnect/vit-gpt2-image-captioning