+
Try it out!
+
+
+
The base versions of the llava-gemma models are available on
+ HuggingFace (HF) at Intel/llava-gemma-2b. While these checkpoints have been converted to the HF version of LLaVA, their usage currently requires a modified preprocessor (available
+ here).
+
With preprocessing_llavagemma.py copied to the appropriate location (e.g. the directory you are running your script from), you can try out
+ the llava-gemma checkpoint using the following code snippet:
+
+
import requests
+from PIL import Image
+from transformers import (
+ LlavaForConditionalGeneration,
+ AutoTokenizer,
+ CLIPImageProcessor
+)
+from processing_llavagemma import LlavaGemmaProcessor # This is in this repo
+
+checkpoint = "Intel/llava-gemma-2b"
+
+# Load model
+model = LlavaForConditionalGeneration.from_pretrained(checkpoint)
+processor = LlavaGemmaProcessor(
+ tokenizer=AutoTokenizer.from_pretrained(checkpoint),
+ image_processor=CLIPImageProcessor.from_pretrained(checkpoint)
+)
+
+# Prepare inputs
+# Use gemma chat template
+prompt = processor.tokenizer.apply_chat_template(
+ [{'role': 'user', 'content': "What's the content of the image?<image>"}],
+ tokenize=False,
+ add_generation_prompt=True
+)
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+inputs = {k: v.to('cuda') for k, v in inputs.items()}
+
+# Generate
+generate_ids = model.generate(**inputs, max_length=30)
+output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(output)
+
+