Adding vision capabilities to large language models through CLIP, BLIP-2, and other multimodal architectures
Chapter 9 explores multimodal large language models that can process and understand both images and text, opening up new possibilities for AI applications.
Multimodal models combine vision and language understanding, enabling AI systems to perform tasks like image captioning, visual question answering, and image-text retrieval.
from urllib.request import urlopenfrom PIL import Image# Load an AI-generated imagepuppy_path = "https://raw.githubusercontent.com/HandsOnLLM/Hands-On-Large-Language-Models/main/chapter09/images/puppy.png"image = Image.open(urlopen(puppy_path)).convert("RGB")caption = "a puppy playing in the snow"
from transformers import CLIPTokenizerFast, CLIPProcessor, CLIPModelmodel_id = "openai/clip-vit-base-patch32"# Load tokenizer for textclip_tokenizer = CLIPTokenizerFast.from_pretrained(model_id)# Load processor for imagesclip_processor = CLIPProcessor.from_pretrained(model_id)# Main model for embeddingsmodel = CLIPModel.from_pretrained(model_id)
Compare multiple images with multiple captions to find the best matches.
import numpy as np# Load multiple imagespuppy_path = "https://raw.githubusercontent.com/HandsOnLLM/Hands-On-Large-Language-Models/main/chapter09/images/puppy.png"cat_path = "https://raw.githubusercontent.com/HandsOnLLM/Hands-On-Large-Language-Models/main/chapter09/images/cat.png"car_path = "https://raw.githubusercontent.com/HandsOnLLM/Hands-On-Large-Language-Models/main/chapter09/images/car.png"paths = [puppy_path, cat_path, car_path]images = [Image.open(urlopen(path)).convert("RGBA") for path in paths]captions = [ "a puppy playing in the snow", "a pixelated image of a cute cat", "A supercar on the road with the sunset in the background"]# Embed all imagesimage_embeddings = []for image in images: image_processed = clip_processor(images=image, return_tensors='pt')['pixel_values'] image_embedding = model.get_image_features(image_processed).detach().cpu().numpy()[0] image_embeddings.append(image_embedding)image_embeddings = np.array(image_embeddings)# Embed all captionstext_embeddings = []for caption in captions: inputs = clip_tokenizer(caption, return_tensors="pt") text_emb = model.get_text_features(**inputs).detach().cpu().numpy()[0] text_embeddings.append(text_emb)text_embeddings = np.array(text_embeddings)# Calculate similarity matrixfrom sklearn.metrics.pairwise import cosine_similaritysim_matrix = cosine_similarity(image_embeddings, text_embeddings)
Ask questions about images and get specific answers.
# Load imageimage = Image.open(urlopen(car_path)).convert("RGB")# Create promptprompt = "Question: Write down what you see in this picture. Answer:"# Process image and prompt togetherinputs = blip_processor( image, text=prompt, return_tensors="pt").to(device, torch.float16)# Generate answergenerated_ids = model.generate(**inputs, max_new_tokens=30)generated_text = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)answer = generated_text[0].strip()print(answer)
# What color is the car?prompt = "Question: What color is the car? Answer:"inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)generated_ids = model.generate(**inputs, max_new_tokens=10)answer = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()print(answer)
Output:
"orange"
# What time of day?prompt = "Question: What time of day is it? Answer:"inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)generated_ids = model.generate(**inputs, max_new_tokens=10)answer = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()print(answer)