from PIL import Image from transformers import BlipProcessor, BlipForQuestionAnswering from smolagents.tools import tool import torch import requests import os DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" def _download_file(file_name: str) -> None: """Download file if it doesn't exist locally""" try: # Try to open the file to check if it exists with open(file_name, 'rb') as f: pass # File exists, do nothing except FileNotFoundError: # File doesn't exist, download it url = f"{DEFAULT_API_URL}/files/{file_name.split('.')[-2]}" r = requests.get(url) with open(file_name, "wb") as f: f.write(r.content) @tool def ask_question_about_image(question: str, path_to_image: str) -> str: """ Ask a question about an image and return the answer. Args: question: the question to ask about the image. path_to_image: The path to the image to ask the question about. Returns: A string with the answer to the question. """ # Download the file if it doesn't exist _download_file(path_to_image) # Check if CUDA is available and use GPU if possible, otherwise use CPU device = 'cuda' if torch.cuda.is_available() else 'cpu' # Load the processor and model (using BLIP for more stable VQA) processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") model = model.to(device) # Load and process the image image = Image.open(path_to_image).convert('RGB') # Process the inputs inputs = processor(image, question, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} # Generate the answer with torch.no_grad(): outputs = model.generate(**inputs, max_length=50, num_beams=5) # Decode and return the answer answer = processor.decode(outputs[0], skip_special_tokens=True) return answer