from PIL import Image, ImageDraw, ImageFont
import numpy as np
import torch
from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STDprocessor = AutoProcessor.from_pretrained("/home/share3/mayunchuan/google/owlv2-large-patch14-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("/home/share3/mayunchuan/google/owlv2-large-patch14-ensemble").cuda()image = Image.open('/home/mayunchuan/lavad/dataset/Thumos14_25fps/frames/video_test_0000293/004902.jpg')# image = Image.open('/home/mayunchuan/lavad/dataset/Thumos14_25fps/frames/video_validation_0000990/001388.jpg')# texts = [["a photo of a volleyball", "a photo of a man"]]
texts =[[" javelin"]]
inputs = processor(text=texts, images=image, return_tensors="pt")
inputs['input_ids']= inputs['input_ids'].cuda()
inputs['attention_mask']= inputs['attention_mask'].cuda()
inputs['pixel_values']= inputs['pixel_values'].cuda()# forward passwith torch.no_grad():outputs = model(**inputs)# Note: boxes need to be visualized on the padded, unnormalized image# hence we'll set the target image sizes (height, width) based on thatdefget_preprocessed_image(pixel_values):pixel_values = pixel_values.squeeze().cpu().numpy()unnormalized_image =(pixel_values * np.array(OPENAI_CLIP_STD)[:,None,None])+ np.array(OPENAI_CLIP_MEAN)[:,None,None]unnormalized_image =(unnormalized_image *255).astype(np.uint8)unnormalized_image = np.moveaxis(unnormalized_image,0,-1)unnormalized_image = Image.fromarray(unnormalized_image)return unnormalized_imageunnormalized_image = get_preprocessed_image(inputs.pixel_values)target_sizes = torch.Tensor([unnormalized_image.size[::-1]])# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
results = processor.post_process_object_detection(outputs=outputs, threshold=0.2, target_sizes=target_sizes
)i =0# Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]for box, score, label inzip(boxes, scores, labels):box =[round(i,2)for i in box.tolist()]print(f"Detected {text[label]} with confidence {round(score.item(),3)} at location {box}")# 绘制边界框
draw = ImageDraw.Draw(unnormalized_image)for score, label, box inzip(scores, labels, boxes):box =[round(i,2)for i in box.tolist()]x, y, x2, y2 =tuple(box)draw.rectangle((x, y, x2, y2), outline="red", width=1)draw.text((x, y), text[label.item()], font_size=20, fill="black")# 保存标记好的图片
unnormalized_image.save("marked_image.jpg")