使用transformers调用owlv2实现开放目标检测

news/2024/12/21 22:09:18/

安装

pip install transformers

Demo

from PIL import Image, ImageDraw, ImageFont
import numpy as np
import torch
from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STDprocessor = AutoProcessor.from_pretrained("/home/share3/mayunchuan/google/owlv2-large-patch14-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("/home/share3/mayunchuan/google/owlv2-large-patch14-ensemble").cuda()image = Image.open('/home/mayunchuan/lavad/dataset/Thumos14_25fps/frames/video_test_0000293/004902.jpg')
# image = Image.open('/home/mayunchuan/lavad/dataset/Thumos14_25fps/frames/video_validation_0000990/001388.jpg')
# texts = [["a photo of a volleyball", "a photo of a man"]]
texts = [[" javelin"]]
inputs = processor(text=texts, images=image, return_tensors="pt")
inputs['input_ids'] = inputs['input_ids'].cuda()
inputs['attention_mask'] = inputs['attention_mask'].cuda()
inputs['pixel_values'] = inputs['pixel_values'].cuda()
# forward pass
with torch.no_grad():outputs = model(**inputs)# Note: boxes need to be visualized on the padded, unnormalized image
# hence we'll set the target image sizes (height, width) based on thatdef get_preprocessed_image(pixel_values):pixel_values = pixel_values.squeeze().cpu().numpy()unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]unnormalized_image = (unnormalized_image * 255).astype(np.uint8)unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)unnormalized_image = Image.fromarray(unnormalized_image)return unnormalized_imageunnormalized_image = get_preprocessed_image(inputs.pixel_values)target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
results = processor.post_process_object_detection(outputs=outputs, threshold=0.2, target_sizes=target_sizes
)i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]for box, score, label in zip(boxes, scores, labels):box = [round(i, 2) for i in box.tolist()]print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")# 绘制边界框
draw = ImageDraw.Draw(unnormalized_image)for score, label, box in zip(scores, labels, boxes):box = [round(i, 2) for i in box.tolist()]x, y, x2, y2 = tuple(box)draw.rectangle((x, y, x2, y2), outline="red", width=1)draw.text((x, y), text[label.item()], font_size=20, fill="black")# 保存标记好的图片
unnormalized_image.save("marked_image.jpg")