link
在使用摄像头直接检测目标时,检测的实时画面还是有点慢,下面是tensorrt加速过程记录。
一、设备
1、设备jetson agx xavier
2、jetpack4.6.1
3、tensorrt 8.2.1.8
4、conda虚拟环境 python=3.6
二、虚拟环境搭建及依赖
1、参考此博客安装torch
Nvidia jetson xavier agx 安装pytorch1.9.0 Gpu版_Ponnyao的博客-CSDN博客_xavier安装pytorch
2、安装pycuda
- conda activate pytorch #我的虚拟环境名字是pytorch
- pip3 install pycuda
3、虚拟环境中使用tensorrt
- #查看tensorrt路径
- sudo find / -name tensorrt*
-
- #进入虚拟环境的此路径
- cd /home/nvidia/archiconda/envs/pytorch/lib/python3.6/site-packages
-
- #设置软连接
- ln -s /usr/lib/python3.6/dist-packages/tensorrt
-
- #上一步不行的话用这个
- ln -s /usr/lib/python3.6/dist-packages/tensorrt/tensorrt.so
三、加速过程
我的项目yolov5_tensorrt-深度学习文档类资源-CSDN下载
1、下载项目
以yolov5 _6.0为例
- mkidr yolov5_tensorrt
- cd yolov5_tensorrt
- git clone -b v6.0 https://github.com/ultralytics/yolov5.git
- git clone https://github.com/wang-xinyu/tensorrtx.git
2、下载yolov5s.pt文件
下载后,放到 yolov5_tensorrt/yolov5文件夹下
https://github.com/ultralytics/yolov5/releases/tag/v6.0
3、转换模型pt->wts
- cp yolov5_tensorrt/tensorrtx/yolov5/gen_wts.py yolov5_tensorrt/yolov5
- cd yolov5_tensorrt/yolov5
- python3 gen_wts.py -w yolov5s.pt -o yolov5s.wts
4、生成引擎文件
- cd yolov5_tensorrt/tensorrtx/yolov5/
- mkdir build
- cd build
- cp yolov5_tensorrt/yolov5/yolov5s.wts yolov5_tensorrt/tensorrtx/yolov5/build
- cmake ..
- make
- sudo ./yolov5 -s yolov5s.wts yolov5s.engine s
生成yolov5s.engine。
5、摄像头加速
原作者只有图片加速,下面是大神修改的摄像头加速文件。
yolov5_trt_cam.py
- """
- An example that uses TensorRT's Python api to make inferences.
- """
- import ctypes
- import os
- import shutil
- import random
- import sys
- import threading
- import time
- import cv2
- import numpy as np
- import pycuda.autoinit
- import pycuda.driver as cuda
- import tensorrt as trt
- import torch
- import torchvision
- import argparse
-
- CONF_THRESH = 0.5
- IOU_THRESHOLD = 0.4
-
-
- def get_img_path_batches(batch_size, img_dir):
- ret = []
- batch = []
- for root, dirs, files in os.walk(img_dir):
- for name in files:
- if len(batch) == batch_size:
- ret.append(batch)
- batch = []
- batch.append(os.path.join(root, name))
- if len(batch) > 0:
- ret.append(batch)
- return ret
-
- def plot_one_box(x, img, color=None, label=None, line_thickness=None):
- """
- description: Plots one bounding box on image img,
- this function comes from YoLov5 project.
- param:
- x: a box likes [x1,y1,x2,y2]
- img: a opencv image object
- color: color to draw rectangle, such as (0,255,0)
- label: str
- line_thickness: int
- return:
- no return
- """
- tl = (
- line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
- ) # line/font thickness
- color = color or [random.randint(0, 255) for _ in range(3)]
- c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
- cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
- if label:
- tf = max(tl - 1, 1) # font thickness
- t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
- c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
- cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
- cv2.putText(
- img,
- label,
- (c1[0], c1[1] - 2),
- 0,
- tl / 3,
- [225, 255, 255],
- thickness=tf,
- lineType=cv2.LINE_AA,
- )
-
-
- class YoLov5TRT(object):
- """
- description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
- """
-
- def __init__(self, engine_file_path):
- # Create a Context on this device,
- self.ctx = cuda.Device(0).make_context()
- stream = cuda.Stream()
- TRT_LOGGER = trt.Logger(trt.Logger.INFO)
- runtime = trt.Runtime(TRT_LOGGER)
-
- # Deserialize the engine from file
- with open(engine_file_path, "rb") as f:
- engine = runtime.deserialize_cuda_engine(f.read())
- context = engine.create_execution_context()
-
- host_inputs = []
- cuda_inputs = []
- host_outputs = []
- cuda_outputs = []
- bindings = []
-
- for binding in engine:
- print('bingding:', binding, engine.get_binding_shape(binding))
- size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
- dtype = trt.nptype(engine.get_binding_dtype(binding))
- # Allocate host and device buffers
- host_mem = cuda.pagelocked_empty(size, dtype)
- cuda_mem = cuda.mem_alloc(host_mem.nbytes)
- # Append the device buffer to device bindings.
- bindings.append(int(cuda_mem))
- # Append to the appropriate list.
- if engine.binding_is_input(binding):
- self.input_w = engine.get_binding_shape(binding)[-1]
- self.input_h = engine.get_binding_shape(binding)[-2]
- host_inputs.append(host_mem)
- cuda_inputs.append(cuda_mem)
- else:
- host_outputs.append(host_mem)
- cuda_outputs.append(cuda_mem)
-
- # Store
- self.stream = stream
- self.context = context
- self.engine = engine
- self.host_inputs = host_inputs
- self.cuda_inputs = cuda_inputs
- self.host_outputs = host_outputs
- self.cuda_outputs = cuda_outputs
- self.bindings = bindings
- self.batch_size = engine.max_batch_size
-
- def infer(self, input_image_path):
- threading.Thread.__init__(self)
- # Make self the active context, pushing it on top of the context stack.
- self.ctx.push()
- self.input_image_path = input_image_path
- # Restore
- stream = self.stream
- context = self.context
- engine = self.engine
- host_inputs = self.host_inputs
- cuda_inputs = self.cuda_inputs
- host_outputs = self.host_outputs
- cuda_outputs = self.cuda_outputs
- bindings = self.bindings
- # Do image preprocess
- batch_image_raw = []
- batch_origin_h = []
- batch_origin_w = []
- batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
-
- input_image, image_raw, origin_h, origin_w = self.preprocess_image(input_image_path
- )
-
-
- batch_origin_h.append(origin_h)
- batch_origin_w.append(origin_w)
- np.copyto(batch_input_image, input_image)
- batch_input_image = np.ascontiguousarray(batch_input_image)
-
- # Copy input image to host buffer
- np.copyto(host_inputs[0], batch_input_image.ravel())
- start = time.time()
- # Transfer input data to the GPU.
- cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
- # Run inference.
- context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
- # Transfer predictions back from the GPU.
- cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
- # Synchronize the stream
- stream.synchronize()
- end = time.time()
- # Remove any context from the top of the context stack, deactivating it.
- self.ctx.pop()
- # Here we use the first row of output in that batch_size = 1
- output = host_outputs[0]
- # Do postprocess
- result_boxes, result_scores, result_classid = self.post_process(
- output, origin_h, origin_w)
- # Draw rectangles and labels on the original image
- for j in range(len(result_boxes)):
- box = result_boxes[j]
- plot_one_box(
- box,
- image_raw,
- label="{}:{:.2f}".format(
- categories[int(result_classid[j])], result_scores[j]
- ),
- )
- return image_raw, end - start
-
- def destroy(self):
- # Remove any context from the top of the context stack, deactivating it.
- self.ctx.pop()
-
- def get_raw_image(self, image_path_batch):
- """
- description: Read an image from image path
- """
- for img_path in image_path_batch:
- yield cv2.imread(img_path)
-
- def get_raw_image_zeros(self, image_path_batch=None):
- """
- description: Ready data for warmup
- """
- for _ in range(self.batch_size):
- yield np.zeros([self.input_h, self.input_w, 3], dtype=np.uint8)
-
- def preprocess_image(self, input_image_path):
- """
- description: Convert BGR image to RGB,
- resize and pad it to target size, normalize to [0,1],
- transform to NCHW format.
- param:
- input_image_path: str, image path
- return:
- image: the processed image
- image_raw: the original image
- h: original height
- w: original width
- """
- image_raw = input_image_path
- h, w, c = image_raw.shape
- image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
- # Calculate widht and height and paddings
- r_w = self.input_w / w
- r_h = self.input_h / h
- if r_h > r_w:
- tw = self.input_w
- th = int(r_w * h)
- tx1 = tx2 = 0
- ty1 = int((self.input_h - th) / 2)
- ty2 = self.input_h - th - ty1
- else:
- tw = int(r_h * w)
- th = self.input_h
- tx1 = int((self.input_w - tw) / 2)
- tx2 = self.input_w - tw - tx1
- ty1 = ty2 = 0
- # Resize the image with long side while maintaining ratio
- image = cv2.resize(image, (tw, th))
- # Pad the short side with (128,128,128)
- image = cv2.copyMakeBorder(
- image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
- )
- image = image.astype(np.float32)
- # Normalize to [0,1]
- image /= 255.0
- # HWC to CHW format:
- image = np.transpose(image, [2, 0, 1])
- # CHW to NCHW format
- image = np.expand_dims(image, axis=0)
- # Convert the image to row-major order, also known as "C order":
- image = np.ascontiguousarray(image)
- return image, image_raw, h, w
-
- def xywh2xyxy(self, origin_h, origin_w, x):
- """
- description: Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
- param:
- origin_h: height of original image
- origin_w: width of original image
- x: A boxes tensor, each row is a box [center_x, center_y, w, h]
- return:
- y: A boxes tensor, each row is a box [x1, y1, x2, y2]
- """
- y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
- r_w = self.input_w / origin_w
- r_h = self.input_h / origin_h
- if r_h > r_w:
- y[:, 0] = x[:, 0] - x[:, 2] / 2
- y[:, 2] = x[:, 0] + x[:, 2] / 2
- y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
- y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
- y /= r_w
- else:
- y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
- y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
- y[:, 1] = x[:, 1] - x[:, 3] / 2
- y[:, 3] = x[:, 1] + x[:, 3] / 2
- y /= r_h
-
- return y
-
- def post_process(self, output, origin_h, origin_w):
- """
- description: postprocess the prediction
- param:
- output: A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
- origin_h: height of original image
- origin_w: width of original image
- return:
- result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
- result_scores: finally scores, a tensor, each element is the score correspoing to box
- result_classid: finally classid, a tensor, each element is the classid correspoing to box
- """
- # Get the num of boxes detected
- num = int(output[0])
- # Reshape to a two dimentional ndarray
- pred = np.reshape(output[1:], (-1, 6))[:num, :]
- # to a torch Tensor
- pred = torch.Tensor(pred).cuda()
- # Get the boxes
- boxes = pred[:, :4]
- # Get the scores
- scores = pred[:, 4]
- # Get the classid
- classid = pred[:, 5]
- # Choose those boxes that score > CONF_THRESH
- si = scores > CONF_THRESH
- boxes = boxes[si, :]
- scores = scores[si]
- classid = classid[si]
- # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
- boxes = self.xywh2xyxy(origin_h, origin_w, boxes)
- # Do nms
- indices = torchvision.ops.nms(boxes, scores, iou_threshold=IOU_THRESHOLD).cpu()
- result_boxes = boxes[indices, :].cpu()
- result_scores = scores[indices].cpu()
- result_classid = classid[indices].cpu()
- return result_boxes, result_scores, result_classid
-
-
- class inferThread(threading.Thread):
- def __init__(self, yolov5_wrapper):
- threading.Thread.__init__(self)
- self.yolov5_wrapper = yolov5_wrapper
- def infer(self , frame):
- batch_image_raw, use_time = self.yolov5_wrapper.infer(frame)
-
- # for i, img_path in enumerate(self.image_path_batch):
- # parent, filename = os.path.split(img_path)
- # save_name = os.path.join('output', filename)
- # # Save image
- # cv2.imwrite(save_name, batch_image_raw[i])
- # print('input->{}, time->{:.2f}ms, saving into output/'.format(self.image_path_batch, use_time * 1000))
- return batch_image_raw,use_time
-
- class warmUpThread(threading.Thread):
- def __init__(self, yolov5_wrapper):
- threading.Thread.__init__(self)
- self.yolov5_wrapper = yolov5_wrapper
-
- def run(self):
- batch_image_raw, use_time = self.yolov5_wrapper.infer(self.yolov5_wrapper.get_raw_image_zeros())
- print('warm_up->{}, time->{:.2f}ms'.format(batch_image_raw[0].shape, use_time * 1000))
-
-
-
- if __name__ == "__main__":
- # load custom plugins
- parser = argparse.ArgumentParser()
- parser.add_argument('--engine', nargs='+', type=str, default="build/yolov5s.engine", help='.engine path(s)')
- parser.add_argument('--save', type=int, default=0, help='save?')
- opt = parser.parse_args()
- PLUGIN_LIBRARY = "build/libmyplugins.so"
- engine_file_path = opt.engine
-
- ctypes.CDLL(PLUGIN_LIBRARY)
-
- # load coco labels
-
- categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
- "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
- "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
- "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
- "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
- "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
- "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
- "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
- "hair drier", "toothbrush"]
- # a YoLov5TRT instance
- yolov5_wrapper = YoLov5TRT(engine_file_path)
- cap = cv2.VideoCapture(0)
- try:
- thread1 = inferThread(yolov5_wrapper)
- thread1.start()
- thread1.join()
- while 1:
- _,frame = cap.read()
- img,t=thread1.infer(frame)
- cv2.imshow("result", img)
- if cv2.waitKey(1) & 0XFF == ord('q'): # 1 millisecond
- break
-
-
- finally:
- # destroy the instance
- cap.release()
- cv2.destroyAllWindows()
- yolov5_wrapper.destroy()
参考
tensorrtx/yolov5 at master · wang-xinyu/tensorrtx · GitHub
Jetson AGX Xavier实现TensorRT加速YOLOv5进行实时检测_围白的尾巴的博客-CSDN博客