nvida的VPI可以调用CPU、GPU、PVA、VIC、NVENC、OFA等后端资源。因此恰当的使用vpi可以把部分需要gpu的计算让其他的计算资源来承担来降低cpu的负载,降低cpu过载造成设备卡死、计算异常等各种各样的风险。
下面介绍一下使用VPI+cuda backend 计算LK稀疏光流。nvidia官方提供了一个计算稀疏光流的例子,但是该例子是使用VPI提取特征点然后再进行光流追踪。光流追踪的输入是nvida格式的,对于我这种VPI小白不太友好,花费了一番功夫实现了输入cv::point2f 格式的稀疏点使用VPI+cuda backend 计算光流的功能,并封装成类,分享给大家。
#include <unistd.h>
#include <string>
#include <unordered_set>
#include <opencv2/core/version.hpp>
#if CV_MAJOR_VERSION >= 3
# include <opencv2/imgcodecs.hpp>
# include <opencv2/videoio.hpp>
#else
# include <opencv2/highgui/highgui.hpp>
#endif#include <opencv2/imgproc/imgproc.hpp>
#include <vpi/OpenCVInterop.hpp>
#include <opencv2/opencv.hpp>#include <opencv2/opencv.hpp>#include "time.h"#include <vpi/Array.h>
#include <vpi/Image.h>
#include <vpi/Pyramid.h>
#include <vpi/Status.h>
#include <vpi/Stream.h>
#include <vpi/algo/ConvertImageFormat.h>
#include <vpi/algo/GaussianPyramid.h>
#include <vpi/algo/HarrisCorners.h>
#include <vpi/algo/OpticalFlowPyrLK.h>#include <algorithm>
#include <cstring> // for memset
#include <fstream>
#include <iostream>
#include <map>
#include <numeric>
#include <sstream>
#include <vector>#define CHECK_STATUS(STMT) \
do \
{ \
VPIStatus status__ = (STMT); \
if (status__ != VPI_SUCCESS) \
{ \
char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH]; \
vpiGetLastStatusMessage(buffer, sizeof(buffer)); \
std::ostringstream ss; \
ss << vpiStatusGetName(status__) << ": " << buffer; \
throw std::runtime_error(ss.str()); \
} \
} while (0);using namespace cv;
using namespace std;
class VPILKOPTFLOW {public:VPILKOPTFLOW() {}void Init(int cols,int rows){CHECK_STATUS(vpiImageCreate(cols, rows, VPI_IMAGE_FORMAT_U8, 0, &preImgTempFrame));CHECK_STATUS(vpiImageCreate(cols, rows, VPI_IMAGE_FORMAT_U8, 0, &preImgFrame));CHECK_STATUS(vpiImageCreate(cols, rows, VPI_IMAGE_FORMAT_U8, 0, &curImgTempFrame));CHECK_STATUS(vpiImageCreate(cols, rows, VPI_IMAGE_FORMAT_U8, 0, &curImgFrame));//cv::Mat pre_img = cv::imread("/home/poincare/code/vpi_sample/samples/12-optflow_lk/tmp/0.jpg");//cv::Mat cur_img = cv::imread("/home/poincare/code/vpi_sample/samples/12-optflow_lk/tmp/1.jpg");cv::Mat pre_img = Mat(cv::Size(cols,rows),CV_8UC1,cv::Scalar(0) );cv::Mat cur_img = Mat(cv::Size(cols,rows),CV_8UC1,cv::Scalar(0));CHECK_STATUS(vpiImageCreateOpenCVMatWrapper(pre_img, 0, &preImgTempFrame));CHECK_STATUS(vpiImageCreateOpenCVMatWrapper(cur_img, 0, &curImgTempFrame));// Create the image pyramids used by the algorithmCHECK_STATUS(vpiPyramidCreate(cols, rows, VPI_IMAGE_FORMAT_U8, pyrLevel, 0.5, 0, &pyrPrevFrame));CHECK_STATUS(vpiPyramidCreate(cols, rows, VPI_IMAGE_FORMAT_U8, pyrLevel, 0.5, 0, &pyrCurFrame));// Create input and output arraysCHECK_STATUS(vpiArrayCreate(MAX_HARRIS_CORNERS, VPI_ARRAY_TYPE_KEYPOINT, 0, &prevFeatures));CHECK_STATUS(vpiArrayCreate(MAX_HARRIS_CORNERS, VPI_ARRAY_TYPE_KEYPOINT, 0, &curFeatures));CHECK_STATUS(vpiArrayCreate(MAX_HARRIS_CORNERS, VPI_ARRAY_TYPE_U8, 0, &status));// Create Optical Flow payloadbackend = VPI_BACKEND_CUDA;CHECK_STATUS(vpiCreateOpticalFlowPyrLK(backend, cols, rows, VPI_IMAGE_FORMAT_U8, pyrLevel, 0.5,&optflow));CHECK_STATUS(vpiInitOpticalFlowPyrLKParams(&lkParams));//lkParams.windowDimension = 21;CHECK_STATUS(vpiStreamCreate(0, &stream));}void CalOpticalLK(const cv::Mat& pre_image, const cv::Mat& curr_image, const std::vector<cv::Point2f>& pre_pts,std::vector<cv::Point2f>& curr_pts, std::vector<uint8_t>& valid_ids){std::cout<<"optical flow using vpi with cuda backend. "<<std::endl;curr_pts.clear();CHECK_STATUS(vpiImageSetWrappedOpenCVMat(preImgTempFrame, pre_image));CHECK_STATUS(vpiImageSetWrappedOpenCVMat(curImgTempFrame, curr_image));//t01 =clock();// Convert it to grayscalestd::cout<<"vpi optical flow 1. "<<std::endl;CHECK_STATUS(vpiSubmitConvertImageFormat(stream, backend, preImgTempFrame, preImgFrame, NULL));CHECK_STATUS(vpiSubmitConvertImageFormat(stream, backend, curImgTempFrame, curImgFrame, NULL));//t02 = clock();// Generate a pyramid out of itstd::cout<<"vpi with cuda backend 2. "<<std::endl;CHECK_STATUS(vpiSubmitGaussianPyramidGenerator(stream, backend, preImgFrame, pyrPrevFrame));CHECK_STATUS(vpiSubmitGaussianPyramidGenerator(stream, backend, curImgFrame, pyrCurFrame));//t03 = clock();std::cout<<"vpi optical flow 3. "<<std::endl;std::vector<VPIKeypoint> kpt;VPIKeypoint vpi_pt;for(const auto &pt : pre_pts) {vpi_pt.x = pt.x;vpi_pt.y = pt.y;kpt.push_back(vpi_pt);}VPIArrayData data = {};data.type = VPI_ARRAY_TYPE_KEYPOINT;data.capacity = 4096 ;//pre_pts.size();int s = pre_pts.size();data.sizePointer = &s;data.strideBytes = 8;data.data = &kpt[0];//t04 = clock();std::cout<<"optical flow 4. "<<std::endl;vpiArrayCreateHostMemWrapper(&data,0,&prevFeatures);//vpiArraySetWrappedHostMem(prevFeatures,&data);//t1 = clock();std::cout<<"optical flow 5. "<<std::endl;// Estimate the features' position in current frame given their position in previous frameCHECK_STATUS(vpiSubmitOpticalFlowPyrLK(stream, backend, optflow, pyrPrevFrame, pyrCurFrame, prevFeatures,curFeatures, status, &lkParams));std::cout<<"optical flow 6. "<<std::endl;CHECK_STATUS(vpiStreamSync(stream));//t2 = clock();std::cout<<"optical flow 7. "<<std::endl;VPIArrayData curFeaturesData, statusData;CHECK_STATUS(vpiArrayLock(curFeatures, VPI_LOCK_READ_WRITE, &curFeaturesData));CHECK_STATUS(vpiArrayLock(status, VPI_LOCK_READ, &statusData));std::cout<<"optical flow 8. "<<std::endl;int numTrackedKeypoints = 0;int totKeypoints = *curFeaturesData.sizePointer;const VPIKeypoint *pCurFeatures = (VPIKeypoint *)curFeaturesData.data;const uint8_t *pStatus = (uint8_t *)statusData.data;cv::Point2f pt2f;for (int i = 0; i < totKeypoints; i++){//if (pStatus[i] == 0){pt2f.x = pCurFeatures[i].x;pt2f.y = pCurFeatures[i].y;curr_pts.push_back(pt2f);if(pStatus[i]==0) {valid_ids.push_back(1);}else {valid_ids.push_back(0);}//numTrackedKeypoints++;}}std::cout<<"optical flow using 9. "<<std::endl;CHECK_STATUS(vpiArrayUnlock(curFeatures));CHECK_STATUS(vpiArrayUnlock(status));std::cout<<"optical flow using 10. "<<std::endl;}~VPILKOPTFLOW(){vpiStreamDestroy(stream);vpiPayloadDestroy(optflow);vpiImageDestroy(preImgTempFrame);vpiImageDestroy(preImgFrame);vpiImageDestroy(curImgTempFrame);vpiImageDestroy(curImgFrame);vpiPyramidDestroy(pyrPrevFrame);vpiPyramidDestroy(pyrCurFrame);vpiArrayDestroy(prevFeatures);vpiArrayDestroy(curFeatures);vpiArrayDestroy(status);}
private:VPIStream stream = NULL;VPIImage preImgTempFrame = NULL;VPIImage preImgFrame = NULL;VPIImage curImgTempFrame = NULL;VPIImage curImgFrame = NULL;VPIPyramid pyrPrevFrame = NULL, pyrCurFrame = NULL;VPIArray status = NULL;VPIArray prevFeatures = NULL, curFeatures = NULL;VPIPayload optflow = NULL;int pyrLevel{3};int MAX_HARRIS_CORNERS{4096};VPIOpticalFlowPyrLKParams lkParams;VPIBackend backend;
};
对代码进行了测试,可以运行,与使用OpenCV 调用CPU 计算光流相比性能确有提升,与OpenCV 调用GPU计算光流相比,表现略差。但在某些情况下VPI可以调用除CPU和GPU外的其他资源,总之算多一种选择吧。