第1章 当前灰度膨胀腐蚀算子
图像最大值最小值滤波。效果如下:
1.1. 常规实现
1.1.1. 半径范围遍历
对于一个像素,其膨胀腐蚀结果,查看周围半径范围内的所有像素,取最大最小值。
uint8_t nMax = 0;
for (int j = -nRY; j <= nRY; j++)
{for (int i = -nRX; i <= nRX; i++){int _X = x + i;int _Y = y + j;nMax = max(nMax, pSrc[_Y * nW + _X]);}
}
pDst[y*nW+x] = nMax;
1.1.2. 亚像素
当半径不为整数时,膨胀边缘数据会和周围像素进行插值。这是当前实现半像素套印的方式。
1.1.3. 当前算子存在问题
单像素处理的时间复杂度为O(N^2),当滤波半径较大时,耗时急剧上升。另外,在滤波半径接近整数时,仍然要计算亚像素,增加了耗时。
第2章 灰度膨胀腐蚀算子的优化
2.1. SIMD优化
2.1.1. 原始实现
逐像素,每个像素在半径范围内取最大最小值。
void morph(uint8_t* pSrc, uint8_t* pDst, int nW, int nH, int nRX, int nRY)
{for (int y = nRY; y < nH - nRY; y++){for (int x = nRX; x < nW - nRX; x++){uint8_t nMax = 0;for (int j = -nRY; j <= nRY; j++){for (int i = -nRX; i <= nRX; i++){int _X = x + i;int _Y = y + j;nMax = max(nMax, pSrc[_Y * nW + _X]);}}pDst[y * nW + x] = nMax;}}
}
AMD2700xCPU,2k*1k的灰度图,半径为1,耗时为33mm。
2.1.2. 每次处理32个像素
每个像素在半径范围内取最大最小值。当前VisionLink的实现
void morph2(uint8_t* pSrc, uint8_t* pDst, int nW, int nH, int nRX, int nRY)
{int x, y;for (y = nRY; y < nH - nRY; y++){//一次处理32个像素int w = nW - 2 * nRX;int n = w / 32;int t = w % 32;for (x = 0; x < n; x++){__m256i m0 = _mm256_set1_epi8(0);for (int j = -nRY; j <= nRY; j++){for (int i = -nRX; i <= nRX; i++){__m256i m1 = _mm256_loadu_si256((const __m256i*)(pSrc + (y + j)*nW + x * 32 + i));m0 = _mm256_max_epu8(m1, m0);}}_mm256_storeu_si256((__m256i*)(pDst + y*nW + x * 32), m0);}for (int k = 0; k < t; k++){uint8_t nMax = 0;for (int j = -nRY; j <= nRY; j++){for (int i = -nRX; i <= nRX; i++){int _X = x * 32 + i + k;int _Y = y + j;nMax = max(nMax, pSrc[_Y*nW + _X]);}}pDst[y*nW + x * 32 + k] = nMax;}}
}
耗时统计:2k*1k的灰度图像:
半径 | 耗时 |
---|---|
1 | 0.5ms |
2 | 1.3ms |
3 | 2.4ms |
5 | 5ms |
7 | 10ms |
相比未优化的实现,有60倍的提升。但随着半径的增加,耗时急剧上升。
2.2. O(1)算法原理
针对随机半径,耗时上升的现象,我们期望找到0(1)的灰度膨胀腐蚀算法原理,即耗时不随着半径的增加而增加。
比如下图的原理:
代码实现如下:
void morph3(uint8_t* pSrc, uint8_t* pDst, int nW, int nH, int nRX, int nRY,int bDilation)
{//const int bDilation = 1;int x, y;int i, j, k;int nSizeX = nRX * 2 + 1;int nSizeY = nRY * 2 + 1;uint8_t* G = new uint8_t[nW * nSizeY];uint8_t* H = new uint8_t[nW * nSizeY];__m128i m0 = _mm_setzero_si128(), mraw, m1;auto pSrc1 = pSrc;int n1 = nW / nSizeX;int t1 = nW% nSizeX;int n2 = (nW - nRX * 2) / 16;int t2 = (nW - nRX * 2) % 16;__m128i mmask, mmasktail;if (bDilation){mmask = _mm_set_epi8(nSizeX >= 16 ? 0xff : 0x00, nSizeX >= 15 ? 0xff : 0x00,nSizeX >= 14 ? 0xff : 0x00, nSizeX >= 13 ? 0xff : 0x00,nSizeX >= 12 ? 0xff : 0x00, nSizeX >= 11 ? 0xff : 0x00,nSizeX >= 10 ? 0xff : 0x00, nSizeX >= 9 ? 0xff : 0x00,nSizeX >= 8 ? 0xff : 0x00, nSizeX >= 7 ? 0xff : 0x00,nSizeX >= 6 ? 0xff : 0x00, nSizeX >= 5 ? 0xff : 0x00,nSizeX >= 4 ? 0xff : 0x00, nSizeX >= 3 ? 0xff : 0x00,nSizeX >= 2 ? 0xff : 0x00, nSizeX >= 1 ? 0xff : 0x00);mmasktail = _mm_set_epi8(t1 >= 16 ? 0xff : 0x00, t1 >= 15 ? 0xff : 0x00,t1 >= 14 ? 0xff : 0x00, t1 >= 13 ? 0xff : 0x00,t1 >= 12 ? 0xff : 0x00, t1 >= 11 ? 0xff : 0x00,t1 >= 10 ? 0xff : 0x00, t1 >= 9 ? 0xff : 0x00,t1 >= 8 ? 0xff : 0x00, t1 >= 7 ? 0xff : 0x00,t1 >= 6 ? 0xff : 0x00, t1 >= 5 ? 0xff : 0x00,t1 >= 4 ? 0xff : 0x00, t1 >= 3 ? 0xff : 0x00,t1 >= 2 ? 0xff : 0x00, t1 >= 1 ? 0xff : 0x00);}else{mmask = _mm_set_epi8(nSizeX < 16 ? 0xff : 0x00, nSizeX < 15 ? 0xff : 0x00,nSizeX < 14 ? 0xff : 0x00, nSizeX < 13 ? 0xff : 0x00,nSizeX < 12 ? 0xff : 0x00, nSizeX < 11 ? 0xff : 0x00,nSizeX < 10 ? 0xff : 0x00, nSizeX < 9 ? 0xff : 0x00,nSizeX < 8 ? 0xff : 0x00, nSizeX < 7 ? 0xff : 0x00,nSizeX < 6 ? 0xff : 0x00, nSizeX < 5 ? 0xff : 0x00,nSizeX < 4 ? 0xff : 0x00, nSizeX < 3 ? 0xff : 0x00,nSizeX < 2 ? 0xff : 0x00, nSizeX < 1 ? 0xff : 0x00);mmasktail = _mm_set_epi8(t1 < 16 ? 0xff : 0x00, t1 < 15 ? 0xff : 0x00,t1 < 14 ? 0xff : 0x00, t1 < 13 ? 0xff : 0x00,t1 < 12 ? 0xff : 0x00, t1 < 11 ? 0xff : 0x00,t1 < 10 ? 0xff : 0x00, t1 < 9 ? 0xff : 0x00,t1 < 8 ? 0xff : 0x00, t1 < 7 ? 0xff : 0x00,t1 < 6 ? 0xff : 0x00, t1 < 5 ? 0xff : 0x00,t1 < 4 ? 0xff : 0x00, t1 < 3 ? 0xff : 0x00,t1 < 2 ? 0xff : 0x00, t1 < 1 ? 0xff : 0x00);}//处理得到前nSizeY行的结果for (k = 0; k < nSizeY; k++) //前nSizeY行{auto pSrc1 = pSrc + k * nW;for (i = 0; i < n1; i++) //每行有多少个分段 n1 = nW / nSizeX{mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);for (j = 1; j < nSizeX; j++) //移位比较{mraw = _mm_slli_si128(mraw, 1);if (!bDilation) mraw = _mm_or_si128(mraw, m255);m1 = bDilation?_mm_max_epu8(m1, mraw): _mm_min_epu8(m1, mraw);}memcpy(G + k * nW + i * nSizeX, m1.m128i_u8, nSizeX);mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);for (j = 1; j < nSizeX; j++) //移位比较{mraw = _mm_srli_si128(mraw, 1);m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);}memcpy(H + k * nW + i * nSizeX, m1.m128i_u8, nSizeX);}if (t1 > 0) //边部处理{mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);for (j = 1; j < t1; j++) //移位比较{mraw = _mm_slli_si128(mraw, 1);if (!bDilation) mraw = _mm_or_si128(mraw, m255);m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);}memcpy(G + k * nW + i * nSizeX, m1.m128i_u8, t1);mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));m1 = mraw = bDilation ? _mm_and_si128(mraw, mmasktail) : _mm_or_si128(mraw, mmasktail);for (j = 1; j < t1; j++) //移位比较{mraw = _mm_srli_si128(mraw, 1);m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);}memcpy(H + k * nW + i * nSizeX, m1.m128i_u8, t1);}}int pIndex = 0;for (y = nRY; y < nH - nRY; y++){//计算得到最大值滤波结果//从G,H中得到滤波结果for (i = 0; i < n2; i++) //n2 = (nW / nSizeX * nSizeX - nSizeX) / 16; //每行处理多少次{__m128i mret = bDilation ? _mm_setzero_si128():_mm_set1_epi8(-1);for (j = 0; j < nSizeY; j++) //多行比较大小{__m128i m1 = _mm_loadu_si128((const __m128i*)(G + j * nW + i * 16 + 2 * nRX));__m128i m2 = _mm_loadu_si128((const __m128i*)(H + j * nW + i * 16));m1 = bDilation ? _mm_max_epu8(m1, m2): _mm_min_epu8(m1, m2);mret = bDilation ? _mm_max_epu8(mret, m1): _mm_min_epu8(mret, m1);}_mm_storeu_si128((__m128i*)(pDst + y * nW + i * 16 + nRX), mret);}for (k = 0; k < t2; k++) //边部处理{uint8_t ret = bDilation ? 0 : 255;for (j = 0; j < nSizeY; j++){uint8_t a = (uint8_t)*(G + j * nW + i * 16 + 2 * nRX + j);uint8_t b = (uint8_t) * (H + j * nW + i * 16 + j);a = bDilation ? max(a, b):min(a,b);ret = bDilation ? max(ret, a):min(ret,a);}*(pDst + y * nW + i * 16 + nRX + k) = ret;}//更新下一行结果if (y == nH - nRY - 1)break;auto pSrc1 = pSrc + (y + nRY + 1) * nW;for (i = 0; i < n1; i++) //每行有多少个分段 n1 = nW / nSizeX{mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);for (int j = 1; j < nSizeX; j++) //移位比较{mraw = _mm_slli_si128(mraw, 1);if (!bDilation) mraw = _mm_or_si128(mraw, m255);m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);}//_mm_storeu_si128((__m128i*)(GH + pIndex*2 * nW + i * nSizeX), m1);memcpy(G + pIndex * nW + i * nSizeX, m1.m128i_u8, nSizeX);mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);for (int j = 1; j < nSizeX; j++) //移位比较{mraw = _mm_srli_si128(mraw, 1);m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);}//_mm_storeu_si128((__m128i*)(GH + (pIndex*2+1) * nW + i * nSize), m1);memcpy(H + pIndex * nW + i * nSizeX, m1.m128i_u8, nSizeX);}if (t1 > 0) //边部处理{mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));m1 = mraw = bDilation ? _mm_and_si128(mraw, mmask) : _mm_or_si128(mraw, mmask);for (j = 1; j < t1; j++) //移位比较{mraw = _mm_slli_si128(mraw, 1);if (!bDilation) mraw = _mm_or_si128(mraw, m255);m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);}memcpy(G + pIndex * nW + i * nSizeX, m1.m128i_u8, t1);mraw = _mm_loadu_si128((const __m128i*)(pSrc1 + i * nSizeX));m1 = mraw = bDilation ? _mm_and_si128(mraw, mmasktail) : _mm_or_si128(mraw, mmasktail);for (j = 1; j < t1; j++) //移位比较{mraw = _mm_srli_si128(mraw, 1);m1 = bDilation ? _mm_max_epu8(m1, mraw) : _mm_min_epu8(m1, mraw);}memcpy(H + pIndex * nW + i * nSizeX, m1.m128i_u8, t1);}pIndex = (pIndex + 1) % nSizeY;}//delete[] GH;delete[] G;delete[] H;
}
在如下链接https://www.cnblogs.com/Imageshop/p/7018510.html,有一个另外的实现,本实现占用空间少,且耗时更短。唯一的问题的支持的最大半径为7,可以通过升级为AVX2,最大支持15。
耗时统计如下:
半径 | 耗时 |
---|---|
1 | 2.2ms |
2 | 2.2ms |
3 | 2.3ms |
5 | 2.6ms |
7 | 2.7ms |
基本上做到了O(1),但是,基础的耗时太长。
2.3. halcon,opencv
那么,halcon,opencv可以做到多块呢?
以下是Halcon测试代码:
HObject ho_Image1, ho_Red, ho_Green, ho_Blue;HObject ho_GrayImage, ho_ImageMin;SetSystem("parallelize_operators", "false");ReadImage(&ho_Image1, "D:/1.bmp");//Decompose3(ho_Image1, &ho_Red, &ho_Green, &ho_Blue);Rgb1ToGray(ho_Image1, &ho_GrayImage);auto start = high_resolution_clock::now();for (int i = 0; i < 1000; i++)GrayDilationShape(ho_GrayImage, &ho_ImageMin, 9, 9, "rectangle");auto stop = high_resolution_clock::now();auto duration = duration_cast<microseconds>(stop - start);cout << "t:" << duration.count() / 1000.0 << "ms" << endl;WriteImage(ho_ImageMin, "bmp", 0, "D:/2H.bmp");
通过SetSystem(“parallelize_operators”, “false”);来控制是否开启并行,以防halcon不讲武德。
以及OpenCV的测试代码:
cv::Mat src, dst;src = cv::imread("D:\\1.bmp", cv::IMREAD_GRAYSCALE); // Read the fileif (src.empty()){cout << "Could not open or find the image" << endl;return -1;}cv::Mat element = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(15, 15));auto start = high_resolution_clock::now();for(int i = 0;i<10000;i++)cv::morphologyEx(src, dst, cv::MORPH_ERODE, element);auto stop = high_resolution_clock::now();auto duration = duration_cast<microseconds>(stop - start);cout << "Time taken by function: "<< duration.count()/1000.0<< "ms" << endl;imwrite("D:\\2.bmp", dst);
耗时分别如下:
半径 | halcon | Halcon 并行 | OpenCV |
---|---|---|---|
1 | 0.3 | 0.2 | 0.3 |
2 | 0.6 | 0.3 | 0.4 |
3 | 0.9 | 0.4 | 0.6 |
5 | 1.3 | 0.6 | 0.8 |
7 | 2.3 | 0.8 | 1.1 |
2.4. 行列分开方式
既然halcon也没有使用O(1)的算法原理,说明O(1)是有代价的,基础耗时就比较高。至少在半径小于10的情况下,Halcon的灰度膨胀腐蚀算子耗时都是递增的。
再就是行列分开的方法了,代码实现如下:
void morph4(uint8_t* pSrc, uint8_t* pDst, int nW, int nH, int nRX, int nRY, int bDilation)
{int halfRow = nRY;int halfCol = nRX;uint8_t* pTmp = new uint8_t[nW * nH];int nCol = nW % 32 == 0 ? nW / 32 : nW / 32 + 1;int* nOffsetX = new int[nCol];memset(nOffsetX, 0, sizeof(int) * nCol);nOffsetX[nCol - 1] = nW % 32 == 0 ? 0 : 32 - nW % 32;//处理列for (int i = 0; i < nH; i++){uint8_t* pSrc1 = pSrc + i * nW;uint8_t* pTmp1 = pTmp + i * nW;for (int j = 0; j < nW; j += 32