▶ CPU 图像卷积,共四种方法。分别为基本串行,使用模板,使用局部内存,使用AVX指令优化
● 全部的代码,仅在主函数中选择调用的函数名即可。
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <time.h> 4 #include <opencv2/opencv.hpp> 5 6 const char *inputFile = "R:/1.png"; 7 const char *outputFile = "R:/2.png"; 8 9 bool floatEq(const float a, const float b)// 相等返回 1 10 { 11 if (b == 0) 12 return fabs(a) < 0.001; 13 return fabs(a / b - 1) < 0.001; 14 } 15 16 void convolution01(const float *input, float *output, const int inputRow, const int inputCol, 17 const float *filter, const int filterWidth) 18 { 19 const int halfFilterWidth = filterWidth / 2; 20 int row, col, rr, cc; 21 float sum; 22 23 //memset(output, 0, sizeof(float) * inputRow * inputCol); // 使用 memset 将 output 全部凃成 0 24 25 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, output, inputRow, inputCol) private(row, col) 26 for (row = 0; row < halfFilterWidth; row++) // 人工将边角涂成 0 27 { 28 for (col = 0; col < inputCol; col++) 29 output[row*inputCol + col] = output[(inputRow - 1 - row)*inputCol + col] = 0; 30 } 31 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, output, inputRow, inputCol) private(row, col) 32 for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++) 33 { 34 for (col = 0; col < halfFilterWidth; col++) 35 output[row*inputCol + col] = output[row*inputCol + inputCol - 1 - col] = 0; 36 } 37 38 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, input, output, inputRow, inputCol, filter) private(row, col, rr, cc, sum) 39 for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)// 内部计算部分 40 { 41 for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col++) 42 { 43 for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++) 44 { 45 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++) 46 sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc]; 47 } 48 output[row * inputCol + col] = sum; 49 } 50 } 51 /* 52 for (row = 0; row < inputRow; row++) // 全范围循环,在最里层判断 53 { 54 for (col = 0; col < inputCol; col++) 55 { 56 if (row < halfFilterWidth || row >= inputRow - halfFilterWidth || col < halfFilterWidth || col >= inputCol - halfFilterWidth) 57 { 58 output[row * inputCol + col] = 0; 59 continue; 60 } 61 for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++) 62 { 63 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++) 64 sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc]; 65 } 66 output[row * inputCol + col] = sum; 67 68 } 69 } 70 */ 71 } 72 73 template<int filterWidth> 74 void convolution02(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 卷积宽度作为模板 75 { 76 const int halfFilterWidth = filterWidth / 2; 77 int row, col, rr, cc; 78 float sum; 79 80 memset(output, 0, sizeof(float) * inputRow * inputCol); 81 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, input, output, inputRow, inputCol, filter) private(row, col, rr, cc, sum) 82 for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++) 83 { 84 for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col++) 85 { 86 for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++) 87 { 88 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++) 89 sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc]; 90 } 91 output[row * inputCol + col] = sum; 92 } 93 } 94 } 95 96 template<int filterWidth, int blockRow, int blockCol> 97 void convolution03(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 使用局部内存块 98 { 99 const int halfFilterWidth = filterWidth / 2; 100 int row, col, rr, cc, i, j; 101 float filterElement; 102 103 104 if (inputRow % blockRow || inputCol % blockCol) // 要求图片长宽为局部内存块的整数倍 105 { 106 printf("Failed, outputRow %% blockRow || outputCol %% blockCol "); 107 return; 108 } 109 110 memset(output, 0, sizeof(float) * inputRow * inputCol); 111 #pragma omp parallel for num_threads(8) 112 for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row += blockRow) 113 { 114 for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col += blockCol) 115 { 116 float sum[blockRow * blockCol] = { 0.0f }; 117 for (rr = -halfFilterWidth; rr <= halfFilterWidth; rr++) 118 { 119 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++) 120 { 121 filterElement = filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth]; 122 for (i = 0; i < blockRow; i++) 123 { 124 for (j = 0; j < blockCol; j++) 125 { 126 if (row + rr + i >= inputRow || col + cc + j >= inputCol) 127 break; 128 sum[i * blockCol + j] += filterElement * input[(row + rr + i) * inputCol + col + cc + j]; 129 } 130 } 131 } 132 } 133 for (i = 0; i < blockRow; i++) 134 { 135 for (j = 0; j < blockCol; j++) 136 { 137 if (row + i >= inputRow || col + j >= inputCol) 138 continue; 139 output[(row + i) * inputCol + col + j] = sum[i * blockCol + j]; 140 } 141 } 142 } 143 } 144 } 145 146 template<int filterWidth, int blockRow, int blockCol> 147 void convolution04(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 使用 AVX 指令扩展 148 { 149 const int halfFilterWidth = filterWidth / 2; 150 int row, col, rr, cc, i, j; 151 152 if (inputRow % blockRow || inputCol % (blockCol * 8)) 153 { 154 printf("Failed, inputRow %% blockRow || inputCol %% blockCol "); 155 return; 156 } 157 158 memset(output, 0, sizeof(float) * inputRow * inputCol); 159 #pragma omp parallel for num_threads(8) 160 for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row += blockRow) 161 { 162 for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col += blockCol * 8) 163 { 164 __m256 sum[blockRow * blockCol] = {_mm256_setzero_ps()}; 165 for (rr = -halfFilterWidth; rr <= halfFilterWidth; rr++) 166 { 167 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++) 168 { 169 __m256 filterElement = _mm256_broadcast_ss(filter + (rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth); 170 for (i = 0; i < blockRow; i++) 171 { 172 for (j = 0; j < blockCol; j++) 173 { 174 //if (row + rr + i >= inputRow || col + cc + j * 8 >= inputCol)// 在局部内存块较大时需要越界检查 175 // continue; 176 __m256 imageElement = _mm256_loadu_ps(input + (row + rr + i)*inputCol + col + cc + j * 8); 177 sum[i * blockCol + j] = _mm256_fmadd_ps(filterElement, imageElement, sum[i * blockCol + j]); 178 } 179 } 180 } 181 } 182 for (i = 0; i < blockRow; i++) 183 { 184 for (j = 0; j < blockCol; j++) 185 { 186 //if (row + i >= inputRow || col + j * 8 >= inputCol) 187 // continue; 188 _mm256_storeu_ps(output + (row + i)*inputCol + col + j * 8, sum[i * blockCol + j]); 189 } 190 } 191 } 192 } 193 } 194 195 int main() 196 { 197 int i, k; 198 clock_t time; 199 float filterSum; 200 201 // 卷积窗口相关 202 const int filterWidth = 5, filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / 2; 203 float filter[filterSize] = 204 {// 模糊窗口 205 0, 0, 0, 0, 0, 206 0, 1.f / 9, 1.f / 9, 1.f / 9, 0, 207 0, 1.f / 9, 1.f / 9, 1.f / 9, 0, 208 0, 1.f / 9, 1.f / 9, 1.f / 9, 0, 209 0, 0, 0, 0, 0 210 }; 211 for (filterSum = 0.0f, i = 0; i < filterSize; filterSum += filter[i++]); 212 if (!floatEq(filterSum, 0))// 非归零的卷积窗口(如模糊)需要归一化 213 for (i = 0; i < filterSize; filter[i] /= filterSum, i++); 214 215 // 图片相关 216 cv::Mat input = cv::imread(inputFile), output = input, channel[3]; 217 cv::split(input, channel); 218 const int inputRow = input.rows, inputCol = input.cols, inputDataSize = inputRow * inputCol; 219 float *inputData = (float*)malloc(sizeof(float) * inputDataSize); 220 float *outputData = (float*)malloc(sizeof(float) * inputDataSize); 221 222 for (k = 0; k < 3; k++)// 三个通道,分别为蓝、绿、红 223 { 224 for (i = 0; i < inputRow * inputCol; inputData[i] = (float)channel[k].data[i], i++); 225 time = clock(); 226 convolution01(inputData, outputData, inputRow, inputCol, (const float *)filter, filterWidth); 227 //convolution02<filterWidth>(inputData, outputData, inputRow, inputCol, filter); 228 //convolution03<filterWidth, 4, 4>(inputData, outputData, inputRow, inputCol, filter); 229 //convolution04<filterWidth, 4, 4>(inputData, outputData, inputRow, inputCol, filter); 230 time = clock() - time; 231 printf("Time for channel[%d]:%d ms ", k, time); 232 for (i = 0; i < inputRow * inputCol; channel[k].data[i] = (unsigned char)outputData[i], i++); 233 } 234 235 cv::merge(channel, 3, output); 236 cv::imwrite(outputFile, output); 237 //imshow("merge", output); 238 //cv::waitKey(0); 239 free(inputData); 240 free(outputData); 241 printf(" Finished. "); 242 getchar(); 243 return 0; 244 }
● 输出结果,使用一张 4608 × 6656 的图片(bmp87.7MB)进行测试,使用主函数中那个边长为5、实际窗口长度为 3 的均值窗口。图片太大喘不上来,偷梁换柱成小图看效果
■ 计时结果
// convolution01,memset + 内部计算,无 OpenMP Time for channel[0]:2377 ms Time for channel[1]:2387 ms Time for channel[2]:2367 ms Finished. // convolution01,手动除边 + 内部计算,无 OpenMP Time for channel[0]:2017 ms Time for channel[1]:2014 ms Time for channel[2]:2021 ms Finished. // convolution01,循环内判断,无 OpenMP Time for channel[0]:2078 ms Time for channel[1]:2057 ms Time for channel[2]:2097 ms Finished. // convolution01,手动除边 + 内部计算,有 OpenMP Time for channel[0]:618 ms Time for channel[1]:618 ms Time for channel[2]:615 ms Finished. // convolution02,有 OpenMP Time for channel[0]:441 ms Time for channel[1]:440 ms Time for channel[2]:436 ms Finished. // convolution03<filterWidth, 4, 4>,无 OpenMP Time for channel[0]:2795 ms Time for channel[1]:2785 ms Time for channel[2]:2798 ms Finished. // convolution04<filterWidth, 4, 4>,无 OpenMP Time for channel[0]:386 ms Time for channel[1]:391 ms Time for channel[2]:386 ms Finished.
■ 没法给 convolution03 和 convolution04 加 OpenMP,一加就各种内存冲突,便捷判断都挡不住。
■