zoukankan      html  css  js  c++  java
  • OpenCL 图像卷积 3 使用 CPU

    ▶ CPU 图像卷积,共四种方法。分别为基本串行,使用模板,使用局部内存,使用AVX指令优化

    ● 全部的代码,仅在主函数中选择调用的函数名即可。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include <opencv2/opencv.hpp>
      5 
      6 const char *inputFile = "R:/1.png";
      7 const char *outputFile = "R:/2.png";
      8 
      9 bool floatEq(const float a, const float b)// 相等返回 1
     10 {
     11     if (b == 0)
     12         return fabs(a) < 0.001;
     13     return fabs(a / b - 1) < 0.001;
     14 }
     15 
     16 void convolution01(const float *input, float *output, const int inputRow, const int inputCol, 
     17     const float *filter, const int filterWidth)
     18 {    
     19     const int halfFilterWidth = filterWidth / 2;
     20     int row, col, rr, cc;
     21     float sum;
     22 
     23     //memset(output, 0, sizeof(float) * inputRow * inputCol);             // 使用 memset 将 output 全部凃成 0
     24     
     25 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, output, inputRow, inputCol) private(row, col)
     26     for (row = 0; row < halfFilterWidth; row++)                         // 人工将边角涂成 0
     27     {
     28         for (col = 0; col < inputCol; col++)
     29             output[row*inputCol + col] = output[(inputRow - 1 - row)*inputCol + col] = 0;
     30     }                              
     31 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, output, inputRow, inputCol) private(row, col)
     32     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)
     33     {
     34         for (col = 0; col < halfFilterWidth; col++)
     35             output[row*inputCol + col] = output[row*inputCol + inputCol - 1 - col] = 0;
     36     }
     37 
     38 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, input, output, inputRow, inputCol, filter) private(row, col, rr, cc, sum)
     39     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)// 内部计算部分
     40     {
     41         for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col++)
     42         {
     43             for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
     44             {
     45                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
     46                     sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];
     47             }
     48             output[row * inputCol + col] = sum;
     49         }
     50     }        
     51     /*
     52     for (row = 0; row < inputRow; row++)                                // 全范围循环,在最里层判断
     53     {
     54         for (col = 0; col < inputCol; col++)
     55         {            
     56             if (row < halfFilterWidth || row >= inputRow - halfFilterWidth || col < halfFilterWidth || col >= inputCol - halfFilterWidth)
     57             {
     58                output[row * inputCol + col] = 0;
     59                 continue;
     60             }
     61             for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
     62             {
     63                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
     64                     sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];
     65             }
     66             output[row * inputCol + col] = sum;
     67      
     68         }
     69     }
     70     */
     71 }
     72 
     73 template<int filterWidth>
     74 void convolution02(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 卷积宽度作为模板
     75 {
     76     const int halfFilterWidth = filterWidth / 2;
     77     int row, col, rr, cc;
     78     float sum;
     79     
     80     memset(output, 0, sizeof(float) * inputRow * inputCol);  
     81 #pragma omp parallel for num_threads(8) default(none) shared(halfFilterWidth, input, output, inputRow, inputCol, filter) private(row, col, rr, cc, sum)
     82     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row++)
     83     {
     84         for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col++)
     85         {
     86             for (sum = 0.0f, rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
     87             {
     88                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
     89                     sum += filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth] * input[(row + rr)*inputCol + col + cc];
     90             }
     91             output[row * inputCol + col] = sum;
     92         }
     93     }
     94 }
     95 
     96 template<int filterWidth, int blockRow, int blockCol>
     97 void convolution03(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 使用局部内存块
     98 {
     99     const int halfFilterWidth = filterWidth / 2;
    100     int row, col, rr, cc, i, j;
    101     float filterElement;
    102     
    103 
    104     if (inputRow % blockRow || inputCol % blockCol) // 要求图片长宽为局部内存块的整数倍
    105     {
    106         printf("Failed, outputRow %% blockRow || outputCol %% blockCol
    ");
    107         return;
    108     }
    109 
    110     memset(output, 0, sizeof(float) * inputRow * inputCol);
    111 #pragma omp parallel for num_threads(8) 
    112     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row += blockRow)
    113     {
    114         for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col += blockCol)
    115         {     
    116             float sum[blockRow * blockCol] = { 0.0f };
    117             for (rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
    118             {
    119                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
    120                 {
    121                     filterElement = filter[(rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth];
    122                     for (i = 0; i < blockRow; i++)
    123                     {
    124                         for (j = 0; j < blockCol; j++)
    125                         {
    126                             if (row + rr + i >= inputRow || col + cc + j >= inputCol)
    127                                 break;
    128                             sum[i * blockCol + j] += filterElement * input[(row + rr + i) * inputCol + col + cc + j];
    129                         }
    130                     }
    131                 }
    132             }
    133             for (i = 0; i < blockRow; i++)
    134             {
    135                 for (j = 0; j < blockCol; j++)
    136                 {
    137                     if (row + i >= inputRow || col + j >= inputCol)
    138                         continue;
    139                     output[(row + i) * inputCol + col + j] = sum[i * blockCol + j];
    140                 }
    141             }
    142         }
    143     }
    144 }
    145 
    146 template<int filterWidth, int blockRow, int blockCol>
    147 void convolution04(const float *input, float *output, const int inputRow, const int inputCol, const float *filter)// 使用 AVX 指令扩展
    148 {
    149     const int halfFilterWidth = filterWidth / 2;
    150     int row, col, rr, cc, i, j;
    151 
    152     if (inputRow % blockRow || inputCol % (blockCol * 8))
    153     {
    154         printf("Failed, inputRow %% blockRow || inputCol %% blockCol
    ");
    155         return;
    156     }
    157 
    158     memset(output, 0, sizeof(float) * inputRow * inputCol);
    159 #pragma omp parallel for num_threads(8)
    160     for (row = halfFilterWidth; row < inputRow - halfFilterWidth; row += blockRow)
    161     {
    162         for (col = halfFilterWidth; col < inputCol - halfFilterWidth; col += blockCol * 8)
    163         {
    164             __m256 sum[blockRow * blockCol] = {_mm256_setzero_ps()};
    165             for (rr = -halfFilterWidth; rr <= halfFilterWidth; rr++)
    166             {
    167                 for (cc = -halfFilterWidth; cc <= halfFilterWidth; cc++)
    168                 {
    169                     __m256 filterElement = _mm256_broadcast_ss(filter + (rr + halfFilterWidth) * filterWidth + cc + halfFilterWidth);
    170                     for (i = 0; i < blockRow; i++)
    171                     {
    172                         for (j = 0; j < blockCol; j++)
    173                         {
    174                             //if (row + rr + i >= inputRow || col + cc + j * 8 >= inputCol)// 在局部内存块较大时需要越界检查
    175                             //    continue;
    176                             __m256 imageElement = _mm256_loadu_ps(input + (row + rr + i)*inputCol + col + cc + j * 8);
    177                             sum[i * blockCol + j] = _mm256_fmadd_ps(filterElement, imageElement, sum[i * blockCol + j]);
    178                         }
    179                     }
    180                 }
    181             }
    182             for (i = 0; i < blockRow; i++)
    183             {
    184                 for (j = 0; j < blockCol; j++)
    185                 {
    186                     //if (row + i >= inputRow || col + j * 8 >= inputCol)
    187                     //    continue;
    188                     _mm256_storeu_ps(output + (row + i)*inputCol + col + j * 8, sum[i * blockCol + j]);
    189                 }
    190             }
    191         }
    192     }
    193 }
    194 
    195 int main()
    196 {
    197     int i, k;
    198     clock_t time;
    199     float filterSum;
    200 
    201     // 卷积窗口相关
    202     const int filterWidth = 5, filterSize = filterWidth * filterWidth, halfFilterWidth = filterWidth / 2;
    203     float filter[filterSize] =        
    204     {// 模糊窗口
    205         0,       0,       0,       0, 0,
    206         0, 1.f / 9, 1.f / 9, 1.f / 9, 0,
    207         0, 1.f / 9, 1.f / 9, 1.f / 9, 0,
    208         0, 1.f / 9, 1.f / 9, 1.f / 9, 0,
    209         0,       0,       0,       0, 0
    210     };    
    211     for (filterSum = 0.0f, i = 0; i < filterSize; filterSum += filter[i++]);
    212     if (!floatEq(filterSum, 0))// 非归零的卷积窗口(如模糊)需要归一化
    213         for (i = 0; i < filterSize; filter[i] /= filterSum, i++);
    214 
    215     // 图片相关        
    216     cv::Mat input = cv::imread(inputFile), output = input, channel[3];
    217     cv::split(input, channel);
    218     const int inputRow = input.rows, inputCol = input.cols, inputDataSize = inputRow * inputCol;
    219     float *inputData = (float*)malloc(sizeof(float) * inputDataSize);
    220     float *outputData = (float*)malloc(sizeof(float) * inputDataSize);
    221 
    222     for (k = 0; k < 3; k++)// 三个通道,分别为蓝、绿、红       
    223     {
    224         for (i = 0; i < inputRow * inputCol; inputData[i] = (float)channel[k].data[i], i++);       
    225         time = clock();
    226         convolution01(inputData, outputData, inputRow, inputCol, (const float *)filter, filterWidth);
    227         //convolution02<filterWidth>(inputData, outputData, inputRow, inputCol, filter);
    228         //convolution03<filterWidth, 4, 4>(inputData, outputData, inputRow, inputCol, filter);
    229         //convolution04<filterWidth, 4, 4>(inputData, outputData, inputRow, inputCol, filter);
    230         time = clock() - time;
    231         printf("Time for channel[%d]:%d ms
    ", k, time);
    232         for (i = 0; i < inputRow * inputCol; channel[k].data[i] = (unsigned char)outputData[i], i++);
    233     }
    234 
    235     cv::merge(channel, 3, output);
    236     cv::imwrite(outputFile, output);
    237     //imshow("merge", output);                                            
    238     //cv::waitKey(0);
    239     free(inputData);
    240     free(outputData);
    241     printf("
    Finished.
    ");
    242     getchar();
    243     return 0;
    244 }

    ● 输出结果,使用一张 4608 × 6656 的图片(bmp87.7MB)进行测试,使用主函数中那个边长为5、实际窗口长度为 3 的均值窗口。图片太大喘不上来,偷梁换柱成小图看效果

         

    ■ 计时结果

    // convolution01,memset + 内部计算,无 OpenMP
    Time for channel[0]:2377 ms
    Time for channel[1]:2387 ms
    Time for channel[2]:2367 ms
    
    Finished.
    
    // convolution01,手动除边 + 内部计算,无 OpenMP
    Time for channel[0]:2017 ms
    Time for channel[1]:2014 ms
    Time for channel[2]:2021 ms
    
    Finished.
    
    // convolution01,循环内判断,无 OpenMP
    Time for channel[0]:2078 ms
    Time for channel[1]:2057 ms
    Time for channel[2]:2097 ms
    
    Finished.
    
    // convolution01,手动除边 + 内部计算,有 OpenMP
    Time for channel[0]:618 ms
    Time for channel[1]:618 ms
    Time for channel[2]:615 ms
    
    Finished.
    
    // convolution02,有 OpenMP
    Time for channel[0]:441 ms
    Time for channel[1]:440 ms
    Time for channel[2]:436 ms
    
    Finished.
    
    // convolution03<filterWidth, 4, 4>,无 OpenMP
    Time for channel[0]:2795 ms
    Time for channel[1]:2785 ms
    Time for channel[2]:2798 ms
    
    Finished.
    
    // convolution04<filterWidth, 4, 4>,无 OpenMP
    Time for channel[0]:386 ms
    Time for channel[1]:391 ms
    Time for channel[2]:386 ms
    
    Finished.

    ■ 没法给 convolution03 和 convolution04 加 OpenMP,一加就各种内存冲突,便捷判断都挡不住。

  • 相关阅读:
    正向代理和反向代理
    Unicode
    utf-8
    ISO 8895-1
    ProtocalBuffers学习记录
    C#基础知识
    MSBuild学习记录
    Linux学习笔记
    Jenkins学习记录
    CruiseControl.Net学习记录
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/9097747.html
Copyright © 2011-2022 走看看