zoukankan      html  css  js  c++  java
  • opencl gauss filter优化(三)

    1.根据前两次的最终结果:

    使用普通buffer,Horizontal 5ms, Vertical 17 ms

    使用image bufferHorizontal 9.4ms, Vertical 6.4 ms

    那么使用 Horizontal普通buffer,Vertical image buffer 组合方式的话,是不是时间最少?只是Intermediate image仍使用image对象,Horizontal kernel中的写操作需要改变。

    结果: Horizontal 的最大local_work_size只能是32, Horizontal 增至8ms, Vertical 6.4ms

     

    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    
    #define r(xc,y) read_imagef( source, sampler,  (int2) (xc, y) ).x
    
    #define w16(x,y,sum) write_imagef( dest, (int2) (x, y), sum.s0 );write_imagef( dest, (int2) (x+1, y), sum.s1 );
            write_imagef( dest, (int2) (x+2, y), sum.s2 );write_imagef( dest, (int2) (x+3, y), sum.s3 );
            write_imagef( dest, (int2) (x+4, y), sum.s4 );write_imagef( dest, (int2) (x+5, y), sum.s5 );
            write_imagef( dest, (int2) (x+6, y), sum.s6 );write_imagef( dest, (int2) (x+7, y), sum.s7 );
            write_imagef( dest, (int2) (x+8, y), sum.s8 );write_imagef( dest, (int2) (x+9, y), sum.s9 );
            write_imagef( dest, (int2) (x+10, y), sum.sa );write_imagef( dest, (int2) (x+11, y), sum.sb );
            write_imagef( dest, (int2) (x+12, y), sum.sc );write_imagef( dest, (int2) (x+13, y), sum.sd );
            write_imagef( dest, (int2) (x+14, y), sum.se );write_imagef( dest, (int2) (x+15, y), sum.sf );
    
    __kernel __attribute__((work_group_size_hint(32,1,1)))
    void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
                                        __write_only image2d_t   dest,  // Intermediate dest image
                                         const int imgWidth ,                // Image width
                                         const int imgHeight)
    {
        const int y = get_global_id(0);
        if(y>=(imgHeight))
            return;
        const uchar m_nRightShiftNum = 8;
        const uchar Rounding = (1 << (m_nRightShiftNum - 1));
        const uchar  m_nFilter[11] = {1,4,8,16,32,134,32,16,8,4,1};
    
        const int s = 11;
        const int nStart = 5;
        const int nWidth = imgWidth;
    
        __global const uchar* pInLine = source + y*nWidth;
    
        int j;
        for(j = 0; j < nStart; j ++)
        {
            ushort sum = 0;
    
            for (int m = 0; m<s / 2; m++)
            {
                int k1 = (j + m - nStart);
                k1 = k1<0 ? -k1 : k1;
    
                int k2 = (j + nStart - m );
                sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
            }
            sum += pInLine[j] * m_nFilter[s / 2];
            //sum = (sum + Rounding) >> 8;
            write_imagef( dest, (int2) (j, y), convert_float(sum)/(255.0*256) );
        }
    
        ushort16 line0 =  convert_ushort16(vload16(0,pInLine+j-nStart));
        for ( ; (j+16)<= (nWidth - nStart); j+=16)
        {
            ushort16 line1 =  convert_ushort16(vload16(0,pInLine+j-nStart+16));
    
            ushort16 temp0;
            ushort16 temp1;
            temp0 = line0;
            temp1.s0123 = line0.sabcd;
            temp1.s45 = line0.sef;
            temp1.s67 = line1.s01;
            temp1.s89abcdef = line1.s23456789;
            ushort16 sum =  ( temp0 + temp1 ) * m_nFilter[0];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s0;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s9;
            sum += ( temp0 +  temp1 ) * m_nFilter[1];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s1;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s8;
            sum += ( temp0 +  temp1 ) * m_nFilter[2];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s2;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s7;
            sum += ( temp0 +  temp1 ) * m_nFilter[3];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s3;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s6;
            sum += ( temp0 +  temp1 ) * m_nFilter[4];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s4;
            sum += ( temp0 ) * m_nFilter[5];
    
            line0 = line1;
    
            float16 sum2 = (convert_float16(sum))/(255.0*256);
            w16(j,y,sum2 );
        }
    
        for( ; j < nWidth; j ++)
        {
            ushort sum = 0;
    
            for (int m = 0; m<s / 2; m++)
            {
                int k1 = (j + m - nStart);
    
                int k2 = (j + nStart - m );
                k2 = k2 >= nWidth ? 2 * nWidth - 2 - k2 : k2;
                sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
            }
            sum += pInLine[j] * m_nFilter[s / 2];
            //sum = (sum + Rounding) >> m_nRightShiftNum;
            write_imagef( dest, (int2) (j, y), convert_float(sum)/(255.0*256) );
        }
    
    }
    View Code

    2.使用各种办法,最终也只能降到13.7ms,Horizontal 7.5, Vertical 6ms,最终代码如下.

    更新HV都 去掉__attribute__ 属性,local_work_size都设置NULL,opencl自己选择,H 的最大local_work_size又变回了64,总时间13ms.因为在LG G4,adreno 418上运行却需要40ms,adreno 418上的local_work_size最大可以是1024,却被强制设成了32.

    a.使用mad指令做sum乘加,结果有误差,时间也略增.fma 是无限精度,mad 是快速方法,结果是近似值。

    b.使用 pInTemp fisrt 16 bytes,避免重复读取,有0.x ms的优势

    c.边界使用了mirror repeat

    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    
    #define r(xc,y) read_imagef( source, sampler,  (int2) (xc, y) ).x
    
    #define w16(x,y,sum) write_imagef( dest, (int2) (x, y), sum.s0 );write_imagef( dest, (int2) (x+1, y), sum.s1 );
            write_imagef( dest, (int2) (x+2, y), sum.s2 );write_imagef( dest, (int2) (x+3, y), sum.s3 );
            write_imagef( dest, (int2) (x+4, y), sum.s4 );write_imagef( dest, (int2) (x+5, y), sum.s5 );
            write_imagef( dest, (int2) (x+6, y), sum.s6 );write_imagef( dest, (int2) (x+7, y), sum.s7 );
            write_imagef( dest, (int2) (x+8, y), sum.s8 );write_imagef( dest, (int2) (x+9, y), sum.s9 );
            write_imagef( dest, (int2) (x+10, y), sum.sa );write_imagef( dest, (int2) (x+11, y), sum.sb );
            write_imagef( dest, (int2) (x+12, y), sum.sc );write_imagef( dest, (int2) (x+13, y), sum.sd );
            write_imagef( dest, (int2) (x+14, y), sum.se );write_imagef( dest, (int2) (x+15, y), sum.sf );
    
    //line0 start from j-5,line1 from j-5+16
    #define GaussianShift16 {
            temp0 = line0;
            temp1.s0123 = line0.sabcd;
            temp1.s45 = line0.sef;
            temp1.s67 = line1.s01;
            temp1.s89abcdef = line1.s23456789;
            sum =  ( temp0 + temp1 ) * m_nFilter[0];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s0;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s9;
            sum += ( temp0 +  temp1 ) * m_nFilter[1];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s1;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s8;
            sum += ( temp0 +  temp1 ) * m_nFilter[2];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s2;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s7;
            sum += ( temp0 +  temp1 ) * m_nFilter[3];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s3;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s6;
            sum += ( temp0 +  temp1 ) * m_nFilter[4];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s4;
            sum += ( temp0 ) * m_nFilter[5];}
    
    __kernel __attribute__((work_group_size_hint(32,1,1)))
    void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
                                        __write_only image2d_t   dest,  // Intermediate dest image
                                         const int imgWidth ,                // Image width
                                         const int imgHeight)
    {
        const int y = get_global_id(0);
        if(y>=(imgHeight))
            return;
        const uchar  m_nFilter[11] = {1,4,8,16,32,134,32,16,8,4,1};
    
        const int s = 11;
        const int nStart = 5;
    
        __global const uchar* pInLine = source + y*imgWidth;
    
        int j;
        uchar pInTemp[16];
        *( (uint4*)(pInTemp) ) = *((__global uint4*)(pInLine)) ;//first 16 bytes
        for(j = 0; j < nStart; j ++)
        {
            ushort sum = 0;
            for (int m = 0; m<s / 2; m++)
            {
                int k1 = (j + m - nStart);
                k1 = k1<0 ? -k1 : k1;
    
                int k2 = (j + nStart - m );
                sum += (pInTemp[k1] + pInTemp[k2])*m_nFilter[m];
            }
            sum += pInTemp[j] * m_nFilter[s / 2];
            write_imagef( dest, (int2) (j, y), convert_float(sum)/(255.0*256) );
        }
    
        ushort16 temp0;
        ushort16 temp1;
        ushort16 sum;
        ushort16 line0,line1;
        line0 =  convert_ushort16(*((uchar16*)pInTemp));
        for ( ; j< (imgWidth-16); j+=16)
        {
            line1 = convert_ushort16(vload16(0,pInLine+j-nStart+16));//convert_ushort16( as_uchar16(*((__global uint4*)(pInLine+j-nStart+16))) ) ;
    
            GaussianShift16
            line0 = line1;
    
            float16 sum2 = (convert_float16(sum))/(255.0*256);
            w16(j,y,sum2 );
        }
    
        {
            //last 16 pixel,some pixels may caculate again
            j = imgWidth-16;
            line0 =  convert_ushort16(vload16(0,pInLine+j-nStart));
            //mirror repeat read
            line1.s0123 =  convert_ushort4( vload4(0,pInLine+imgWidth-nStart) );
            line1.s4567 = (ushort4)( pInLine[imgWidth-1],line1.s3,line1.s21 ) ;
            line1.s89 = (ushort2)(line1.s0,line0.sf);
    
            GaussianShift16
            float16 sum2 = (convert_float16(sum))/(255.0*256);
            w16(j,y,sum2 );
        }
    }
    
    __kernel  __attribute__((work_group_size_hint(64,1,1)))
    void ImageGaussianFilterVertical(__read_only image2d_t  source, // Source image
                                    __write_only image2d_t   dest,
                                     const int imgWidth ,
                                     const int imgHeight)
    {
        const int x = get_global_id(0);
        if(x>=(imgWidth))
            return;
        const float m_nFilter[11] = {1/256.0,4/256.0,8/256.0,16/256.0,32/256.0,134/256.0,32/256.0,16/256.0,8/256.0,4/256.0,1/256.0};
    
    #define rv16(x,y) (float16)( r(x,y),r(x,y+1),r(x,y+2),r(x,y+3),r(x,y+4),r(x,y+5),r(x,y+6),r(x,y+7),
                    r(x,y+8),r(x,y+9),r(x,y+10),r(x,y+11),r(x,y+12),r(x,y+13),r(x,y+14),r(x,y+15))
    
    #define wv16(x,y,sum) write_imagef( dest, (int2) (x,y), sum.s0 );write_imagef( dest, (int2) (x,y+1), sum.s1 );
            write_imagef( dest, (int2) (x,y+2), sum.s2 );write_imagef( dest, (int2) (x,y+3), sum.s3 );
            write_imagef( dest, (int2) (x,y+4), sum.s4 );write_imagef( dest, (int2) (x,y+5), sum.s5 );
            write_imagef( dest, (int2) (x,y+6), sum.s6 );write_imagef( dest, (int2) (x,y+7), sum.s7 );
            write_imagef( dest, (int2) (x,y+8), sum.s8 );write_imagef( dest, (int2) (x,y+9), sum.s9 );
            write_imagef( dest, (int2) (x,y+10), sum.sa );write_imagef( dest, (int2) (x,y+11), sum.sb );
            write_imagef( dest, (int2) (x,y+12), sum.sc );write_imagef( dest, (int2) (x,y+13), sum.sd );
            write_imagef( dest, (int2) (x,y+14), sum.se );write_imagef( dest, (int2) (x,y+15), sum.sf );
    
    
        float16 temp0;
        float16 temp1;
        float16 sum;
        float16 line0,line1;
    
        line0 =  rv16(x,-5);
        line0.s0123 = line0.sa987;//mirror repeat
        line0.s4 = line0.s6;
        int j;
        for(j=0;j<imgHeight-16;j+=16){
            line1 =  rv16(x,j-5+16);
    
            GaussianShift16
    
            line0 = line1;
            wv16(x,j,sum );
        }
        //last 16 pixel,some pixels may caculate again if imgHeight not 16 bytes align
        j = imgHeight-16;
        line0 =  rv16(x,j-5);
        //mirror repeat read
        const int y = imgHeight-5;
        line1.s0123 =  (float4)( r(x,y),r(x,y+1),r(x,y+2),r(x,y+3) );
        line1.s4567 = (float4)( r(x,y+4),line1.s3,line1.s21 );
        line1.s89 = (float2)(line1.s0,line0.sf);
    
        GaussianShift16
        wv16(x,j,sum );
    }
    View Code

    总结:1.local_work_size 对时间的影响比较大,有时使用NULL默认的就可以,有时需要一个个去试。

    使用vector 类型,local memory,kernel代码结构 都会对 local_work_size 最大值有影响

    2.profile中的wait time可能是读写memory还有其它的等待时间,rum timeALU计算执行的时间。

    3.避免对global memory的重复读写,预先缓存下来再用

    4.image buffer的读写比普通buffer快,也没有按行按列读写的效率差异.尽量使用image buffer

    5.read/write_imageui 并不比 read/write_imagef 快,一般就使用float

    6.write read 要慢很多,内存未对齐也会慢些

    7.使用vector 读写,计算 都会更快.image buffer虽然是单点读,组合成vector计算也更快.

    8.half类型存在精度问题,会引入误差,在这里也不比float

    9.如果不确定local_work_size,就设置成NULL,opencl自己选择。

    不同的GPU local_work_size最大值不一样,比如这个kernel Adreno 330上最大64,adreno 418上最大1024.


     

     

  • 相关阅读:
    腾讯开放平台 手机QQ登录 错误码:110406 解决办法
    比较酷的安卓软件
    Top 10 Methods for Java Arrays
    Feathers UI 性能优化
    Starling中通过PivotX 和 PivotY 修改原点
    Adobe AIR 中为不同尺寸和分辨率屏幕适配
    Feathers组件的宽度或高度属性,为什么我得到的值是0
    "Type Coercion failed" Error in FlashBuilder 4.7
    正则表达式中的特殊字符
    单体内置对象 Global 和 Math
  • 原文地址:https://www.cnblogs.com/mlj318/p/5039425.html
Copyright © 2011-2022 走看看