zoukankan      html  css  js  c++  java
  • opencl gauss filter优化(一)

    Platform: LG G3, Adreno 330 ,img size 3264x2448


    C code

    neon

    GPU

    300

    60

    29

     单位:ms


    1. 目前按如下行列分解的方式最快29ms,Horizontal kernel globalWorksize[1] = {height+256-height%256};Vertical kernel globalWorksize2[1] = {width+256-width%256};

    localWorksize2[] = {64}; localWorksize2 手动设为64时最快。

    Porfile的结果为:Horizontal kernel wait time 11ms,实际rum time 18ms.

    这个wait time是什么呢?注释掉Horizontal kernel中的 vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j) ; wait time只有0.x ms.并且 localWorksize 越小wait time越长,1时达到200ms,1620ms. 难道是写内存等待时间,没有足够的ALU指令隐藏访存延时?写内存后进入下一个for循环,马上又读内存,所以没有ALU指令隐藏这个延时。然而Horizontal kernelprofile结果实际run time只有0.x ms,所有时间基本都是在wait.(更正:注释掉vstore16,sum的计算被优化掉了,0.x ms是读内存的时间)

     

    __kernel void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
                                __global uchar* restrict  dest,  // Intermediate dest image
                                                 const int imgWidth ,                // Image width
                                                 const int imgHeight)
    {
        const int y = get_global_id(0);
        if(y>=(imgHeight))
            return;
        const uchar m_nRightShiftNum = 8;
        const uchar Rounding = (1 << (m_nRightShiftNum - 1));
        const uchar  m_nFilter[11] = {1,4,8,16,32,134,32,16,8,4,1};
    
        const int s = 11;
        const int nStart = 5;
        const int nWidth = imgWidth;
    
        __global const uchar* pInLine = source + y*nWidth;
        __global uchar* pOutLine = dest + y*nWidth;
        int j;
        for(j = 0; j < nStart; j ++)
        {
            ushort sum = 0;
    
            for (int m = 0; m<s / 2; m++)
            {
                int k1 = (j + m - nStart);
                k1 = k1<0 ? -k1 : k1;
    
                int k2 = (j + nStart - m );
                sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
            }
            sum += pInLine[j] * m_nFilter[s / 2];
            sum = (sum + Rounding) >> 8;
            pOutLine[j] = (uchar)clamp(sum,(ushort)0,(ushort)255);
        }
    
        for ( ; (j+16)<= (nWidth - nStart); j+=16)
        {        
    #define GAUSSIAN_LINE_NEON(m) 
    sum += ( convert_ushort16(vload16(0,pInLine+j-nStart+m))* m_nFilter[m] );
    
            ushort16 sum =  (convert_ushort16(vload16(0,pInLine+j-nStart)) * m_nFilter[0]);
            GAUSSIAN_LINE_NEON(1);
            GAUSSIAN_LINE_NEON(2);
            GAUSSIAN_LINE_NEON(3);
            GAUSSIAN_LINE_NEON(4);
            GAUSSIAN_LINE_NEON(5);
            GAUSSIAN_LINE_NEON(6);
            GAUSSIAN_LINE_NEON(7);
            GAUSSIAN_LINE_NEON(8);
            GAUSSIAN_LINE_NEON(9);
            GAUSSIAN_LINE_NEON(10);
    
            sum += (ushort)Rounding;
            vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j) ;
        }
    
        for( ; j < nWidth; j ++)
        {
            ushort sum = 0;
    
            for (int m = 0; m<s / 2; m++)
            {
                int k1 = (j + m - nStart);
    
                int k2 = (j + nStart - m );
                k2 = k2 >= nWidth ? 2 * nWidth - 2 - k2 : k2;
                sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
            }
            sum += pInLine[j] * m_nFilter[s / 2];
            sum = (sum + Rounding) >> m_nRightShiftNum;
            pOutLine[j] =  (uchar)clamp(sum,(ushort)0,(ushort)255);
        }
    }
    
    
    __kernel void ImageGaussianFilterVertical( __global uchar* restrict source,   // Intermediate image processed by ImageGaussianFilterHorizontal()
                            __global uchar* restrict dest,  // Final destination image
                            const int imgWidth,
                                             const int imgHeight
                                        )
    {
        const int x = get_global_id(0);
        if(x>=(imgWidth))
            return;
        const int x_offset = x;
    
        const int s = 11;
        const int nStart = s / 2;
        const int m_nRightShiftNum = 8;
        const int Rounding = (1 << (m_nRightShiftNum - 1));
        const uchar  m_nFilter[11] = {1,4,8,16,32,134,32,16,8,4,1};
    
        int y;
    //    mem_fence(CLK_LOCAL_MEM_FENCE);
    
        ushort lines[11];
        lines[nStart] = (ushort)( source[x_offset]  );
        for(y=1;y<=nStart;y++)
        {
            lines[nStart+y] = (ushort)( source[y*imgWidth+x_offset]  );
            lines[nStart-y] = lines[nStart+y];
        }
    
        for(y=0;y<(imgHeight-nStart-1);)
        {
    
            ushort sum = lines[nStart] * m_nFilter[nStart];
    #define    GaussianTwoLines(m) 
        sum += ( (lines[m] + lines[s-1-m])*m_nFilter[m] );
            GaussianTwoLines(0)
            GaussianTwoLines(1)
            GaussianTwoLines(2)
            GaussianTwoLines(3)
            GaussianTwoLines(4)
    
            sum += (ushort)Rounding;
            dest[y*imgWidth+x_offset]  = (uchar)(sum>>(ushort)8);
    
            y++;
            for(int i = 0; i<s-1; i++) lines[i] = lines[i+1];
            
            lines[s-1] =  (ushort)( source[(y+nStart)*imgWidth+x_offset]  );
            
        }
    
        for(y=imgHeight-nStart-1;y<(imgHeight-1);)
        {
            ushort sum = lines[nStart] * m_nFilter[nStart];
            GaussianTwoLines(0)
            GaussianTwoLines(1)
            GaussianTwoLines(2)
            GaussianTwoLines(3)
            GaussianTwoLines(4)
    
            sum += (ushort)Rounding;
            dest[y*imgWidth+x_offset]  = (uchar)(sum>>(ushort)8);
    
            y++;
            for(int i = 0; i<s-1; i++) {
                lines[i] = lines[i+1];
            }
            lines[s-1] = lines[(imgHeight-y)*2-2] ; //
        }
        //last y=imgHeight-1
        ushort sum = lines[nStart] * m_nFilter[nStart];
        GaussianTwoLines(0)
        GaussianTwoLines(1)
        GaussianTwoLines(2)
        GaussianTwoLines(3)
        GaussianTwoLines(4)
    
        sum += (ushort)Rounding;
        dest[y*imgWidth+x_offset]  = (uchar)(sum>>(ushort)8);
    }
    kernel

     

    2.Horizontal kernel改进,预先load 2x16个所需的pixel,计算时从中提取,这样每次循环只需读一次内存。需要26ms,wait time 8ms.

     

        ushort16 line0 =  convert_ushort16(vload16(0,pInLine+j-nStart));
        for ( ; (j+16)<= (nWidth - nStart); j+=16)
        {
            ushort16 line1 =  convert_ushort16(vload16(0,pInLine+j-nStart+16));
    
            ushort16 temp0;
            ushort16 temp1;
            temp0 = line0;
            temp1.s0123 = line0.sabcd;
            temp1.s45 = line0.sef;
            temp1.s67 = line1.s01;
            temp1.s89abcdef = line1.s23456789;
            ushort16 sum =  ( temp0 + temp1 ) * m_nFilter[0];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s0;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s9;
            sum += ( temp0 +  temp1 ) * m_nFilter[1];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s1;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s8;
            sum += ( temp0 +  temp1 ) * m_nFilter[2];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s2;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s7;
            sum += ( temp0 +  temp1 ) * m_nFilter[3];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s3;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s6;
            sum += ( temp0 +  temp1 ) * m_nFilter[4];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s4;
            sum += ( temp0 ) * m_nFilter[5];
    
            sum += (ushort)Rounding;
            line0 = line1;
            vstore16(convert_uchar16(sum>>(ushort)8),0,pOutLine+j) ;
        }
    View Code

     

    3.不计算,只读写内存测试。那么wait time 3.2 ms,run time 18.2 ms.说明Horizontal kernel 耗时的极限也需3.2ms. 但是只是注释掉vstore16,还保留了读和计算,反而wait time还只有0.x ms,这又是为何?是读几乎没有wait,3.2ms都是写的wait time? (更正:注释掉vstore16,sum的计算被优化掉了,0.x ms是读内存的时间)

    a.再次测试,只有读wait time 0.xms ,只有写wait time 3.2ms.写比读的周期长.

    for ( ; (j+16)<= (nWidth - nStart); j+=16)

    {

    ushort16 line1 = convert_ushort16(vload16(0,pInLine+j-nStart+16));

    vstore16(0,0,pOutLine+j) ;

    }

    b.另外发现使用*((__global uint4*)(pOutLine+j)) = as_uint4(result);vstore16快,wait time 2.5ms.高通 80-N8592-1_L_OpenCL_Programming_Guide 中提到:

    Vectorized load/store of a larger data type is more optimal than a small data type; e.g., a load of uint2* is more optimal than uchar8* .

    For optimal SP to L2 bandwidth performance, align read access to a 32-bit address and write access to a 128-bit address.

    c.原来写的内存没有对齐,使用*((__global uint4*)(pOutLine+j-5)) = as_uint4(result);wait time 1.9ms.

    d.最后加上sum计算,采用的Horizontal kernel如下,localWorksize[] = {64};时时间最少,需要23ms,wait time 4.7ms , localWorksize = 128,wait 6ms.

    并且使用__attribute__((work_group_size_hint(64,1,1))) ,耗时22ms.

     

    __kernel __attribute__((work_group_size_hint(64,1,1)))   
    void ImageGaussianFilterHorizontal(__global const uchar* restrict source, // Source image
                            __global uchar* restrict  dest,  // Intermediate dest image
                                                 const int imgWidth ,                // Image width
                                                 const int imgHeight)
    {
        const int y = get_global_id(0);
        if(y>=(imgHeight))
            return;
        const uchar m_nRightShiftNum = 8;
        const uchar Rounding = (1 << (m_nRightShiftNum - 1));
        const uchar  m_nFilter[11] = {1,4,8,16,32,134,32,16,8,4,1};
    
        const int s = 11;
        const int nStart = 5;
        const int nWidth = imgWidth;
    
    
        __global const uchar* pInLine = source + y*nWidth;
        __global uchar* pOutLine = dest + y*nWidth;
    
        int j;
        uchar temp[5];
        for(j = 0; j < nStart; j ++)
        {
            ushort sum = 0;
    
            for (int m = 0; m<s / 2; m++)
            {
                int k1 = (j + m - nStart);
                k1 = k1<0 ? -k1 : k1;
    
                int k2 = (j + nStart - m );
                sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
            }
            sum += pInLine[j] * m_nFilter[s / 2];
            sum = (sum + Rounding) >> 8;
            temp[j] = (uchar)clamp(sum,(ushort)0,(ushort)255);
        }
    
        uchar16 result,pre_result;
        pre_result.sbcde = (uchar4)(temp[0],temp[1],temp[2],temp[3]);
        pre_result.sf = temp[4];
    
        ushort16 line0 =  convert_ushort16(vload16(0,pInLine+j-nStart));
        for ( ; (j+16)<= (nWidth - nStart); j+=16)
        {
            //prefetch(pInLine+j-nStart,32); //无变化
            ushort16 line1 =  convert_ushort16(vload16(0,pInLine+j-nStart+16));
    
            ushort16 temp0;
            ushort16 temp1;
            temp0 = line0;
            temp1.s0123 = line0.sabcd;
            temp1.s45 = line0.sef;
            temp1.s67 = line1.s01;
            temp1.s89abcdef = line1.s23456789;
            ushort16 sum =  ( temp0 + temp1 ) * m_nFilter[0];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s0;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s9;
            sum += ( temp0 +  temp1 ) * m_nFilter[1];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s1;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s8;
            sum += ( temp0 +  temp1 ) * m_nFilter[2];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s2;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s7;
            sum += ( temp0 +  temp1 ) * m_nFilter[3];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s3;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s6;
            sum += ( temp0 +  temp1 ) * m_nFilter[4];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s4;
            sum += ( temp0 ) * m_nFilter[5];
    
            sum += (ushort)Rounding;
            line0 = line1;
    
            result.s0123 = pre_result.sbcde;
            result.s4 = pre_result.sf;
            pre_result = convert_uchar16(sum>>(ushort)8) ;
    
            result.s5 = pre_result.s0;
            result.s67 = pre_result.s12;
            result.s89abcdef = pre_result.s3456789a;
            *( (__global uint4*)(pOutLine+j-5) ) =  (as_uint4)(result) ;
        }
    
        *( (__global uint*)(pOutLine+j-5) ) = (as_uint)(pre_result.sbcde);//last 5 bytes
        pOutLine[j-1] = pre_result.sf;
    
        for( ; j < nWidth; j ++)
        {
            ushort sum = 0;
    
            for (int m = 0; m<s / 2; m++)
            {
                int k1 = (j + m - nStart);
    
                int k2 = (j + nStart - m );
                k2 = k2 >= nWidth ? 2 * nWidth - 2 - k2 : k2;
                sum += (pInLine[k1] + pInLine[k2])*m_nFilter[m];
            }
            sum += pInLine[j] * m_nFilter[s / 2];
            sum = (sum + Rounding) >> m_nRightShiftNum;
            pOutLine[j] =  (uchar)clamp(sum,(ushort)0,(ushort)255);
        }
    }
    View Code

     

     

     

     

  • 相关阅读:
    24张图,九大数据结构安排得明明白白
    mysql中的mvcc解读
    常见电商项目的数据库表设计(MySQL版)
    两万字深度介绍分布式系统原理,一文入魂
    使用消息中间件时,如何保证消息仅仅被消费一次?
    GCC/G++选项 -Wl,-Bstatic和-Wl,-Bdynamic
    sql 练习
    设计模式-单例模式
    设计模式-抽象工厂模式
    设计模式-工厂方法模式
  • 原文地址:https://www.cnblogs.com/mlj318/p/5039348.html
Copyright © 2011-2022 走看看