zoukankan      html  css  js  c++  java
  • opencl gauss filter优化(二)

    1.buffer使用image的方式:Horizontal Vertical 算法一样, 共需30ms,wait time 19ms.

     

    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    __kernel void ImageGaussianFilterHorizontal(__read_only image2d_t  source, // Source image
                                __write_only image2d_t   dest,  // Intermediate dest image
                                                 const int imgWidth ,                // Image width
                                                 const int imgHeight)
    {
        const int y = get_global_id(0);
        if(y>=(imgHeight))
            return;
        const float m_nFilter[11] = {1/256.0,4/256.0,8/256.0,16/256.0,32/256.0,134/256.0,32/256.0,16/256.0,8/256.0,4/256.0,1/256.0};
    
        const int s = 11;
        const int nStart = 5;
        
        float lines[11];
        for(int i=0;i<11;i++)
            lines[i] = read_imagef( source, sampler,  (int2) (i-5, y) ).x;
    
        for(int j=0;j<imgWidth;){
        float sum = lines[nStart] * m_nFilter[nStart];
    #define    GaussianTwoLines(m) 
        sum += ( (lines[m] + lines[s-1-m])*m_nFilter[m] );
            GaussianTwoLines(0)
            GaussianTwoLines(1)
            GaussianTwoLines(2)
            GaussianTwoLines(3)
            GaussianTwoLines(4)    
            
            write_imagef( dest, (int2) (j, y), sum );
            
            for(int i = 0; i<s-1; i++) lines[i] = lines[i+1];
            j++;
            lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;
        }
    }
    
    __kernel void ImageGaussianFilterVertical(__read_only image2d_t  source, // Source image
                            __write_only image2d_t   dest,   
                             const int imgWidth ,               
                            const int imgHeight)
    {
        const int x = get_global_id(0);
        if(x>=(imgWidth))
            return;
        const float m_nFilter[11] = {1/256.0,4/256.0,8/256.0,16/256.0,32/256.0,134/256.0,32/256.0,16/256.0,8/256.0,4/256.0,1/256.0};
    
        const int s = 11;
        const int nStart = 5;
    
        float lines[11];
        for(int i=0;i<11;i++)
            lines[i] = read_imagef( source, sampler,  (int2) (x ,i-5) ).x;
    
        for(int j=0;j<imgHeight;){
        float sum = lines[nStart] * m_nFilter[nStart];
    #define    GaussianTwoLines(m) 
        sum += ( (lines[m] + lines[s-1-m])*m_nFilter[m] );
            GaussianTwoLines(0)
            GaussianTwoLines(1)
            GaussianTwoLines(2)
            GaussianTwoLines(3)
            GaussianTwoLines(4)
    
            write_imagef( dest, (int2) (x, j), sum );
    
            for(int i = 0; i<s-1; i++) lines[i] = lines[i+1];
            j++;
            lines[s-1] = read_imagef( source, sampler, (int2) (x,j+5) ).x;
        }
    }
    View Code

     

    2.只运行 Horizontal 19ms,wait time 19ms. 注释掉 write_imagef 2.4ms(wait time,run time都是0.0xms)(更新:sum计算被优化,0.x ms就是读image的时间).

    a.顺序调整为:

    lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

    write_imagef( dest, (int2) (j-1, y), sum );

    16.9ms,很奇怪sum用固定的0,0.2替代时间只有3.9ms?????把计算部分注释掉,只读写imgage,也是3.9ms, 计算sum的部分被编译器优化掉了?

    b. if(sum>0)

    lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

    write_imagef( dest, (int2) (j-1, y), 0.2 );

    如此测试,17ms,看来是sum的计算被优化掉了.

    c.if(sum>=0)

    j++;

    //lines[s-1] = read_imagef( source, sampler, (int2) (j+5, y) ).x;

    //write_imagef( dest, (int2) (j-1, y), sum );

    只计算,5.7ms,但还是wait time 5.7ms???

     

     

    3.使用float16 vector 计算,总共耗时15.6 ms,wait time 9.3ms,rum time 6.3ms.使用 __attribute__ 能减少1ms以内.其中Horizontal:wait time 9.4ms,rum time 0.008ms ,Vertical:wait time 0.07ms,rum time 6.4ms.

     

    不知道为什么使用fma指令替代sum+= ,需要近2s,而且localWorksize最大只能32.

     

    使用half16 精度,反而还要17ms,而且结果有1-2的误差。

     

    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    
    __kernel __attribute__((work_group_size_hint(64,1,1)))
    void ImageGaussianFilterHorizontal(__read_only image2d_t  source, // Source image
                                        __write_only image2d_t   dest,  // Intermediate dest image
                                         const int imgWidth ,                // Image width
                                         const int imgHeight)
    {
        const int y = get_global_id(0);
        if(y>=(imgHeight))
            return;
        const float m_nFilter[11] = {1/256.0,4/256.0,8/256.0,16/256.0,32/256.0,134/256.0,32/256.0,16/256.0,8/256.0,4/256.0,1/256.0};
    
    
    #define r(xc,y) read_imagef( source, sampler,  (int2) (xc, y) ).x
    #define r16(x,y) (float16)( r(x,y),r(x+1,y),r(x+2,y),r(x+3,y),r(x+4,y),r(x+5,y),r(x+6,y),r(x+7,y),
                    r(x+8,y),r(x+9,y),r(x+10,y),r(x+11,y),r(x+12,y),r(x+13,y),r(x+14,y),r(x+15,y))
    
    #define w16(x,y,sum) write_imagef( dest, (int2) (x, y), sum.s0 );write_imagef( dest, (int2) (x+1, y), sum.s1 );
            write_imagef( dest, (int2) (x+2, y), sum.s2 );write_imagef( dest, (int2) (x+3, y), sum.s3 );
            write_imagef( dest, (int2) (x+4, y), sum.s4 );write_imagef( dest, (int2) (x+5, y), sum.s5 );
            write_imagef( dest, (int2) (x+6, y), sum.s6 );write_imagef( dest, (int2) (x+7, y), sum.s7 );
            write_imagef( dest, (int2) (x+8, y), sum.s8 );write_imagef( dest, (int2) (x+9, y), sum.s9 );
            write_imagef( dest, (int2) (x+10, y), sum.sa );write_imagef( dest, (int2) (x+11, y), sum.sb );
            write_imagef( dest, (int2) (x+12, y), sum.sc );write_imagef( dest, (int2) (x+13, y), sum.sd );
            write_imagef( dest, (int2) (x+14, y), sum.se );write_imagef( dest, (int2) (x+15, y), sum.sf );
    
        float16 line0 =  r16(-5,y);
        for(int j=0;j<imgWidth;){
            float16 line1 =  r16(j-5+16,y);
    
            float16 temp0;
            float16 temp1;
            temp0 = line0;
            temp1.s0123 = line0.sabcd;
            temp1.s45 = line0.sef;
            temp1.s67 = line1.s01;
            temp1.s89abcdef = line1.s23456789;
            float16 sum =  ( temp0 + temp1 ) * m_nFilter[0];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s0;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s9;
            sum += ( temp0 +  temp1 ) * m_nFilter[1];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s1;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s8;
            sum += ( temp0 +  temp1 ) * m_nFilter[2];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s2;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s7;
            sum += ( temp0 +  temp1 ) * m_nFilter[3];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s3;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s6;
            sum += ( temp0 +  temp1 ) * m_nFilter[4];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s4;
            sum += ( temp0 ) * m_nFilter[5];
    
            line0 = line1;
            w16(j,y,sum );
            j+=16;
        }
    
    }
    
    __kernel  __attribute__((work_group_size_hint(64,1,1)))
    void ImageGaussianFilterVertical(__read_only image2d_t  source, // Source image
                                    __write_only image2d_t   dest,
                                     const int imgWidth ,
                                     const int imgHeight)
    {
        const int x = get_global_id(0);
        if(x>=(imgWidth))
            return;
        const float m_nFilter[11] = {1/256.0,4/256.0,8/256.0,16/256.0,32/256.0,134/256.0,32/256.0,16/256.0,8/256.0,4/256.0,1/256.0};
    
    #define rv16(x,y) (float16)( r(x,y),r(x,y+1),r(x,y+2),r(x,y+3),r(x,y+4),r(x,y+5),r(x,y+6),r(x,y+7),
                    r(x,y+8),r(x,y+9),r(x,y+10),r(x,y+11),r(x,y+12),r(x,y+13),r(x,y+14),r(x,y+15))
    
    #define wv16(x,y,sum) write_imagef( dest, (int2) (x,y), sum.s0 );write_imagef( dest, (int2) (x,y+1), sum.s1 );
            write_imagef( dest, (int2) (x,y+2), sum.s2 );write_imagef( dest, (int2) (x,y+3), sum.s3 );
            write_imagef( dest, (int2) (x,y+4), sum.s4 );write_imagef( dest, (int2) (x,y+5), sum.s5 );
            write_imagef( dest, (int2) (x,y+6), sum.s6 );write_imagef( dest, (int2) (x,y+7), sum.s7 );
            write_imagef( dest, (int2) (x,y+8), sum.s8 );write_imagef( dest, (int2) (x,y+9), sum.s9 );
            write_imagef( dest, (int2) (x,y+10), sum.sa );write_imagef( dest, (int2) (x,y+11), sum.sb );
            write_imagef( dest, (int2) (x,y+12), sum.sc );write_imagef( dest, (int2) (x,y+13), sum.sd );
            write_imagef( dest, (int2) (x,y+14), sum.se );write_imagef( dest, (int2) (x,y+15), sum.sf );
    
        float16 line0 =  rv16(x,-5);
        for(int j=0;j<imgHeight;){
            float16 line1 =  rv16(x,j-5+16);
    
            float16 temp0;
            float16 temp1;
            temp0 = line0;
            temp1.s0123 = line0.sabcd;
            temp1.s45 = line0.sef;
            temp1.s67 = line1.s01;
            temp1.s89abcdef = line1.s23456789;
            float16 sum =  ( temp0 + temp1 ) * m_nFilter[0];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s0;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s9;
            sum += ( temp0 +  temp1 ) * m_nFilter[1];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s1;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s8;
            sum += ( temp0 +  temp1 ) * m_nFilter[2];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s2;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s7;
            sum += ( temp0 +  temp1 ) * m_nFilter[3];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s3;
            temp1.s0123456789abcdef = temp1.s00123456789abcde;
            temp1.s0 = line0.s6;
            sum += ( temp0 +  temp1 ) * m_nFilter[4];
            temp0.s0123456789abcdef = temp0.s123456789abcdeff;
            temp0.sf = line1.s4;
            sum += ( temp0 ) * m_nFilter[5];
    
            line0 = line1;
            wv16(x,j,sum );
            j+=16;
        }
    }
    View Code

     

     

     

     

  • 相关阅读:
    matplotlib直方图绘图并且标注数字
    爬虫最基础知识串行和异步进阶代码
    卡方检验
    德国VoIP 系统中发现秘密后门
    python 处理字典键值合并
    python 读取json大文件
    上下文管理器的使用
    服务端高并发分布式架构
    上下文管理器连接数据库示例
    上下文管理器
  • 原文地址:https://www.cnblogs.com/mlj318/p/5039401.html
Copyright © 2011-2022 走看看