zoukankan      html  css  js  c++  java
  • 基于OpenCL的mean filter性能

    1.对于一个标准的3*3 均值滤波,kernel代码如下:

    使用buffer/image缓冲对象

    __kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage, uint N)
    {
    int x = get_global_id(0);
    int y = get_global_id(1);
    int width = get_global_size(0);
    int height = get_global_size(1);

    int k = (N-1)/2;
    int n = N*N; //n*n

    if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
    {
    outputImage[x + y * width] = inputImage[x + y * width];
    return;
    }


    uint4 finalcolor = (uint4)(0);

    int i,j;
    for(j = y - k; j <= y + k; j++)
    {
    for(i = x - k; i <= x + k; i++)
    {
    finalcolor = finalcolor + convert_uint4(inputImage[i + j * width]);
    }
    }

    outputImage[x + y * width] = convert_uchar4(finalcolor/n);

    }

    __kernel void filterImg( image2d_t inputImage, __write_only image2d_t outputImage, uint N)
    {
    int x = get_global_id(0);
    int y = get_global_id(1);
    int width = get_global_size(0);
    int height = get_global_size(1);


    uint4 temp = read_imageui(inputImage, imageSampler, (int2)(x,y));

    int k = (N-1)/2;
    int n = N*N; //n*n


    if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
    {
    write_imageui(outputImage, (int2)(x,y), temp);
    return;
    }

    /* k*k area */
    uint4 finalcolor = (uint4)(0);

    int i,j;
    for(j = y - k; j <= y + k; j++)
    {
    for(i = x - k; i <= x + k; i++)
    {
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(i,j));
    }
    }

    finalcolor = finalcolor/n;

    write_imageui(outputImage, (int2)(x,y), finalcolor);


    }

    对一个2048*2048的图像执行filter操作,

    image

    image

    image

    image

    global work size = {2048, 2048, 1}, group work size = {16, 16}, 一般group work size应该为64的倍数,因为对于AMD显卡,wave是基本的硬件线程调度单位。

    使用了6个GPRs,没有使用ScratchRegs,ScratchRregs是指用vedio meory来模拟GPR,但是线程执行的速度会大大降低,应尽量减少ScratchRegs的数量。

    可以看到,使用image对象kernel执行时间要短,但奇怪的是各项性能参数都是buffer对象领先,除了alu busy和alu指令数目。

    改为下面的kernel代码,性能会有所提高

    __kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage, uint N)
    {
    int x = get_global_id(0);
    int y = get_global_id(1);
    int width = get_global_size(0);
    int height = get_global_size(1);

    if(x < 1 || y < 1 || x > width - 2 || y > height - 2)
    {
    outputImage[x + y * width] = inputImage[x + y * width];
    return;
    }


    uint4 finalcolor = (uint4)(0);

    finalcolor = finalcolor + convert_uint4(inputImage[x-1+( y-1) * width]);
    finalcolor = finalcolor + convert_uint4(inputImage[x+( y-1) * width]);
    finalcolor = finalcolor + convert_uint4(inputImage[x+1+( y-1) * width]);
    finalcolor = finalcolor + convert_uint4(inputImage[x-1+y * width]);
    finalcolor = finalcolor + convert_uint4(inputImage[x+y * width]);
    finalcolor = finalcolor + convert_uint4(inputImage[x+1+y * width]);
    finalcolor = finalcolor + convert_uint4(inputImage[x-1+( y+1) * width]);
    finalcolor = finalcolor + convert_uint4(inputImage[x+( y+1) * width]);
    finalcolor = finalcolor + convert_uint4(inputImage[x+1+( y+1) * width]);

    outputImage[x + y * width] = convert_uchar4(finalcolor/9);

    }
    __kernel void filter1(__global uchar4* inputImage, __global uchar4* outputImage, uint N)
    {
    int x = get_global_id(0);
    int y = get_global_id(1);
    int width = get_global_size(0);
    int height = get_global_size(1);

    int k = (N-1)/2;
    int n = N*N; //n*n

    if(x < k || y < k || x > width - k - 1 || y > height - k - 1)
    {
    outputImage[x + y * width inputImage[x + y * width];
    return;
    }

    // if(x==209 && y ==243)
    //{
    // printf("final color:%d,%d,%d,%d\n", finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);
    // }

    uint4 finalcolor = (uint4)(0);

    int i,j;
    for(j = y - k; j <= y + k; j++)
    {
    for(i = x - k; i <= x + k; i++)
    {
    finalcolor = finalcolor + convert_uint4(inputImage[i + j * width]);
    }
    }

    outputImage[x + y * width] = convert_uchar4(finalcolor/n);

    }
    __kernel void filterImg( image2d_t inputImage, __write_only image2d_t outputImage, uint N)
    {
    int x = get_global_id(0);
    int y = get_global_id(1);
    int width = get_global_size(0);
    int height = get_global_size(1);


    uint4 temp = read_imageui(inputImage, imageSampler, (int2)(x,y));

    if(x < 1 || y < 1 || x > width - 2 || y > height - 2)
    {
    write_imageui(outputImage, (int2)(x,y), temp);
    return;
    }

    /* k*k area */
    uint4 finalcolor = (uint4)(0);

    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y-1));
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y-1));
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y-1));
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y));
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y));
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y));
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y+1));
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y+1));
    finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y+1));

    finalcolor = finalcolor/9;

    write_imageui(outputImage, (int2)(x,y), finalcolor);


    }

    image

    image

    image

    image

  • 相关阅读:
    Caused by: Unable to load bean: type: class:com.opensymphony.xwork2.ObjectFactory
    nable to load bean: type:com.opensymphony.xwork2.util.ValueStackFactory
    一个web项目web.xml的配置中<context-param>配置作用
    js获取form的方法
    HTML <legend> 标签
    Struts2 文件上传 之 文件类型 allowedTypes
    Struts2验证框架的配置及validation.xml常用的验证规则
    struts2学习笔记--使用Validator校验数据
    LeetCode204:Count Primes
    《采访中收集程序猿》学习记录5
  • 原文地址:https://www.cnblogs.com/mikewolf2002/p/2290604.html
Copyright © 2011-2022 走看看