zoukankan      html  css  js  c++  java
  • A tiny program to benchmark image transpose algorithms

    Here is the code:

    #include <stdio.h>
    #include <xmmintrin.h>
    #include <windows.h>
    
    typedef __m128 Vec;
    
    typedef unsigned long long value_t;
    
    __forceinline value_t now()
    {
        LARGE_INTEGER n;
        QueryPerformanceCounter(&n);
        return n.QuadPart;
    }
    
    inline void img_transpose(
        Vec *dst_img, 
        Vec *src_img, 
        const int src_w, 
        const int src_h)
    {
    #pragma omp parallel for
        for (int j = 0; j < src_w; ++j)
        {
            for (int i = 0; i < src_h; ++i)
            {
                dst_img[j * src_h + i] = src_img[i * src_w + j];
            }
        }
    }
    
    inline void img_transpose_block(
        Vec *dst_img, 
        Vec *src_img, 
        const int src_w, 
        const int src_h)
    {
    #pragma omp parallel for
        for (int j = 0; j < src_w; j += 8)
        {
            for (int i = 0; i < src_h; i += 8)
            {
                const int nsize = min(j + 8, src_w);
                const int msize = min(i + 8, src_h);
    
                for (int n = j; n < nsize; ++n)
                {
                    for (int m = i; m < msize; ++m)
                    {
                        dst_img[n * src_h + m] = src_img[m * src_w + n];
                    }
                }
            }
        }
    }
    
    int main(int argc, char *argv[])
    {
        //// performance benchmark ////
    
        const int w = 1280;
        const int h = 720;
        Vec *a = new Vec [w * h];
        Vec *b = new Vec [w * h];
        value_t start_time, end_time;
    
    
        LARGE_INTEGER freq;
        QueryPerformanceFrequency(&freq);
        double ms_per_tick = 1000.0 / (double)freq.QuadPart;
    
    
    
        start_time = now();
    
        for (int t = 0; t < 50; ++t)
        {
            img_transpose(b, a, w, h);
            img_transpose(a, b, h, w);
        }
    
        end_time = now();
        printf("img_transpose:          %f ms
    ", (double)(end_time - start_time) * ms_per_tick);
    
    
    
        start_time = now();
    
        for (int t = 0; t < 50; ++t)
        {
            img_transpose_block(b, a, w, h);
            img_transpose_block(a, b, h, w);
        }
    
        end_time = now();
        printf("img_transpose_block:   %f ms
    ", (double)(end_time - start_time) * ms_per_tick);
    
    
        delete [] a;
        delete [] b;
    
    
        //// algorithm validation ////
        const int width = 1080;
        const int height = 1920;
        Vec *src_img = new Vec [width * height];
        Vec *dst_img = new Vec [height * width];
    
        for (int j = 0; j < height; ++j)
        {
            for (int i = 0; i < width; ++i)
            {
                src_img[j * width + i].m128_i32[0] = i;
                src_img[j * width + i].m128_i32[1] = j;
            }
        }
    
        img_transpose_block(dst_img, src_img, width, height);
    
        for (int j = 0; j < width; ++j)
        {
            for (int i = 0; i < height; ++i)
            {
                int pi = dst_img[j * height + i].m128_i32[0];
                int pj = dst_img[j * height + i].m128_i32[1];
    
                if (pi != j || pj != i)
                {
                    printf("Algorithm is wrong!!!
    ");
                    goto END_OF_PROGRAM;
                }
            }
        }
    
    END_OF_PROGRAM:
        printf("All done
    ");
    
    
        return 0;
    }
  • 相关阅读:
    关于ThreadLocal
    二月份推荐的书籍
    《编写高质量代码:改善Java程序的151个建议》笔记
    Oracle中定义package以及存储过程的使用
    不同系统平台下Java默认的安装路径
    WebSphere数据源配置
    Dedication(转)
    Upgrading to Java 8——第二章 Method References(方法引用)
    Upgrading to Java 8——第一章 Lambda表达式
    Multi-catch
  • 原文地址:https://www.cnblogs.com/len3d/p/7711639.html
Copyright © 2011-2022 走看看