zoukankan      html  css  js  c++  java
  • 对比使用C# unsafe代码和OpenCV进行图像处理的效率(下)

            经过前面的讨论,我对Image类进行了优化,代码如下:

        //C#灰度图像处理类,作者:wmesci
        //http://http://blog.csdn.net/wmesci
        unsafe class Image :CriticalHandle,  IDisposable
        {
            [DllImport("kernel32.dll")]
            static extern IntPtr LocalAlloc(int flags, int size);
    
            [DllImport("kernel32.dll")]
            static extern IntPtr LocalFree(IntPtr memBlock);
    
            [DllImport("kernel32.dll", EntryPoint = "RtlMoveMemory")]
            static extern unsafe void CopyMemory(void* dst, void* src, int count);
    
            [DllImport("ntdll.dll")]
            static extern unsafe void* memset(void* src, byte value, uint size);
    
            const byte Max = 255;
            const byte Min = 0;
    
            public Image(int width, int height) 
                : base(IntPtr.Zero)
            {
                if (width <= 0 || height <= 0)
                    throw new ArgumentOutOfRangeException();
    
                Width = width;
                Height = height;
                Stride = (width + 3) & ~3;
                Length = Stride * Height;
                base.SetHandle(LocalAlloc(0x40, Length));
    
                Pointer = (byte*)handle.ToPointer();
            }
    
            public Image(int width, int height, byte* data)
                : this(width, height)
            {
                SetData(data);
            }
    
            public void GetData(void* dst) 
            {
                CopyMemory(dst, Pointer, Length);
            }
    
            public void SetData(void* src)
            {
                CopyMemory(Pointer, src, Length);
            }
    
            public readonly int Width;
    
            public readonly int Height;
    
            public readonly int Length;
    
            public readonly int Stride;
    
            public readonly byte* Pointer;
    
            public byte this[int x, int y] 
            {
                get
                {
                    return *(Pointer + y * Stride + x);
                }
                set
                {
                    *(Pointer + y * Stride + x) = value;
                }
            }
    
            public Image Clone()
            {
                return new Image(Width, Height, Pointer);
            }
    
            public void Add(Image img)
            {
                Action<int> act = y =>
                {
                    byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
                    for (int x = 0; x < Stride; x += 4, p1 += 4, p2 += 4)
                    {
                        int d = (int)p1[0] + (int)p2[0];
                        if (d < 0)
                            p1[0] = 0;
                        else if (d > 255)
                            p1[0] = 255;
                        else
                            p1[0] = (byte)d;
    
                        d = (int)p1[1] + (int)p2[1];
                        if (d < 0)
                            p1[1] = 0;
                        else if (d > 255)
                            p1[1] = 255;
                        else
                            p1[1] = (byte)d;
    
                        d = (int)p1[2] + (int)p2[2];
                        if (d < 0)
                            p1[2] = 0;
                        else if (d > 255)
                            p1[2] = 255;
                        else
                            p1[2] = (byte)d;
    
                        d = (int)p1[3] + (int)p2[3];
                        if (d < 0)
                            p1[3] = 0;
                        else if (d > 255)
                            p1[3] = 255;
                        else
                            p1[3] = (byte)d;
                    }
                };
                Parallel.For(0, Height, act);
            }
    
            public void Sub(Image img) 
            {
                Action<int> act = y =>
                {
                    byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
                    for (int x = 0; x < Stride; x += 4, p1 += 4, p2 += 4)
                    {
                        int d = (int)p1[0] - (int)p2[0];
                        if (d < 0)
                            p1[0] = 0;
                        else if (d > 255)
                            p1[0] = 255;
                        else
                            p1[0] = (byte)d;
    
                        d = (int)p1[1] - (int)p2[1];
                        if (d < 0)
                            p1[1] = 0;
                        else if (d > 255)
                            p1[1] = 255;
                        else
                            p1[1] = (byte)d;
    
                        d = (int)p1[2] - (int)p2[2];
                        if (d < 0)
                            p1[2] = 0;
                        else if (d > 255)
                            p1[2] = 255;
                        else
                            p1[2] = (byte)d;
    
                        d = (int)p1[3] - (int)p2[3];
                        if (d < 0)
                            p1[3] = 0;
                        else if (d > 255)
                            p1[3] = 255;
                        else
                            p1[3] = (byte)d;
                    }
                };
                Parallel.For(0, Height, act);
            }
    
            /// <summary>OK</summary>
            public void Mul(Image img, double scale)
            {
                Action<int> act = y =>
                {
                    byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
                    for (int x = 0; x < Stride; x+=4, p1+=4, p2+=4)
                    {
                        double d = (int)p1[0] * (int)p2[0] * scale;
                        if (d < 0)
                            p1[0] = 0;
                        else if (d > 255)
                            p1[0] = 255;
                        else
                            p1[0] = (byte)d;
    
                        d = (int)p1[1] * (int)p2[1] * scale;
                        if (d < 0)
                            p1[1] = 0;
                        else if (d > 255)
                            p1[1] = 255;
                        else
                            p1[1] = (byte)d;
    
                        d = (int)p1[2] * (int)p2[2] * scale;
                        if (d < 0)
                            p1[2] = 0;
                        else if (d > 255)
                            p1[2] = 255;
                        else
                            p1[2] = (byte)d;
    
                        d = (int)p1[3] * (int)p2[3] * scale;
                        if (d < 0)
                            p1[3] = 0;
                        else if (d > 255)
                            p1[3] = 255;
                        else
                            p1[3] = (byte)d;
                    }
                };
                Parallel.For(0, Height, act);
            }
    
            public void Threshold(byte threshold) 
            {
                Action<int> act = y => 
                {
                    byte* p = Pointer + y * Stride;
                    for (int x = 0; x < Stride; x+=4, p+=4)
                    {
                        p[0] = p[0] < threshold ? Min : Max;
                        p[1] = p[1] < threshold ? Min : Max;
                        p[2] = p[2] < threshold ? Min : Max;
                        p[3] = p[3] < threshold ? Min : Max;
                    }
                };
                Parallel.For(0, Height, act);
            }
    
            /// <summary>OK</summary>
            public void AddWeighted(Image img, double a, double b)
            {
                int* taba = stackalloc int[256];
                for (int i = 0; i < 256; i++)
                    taba[i] = (int)(i * a);
                int* tabb = stackalloc int[256];
                for (int i = 0; i < 256; i++)
                    tabb[i] = (int)(i * b);
    
                Action<int> act = y =>
                {
                    byte* p1 = this.Pointer + y * this.Stride, p2 = (byte*)img.Pointer + y * img.Stride;
                    for (int x = 0; x < this.Stride; x+=4, p1+=4, p2+=4)
                    {
                        int d = taba[p1[0]] + taba[p2[0]];
                        if (d < 0)
                            p1[0] = 0;
                        else if (d > 255)
                            p1[0] = 255;
                        else 
                            p1[0] = (byte)d;
    
                        d = taba[p1[1]] + taba[p2[1]];
                        if (d < 0)
                            p1[1] = 0;
                        else if (d > 255)
                            p1[1] = 255;
                        else
                            p1[1] = (byte)d;
    
                        d = taba[p1[2]] + taba[p2[2]];
                        if (d < 0)
                            p1[2] = 0;
                        else if (d > 255)
                            p1[2] = 255;
                        else
                            p1[2] = (byte)d;
    
                        d = taba[p1[3]] + taba[p2[3]];
                        if (d < 0)
                            p1[3] = 0;
                        else if (d > 255)
                            p1[3] = 255;
                        else
                            p1[3] = (byte)d;
                    }
                };
                Parallel.For(0, this.Height, act);
            }
    
            public static void Smooth(Image src, Image dst, int n)
            {
                //分配一块临时存储区
                int* tmp = (int*)Marshal.AllocHGlobal(src.Stride * src.Height * 4).ToPointer();
                Action<int> act = y =>
                {
                    byte* p = src.Pointer + y * src.Stride;
                    int d = 0;
                    for (int i = -n; i <= n; i++)
                    {
                        int xx = Clamp(i, src.Stride);
    
                        d += p[xx];
                    }
                    tmp[y * src.Stride] = d;
                };
                Parallel.For(0, src.Height, act);
    
                act = y =>
                {
                    int i = y * src.Stride;
                    byte* p = src.Pointer + y * src.Stride;
                    for (int x = 1; x < src.Stride; x++)
                    {
                        int d = tmp[i];
    
                        int x1 = Clamp(x - n - 1, src.Stride);
                        int x2 = Clamp(x + n, src.Stride);
    
                        d += (p[x2] - p[x1]);
    
                        tmp[++i] = d;
                    }
                };
                Parallel.For(0, src.Height, act);
    
                double f = 1.0 / (2 * n + 1);
                f *= f;
    
                act = x =>
                {
                    int d = 0;
                    byte* p = dst.Pointer + x;
                    for (int j = -n; j <= n; j++)
                    {
                        int yy = Clamp(j, src.Height);
    
                        d += tmp[x + yy * src.Stride];
                    }
                    *p = (byte)(d * f);
                    p += src.Stride;
    
                    for (int y = 1; y < src.Height; y++, p += src.Stride)
                    {
                        int y1 = Clamp(y - n - 1, src.Height);
                        int y2 = Clamp(y + n, src.Height);
    
                        d += (tmp[x + y2 * src.Stride] - tmp[x + y1 * src.Stride]);
    
                        *p = (byte)(d * f);
                    }
                };
    
                Parallel.For(0, src.Stride, act);
                Marshal.FreeHGlobal(new IntPtr(tmp));
            }
    
            private static int Clamp(int i, int max)
            {
                if (i < 0) return 0;
                if (i >= max) return max - 1;
                return i;
            }
    
            public override bool IsInvalid
            {
                get { return handle == IntPtr.Zero; }
            }
    
            protected override bool ReleaseHandle()
            {
                LocalFree(handle);
                return true;
            }
        }

            主要修改的地方如下:

            1、将图像的每一行4字节对齐,增加Stribe属性,其值等于Width向上取最近的4的倍数,然后在所有的for循环里,每次操作4个字节。这样一来,减少了循环次数。

            2、减少浮点运算

                    A:Add/Sub方法中的临时变量d改为int型

                    B:Mul方法中,调整运算顺序,由scale * *p1 * *p2改为p1[0] * p2[0] * scale,区别在于,前一种先算scale * *p0,是一个浮点乘法,其结果也是浮点数,然后再算和*p2的乘积,共两次浮点乘法;而后一种先算p1[0] * p2[0],这是一次整数乘法,然后再算和scale的积,共一次整数乘法一次浮点乘法。由于浮点乘法比整数乘法慢,因此效率会有所提高。

        3、AddWeighted改为使用查表法进行运算,首先算出0~255这256个数和a、b的积,放在数组taba、tabb中,其后的循环中只需查表再相加即可,效率大幅提高!


    下面是优化后的测试结果(数值表示Image类方法和对应的OpenCV方法执行时间之比):

    CPU:AMD Athlon(tm) II X4 640 3.00GHz (四核)
    样本:600 X 896
    -------------------------------------
    Add         1.446  1.315
    Sub         1.171  1.109
    Mul         0.651  0.580
    Threshold   1.511  1.432
    Smooth      0.938  0.908
    AddWeighted 0.528  0.474

    CPU:AMD Athlon(tm) II X4 640 3.00GHz (四核)
    样本:1600 X 1200
    -------------------------------------
    Add         1.041  1.052
    Sub         0.910  0.906
    Mul         0.562  0.558
    Threshold   1.277  1.236
    Smooth      1.020  1.024
    AddWeighted 0.462  0.461

    CPU:AMD Athlon(tm) II X2 245 2.91GHz (双核)
    样本:1600 X 1200
    -------------------------------------
    Add         1.514  1.533
    Sub         1.225  1.163
    Mul         1.085  1.095
    Threshold   1.643  1.630
    Smooth      1.847  1.867
    AddWeighted 0.957  0.924

    CPU:AMD Athlon(tm) II X2 245 2.91GHz (双核)
    样本:600 X 896
    -------------------------------------
    Add         2.559  2.073  2.676
    Sub         2.240  1.784  1.856
    Mul         1.261  1.352  1.284
    Threshold   2.453  2.511  3.101
    Smooth      1.660  1.647  1.663
    AddWeighted 0.978  1.017  0.961

    CPU:Intel Core i3 M330 2.13GHz  (双核四线程)
    样本:1600 X 1200
    -------------------------------------
    Add         2.611
    Sub         2.545
    Mul         1.011
    Threshold   2.882
    Smooth      1.891
    AddWeighted 0.525

    CPU:Intel Core i3 M330 2.13GHz  (双核四线程)
    样本:600 X 896
    -------------------------------------
    Add         4.483
    Sub         3.576
    Mul         1.101
    Threshold   5.953
    Smooth      2.029
    AddWeighted 0.581

    CPU:Intel Core i7 2360QM 2.00GHz  (四核八线程)
    样本:600 X 896
    -------------------------------------
    Add         1.080  1.020
    Sub         0.977  1.010
    Mul         0.575  0.558
    Threshold   0.842  0.898
    Smooth      1.447  1.386
    AddWeighted 0.325  0.366

    CPU:Intel Core i7 2360QM 2.00GHz  (四核八线程)
    样本:1600 X 1200
    -------------------------------------
    Add         1.420
    Sub         1.134
    Mul         0.535
    Threshold   0.878
    Smooth      1.379
    AddWeighted 0.325


        分析以上数据,我们不难发现以下几点:

        1、样本大小相同时,CPU核心数越多,Image/OpenCV就越小,这说明了多线程算法在多核CPU下的优势。

        2、CPU相同时,样本大小越打,比值越小。

        3、OpenCV针对Intel CPU使用IPP进行了优化,因此在Intel CPU上跑,比值会比在AMD CPU上打很多。

        4、OpenCV里使用SSE优化过的方法,用C#实现时差距比较明显,怎么才能达到差不多的效率,这个暂时还没想到。但是OpenCV里没使用SSE优化的方法,如Mul、AddWeighted,使用C#完全可以达到相同的性能,甚至超过OpenCV,如AddWeighted,在双核CPU上也比OpenCV要快,在4核以上CPU上远超OpenCV!!

        5、使用自写方法替代OpenCV是完全可行的!!


        以上测试有几点需要说明:

        1、CLR是在第一次运行某个方法时才进行编译,因此第一次执行某个方法时会慢很多,在计算时间时要排除第一次执行的时间。

        2、C#调用OpenCV需要经过封送处理,但封送处理所消耗的时间在这里无法避免。


        各位看官如有其它优化修改意见,还望不吝赐教!!!同时我会不断对这个类进行修改以及添加其它图像处理方法,对图像处理、OpenCV以及C#代码优化感兴趣的同学,请关注本贴!!

  • 相关阅读:
    WPF-触发器
    WPF使用socket实现简单聊天软件
    git常用命令备忘
    (转载)WPF中的动画——(一)基本概念
    WPF中的依赖项属性
    C#中的索引器
    C#中的装箱拆箱
    编程语言的弱类型、强类型、动态类型、静态类型
    WPF中的数据驱动
    WPF中的命令简介
  • 原文地址:https://www.cnblogs.com/wmesci/p/2736006.html
Copyright © 2011-2022 走看看