zoukankan      html  css  js  c++  java
  • 快速遍历OpenCV Mat图像数据的多种方法和性能分析 | opencv mat for loop

    本文首发于个人博客https://kezunlin.me/post/61d55ab4/,欢迎阅读!

    opencv mat for loop

    Series

    Guide

    Mat

    • for gray image, use type <uchar>
    • for RGB color image,use type <Vec3b>

    gray format storage
    gray

    color format storage: BGR
    BGR

    we can use method isContinuous() to judge whether the memory buffer is continuous or not.

    color space reduction

    uchar color_space_reduction(uchar pixel)
    {
    	/*
    	0-9 ===>0
    	10-19===>10
    	20-29===>20
    	...
    	240-249===>24
    	250-255===>25
    
    	map from 256*256*256===>26*26*26
    	*/
    
    	int divideWith = 10;
    	uchar new_pixel = (pixel / divideWith)*divideWith;
    	return new_pixel;
    }
    

    color table

    void get_color_table()
    {
    	// cache color value in table[256]
    	int divideWith = 10;
    	uchar table[256];
    	for (int i = 0; i < 256; ++i)
    		table[i] = divideWith* (i / divideWith);
    }
    

    C++

    ptr []

    // C ptr []: faster but not safe
    Mat& ScanImageAndReduce_Cptr(Mat& I, const uchar* const table)
    {
    	// accept only char type matrices
    	CV_Assert(I.depth() != sizeof(uchar));
    	int channels = I.channels();
    	int nRows = I.rows;
    	int nCols = I.cols* channels;
    	if (I.isContinuous())
    	{
    		nCols *= nRows;
    		nRows = 1;
    	}
    	int i, j;
    	uchar* p;
    	for (i = 0; i < nRows; ++i)
    	{
    		p = I.ptr<uchar>(i);
    		for (j = 0; j < nCols; ++j)
    		{
    			p[j] = table[p[j]];
    		}
    	}
    	return I;
    }
    

    ptr ++

    // C ptr ++: faster but not safe
    Mat& ScanImageAndReduce_Cptr2(Mat& I, const uchar* const table)
    {
    	// accept only char type matrices
    	CV_Assert(I.depth() != sizeof(uchar));
    	int channels = I.channels();
    	int nRows = I.rows;
    	int nCols = I.cols* channels;
    	if (I.isContinuous())
    	{
    		nCols *= nRows;
    		nRows = 1;
    	}
    	uchar* start = I.ptr<uchar>(0); // same as I.ptr<uchar>(0,0)
    	uchar* end = start + nRows * nCols;
    	for (uchar* p=start; p < end; ++p)
    	{
    		*p = table[*p];
    	}
    	return I;
    }
    

    at(i,j)

    // at<uchar>(i,j): random access, slow
    Mat& ScanImageAndReduce_atRandomAccess(Mat& I, const uchar* const table)
    {
       // accept only char type matrices
       CV_Assert(I.depth() != sizeof(uchar));
       const int channels = I.channels();
       switch (channels)
       {
       case 1:
       {
       	for (int i = 0; i < I.rows; ++i)
       		for (int j = 0; j < I.cols; ++j)
       			I.at<uchar>(i, j) = table[I.at<uchar>(i, j)];
       	break;
       }
       case 3:
       {
       	Mat_<Vec3b> _I = I;
    
       	for (int i = 0; i < I.rows; ++i)
       		for (int j = 0; j < I.cols; ++j)
       		{
       			_I(i, j)[0] = table[_I(i, j)[0]];
       			_I(i, j)[1] = table[_I(i, j)[1]];
       			_I(i, j)[2] = table[_I(i, j)[2]];
       		}
       	I = _I;
       	break;
       }
       }
       return I;
    }
    
    

    Iterator

    // MatIterator_<uchar>: safe but slow
    Mat& ScanImageAndReduce_Iterator(Mat& I, const uchar* const table)
    {
       // accept only char type matrices
       CV_Assert(I.depth() != sizeof(uchar));
       const int channels = I.channels();
       switch (channels)
       {
       case 1:
       {
       	MatIterator_<uchar> it, end;
       	for (it = I.begin<uchar>(), end = I.end<uchar>(); it != end; ++it)
       		*it = table[*it];
       	break;
       }
       case 3:
       {
       	MatIterator_<Vec3b> it, end;
       	for (it = I.begin<Vec3b>(), end = I.end<Vec3b>(); it != end; ++it)
       	{
       		(*it)[0] = table[(*it)[0]];
       		(*it)[1] = table[(*it)[1]];
       		(*it)[2] = table[(*it)[2]];
       	}
       }
       }
       return I;
    }
    

    opencv LUT

    // LUT
    Mat& ScanImageAndReduce_LUT(Mat& I, const uchar* const table)
    {
       Mat lookUpTable(1, 256, CV_8U);
       uchar* p = lookUpTable.data;
       for (int i = 0; i < 256; ++i)
       	p[i] = table[i];
    
       cv::LUT(I, lookUpTable, I);
       return I;
    }
    

    forEach

    forEach method of the Mat class that utilizes all the cores on your machine to apply any function at every pixel.

    // Parallel execution with function object.
    struct ForEachOperator
    {
    	uchar m_table[256];
    	ForEachOperator(const uchar* const table)
    	{
    		for (size_t i = 0; i < 256; i++)
    		{
    			m_table[i] = table[i];
    		}
    	}
    
    	void operator ()(uchar& p, const int * position) const
    	{
    		// Perform a simple operation
    		p = m_table[p];
    	}
    };
    
    // forEach use multiple processors, very fast
    Mat& ScanImageAndReduce_forEach(Mat& I, const uchar* const table)
    {
    	I.forEach<uchar>(ForEachOperator(table));
    	return I;
    }
    

    forEach with lambda

    // forEach lambda use multiple processors, very fast (lambda slower than ForEachOperator)
    Mat& ScanImageAndReduce_forEach_with_lambda(Mat& I, const uchar* const table)
    {
    	I.forEach<uchar>
    	(
    		[=](uchar &p, const int * position) -> void
    		{
    			p = table[p];
    		}
    	);
    	return I;
    }
    

    time cost

    no foreach

    [1 Cptr   ] times=5000, total_cost=988 ms, avg_cost=0.1976 ms
    [1 Cptr2  ] times=5000, total_cost=1704 ms, avg_cost=0.3408 ms
    [2 atRandom] times=5000, total_cost=9611 ms, avg_cost=1.9222 ms
    [3 Iterator] times=5000, total_cost=20195 ms, avg_cost=4.039 ms
    [4 LUT    ] times=5000, total_cost=899 ms, avg_cost=0.1798 ms
    
    [1 Cptr   ] times=10000, total_cost=2425 ms, avg_cost=0.2425 ms
    [1 Cptr2  ] times=10000, total_cost=3391 ms, avg_cost=0.3391 ms
    [2 atRandom] times=10000, total_cost=20024 ms, avg_cost=2.0024 ms
    [3 Iterator] times=10000, total_cost=39980 ms, avg_cost=3.998 ms
    [4 LUT    ] times=10000, total_cost=103 ms, avg_cost=0.0103 ms
    

    foreach

    [5 forEach     ] times=200000, total_cost=199 ms, avg_cost=0.000995 ms
    [5 forEach lambda] times=200000, total_cost=521 ms, avg_cost=0.002605 ms
    
    [5 forEach     ] times=20000, total_cost=17 ms, avg_cost=0.00085 ms
    [5 forEach lambda] times=20000, total_cost=23 ms, avg_cost=0.00115 ms
    

    results

    Loop Type | Time Cost (us)
    :----: |
    ptr [] | 242
    ptr ++ | 339
    at | 2002
    iterator | 3998
    LUT | 10
    forEach | 0.85
    forEach lambda | 1.15

    forEach is 10x times faster than LUT, 240~340x times faster than ptr [] and ptr ++, and 2000~4000x times faster than at and iterator.

    code

    code here

    Python

    pure python

    # import the necessary packages
    import matplotlib.pyplot as plt
    import cv2
    print(cv2.__version__)
    
    %matplotlib inline
    
    3.4.2
    
    # load the original image, convert it to grayscale, and display
    # it inline
    image = cv2.imread("cat.jpg")
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    print(image.shape)
    #plt.imshow(image, cmap="gray")
    
    (360, 480)
    
    %load_ext cython
    
    The cython extension is already loaded. To reload it, use:
      %reload_ext cython
    
    %%cython -a
     
    def threshold_python(T, image):
        # grab the image dimensions
        h = image.shape[0]
        w = image.shape[1]
        
        # loop over the image, pixel by pixel
        for y in range(0, h):
            for x in range(0, w):
                # threshold the pixel
                image[y, x] = 255 if image[y, x] >= T else 0
                
        # return the thresholded image
        return image
    
    %timeit threshold_python(5, image)
    
    263 ms ± 20.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    

    cython

    %%cython -a
     
    import cython
     
    @cython.boundscheck(False)
    cpdef unsigned char[:, :] threshold_cython(int T, unsigned char [:, :] image):
        # set the variable extension types
        cdef int x, y, w, h
        
        # grab the image dimensions
        h = image.shape[0]
        w = image.shape[1]
        
        # loop over the image
        for y in range(0, h):
            for x in range(0, w):
                # threshold the pixel
                image[y, x] = 255 if image[y, x] >= T else 0
        
        # return the thresholded image
        return image
    

    numba

    %timeit threshold_cython(5, image)
    
    150 µs ± 7.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    
    from numba import njit
    
    @njit
    def threshold_njit(T, image):
        # grab the image dimensions
        h = image.shape[0]
        w = image.shape[1]
        
        # loop over the image, pixel by pixel
        for y in range(0, h):
            for x in range(0, w):
                # threshold the pixel
                image[y, x] = 255 if image[y, x] >= T else 0
                
        # return the thresholded image
        return image
    
    %timeit threshold_njit(5, image)
    
    43.5 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    

    numpy

    def threshold_numpy(T, image):
        image[image > T] = 255
        return image
    
    %timeit threshold_numpy(5, image)
    
    111 µs ± 334 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    

    conclusions

    image = cv2.imread("cat.jpg")
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    print(image.shape)
    
    %timeit threshold_python(5, image)
    %timeit threshold_cython(5, image)
    %timeit threshold_njit(5, image)
    %timeit threshold_numpy(5, image)
    
    (360, 480)
    251 ms ± 6.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    143 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    43.8 µs ± 284 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    113 µs ± 957 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    
    image = cv2.imread("big.jpg")
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    print(image.shape)
    
    %timeit threshold_python(5, image)
    %timeit threshold_cython(5, image)
    %timeit threshold_njit(5, image)
    %timeit threshold_numpy(5, image)
    
    (2880, 5120)
    21.8 s ± 460 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    12.3 ms ± 231 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    3.91 ms ± 66.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    10.3 ms ± 179 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    

    60,480

    • python: 251 ms
    • cython: 143 us
    • numba: 43 us
    • numpy: 113 us

    2880, 5120

    • python: 21 s
    • cython: 12 ms
    • numba: 4 ms
    • numpy: 10 ms

    Reference

    History

    • 20180823: created.

    Copyright

  • 相关阅读:
    DLUTOJ 1209 字典序和r-子集
    C++ Standard-Library Random Numbers
    后缀数组模板
    UVA 1398 Meteor
    STL Iterators
    hihocoder1187 Divisors
    51nod 子序列的个数(动态规划)
    51nod 最长单增子序列(动态规划)
    51nod 更难的矩阵取数问题(动态规划)
    51nod 多重背包问题(动态规划)
  • 原文地址:https://www.cnblogs.com/kezunlin/p/11880864.html
Copyright © 2011-2022 走看看