zoukankan      html  css  js  c++  java
  • (原)测试intel的并行计算pafor

    转载请注明出处:

    http://www.cnblogs.com/darkknightzh/p/4988264.html

    参考网址:

    关于mt19937:http://www.cnblogs.com/egmkang/archive/2012/09/06/2673253.html

    代码如下:

      1 #include "stdafx.h"
      2 #include <iostream>
      3 #include <random>     // mt19937的头文件
      4 #include <ppl.h>      // parfor的头文件
      5 #include <windows.h>  // QueryPerformanceFrequency等函数的头文件
      6 
      7 using namespace concurrency; // parfor使用
      8 using namespace std;
      9 
     10 
     11 // 分配内存
     12 void AllocMatrix(double** m, size_t n)
     13 {
     14     *m = new double[n*n];
     15     memset(*m, 0, sizeof(double)*n*n);
     16 }
     17 
     18 
     19 // 初始化矩阵内容
     20 template <class Gen>
     21 void IniMatrix(double* m, size_t n, Gen& gen)
     22 {
     23     for (size_t i = 0; i < n; ++i)
     24     {
     25         for (size_t j = 0; j < n; ++j)
     26         {
     27             m[i*n + j] = static_cast<double>(gen());
     28         }
     29     }
     30 }
     31 
     32 
     33 // 释放内存
     34 void FreeMatrix(double** m)
     35 {
     36     if (nullptr != *m)
     37     {
     38         delete[](*m);
     39         (*m) = nullptr;
     40     }
     41 }
     42 
     43 
     44 // 矩阵相乘,使用for
     45 void matrixMultiplyFor(double* res, const double* m1, const double* m2, size_t n)
     46 {
     47     for (size_t i = 0; i < n; i++)
     48     {
     49         for (size_t j = i; j < n; j++)
     50         {
     51             double temp = 0;
     52             for (size_t k = 0; k < n; k++)
     53             {
     54                 temp += m1[i * n + k] * m2[k * n + j];
     55             }
     56             res[i*n + j] = temp;
     57         }
     58     }
     59 }
     60 
     61 
     62 // 矩阵相乘,外层使用parfor
     63 void matrixMultiplyParForOuter(double* res, const double* m1, const double* m2, size_t n)
     64 {
     65     parallel_for(size_t(0), n, [&](size_t i)
     66     {
     67         for (size_t j = i; j < n; j++)
     68         {
     69             double temp = 0;
     70             for (size_t k = 0; k < n; k++)
     71             {
     72                 temp += m1[i * n + k] * m2[k * n + j];
     73             }
     74             res[i*n + j] = temp;
     75         }
     76     });
     77 }
     78 
     79 
     80 // 矩阵相乘,内层使用parfor
     81 void matrixMultiplyParForInner(double* res, const double* m1, const double* m2, size_t n)
     82 {
     83     for (size_t i = 0; i < n; i++)
     84     {
     85         parallel_for(size_t(i), n, [&](size_t j)
     86         {
     87             double temp = 0;
     88             for (size_t k = 0; k < n; k++)
     89             {
     90                 temp += m1[i * n + k] * m2[k * n + j];
     91             }
     92             res[i*n + j] = temp;
     93         });
     94     }
     95 }
     96 
     97 
     98 // 测试矩阵相乘,使用for的时间
     99 double testmatrixMultiplyFor(double* res, const double* m1, const double* m2, size_t n)
    100 {
    101     LARGE_INTEGER nFreq, nBeginTime, nEndTime;
    102     QueryPerformanceFrequency(&nFreq);
    103     QueryPerformanceCounter(&nBeginTime);
    104 
    105     matrixMultiplyFor(res, m1, m2, n);
    106 
    107     QueryPerformanceCounter(&nEndTime);
    108     return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
    109 }
    110 
    111 
    112 // 测试矩阵相乘,外层使用parfor的时间
    113 double testmatrixMultiplyParForOuter(double* res, const double* m1, const double* m2, size_t n)
    114 {
    115     LARGE_INTEGER nFreq, nBeginTime, nEndTime;
    116     QueryPerformanceFrequency(&nFreq);
    117     QueryPerformanceCounter(&nBeginTime);
    118 
    119     matrixMultiplyParForOuter(res, m1, m2, n);
    120 
    121     QueryPerformanceCounter(&nEndTime);
    122     return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
    123 }
    124 
    125 
    126 // 测试矩阵相乘,内层使用parfor的时间
    127 double testmatrixMultiplyParForInner(double* res, const double* m1, const double* m2, size_t n)
    128 {
    129     LARGE_INTEGER nFreq, nBeginTime, nEndTime;
    130     QueryPerformanceFrequency(&nFreq);
    131     QueryPerformanceCounter(&nBeginTime);
    132 
    133     matrixMultiplyParForInner(res, m1, m2, n);
    134 
    135     QueryPerformanceCounter(&nEndTime);
    136     return (double)(nEndTime.QuadPart - nBeginTime.QuadPart) * 1000 / (double)nFreq.QuadPart;
    137 }
    138 
    139 
    140 // 主函数
    141 int _tmain(int argc, _TCHAR* argv[])
    142 {
    143     const size_t n = 1024;
    144     double* dM1 = NULL;
    145     double* dM2 = NULL;
    146     double* dRes1 = NULL;
    147     double* dRes2 = NULL;
    148     double* dRes3 = NULL;
    149 
    150     random_device rd;
    151     mt19937 gen(rd());
    152 
    153     AllocMatrix(&dM1, n);
    154     AllocMatrix(&dM2, n);
    155     IniMatrix(dM1, n, gen);
    156     IniMatrix(dM2, n, gen);
    157 
    158     AllocMatrix(&dRes1, n);
    159     AllocMatrix(&dRes2, n);
    160     AllocMatrix(&dRes3, n);
    161     
    162     double dTimeFor = testmatrixMultiplyFor(dRes1, dM1, dM2, n);
    163     double dTimeParForOuter = testmatrixMultiplyParForOuter(dRes2, dM1, dM2, n);
    164     double dTimeParForInner = testmatrixMultiplyParForInner(dRes3, dM1, dM2, n);
    165 
    166     printf("time(ms)
    for: %f 
    parforOunter: %f 
    parforInner: %f
    ", dTimeFor, dTimeParForOuter, dTimeParForInner);
    167 
    168     FreeMatrix(&dM1);
    169     FreeMatrix(&dM2);
    170     FreeMatrix(&dRes1);
    171     FreeMatrix(&dRes2);
    172     FreeMatrix(&dRes3);
    173 
    174     return 0;
    175 }

    debug

    time(ms)

    for: 7761.769099

    parforOunter: 3416.670736

    parforInner: 3423.701265

    release

    time(ms)

    for: 3884.167485

    parforOunter: 1062.581817

    parforInner: 1083.642302

    说明:此处测试outerinner是因为,matlab里面,使用outer形式的并行计算,使用parfor后,如果循环比对类似这种三角形式,最终有些核先跑完结果,有些核后跑完结果,导致出现,一个核累死累活的跑程序,另外N-1个核围观的状态,使最终的计算时间变长(不过在matlab中未测试outerinner使用parfor的时间对比)。

    但是,在C++里面,不知道是否优化的原因,outer使用parforinner使用parfor要快。此处测试了n=2048,结果也是outerinner的形式要快。

  • 相关阅读:
    collections工具类 排序
    API text|lang
    异常处理
    extends继承
    接口
    static修饰符
    多态与find关键词
    Markdown语法
    Hexo | (五)Yilia主题优化
    Hexo | (四)多机同步更新博客
  • 原文地址:https://www.cnblogs.com/darkknightzh/p/4988264.html
Copyright © 2011-2022 走看看