zoukankan      html  css  js  c++  java
  • Linux 版的 Intel MKL 的安装使用

    1.下载

    https://software.intel.com/en-us/mkl

    文件名字类似 l_mkl_2017.3.196.tgz

    2.安装

    1)解压

    笔者解压至 /opt/

    2)# ./install.sh

    3)在 /etc/ld.so.conf.d 下创建名为 intel-mkl.conf 的文件,内容为

    /opt/intel/mkl/lib/intel64
    /opt/intel/lib/intel64

    4) 执行

    $ /opt/intel/mkl/bin/mklvars.sh intel64 ilp64

    见:https://software.intel.com/en-us/mkl-linux-developer-guide-scripts-to-set-environment-variables

    3.使用

    以编译官方文档上的 dgemm_example.c 为例

    #define min(x,y) (((x) < (y)) ? (x) : (y))
    #include <stdio.h>
    #include <stdlib.h>
    #include "mkl.h"
    
    int main()
    {
        double *A, *B, *C;
        int m, n, p, i, j;
        double alpha, beta;
    
        printf ("
     This example computes real matrix C=alpha*A*B+beta*C using 
    "
                " Intel(R) MKL function dgemm, where A, B, and  C are matrices and 
    "
                " alpha and beta are double precision scalars
    
    ");
    
        m = 2000, p = 200, n = 1000;
        printf (" Initializing data for matrix multiplication C=A*B for matrix 
    "
                " A(%ix%i) and matrix B(%ix%i)
    
    ", m, p, p, n);
        alpha = 1.0; beta = 0.0;
        printf (" Allocating memory for matrices aligned on 64-byte boundary for better 
    "
                " performance 
    
    ");
        A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
        B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
        C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
        if (A == NULL || B == NULL || C == NULL) {
            printf( "
     ERROR: Can't allocate memory for matrices. Aborting... 
    
    ");
            mkl_free(A);
            mkl_free(B);
            mkl_free(C);
            return 1;
        }
    
        printf (" Intializing matrix data 
    
    ");
        for (i = 0; i < (m*p); i++) {
            A[i] = (double)(i+1);
        }
    
        for (i = 0; i < (p*n); i++) {
            B[i] = (double)(-i-1);
        }
    
        for (i = 0; i < (m*n); i++) {
            C[i] = 0.0;
        }
    
        printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface 
    
    ");
        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 
                    m, n, p, alpha, A, p, B, n, beta, C, n);
        printf ("
     Computations completed.
    
    ");
    
        printf (" Top left corner of matrix A: 
    ");
        for (i=0; i<min(m,6); i++) {
            for (j=0; j<min(p,6); j++) {
                printf ("%12.0f", A[j+i*p]);
            }
            printf ("
    ");
        }
    
        printf ("
     Top left corner of matrix B: 
    ");
        for (i=0; i<min(p,6); i++) {
            for (j=0; j<min(n,6); j++) {
                printf ("%12.0f", B[j+i*n]);
            }
            printf ("
    ");
        }
        
        printf ("
     Top left corner of matrix C: 
    ");
        for (i=0; i<min(m,6); i++) {
            for (j=0; j<min(n,6); j++) {
                printf ("%12.5G", C[j+i*n]);
            }
            printf ("
    ");
        }
    
        printf ("
     Deallocating memory 
    
    ");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
    
        printf (" Example completed. 
    
    ");
        return 0;
    }

    (代码下载地址:https://software.intel.com/en-us/product-code-samples) 

    编译命令为:

    $ gcc -I/opt/intel/mkl/include dgemm_example.c -lmkl_core -lmkl_intel_lp64 -lmkl_intel_thread -liomp5 -lpthread -lm -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64

    或者

    $ gcc -I/opt/intel/mkl/include dgemm_example.c -lmkl_rt -L/opt/intel/mkl/lib/intel64 -L/opt/intel/lib/intel64

    再或者 

    $ . /opt/intel/bin/compilervars.sh intel64

    $ gcc dgemm_example.c -lmkl_rt

    此方法可行是因为前一个命令设置了环境变量 CPATH,LD_LIBRARY_PATH,LIBRARY_PATH,致使编译器可以找到所需的头文件和库文件。编译C时头文件查找 C_INCLUDE_PATH 中包含目录,C++ 查找 CPLUS_INCLUDE_PATH,C和C++都查找 CPATH)

    链接MKL的库的方法见:https://software.intel.com/en-us/mkl-linux-developer-guide-linking-your-application-with-the-intel-math-kernel-library

    (学习文档:https://software.intel.com/en-us/get-started-with-mkl-for-linux ,https://software.intel.com/en-us/mkl-linux-developer-guide)

    后记:

    Intel的MPI库的安装方法与MKL相同,执行 compilervars.sh 之后即可编译使用了MPI库的文件。

    gmres_test.c

    /* Example to show how to use Intel's FGMRES with preconditioner to solve the linear system Ax=b in MPI.
     * Based on Intel's example: solverc/source/fgmres_full_funct_c.c
     * For CS51501 HW3 Part b 
     * 
     * Please read Intel Reference Manual, Chapter 6 Sparse Solve Routine, FGMRES Interface Description for the detail information.
     */
    #include <stdio.h>
    #include "mkl.h"
    #include "mpi.h"
    
    #define MASTER 0        // taskid of first task
    #define RESTART 500
    #define TOL 0.00000001
    #define MAXIT 1000
    
    void mpi_dgemv(const MKL_INT m, const MKL_INT local_m, const double *A, const double *u, double *v, double *local_u, double *local_v, int taskid, MPI_Comm comm);
    void mpi_preconditioner_solver(const MKL_INT m, const MKL_INT local_m, const double *local_M, const double *u, double *v, double *local_u, int taskid, MPI_Comm comm);
    
    int main(int argc, char *argv[])
    {
        int taskid;        // a task identifier 
        int numtasks;        // number of tasks in partition
        MPI_Comm comm;
        int m;            // size of the matrix
        int local_m;        // rows of matrix A sent to each worker 
        double *A, *b, *exact_x, *x;
        double *temp_1, *temp_2;
        double *local_A, *local_v, *local_u;
        double *local_M;    // M is the preconditioner in this example, which is the diagonal element of A;
        int i, j, k;
    
        MPI_Init(&argc, &argv);
        comm = MPI_COMM_WORLD;
        MPI_Comm_rank(comm, &taskid);
        MPI_Comm_size(comm, &numtasks);
        if (taskid == MASTER) {    // initilization: A and b
            /* start modification 1: read A and b from mtx files in node 0 */
            m = 64;        // size of the matrix
            A = malloc(sizeof(double) * (m * m));
            // !!! A is in col-major
            for (j = 0; j < m; j++)
                for (i = 0; i < m; i++) {
                    if (i == j)
                        *(A + j * m + i) = m * 100.0;
                    else
                        *(A + j * m + i) = i + 1.0;
                }
            exact_x = malloc(sizeof(double) * m);
            for (i = 0; i < m; i++)
                *(exact_x + i) = 1.0;
            b = malloc(sizeof(double) * m);
            // b=A*ones(n,1)
            cblas_dgemv(CblasColMajor, CblasNoTrans, m, m, 1.0, A, m, exact_x, 1, 0.0, b, 1);
            /* end modification 1 */
        }
    
        MPI_Bcast(&m, 1, MPI_INT, MASTER, comm);    // send m from node MASTER to all other nodes.
        local_m = m / numtasks;
        local_A = malloc(sizeof(double) * (local_m * m));
        local_u = malloc(sizeof(double) * (local_m));
        local_v = malloc(sizeof(double) * m);
        //      partition A and send A_i to local_A on  node i
        MPI_Scatter(A, local_m * m, MPI_DOUBLE, local_A, local_m * m, MPI_DOUBLE, MASTER, comm);
    
        if (taskid == MASTER) {
            free(A);
            free(exact_x);
            // do not free b, it wil be used  for GMRES
        }
    
    
        /* start modification 2: generate preconditioner M
         * In this example, TA choose the diagonal elements of A as the preconditioner.
         * In HW3 part b, you should generate L and U here.
         */
        local_M = malloc(sizeof(double) * local_m);
        for (i = 0; i < local_m; i++)
            *(local_M + i) = *(local_A + taskid * local_m + i * m + i);
        /* end  modification 2 */
    
    
        /*---------------------------------------------------------------------------
         * GMRES: Allocate storage for the ?par parameters and the solution vectors
         *---------------------------------------------------------------------------*/
        MKL_INT RCI_request;
        int RCI_flag;
        double dvar;
        int flag = 0;
    
        MKL_INT ipar[128];    //specifies the integer set of data for the RCI FGMRES computations
        double dpar[128];    // specifies the double precision set of data
        double *tmp;        //used to supply the double precision temporary space for theRCI FGMRES computations, specifically:
        double *computed_solution;
        double *residual;
        double *f;
        MKL_INT itercount, ierr = 0;;
        MKL_INT ivar;
        double b_2norm;
        char cvar = 'N';
        MKL_INT incx = 1;
        if (taskid == MASTER) {
            ipar[14] = RESTART;    // restart iteration number
            int n_tmp = (2 * ipar[14] + 1) * m + ipar[14] * (ipar[14] + 9) / 2 + 1;
            tmp = (double *) malloc(sizeof(double) * n_tmp);
            computed_solution = (double *) malloc(sizeof(double) * m);
            residual = (double *) malloc(sizeof(double) * m);
            f = (double *) malloc(sizeof(double) * m);
    
            ivar = m;
            /*---------------------------------------------------------------------------
             * Initialize the initial guess
             *---------------------------------------------------------------------------*/
            for (i = 0; i < m; i++) {
                computed_solution[i] = 0.5;
            }
    
            b_2norm = cblas_dnrm2(ivar, b, incx);
            //      printf("b_2norm=%f
    ",b_2norm);
            /*---------------------------------------------------------------------------
             * Initialize the solver
             *---------------------------------------------------------------------------*/
            dfgmres_init(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);
            RCI_flag = RCI_request;
        }
        MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);
        if (RCI_flag != 0)
            goto FAILED;
    
        if (taskid == MASTER) {
            /*---------------------------------------------------------------------------
             * GMRES: Set the desired parameters:
             *---------------------------------------------------------------------------*/
            ipar[14] = RESTART;    // restart iteration number
            ipar[7] = 1;    //do the stopping test
            ipar[10] = 1;    // use preconditioner
            dpar[0] = TOL;
            /*---------------------------------------------------------------------------
             * Check the correctness and consistency of the newly set parameters
             *---------------------------------------------------------------------------*/
            dfgmres_check(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);
            RCI_flag = RCI_request;
        }
    
        MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);
        if (RCI_flag != 0)
            goto FAILED;
    
        if (taskid == MASTER) {
            /*---------------------------------------------------------------------------
             * Print the info about the RCI FGMRES method
             *---------------------------------------------------------------------------*/
            printf("Some info about the current run of RCI FGMRES method:
    
    ");
            if (ipar[7]) {
                printf("As ipar[7]=%d, the automatic test for the maximal number of ", ipar[7]);
                printf("iterations will be
    performed
    ");
            } else {
                printf("As ipar[7]=%d, the automatic test for the maximal number of ", ipar[7]);
                printf("iterations will be
    skipped
    ");
            }
            printf("+++
    ");
            if (ipar[8]) {
                printf("As ipar[8]=%d, the automatic residual test will be performed
    ", ipar[8]);
            } else {
                printf("As ipar[8]=%d, the automatic residual test will be skipped
    ", ipar[8]);
            }
            printf("+++
    ");
            if (ipar[9]) {
                printf("As ipar[9]=%d, the user-defined stopping test will be ", ipar[9]);
                printf("requested via
    RCI_request=2
    ");
            } else {
                printf("As ipar[9]=%d, the user-defined stopping test will not be ", ipar[9]);
                printf("requested, thus,
    RCI_request will not take the value 2
    ");
            }
            printf("+++
    ");
            if (ipar[10]) {
                printf("As ipar[10]=%d, the Preconditioned FGMRES iterations will be ", ipar[10]);
                printf("performed, thus,
    the preconditioner action will be requested via ");
                printf("RCI_request=3
    ");
            } else {
                printf("As ipar[10]=%d, the Preconditioned FGMRES iterations will not ", ipar[10]);
                printf("be performed,
    thus, RCI_request will not take the value 3
    ");
            }
            printf("+++
    ");
            if (ipar[11]) {
                printf("As ipar[11]=%d, the automatic test for the norm of the next ", ipar[11]);
                printf("generated vector is
    not equal to zero up to rounding and ");
                printf("computational errors will be performed,
    thus, RCI_request will not ");
                printf("take the value 4
    ");
            } else {
                printf("As ipar[11]=%d, the automatic test for the norm of the next ", ipar[11]);
                printf("generated vector is
    not equal to zero up to rounding and ");
                printf("computational errors will be skipped,
    thus, the user-defined test ");
                printf("will be requested via RCI_request=4
    ");
            }
            printf("+++
    
    ");
        }
        /*---------------------------------------------------------------------------
         * Compute the solution by RCI (P)FGMRES solver with preconditioning
         * Reverse Communication starts here
         *---------------------------------------------------------------------------*/
          ONE:
        if (taskid == MASTER) {
            dfgmres(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp);
            RCI_flag = RCI_request;
        }
        MPI_Bcast(&RCI_flag, 1, MPI_INT, MASTER, comm);    // send RCI_request from node MASTER to all other nodes.
    
        /*---------------------------------------------------------------------------
         * If RCI_request=0, then the solution was found with the required precision
         *---------------------------------------------------------------------------*/
        if (RCI_flag == 0)
            goto COMPLETE;
        /*---------------------------------------------------------------------------
         * If RCI_request=1, then compute the vector A*tmp[ipar[21]-1]
         * and put the result in vector tmp[ipar[22]-1]
         *---------------------------------------------------------------------------
         * NOTE that ipar[21] and ipar[22] contain FORTRAN style addresses,
         * therefore, in C code it is required to subtract 1 from them to get C style
         * addresses
         *---------------------------------------------------------------------------*/
        if (RCI_flag == 1) {
            if (taskid == MASTER) {
                temp_1 = &tmp[ipar[21] - 1];
                temp_2 = &tmp[ipar[22] - 1];
            }
    
            mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);
    
            goto ONE;
        }
        /*---------------------------------------------------------------------------
         * If RCI_request=2, then do the user-defined stopping test
         * The residual stopping test for the computed solution is performed here
         *---------------------------------------------------------------------------
         */
        if (RCI_flag == 2) {
            /* Request to the dfgmres_get routine to put the solution into b[N] via ipar[12]
               --------------------------------------------------------------------------------
               WARNING: beware that the call to dfgmres_get routine with ipar[12]=0 at this
               stage may destroy the convergence of the FGMRES method, therefore, only
               advanced users should exploit this option with care */
            if (taskid == MASTER) {
                ipar[12] = 1;
                /* Get the current FGMRES solution in the vector f */
                dfgmres_get(&ivar, computed_solution, f, &RCI_request, ipar, dpar, tmp, &itercount);
                temp_1 = f;
                temp_2 = residual;
            }
            /* Compute the current true residual via mpi mat_vec multiplication */
            mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);
    
            if (taskid == MASTER) {
                dvar = -1.0E0;
                cblas_daxpy(ivar, dvar, b, incx, residual, incx);
                dvar = cblas_dnrm2(ivar, residual, incx);
                printf("iteration %d, relative residual:%e
    ", itercount, dvar);
            }
    
            MPI_Bcast(&dvar, 1, MPI_DOUBLE, MASTER, comm);
            if (dvar < TOL) {
                goto COMPLETE;
            } else
                goto ONE;
        }
        /*---------------------------------------------------------------------------
         * If RCI_request=3, then apply the preconditioner on the vector
         * tmp[ipar[21]-1] and put the result in vector tmp[ipar[22]-1]
         *---------------------------------------------------------------------------
         * NOTE that ipar[21] and ipar[22] contain FORTRAN style addresses,
         * therefore, in C code it is required to subtract 1 from them to get C style
         * addresses
         *---------------------------------------------------------------------------*/
        if (RCI_flag == 3) {
            if (taskid == MASTER) {
                temp_1 = &tmp[ipar[21] - 1];
                temp_2 = &tmp[ipar[22] - 1];
            }
            /* start modification 3: solve L U temp_2 = temp_1   */
            mpi_preconditioner_solver(m, local_m, local_M, temp_1, temp_2, local_u, taskid, comm);
            /* end modification 3 */
            goto ONE;
        }
        /*---------------------------------------------------------------------------
         * If RCI_request=4, then check if the norm of the next generated vector is
         * not zero up to rounding and computational errors. The norm is contained
         * in dpar[6] parameter
         *---------------------------------------------------------------------------*/
        if (RCI_flag == 4) {
            if (taskid == MASTER)
                dvar = dpar[6];
            MPI_Bcast(&dvar, 1, MPI_DOUBLE, MASTER, comm);
            if (dvar < 1.0E-12) {
                goto COMPLETE;
            } else
                goto ONE;
        }
        /*---------------------------------------------------------------------------
         * If RCI_request=anything else, then dfgmres subroutine failed
         * to compute the solution vector: computed_solution[N]
         *---------------------------------------------------------------------------*/
        else {
            goto FAILED;
        }
        /*---------------------------------------------------------------------------
         * Reverse Communication ends here
         * Get the current iteration number and the FGMRES solution (DO NOT FORGET to
         * call dfgmres_get routine as computed_solution is still containing
         * the initial guess!). Request to dfgmres_get to put the solution
         * into vector computed_solution[N] via ipar[12]
         *---------------------------------------------------------------------------*/
          COMPLETE:if (taskid == MASTER) {
            ipar[12] = 0;
            dfgmres_get(&ivar, computed_solution, b, &RCI_request, ipar, dpar, tmp, &itercount);
                 /*---------------------------------------------------------------------------
                  * Print solution vector: computed_solution[N] and the number of iterations: itercount
                  *---------------------------------------------------------------------------*/
            printf("The system has been solved in %d iterations 
    ", itercount);
            printf("The following solution has been obtained (first 4 elements): 
    ");
            for (i = 0; i < 4; i++) {
                printf("computed_solution[%d]=", i);
                printf("%e
    ", computed_solution[i]);
            }
    
                 /*-------------------------------------------------------------------------*/
            /* Release internal MKL memory that might be used for computations         */
            /* NOTE: It is important to call the routine below to avoid memory leaks   */
            /* unless you disable MKL Memory Manager                                   */
                 /*-------------------------------------------------------------------------*/
            MKL_Free_Buffers();
            temp_1 = computed_solution;
            temp_2 = residual;
        }
        // compute the relative residual
        mpi_dgemv(m, local_m, local_A, temp_1, temp_2, local_u, local_v, taskid, comm);
        if (taskid == MASTER) {
            dvar = -1.0E0;
            cblas_daxpy(ivar, dvar, b, incx, residual, incx);
            dvar = cblas_dnrm2(ivar, residual, incx);
            printf("relative residual:%e
    ", dvar / b_2norm);
    
            if (itercount < MAXIT && dvar < TOL)
                flag = 0;    //success
            else
                flag = 1;    //fail
    
        }
    
        MPI_Bcast(&flag, 1, MPI_INT, MASTER, comm);
    
        free(local_A);
        free(local_M);
        free(local_u);
        free(local_v);
        if (taskid == MASTER) {
            free(tmp);
            free(b);
            free(computed_solution);
            free(residual);
        }
    
        if (flag == 0) {
            MPI_Finalize();
            return 0;
        } else {
            MPI_Finalize();
            return 1;
        }
        /* Release internal MKL memory that might be used for computations         */
        /* NOTE: It is important to call the routine below to avoid memory leaks   */
        /* unless you disable MKL Memory Manager                                   */
             /*-------------------------------------------------------------------------*/
          FAILED:
        if (taskid == MASTER) {
            printf("
    This example FAILED as the solver has returned the ERROR code %d", RCI_request);
            MKL_Free_Buffers();
        }
        free(local_A);
        free(local_M);
        free(local_u);
        free(local_v);
        if (taskid == MASTER) {
            free(tmp);
            free(b);
            free(computed_solution);
            free(residual);
        }
    
        MPI_Finalize();
        return 1;
    }
    
    void mpi_dgemv(const MKL_INT m, const MKL_INT local_m, const double *local_A, const double *u, double *v, double *local_u, double *local_v, int taskid, MPI_Comm comm)
    {
        // compute v=A*u in MPI
        CBLAS_LAYOUT layout = CblasColMajor;    //col major
        CBLAS_TRANSPOSE trans = CblasNoTrans;    // no transfer
    
        MPI_Scatter(u, local_m, MPI_DOUBLE, local_u, local_m, MPI_DOUBLE, MASTER, comm);    // send u_i from node MASTER to all other nodes.
        //      printf("scatter finish at taskid=%d
    ",taskid);
        // compute A_i
        cblas_dgemv(layout, trans, m, local_m, 1.0, local_A, m, local_u, 1, 0.0, local_v, 1);
    
        //  Apply a reduction operation on all nodes and place the result in vector v.
        MPI_Reduce(local_v, v, m, MPI_DOUBLE, MPI_SUM, MASTER, comm);
    }
    
    void mpi_preconditioner_solver(const MKL_INT m, const MKL_INT local_m, const double *local_M, const double *u, double *v, double *local_u, int taskid, MPI_Comm comm)
    {
        int i = 0;
        //      printf("begin taskid=%d
    ",taskid);
        MPI_Scatter(u, local_m, MPI_DOUBLE, local_u, local_m, MPI_DOUBLE, MASTER, comm);    // send u_i from node MASTER to all other nodes.
        //      printf("taskid=%d
    ",taskid);
        //compute Mi^(-1)*y_i at each node
        for (i = 0; i < local_m; i++)
            *(local_u + i) /= *(local_M + i);
    
        // Apply a gather operation on all nodes 
        MPI_Gather(local_u, local_m, MPI_DOUBLE, v, local_m, MPI_DOUBLE, MASTER, comm);
    }
    $ . /opt/intel/bin/compilervars.sh intel64
    $ mpicc gmres_test.c -o gmres_test -lmkl_rt


    博文摘自:http://m.blog.csdn.net/chenjun15/article/details/75041932
  • 相关阅读:
    C++11 override和final
    C++11 类型推导auto
    C++11 强枚举类型
    C++11 explicit的使用
    《数据结构与算法之美》——冒泡排序、插入排序、选择排序
    《数据结构与算法之美》- 栈
    Spring Boot系列——AOP配自定义注解的最佳实践
    Spring Boot系列——死信队列
    Spring Boot系列——7步集成RabbitMQ
    让我头疼一下午的Excel合并单元格
  • 原文地址:https://www.cnblogs.com/kuangsyx/p/8026205.html
Copyright © 2011-2022 走看看