zoukankan      html  css  js  c++  java
  • 通过OpenCL内核代码猜测设备寄存器个数



    这个方法对于一般的GPU更有用些。由于CPU往往拥有L1 Data Cache,当寄存器不够用的时候,编译器会将不太常用的数据放到栈中,而栈在此时往往能获得高命中率的Cache访问,因此性能不会过受影响。而GPU端当寄存器不够用时,编译器往往会采取将不常用数据直接存放到VRAM中,而对外部VRAM的访问往往是比较慢的,因此,如果临时变量太多,使得频繁访问外部存储器,会使得整体计算性能大幅下降。当然,现在不少GPU也有了L1 Cache,但是空间也十分有限。因此,这里用“猜”这个词,呵呵~


    __kernel void QueryRegisterCount(__global int *pInOut)
        int index = get_global_id(0);
        int i0 = pInOut[(index * 4 + 0) * 4];
        int i1 = pInOut[(index * 4 + 1) * 4];
        int i2 = pInOut[(index * 4 + 2) * 4];
        int i3 = pInOut[(index * 4 + 3) * 4];
        for(int i = 0; i < 100000; i++)
            i1 += i0 << 1;
            i2 += i1 << 1;
            i3 += i2 << 1;
            i0 += i3 << 1;
            i1 += i0 >> 1;
            i2 += i1 >> 1;
            i3 += i2 >> 1;
            i0 += i3 >> 1;
            i1 += i0 >> 2;
            i2 += i1 >> 2;
            i3 += i2 >> 2;
            i0 += i3 >> 2;
            i1 += i0 >> 3;
            i2 += i1 >> 3;
            i3 += i2 >> 3;
            i0 += i3 >> 3;
        pInOut[(index * 4 + 0) * 4] = i0;
        pInOut[(index * 4 + 1) * 4] = i1;
        pInOut[(index * 4 + 2) * 4] = i2;
        pInOut[(index * 4 + 3) * 4] = i3;


    __kernel void QueryRegisterCount(__global int *pInOut)
        int index = get_global_id(0);
        int i0 = pInOut[(index * 8 + 0) * 4];
        int i1 = pInOut[(index * 8 + 1) * 4];
        int i2 = pInOut[(index * 8 + 2) * 4];
        int i3 = pInOut[(index * 8 + 3) * 4];
        int i4 = pInOut[(index * 8 + 4) * 4];
        int i5 = pInOut[(index * 8 + 5) * 4];
        int i6 = pInOut[(index * 8 + 6) * 4];
        int i7 = pInOut[(index * 8 + 7) * 4];
        for(int i = 0; i < 100000; i++)
            i1 += i0 << 1;
            i2 += i1 << 1;
            i3 += i2 << 1;
            i4 += i3 << 1;
            i5 += i4 << 1;
            i6 += i5 << 1;
            i7 += i6 << 1;
            i0 += i7 << 1;
            i1 += i0 >> 1;
            i2 += i1 >> 1;
            i3 += i2 >> 1;
            i4 += i3 >> 1;
            i5 += i4 >> 1;
            i6 += i5 >> 1;
            i7 += i6 >> 1;
            i0 += i7 >> 1;
            i1 += i0 >> 2;
            i2 += i1 >> 2;
            i3 += i2 >> 2;
            i4 += i3 >> 2;
            i5 += i4 >> 2;
            i6 += i5 >> 2;
            i7 += i6 >> 2;
            i0 += i7 >> 2;
            i1 += i0 >> 3;
            i2 += i1 >> 3;
            i3 += i2 >> 3;
            i4 += i3 >> 3;
            i5 += i4 >> 3;
            i6 += i5 >> 3;
            i7 += i6 >> 3;
            i0 += i7 >> 3;
        pInOut[(index * 8 + 0) * 4] = i0;
        pInOut[(index * 8 + 1) * 4] = i1;
        pInOut[(index * 8 + 2) * 4] = i2;
        pInOut[(index * 8 + 3) * 4] = i3;
        pInOut[(index * 8 + 4) * 4] = i4;
        pInOut[(index * 8 + 5) * 4] = i5;
        pInOut[(index * 8 + 6) * 4] = i6;
        pInOut[(index * 8 + 7) * 4] = i7;



        /** Prepare for running an OpenCL kernel program to get register count */
        /*Step 4: Creating command queue associate with the context.*/
        commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);
        /*Step 5: Create program object */
        // Read the kernel code to the buffer
        kernelPath = [[NSBundle mainBundle] pathForResource:@"reg" ofType:@"ocl"];
        aSource = [[NSString stringWithContentsOfFile:kernelPath encoding:NSUTF8StringEncoding error:nil] UTF8String];
        kernelLength = strlen(aSource);
        program = clCreateProgramWithSource(context, 1, &aSource, &kernelLength, NULL);
        /*Step 6: Build program. */
        status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
        /*Step 7: Initial inputs and output for the host and create memory objects for the kernel*/
        const size_t memSize = global_work_size[0] * 1024 * 4 * 4;
        cl_int *orgBufer = (cl_int*)malloc(memSize);
        memset(orgBufer, 1, memSize);
        outputMemObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, memSize, orgBufer, NULL);
        /*Step 8: Create kernel object */
        kernel = clCreateKernel(program, "QueryRegisterCount", NULL);
        /*Step 9: Sets Kernel arguments.*/
        status |= clSetKernelArg(kernel, 0, sizeof(outputMemObj), &outputMemObj);
        /*Step 10: Running the kernel.*/
        for(int i = 0; i < 5; i++)
            NSTimeInterval beginTime = [[NSProcessInfo processInfo] systemUptime];
            status |= clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);
            NSTimeInterval endTime = [[NSProcessInfo processInfo] systemUptime];
            NSLog(@"Time spent: %f", endTime - beginTime);
        if(status != CL_SUCCESS)
            NSLog(@"Program built failed!");

    以上由于是在OS X下开发的,因此直接用Objective-C文件读写更方便些。但是大部分都是C代码,很容易读懂。


    在2013年的MacBook Air中的Intel HD 5000中的测试结果为:





    很显然,我们可以猜得,Intel HD Graphics 5000至少可以为每个work-item分配16个寄存器。


  • 相关阅读:
    elk6.3 centos集群搭建 head插件安装
    9.4 关系的闭包
    9.5 等价关系
    9.3 关系的表示
    9.1 关系及关系性质
  • 原文地址:https://www.cnblogs.com/zenny-chen/p/3439636.html
Copyright © 2011-2022 走看看