zoukankan      html  css  js  c++  java
  • 【DM642学习笔记十】DSP优化记录

    1. 处理的数据先EDMA到片内,具有更高的效率!
    以YUV2RGB为例:
    #pragma DATA_SECTION(onchipBuf0_y,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf0_y,128);
    #pragma DATA_SECTION(onchipBuf1_y,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf1_y,128);
    #pragma DATA_SECTION(onchipBuf0_u,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf0_u,128);
    #pragma DATA_SECTION(onchipBuf1_u,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf1_u,128);
    #pragma DATA_SECTION(onchipBuf0_v,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf0_v,128);
    #pragma DATA_SECTION(onchipBuf1_v,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf1_v,128);
    
    #pragma DATA_SECTION(onchipBuf0_r,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf0_r,128);
    #pragma DATA_SECTION(onchipBuf1_r,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf1_r,128);
    #pragma DATA_SECTION(onchipBuf0_g,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf0_g,128);
    #pragma DATA_SECTION(onchipBuf1_g,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf1_g,128);
    #pragma DATA_SECTION(onchipBuf0_b,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf0_b,128);
    #pragma DATA_SECTION(onchipBuf1_b,".INTPROCBUFF");
    #pragmaDATA_ALIGN(onchipBuf1_b,128);
    //片上缓冲区
    unsigned charonchipBuf0_y[PROC_WIDTH];
    unsigned charonchipBuf1_y[PROC_WIDTH];
    unsigned charonchipBuf0_u[PROC_WIDTH_2];
    unsigned charonchipBuf1_u[PROC_WIDTH_2];
    unsigned charonchipBuf0_v[PROC_WIDTH_2];
    unsigned charonchipBuf1_v[PROC_WIDTH_2];
    unsigned charonchipBuf0_r[PROC_WIDTH];
    unsigned charonchipBuf1_r[PROC_WIDTH];
    unsigned charonchipBuf0_g[PROC_WIDTH];
    unsigned charonchipBuf1_g[PROC_WIDTH];
    unsigned charonchipBuf0_b[PROC_WIDTH];
    unsigned charonchipBuf1_b[PROC_WIDTH];
    //原始图像 YUV
    extern unsigned charsrc_Y[IMGWIDTH*IMGHEIGHT];//720*576
    extern unsigned charsrc_U[IMGWIDTH_2*IMGHEIGHT];
    extern unsigned charsrc_V[IMGWIDTH_2*IMGHEIGHT];
    //RGB图像
    extern unsigned charsrc_R[PROC_WIDTH*PROC_HEIGHT];//352*288
    extern unsigned charsrc_G[PROC_WIDTH*PROC_HEIGHT];
    extern unsigned charsrc_B[PROC_WIDTH*PROC_HEIGHT];
    
    void yuv2rgb888()
    {
    
    int i=0,j=0;
    int y,u,v,r,g,b;
    int v359,v183,u88,u454;
       unsigned char *py,*pu,*pv,*pr,*pg,*pb;
    // const int dif=0x8080;// 128128
    for ( i = 0; i <288; i ++ ) 
       {
           //copy一行到片上  144-432行,180列~180+352列
          DAT_copy(src_Y+(i+144)*IMGWIDTH+180, onchipBuf0_y, PROC_WIDTH);
          DAT_copy(src_U+(i+144)*(IMGWIDTH>>1)+90, onchipBuf0_u,PROC_WIDTH_2);
          DAT_copy(src_V+(i+144)*(IMGWIDTH>>1)+90,   onchipBuf0_v, PROC_WIDTH_2);
    py=onchipBuf0_y;
    pu=onchipBuf0_u;
    pv=onchipBuf0_v;
    pr=onchipBuf0_r;
    pg=onchipBuf0_g;
    pb=onchipBuf0_b;
    #pragmaMUST_ITERATE(0,176, 8);
    for(j=0;j
    {
    
    y=(*py);//u-=128; v-=128;//y-=16;y不减16
    u=(*pu)-128; 
    v=(*pv)-128; 
    v359=359*v>>8;
    u88=88*u>>8;
    v183=183*v>>8;
    u454=454*u>>8;
    r= y+v359;       // r=y+1.402*v;
    r&=~(r>>31);  
    r = (r |((255-r)>>31) ) & 0xFF;
    g= y-u88-v183;    //g=y-0.34414*u-0.71414*v;
    g&=~(g>>31);
    g = (g |((255-g)>>31) ) & 0xFF;
    b= y+u454;          //b=y+1.772*u;
    b&=~(b>>31);
    b = (b |((255-b)>>31) ) & 0xFF;
    *pr++=r; 
    *pg++=g;
    *pb++=b;
    //
    py++; y=(*py);  //y-=16; y减了16之后比原来灰度暗了16左右。∴不减。
    r= y+v359;           //r=y+1.402*v;
    r&=~(r>>31);  
    r = (r |((255-r)>>31) ) & 0xFF;
    g= y-u88-v183;    //g=y-0.34414*u-0.71414*v;
    g&=~(g>>31);
    g = (g |((255-g)>>31) ) & 0xFF;
    b= y+u454;       // b=y+1.772*u;
    b&=~(b>>31);
    b = (b |((255-b)>>31) ) & 0xFF;
    *pr++=r;
    *pg++=g;
    *pb++=b;
    py++; pu++;pv++;
    }   
          //处理完后,copy给片外r、g、b[352*288]
           DAT_copy(onchipBuf0_r,src_R+i*PROC_WIDTH, PROC_WIDTH);
           DAT_copy(onchipBuf0_g,src_G+i*PROC_WIDTH, PROC_WIDTH); 
           DAT_copy(onchipBuf0_b,src_B+i*PROC_WIDTH, PROC_WIDTH); 
       }
    }
    View Code
    耗时clock对比
    放在片外:          1888 3080 clocks
    先EDMA到片上:  197 7300 clocks!
    约比在片外处理快10倍。
    -------------------------------------------------
    2. 另外,代码中将0~255的限定使用(0):
                    r&=~(r>>31);  
                        r = (r | ((255-r)>>31) ) & 0xFF;
                        g &=~(g>>31);
                        g = (g | ((255-g)>>31) ) & 0xFF;
                        b &=~(b>>31);
                        b = (b | ((255-b)>>31) ) & 0xFF;  

    代替(1),能实现更好的软件流水。

          if ( r>255)    r=255;          else if ( r<0 )  r=0;
              if ( g>255 )   g=255;          else if ( g<0 )  g=0;
              if ( b>255 )   b=255;          else if ( b<0 )  b=0;

    说明:如下图所示,后者(1)不能软件流水~(都在片内处理的情况下)t1=14634366 clocks ≈10*t0!!!

    【DM642】DSP优化记录

    => 优化后

     

  • 相关阅读:
    AD20改变pcb图纸大小方式
    ceph相关概念
    五种IO模型和三种实现方式
    MongoDB入门
    GO通过sqlx库操作MySQL
    Go原生sql操作MySQL
    Traefik工作原理
    Redis主从
    Nginx入门
    Redis入门
  • 原文地址:https://www.cnblogs.com/eaglediao/p/7136500.html
Copyright © 2011-2022 走看看