zoukankan      html  css  js  c++  java
  • Verilog -- 改进的Booth乘法(基4)

    Verilog -- 改进的Booth乘法(基4)

    @(verilog)

    1. 背景

    之前已经介绍过Booth乘法算法的基本原理以及代码,实际上之前的算法是基2的booth算法,每次对乘数编码都只考虑两位。因此在实际实现时往往效率不高,考虑最坏情况,使用基2的booth算法计算两个8位数据的乘法,除了编码复杂,计算时需要累加8个部分积,可见最坏情况跟普通阵列乘法器需要累加的部分积个数一样,因此代价不低。

    改进的Booth乘法为了减少部分积的累加,现在基本很少采用基2的booth算法了,而是采用基4甚至基8的形式,下面主要介绍一下基4的booth算法。

    2. 原理

    跟基2的算法一样,假设A和B是乘数和被乘数,且有:

    [egin{align} A &= color{green}{(a_{2n+1}a_{2n})}a_{2n-1}a_{2n-2}dots a_{1}a_{0}color{green}{(a_{-1})} ag{1}\ B &= b_{2n-1}b_{2n-2}dots b_{1}b_{0} ag{2} end{align} ]

    其中,(a_{-1})是末尾补的0,(a_{2n},a_{2n+1})是扩展的两位符号位。可以将乘数A表示为:

    [A = (-1cdot a_{2n-1})2^{2n-1}+ a_{2n-2}cdot2^{2n-2}+cdots + a_1cdot 2+a_0 ]

    同样可以将两数的积表示为:

    [egin{align} AB &= (a_{-1}+a_0-2a_1) imes B imes 2^0 + (a_1+a_2-2a_3) imes B imes 2^2 ag{}\ &+(a_{3}+a_4-a_5) imes B imes 2^4 +dots ag{}\ &+(a_{2n-1}+a_{2n}-2a_{2n+1}) imes B imes 2^{2n} ag{}\ &=color{red}{B imes [sum_{k=0}^{n}(a_{2k-1}+a_{2k}-2a_{2k+1})cdot 2^{2k}]} ag{3} \ &=B imes Val(A) ag{} end{align} ]

    红色部分即为基4booth的编码方式。

    3. 算法实现

    有了公式就可以比较方便地推导算法步骤了,首先给出基4booth的编码表:

    乘数位 ((a_{2k-1}+a_{2k}-2a_{2k+1})) 编码操作
    000 0
    001 +B
    010 +B
    011 +2B
    100 -2B
    101 -B
    110 -B
    111 0

    所有操作过后都会移位两次

    示例:
    (A = -7,B = -3)
    首先,计算编码需要的操作数:
    (+B = 1111 1101)
    (-B = 0000 0011)
    (+2B = 1111 1010)
    (-2B = 0000 0110)
    下面对(A)进行编码:

    (A => (11) 1001 (0)=> (111) (100) (010)=> (0) (-2X) (+X))

    计算过程:

    + 1111 1101    +B 
    + 0001 10      -2B << <<
    -----------
    = 0001 0101    = 21
    

    可以发现,对于8bit的乘法,基4的booth算法最多只需要计算4个部分积的累加,极大简化了求和逻辑。

    4. Verilog 代码

    verilog代码参考的是fanhu大神写的,链接: https://pan.baidu.com/s/1bR0SK0NeeaenLC73E1kKNg 提取码: 4kat
    下面的代码针对上面的做了部分修改。

    `timescale 1ns/1ps
    
    module booth_radix4 #(
      parameter WIDTH_M = 8,
      parameter WIDTH_R = 8
      )(
      input                             clk,
      input                             rstn,
      input                             vld_in,
      input  [WIDTH_M-1:0]              multiplicand,
      input  [WIDTH_R-1:0]              multiplier,
      output [WIDTH_M+WIDTH_R-1:0]      mul_out,
      output reg done
    );
    parameter   IDLE   = 2'b00,
                ADD    = 2'b01,
                SHIFT  = 2'b11,
                OUTPUT = 2'b10;
    			
    reg  [1:0]  current_state, next_state;  	
    		
    reg [WIDTH_M+WIDTH_R+2:0] add1;
    reg [WIDTH_M+WIDTH_R+2:0] sub1;
    reg [WIDTH_M+WIDTH_R+2:0] add_x2;
    reg [WIDTH_M+WIDTH_R+2:0] sub_x2;
    reg [WIDTH_M+WIDTH_R+2:0] p_dct;
    reg [WIDTH_R-1:0] count;
    
    always @(posedge clk or negedge rstn) 
      if(!rstn) current_state = IDLE;
      else if (!vld_in) current_state = IDLE;
      else current_state <= next_state;
      
    always @* begin
      next_state = 2'bx;
      case (current_state)
        IDLE : if (vld_in) next_state = ADD;
        else               next_state = IDLE;
        ADD  : next_state = SHIFT  ; 
        SHIFT : if (count==WIDTH_R/2) next_state = OUTPUT;
                else                  next_state = ADD;
        OUTPUT : next_state = IDLE;
    	default: next_state = IDLE;
      endcase
    end  
    
    
    always @(posedge clk or negedge rstn) begin
      if(!rstn) begin
        {add1,sub1,add_x2,sub_x2,p_dct,count,done} <= 0;
      end else begin
        case(current_state) 
          IDLE: begin
            add1     <= {{2{multiplicand[WIDTH_R-1]}},multiplicand,{WIDTH_R+1{1'b0}}};
            sub1     <= {-{{2{multiplicand[WIDTH_R-1]}},multiplicand},{WIDTH_R+1{1'b0}}};
            add_x2   <= {{multiplicand[WIDTH_M-1],multiplicand,1'b0},{WIDTH_R+1{1'b0}}};
            sub_x2   <= {-{multiplicand[WIDTH_M-1],multiplicand,1'b0},{WIDTH_R+1{1'b0}}};
            p_dct    <= {{WIDTH_M+1{1'b0}},multiplier,1'b0}  ;
            count    <= 0;
            done     <= 0;
          end
          ADD:begin
            case(p_dct[2:0])
              3'b000,3'b111: p_dct <= p_dct;
              3'b001,3'b010: p_dct <= p_dct+add1;
              3'b101,3'b110: p_dct <= p_dct+sub1;
              3'b100:        p_dct <= p_dct+sub_x2;	   
              3'b011:        p_dct <= p_dct+add_x2;
              default:       p_dct <= p_dct;
            endcase
            count <= count+1;
          end
          SHIFT:
            p_dct <= {p_dct[WIDTH_M+WIDTH_R+2],p_dct[WIDTH_M+WIDTH_R+2],p_dct[WIDTH_M+WIDTH_R+2:2]};
    
          OUTPUT:begin
            done  <= 1;	 
          end		  
        endcase
      end   
    end
    
    assign mul_out = p_dct[WIDTH_M+WIDTH_R:1];
    
    endmodule
    
    

    testbench:

    
    `timescale 1ns/1ps
    
    module booth_radix4_tb();
    `define TEST_WIDTH 4
    
    parameter WIDTH_M = `TEST_WIDTH;
    parameter WIDTH_R = `TEST_WIDTH;
    
    reg                 clk;
    reg                 rstn;
    reg                 vld_in;
    reg   [WIDTH_M-1:0] multiplicand;
    reg   [WIDTH_R-1:0] multiplier;
    
    wire   [WIDTH_M+WIDTH_R-1:0] mul_out;
    wire                         done;
    //输入 :要定义有符号和符号,输出:无要求
    wire signed [`TEST_WIDTH-1:0]                m1_in;
    wire signed [`TEST_WIDTH-1:0]                m2_in;
    
    reg  signed [2*`TEST_WIDTH-1:0] product_ref;
    reg  [2*`TEST_WIDTH-1:0] product_ref_u;
    
    assign m1_in = multiplier[`TEST_WIDTH-1:0];
    assign m2_in = multiplicand[`TEST_WIDTH-1:0];
    
    
    always #1 clk = ~clk;
    integer i,j;
    integer num_good;
    initial begin
      clk = 0;
      vld_in = 0;
      multiplicand = 0;
      multiplier = 0;
      num_good = 0;
      rstn = 1;
      #4 rstn = 0; #2 rstn = 1;
      repeat(2) @(posedge clk);
      for (i = 0; i < (1<<`TEST_WIDTH); i = i + 1) begin
        for (j = 0; j < (1<<`TEST_WIDTH); j = j + 1) begin
          vld_in = 1;
          wait (done == 0);
          wait (done == 1);
          product_ref=m1_in*m2_in;
          product_ref_u=m1_in*m2_in;
          if (product_ref != mul_out) begin
            $display("multiplier = %d multiplicand = %d proudct =%d",m1_in,m2_in,mul_out);
            @(posedge clk);
            $stop;
          end
          else begin
            num_good = num_good + 1;
          end
          multiplicand = multiplicand + 1;
        end
        multiplier = multiplier + 1;
      end
      $display("sim done. num good = %d",num_good);
      $finish;
    
    end
    
    booth_radix4 #(  .WIDTH_M ( WIDTH_M ),
                     .WIDTH_R ( WIDTH_R ))
    U_BOOTH_RADIX4_0
    (  .clk          ( clk          ),
       .rstn         ( rstn         ),
       .vld_in       ( vld_in       ),
       .multiplicand ( multiplicand ),
       .multiplier   ( multiplier   ),
       .mul_out      ( mul_out      ),
       .done         ( done         ));
    
    
    initial begin
      $fsdbDumpvars();
      $fsdbDumpMDA();
      $dumpvars();
    end
    
    endmodule
    
    
    

    仿真波形图:
    首先num_good表示正确的计算数目,因为上面我只测试了4位宽度的所有有符号乘法,因此总的计算个数为16*16=256个,这边显示全部正确。

    下面是波形图:


    PS:跟之前写的基2的算法相比,这里如果位宽改为10,经过仿真得到的计算周期为12周期几乎比基2减少了一半。(之前写的基2在计算10bit时需要21个周期)

  • 相关阅读:
    20210108 android学习
    20210106 android学习
    20210105 android学习
    20210104 android学习
    httpClient请求转发个人理解
    CefSharp如何判断页面是否加载完
    C#/WPF 使用的Task线程程序缺依旧响应缓慢问题
    修改LiveChart的提示显示位置
    Wpf双击放大缩小界面
    WPF设置无边框CefSharp显示不出来问题
  • 原文地址:https://www.cnblogs.com/lyc-seu/p/12890155.html
Copyright © 2011-2022 走看看