矩阵相乘的例子
参考博客:http://blog.csdn.net/kkk584520/article/details/18812321
MatrixMultiply.c
typedef int data_type; #define N 5 void MatrixMultiply(data_type AA[N*N],data_type bb[N],data_type cc[N]) { int i,j; for(i = 0;i<N;i++) { data_type sum = 0; for(j = 0;j<N;j++) { sum += AA[i*N+j]*bb[j]; } cc[i] = sum; } }
修改后:
#include <ap_cint.h> typedef uint15 data_type; #define N 5 void MatrixMultiply(data_type AA[N*N],data_type bb[N],data_type cc[N]) { int i,j; MatrixMultiply_label2:for(i = 0;i<N;i++) { data_type sum = 0; MatrixMultiply_label1:for(j = 0;j<N;j++) { sum += AA[i*N+j]*bb[j]; } cc[i] = sum; } }
测试文件:TestMatrixMultiply.c:
#include <stdio.h> #include <ap_cint.h> typedef uint15 data_type; #define N 5 const data_type MatrixA[] = { #include "a.h" }; const data_type Vector_b[] = { #include "b.h" }; const data_type MatlabResult_c[] = { #include "c.h" }; data_type HLS_Result_c[N] = {0}; void CheckResult(data_type * matlab_result,data_type * your_result); int main(void) { int i; printf("Checking Results: "); MatrixMultiply(MatrixA,Vector_b,HLS_Result_c); CheckResult(MatlabResult_c,HLS_Result_c); return 0; } void CheckResult(data_type * matlab_result,data_type * your_result) { int i; for(i = 0;i<N;i++) { printf("Idx %d: Error = %d ",i,matlab_result[i]-your_result[i]); } }
a.h
{82}, {10}, {16}, {15}, {66}, {91}, {28}, {98}, {43}, {4}, {13}, {55}, {96}, {92}, {85}, {92}, {96}, {49}, {80}, {94}, {64}, {97}, {81}, {96}, {68}
b.h
{76}, {75}, {40}, {66}, {18}
c.h
{9800}, {15846}, {16555}, {23124}, {22939}
ip核顶层:
module test_multiply_v1_0 # ( // Users to add parameters here // User parameters ends // Do not modify the parameters beyond this line // Parameters of Axi Slave Bus Interface S00_AXIS parameter integer C_S00_AXIS_TDATA_WIDTH = 32, // Parameters of Axi Master Bus Interface M00_AXIS parameter integer C_M00_AXIS_TDATA_WIDTH = 32, parameter integer C_M00_AXIS_START_COUNT = 32 ) ( // Users to add ports here // User ports ends // Do not modify the ports beyond this line // Ports of Axi Slave Bus Interface S00_AXIS input wire s00_axis_aclk, input wire s00_axis_aresetn, output wire s00_axis_tready, input wire [C_S00_AXIS_TDATA_WIDTH-1 : 0] s00_axis_tdata, input wire [(C_S00_AXIS_TDATA_WIDTH/8)-1 : 0] s00_axis_tstrb, input wire s00_axis_tlast, input wire s00_axis_tvalid, // Ports of Axi Master Bus Interface M00_AXIS input wire m00_axis_aclk, input wire m00_axis_aresetn, output wire m00_axis_tvalid, output wire [C_M00_AXIS_TDATA_WIDTH-1 : 0] m00_axis_tdata, output wire [(C_M00_AXIS_TDATA_WIDTH/8)-1 : 0] m00_axis_tstrb, output wire m00_axis_tlast, input wire m00_axis_tready ); // Instantiation of Axi Bus Interface S00_AXIS // Add user logic here my_stream_ip my_stream_ip_v1_0_S01_AXIS_inst ( .ACLK(s00_axis_aclk), .ARESETN(s00_axis_aresetn), .S_AXIS_TREADY(s00_axis_tready), .S_AXIS_TDATA(s00_axis_tdata), .S_AXIS_TLAST(s00_axis_tlast), .S_AXIS_TVALID(s00_axis_tvalid), .M_AXIS_TVALID(m00_axis_tvalid), .M_AXIS_TDATA(m00_axis_tdata), .M_AXIS_TLAST(m00_axis_tlast), .M_AXIS_TREADY(m00_axis_tready) ); // User logic ends endmodule
ip核:(未完成)
`timescale 1ns / 1ps module my_stream_ip ( ACLK, ARESETN, S_AXIS_TREADY, S_AXIS_TDATA, S_AXIS_TLAST, S_AXIS_TVALID, M_AXIS_TVALID, M_AXIS_TDATA, M_AXIS_TLAST, M_AXIS_TREADY, ); input ACLK; input ARESETN; output S_AXIS_TREADY; input [31 :0] S_AXIS_TDATA; input S_AXIS_TLAST; input S_AXIS_TVALID; output M_AXIS_TVALID; output [31 :0] M_AXIS_TDATA; output M_AXIS_TLAST; input M_AXIS_TREADY; localparam NUMBER_OF_INPUT_WORDS = 30; localparam NUMBER_OF_OUTPUT_WORDS = 30; localparam Idle =3'b100; localparam Read_Inputs = 3'b010; localparam Write_Outputs = 3'b001; localparam Wait_Calculate = 3'b000;//my add //send... reg start2; reg reset2; //get... wire done2; wire idle2; wire ready2; //data reg [31:0] AA [0:29]; reg [31:0] bb [0:4]; wire [31:0] cc [0:4]; wire cc_val [0:4]; reg [31:0] AA_index; reg [31:0] bb_index; reg [31:0] cc_index; reg [2:0] state; reg [31:0] sum; reg [NUMBER_OF_INPUT_WORDS -1:0] nr_of_reads; reg [NUMBER_OF_OUTPUT_WORDS - 1:0] nr_of_writes; assign S_AXIS_TREADY =(state == Read_Inputs); assign M_AXIS_TVALID = (state == Write_Outputs); assign M_AXIS_TDATA = sum; assign M_AXIS_TLAST = (nr_of_writes == 1); always @(posedge ACLK) begin // process The_SW_accelerator if(!ARESETN) // Synchronous reset (active low) begin state <= Idle; nr_of_reads <= 0; nr_of_writes <=0; sum <= 0; AA_index <= 0; bb_index <= 0; reset2 <= 1; start2 <= 0; end else case (state) Idle: if (S_AXIS_TVALID== 1) begin state <= Read_Inputs; nr_of_reads <= NUMBER_OF_INPUT_WORDS - 1; sum <= 0; end Read_Inputs: if(S_AXIS_TVALID == 1) begin if(nr_of_reads >= 5) begin AA[AA_index] <= S_AXIS_TDATA; AA_index <= AA_index + 1; end else begin bb[bb_index] <= S_AXIS_TDATA; bb_index <= bb_index + 1; end if (nr_of_reads == 0) begin state <= Write_Outputs; reset2 <= 0; start2 <= 1; nr_of_writes <= NUMBER_OF_OUTPUT_WORDS - 1; end else nr_of_reads <= nr_of_reads - 1; end Wait_Calculate: if(done2 == 0) begin sum <= cc[0]; state <= Write_Outputs; end Write_Outputs: if(M_AXIS_TREADY == 1) begin if (nr_of_writes == 0) state <= Idle; else sum <= done2; nr_of_writes <= nr_of_writes - 1; end endcase end MatrixMultiply U1 ( .ap_clk(S_AXI_ACLK), .ap_rst(reset2), .ap_start(start2), .ap_done(done2), .ap_idle(idle2), .ap_ready(ready2), .AA_0(AA[0]), .AA_1(AA[1]), .AA_2(AA[2]), .AA_3(AA[3]), .AA_4(AA[4]), .AA_5(AA[5]), .AA_6(AA[6]), .AA_7(AA[7]), .AA_8(AA[8]), .AA_9(AA[9]), .AA_10(AA[10]), .AA_11(AA[11]), .AA_12(AA[12]), .AA_13(AA[13]), .AA_14(AA[14]), .AA_15(AA[15]), .AA_16(AA[16]), .AA_17(AA[17]), .AA_18(AA[18]), .AA_19(AA[19]), .AA_20(AA[20]), .AA_21(AA[21]), .AA_22(AA[22]), .AA_23(AA[23]), .AA_24(AA[24]), .bb_0(bb[0]), .bb_1(bb[1]), .bb_2(bb[2]), .bb_3(bb[3]), .bb_4(bb[4]), .cc_0(cc[0]), .cc_0_ap_vld(cc_val[0]), .cc_1(cc[1]), .cc_1_ap_vld(cc_val[1]), .cc_2(cc[2]), .cc_2_ap_vld(cc_val[2]), .cc_3(cc[3]), .cc_3_ap_vld(cc_val[3]), .cc_4(cc[4]), .cc_4_ap_vld(cc_val[4]) ); endmodule
ip核引用的为HLS从c语言生成的verylog代码。