原蜂鸟E203 SoC中乘法指令的实现方法为booth4编码的循环迭代实现,完成一次乘法运算需要17个时钟周期,除法指令的实现方法为不恢复余数法,完成一次除法运算需要36个时钟周期,如果能够修改乘法和除法指令的实现方式以减少运算所需的时钟周期,将会明显提高跑分程序的结果,本文将对乘除法指令实现单元的修改方法进行分享。
e203_exu_alu_muldiv.v文件中的e203_exu_alu_muldiv模块包含了乘法和除法指令的实现过程,模块中的输入输出信号均由valid和ready握手信号控制,原文件中的乘除法运算过程复用了ALU中的加减法器和缓存,如果不进行复用而采用独立的运算单元,整个运算单元的实现并不复杂,下面给出本文的参考实现:
`include "e203_defines.v"
module e203_exu_alu_muldiv(
input mdv_nob2b,
// The Issue Handshake Interface to MULDIV
input muldiv_i_valid, // Handshake valid
output muldiv_i_ready, // Handshake ready
input [`E203_XLEN-1:0] muldiv_i_rs1,
input [`E203_XLEN-1:0] muldiv_i_rs2,
input [`E203_XLEN-1:0] muldiv_i_imm,
input [`E203_DECINFO_MULDIV_WIDTH-1:0] muldiv_i_info,
input [`E203_ITAG_WIDTH-1:0] muldiv_i_itag,
output muldiv_i_longpipe,
input flush_pulse,
// The MULDIV Write-Back/Commit Interface
output muldiv_o_valid, // Handshake valid
input muldiv_o_ready, // Handshake ready
output [`E203_XLEN-1:0] muldiv_o_wbck_wdat,
output muldiv_o_wbck_err,
/* 如果不复用ALU中的加减法单元和缓存,这部分信号可以注释掉,当然e203_exu_alu.v文件中也要做简单修改
// The operands and info to ALU
output [`E203_MULDIV_ADDER_WIDTH-1:0] muldiv_req_alu_op1,
output [`E203_MULDIV_ADDER_WIDTH-1:0] muldiv_req_alu_op2,
output muldiv_req_alu_add ,
output muldiv_req_alu_sub ,
input [`E203_MULDIV_ADDER_WIDTH-1:0] muldiv_req_alu_res,
// The Shared-Buffer interface to ALU-Shared-Buffer
output muldiv_sbf_0_ena,
output [33-1:0] muldiv_sbf_0_nxt,
input [33-1:0] muldiv_sbf_0_r,
output muldiv_sbf_1_ena,
output [33-1:0] muldiv_sbf_1_nxt,
input [33-1:0] muldiv_sbf_1_r,
*/
input clk,
input rst_n
);
wire muldiv_i_hsked = muldiv_i_valid & muldiv_i_ready;
wire muldiv_o_hsked = muldiv_o_valid & muldiv_o_ready;
//指令流水线冲刷
wire flushed_r;
wire flushed_set = flush_pulse;
wire flushed_clr = muldiv_o_hsked & (~flush_pulse);
wire flushed_ena = flushed_set | flushed_clr;
wire flushed_nxt = flushed_set | (~flushed_clr);
sirv_gnrl_dfflr #(1) flushed_dfflr (flushed_ena, flushed_nxt, flushed_r, clk, rst_n);
wire i_mul = muldiv_i_info[`E203_DECINFO_MULDIV_MUL ];
wire i_mulh = muldiv_i_info[`E203_DECINFO_MULDIV_MULH ];
wire i_mulhsu = muldiv_i_info[`E203_DECINFO_MULDIV_MULHSU];
wire i_mulhu = muldiv_i_info[`E203_DECINFO_MULDIV_MULHU ];
wire i_div = muldiv_i_info[`E203_DECINFO_MULDIV_DIV ];
wire i_divu = muldiv_i_info[`E203_DECINFO_MULDIV_DIVU ];
wire i_rem = muldiv_i_info[`E203_DECINFO_MULDIV_REM ];
wire i_remu = muldiv_i_info[`E203_DECINFO_MULDIV_REMU ];
// If it is flushed then it is not back2back real case
wire i_b2b = muldiv_i_info[`E203_DECINFO_MULDIV_B2B ] & (~flushed_r) & (~mdv_nob2b);
wire back2back_seq = i_b2b;
wire mul_rs1_sign = (i_mulhu) ? 1'b0 : muldiv_i_rs1[`E203_XLEN-1];
wire mul_rs2_sign = (i_mulhsu | i_mulhu) ? 1'b0 : muldiv_i_rs2[`E203_XLEN-1];
wire [31:0] muldiv_op1 = mul_rs1_sign ? (~muldiv_i_rs1+1) : muldiv_i_rs1;
wire [31:0] muldiv_op2 = mul_rs2_sign ? (~muldiv_i_rs2+1) : muldiv_i_rs2;
wire muldiv_res_sign=mul_rs1_sign^mul_rs2_sign;
wire i_op_mul = i_mul | i_mulh | i_mulhsu | i_mulhu;
wire i_op_div = i_div | i_divu | i_rem | i_remu;
/////////////////////////////有限状态机/////////////////////////////////////////////////
localparam IDLE = 0;
localparam MUL = 1;
localparam DIV = 2;
localparam DONE = 3;
reg [1:0] state_r;
wire special_cases,mul32_done,div32_done;
wire op_start=(state_r==IDLE)&muldiv_i_valid & (~back2back_seq) & (~special_cases)&(~flush_pulse);
always @(posedge clk,negedge rst_n) begin
if(~rst_n)state_r<=IDLE;
else begin
case (state_r)
IDLE:begin
if(op_start)begin
if(i_op_mul)state_r<=MUL;
if(i_op_div)state_r<=DIV;
end
end
MUL:begin
if(flush_pulse)state_r<=IDLE;
else if(mul32_done)state_r<=DONE;
end
DIV:begin
if(flush_pulse)state_r<=IDLE;
else if(div32_done)state_r<=DONE;
end
DONE:begin
if(muldiv_o_hsked|flush_pulse)state_r<=IDLE;
end
default:state_r<=IDLE;
endcase
end
end
//////////////////////////////////乘法器实现////////////////////////////////////
wire [63:0] ref_mul_res;
wire mul32_start=op_start&i_op_mul; //如果是多周期实现则需要该信号
//这里直接相乘,具体实现时需要自行修改为其他实现方式(移位相加、阵列、Booth、Wallace等)
assign ref_mul_res=muldiv_op1*muldiv_op2;
assign mul32_done=1'b1;
wire [63:0] mul_res_true=muldiv_res_sign?(~ref_mul_res+1):ref_mul_res;
reg [63:0] mul_res_r;
always @(posedge clk,negedge rst_n) begin
if(~rst_n)mul_res_r<=64'd0;
else if((state_r==MUL)&mul32_done)mul_res_r<=mul_res_true;
end
wire[`E203_XLEN-1:0] mul_res = i_mul ? mul_res_r[31:0] : mul_res_r[63:32];
///////////////////////////////除法运算实现//////////////////////////////////////
//前面为除0和溢出判断,可直接使用原文件中的代码
wire div_rs1_sign = (i_divu | i_remu) ? 1'b0 : muldiv_i_rs1[`E203_XLEN-1];
wire div_rs2_sign = (i_divu | i_remu) ? 1'b0 : muldiv_i_rs2[`E203_XLEN-1];
wire [65:0] dividend = {{33{div_rs1_sign}}, div_rs1_sign, muldiv_i_rs1};
wire [33:0] divisor = {div_rs2_sign, div_rs2_sign, muldiv_i_rs2};
wire div_by_0 = ~(|muldiv_i_rs2);// Divisor is all zeros
wire div_ovf = (i_div | i_rem) & (&muldiv_i_rs2) // Divisor is all ones, means -1
//Dividend is 10000...000, means -(2^xlen -1)
& muldiv_i_rs1[`E203_XLEN-1] & (~(|muldiv_i_rs1[`E203_XLEN-2:0]));
wire[`E203_XLEN-1:0] div_by_0_res_quot = ~`E203_XLEN'b0;
wire[`E203_XLEN-1:0] div_by_0_res_remd = dividend[`E203_XLEN-1:0];
wire[`E203_XLEN-1:0] div_by_0_res = (i_div | i_divu) ? div_by_0_res_quot : div_by_0_res_remd;
wire[`E203_XLEN-1:0] div_ovf_res_quot = {1'b1,{`E203_XLEN-1{1'b0}}};
wire[`E203_XLEN-1:0] div_ovf_res_remd = `E203_XLEN'b0;
wire[`E203_XLEN-1:0] div_ovf_res = (i_div | i_divu) ? div_ovf_res_quot : div_ovf_res_remd;
wire div_special_cases = i_op_div & (div_by_0 | div_ovf);
wire [`E203_XLEN-1:0] div_special_res = div_by_0 ? div_by_0_res : div_ovf_res;
//除法器实现
wire [31:0] quotient,reminder;
wire div32_start=op_start&i_op_div;
//这里直接相除和取余,仅作为参考和仿真,具体实现时需自行修改为其他实现方式
//除法器算法:恢复余数法、不恢复余数法、SRT、牛顿迭代法、Goldschmidt等
assign quotient=muldiv_op1/muldiv_op2;
assign reminder=muldiv_op1%muldiv_op2;
assign div32_done=1'b1;
wire [31:0] quotient_true=muldiv_res_sign?(~quotient+1):quotient;
wire [31:0] reminder_true=muldiv_res_sign?(~reminder+1):reminder;
reg [63:0] div_res_r;
always @(posedge clk,negedge rst_n) begin
if(~rst_n)div_res_r<=64'd0;
else if((state_r==DIV)&div32_done)div_res_r<={quotient_true,reminder_true};
end
wire[`E203_XLEN-1:0] div_res = (i_div|i_divu)?div_res_r[63:32]:div_res_r[31:0];
/////////////////////////////输出信号生成//////////////////////////////////////
assign special_cases = div_special_cases;
wire[`E203_XLEN-1:0] special_res = div_special_res;
wire [`E203_XLEN-1:0] back2back_mul_res = mul_res_r[31:0];
wire [`E203_XLEN-1:0] back2back_mul_rem = div_res_r[31:0];
wire [`E203_XLEN-1:0] back2back_mul_div = div_res_r[63:32];
wire [`E203_XLEN-1:0] back2back_res = (
({`E203_XLEN{i_mul }} & back2back_mul_res)
| ({`E203_XLEN{i_rem | i_remu}} & back2back_mul_rem)
| ({`E203_XLEN{i_div | i_divu}} & back2back_mul_div)
);
wire wbck_condi = (back2back_seq | special_cases) ? 1'b1 :(state_r==DONE);
assign muldiv_o_valid = wbck_condi & muldiv_i_valid;
assign muldiv_i_ready = wbck_condi & muldiv_o_ready;
wire res_sel_spl = special_cases;
wire res_sel_b2b = back2back_seq & (~special_cases);
wire res_sel_div = (~back2back_seq) & (~special_cases) & i_op_div;
wire res_sel_mul = (~back2back_seq) & (~special_cases) & i_op_mul;
assign muldiv_o_wbck_wdat =
({`E203_XLEN{res_sel_b2b}} & back2back_res)
| ({`E203_XLEN{res_sel_spl}} & special_res)
| ({`E203_XLEN{res_sel_div}} & div_res)
| ({`E203_XLEN{res_sel_mul}} & mul_res);
assign muldiv_o_wbck_err = 1'b0;
assign muldiv_i_longpipe = 1'b0;
endmodule
|
|