111
module mult_behav_1
#(int w = 16)
(output uwire [2*w-1:0] prod, input uwire [w-1:0] cand, plier);
assign prod = cand * plier;
endmodule
module mult_linear
#(int w = 16)
(output logic [2*w-1:0] prod, input uwire [w-1:0] cand, plier);
uwire [2*w-1:0] b[w:0];
assign b[0] = 0;
assign prod = b[w];
for ( genvar pos = 0; pos < w; pos++ ) begin
uwire [2*w-1:0] pp = plier[pos] ? cand << pos : 0;
assign b[pos+1] = pp + b[pos];
end
endmodule
module mult_tree
#( int wa = 16, int wb = wa, int wp = wa + wb )
( output uwire [wp-1:0] prod,
input uwire [wa-1:0] a,
input uwire [wb-1:0] b );
if ( wa == 1 ) begin
assign prod = a ? b : 0;
end else begin
localparam int wn = wa / 2;
localparam int wx = wb + wn;
uwire [wx-1:0] prod_lo, prod_hi;
mult_tree #(wn,wb) mlo( prod_lo, a[wn-1:0], b );
mult_tree #(wn,wb) mhi( prod_hi, a[wa-1:wn], b );
assign prod = prod_lo + ( prod_hi << wn );
end
endmodule
module mult_linear_clk
#( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk);
uwire [2*w-1:0] p;
logic [w-1:0] candcpy, pliercpy;
mult_linear #(w) ml(p, candcpy, pliercpy);
always_ff @( posedge clk ) begin
candcpy <= cand;
pliercpy <= plier;
prod <= p;
end
endmodule
module mult_seq #( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk);
localparam int wlog = $clog2(w);
cadence initial if ( w != 1 << wlog ) $fatal(1,"Size must be a power of 2.");
cadence
bit [wlog-1:0] pos;
logic [2*w-1:0] accum;
always_ff @( posedge clk ) begin
if ( pos == 0 ) begin
prod = accum;
accum = 0;
end
if ( plier[pos] ) accum += cand << pos;
pos++;
end
endmodule
:Def: Pipelining
:Example:module very_simple_pipe
#( int w = 16 )
( output logic [w-1:0] x,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r;
always_ff @( posedge clk ) r <= a;
always_ff @( posedge clk ) x <= r;
endmodule
:Example:module simple_pipe2
#( int w = 16, int nstages = 4 )
( output uwire [w-1:0] x,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r[nstages];
always_ff @( posedge clk ) begin
r[0] <= a; for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
end
assign x = r[nstages-1];
endmodule
:Example:module simple_pipe2_ba
#( int w = 16, int nstages = 4 )
( output uwire [w-1:0] x,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r[nstages];
always_ff @( posedge clk ) begin
r[0] = a; for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
end
assign x = r[nstages-1];
endmodule
:Example:module simple_pipe_avg
#( int w = 16, int nstages = 4 )
( output uwire [w-1:0] x,
output logic [w-1:0] avg,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r[nstages];
assign r[0] = a;
always_ff @( posedge clk ) begin
for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
end
assign x = r[nstages-1];
logic [w+$clog2(nstages):0] sum;
always_comb begin
sum = 0;
for ( int i=0; i<nstages; i++ ) sum += r[i];
avg = sum / nstages;
end
endmodule
:Example:module simple_pipe_add1
#( int w = 16, int nstages = 4 )
( output uwire [w-1:0] x,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r[nstages];
always_ff @( posedge clk ) begin
r[0] <= a + 1;
for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1] + 1;
end
assign x = r[nstages-1];
endmodule
:Def: Throughput:Def: Latency:Def: Critical Path
:Def: Pipeline Latch
:Example:module mult_pipe1
#( int w = 16, int m = 1 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] cand, plier,
input uwire clk);
localparam int stages = w;
logic [2*w-1:0] pl_accum[0:stages];
logic [w-1:0] pl_cand[0:stages];
logic [w-1:0] pl_plier[0:stages];
always_ff @( posedge clk ) begin
pl_accum[0] <= 0;
pl_cand[0] <= cand;
pl_plier[0] <= plier;
for ( int stage=0; stage<stages; stage++ ) begin
automatic int pos = stage;
logic [2*w-1:0] pp, accum;
pp = pl_plier[stage][stage] ? pl_cand[stage] << stage : 0;
accum = pl_accum[stage] + pp;
pl_accum[stage+1] <= accum;
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum[stages];
endmodule
:Example:
module mult_pipe #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] cand, plier,
input clk);
localparam int stages = ( w + m - 1 ) / m;
logic [2*w-1:0] pl_accum[0:stages];
logic [w-1:0] pl_cand[0:stages];
logic [w-1:0] pl_plier[0:stages];
always_ff @( posedge clk ) begin
pl_accum[0] <= 0;
pl_cand[0] <= cand;
pl_plier[0] <= plier;
for ( int stage=0; stage<stages; stage++ ) begin
logic [2*w-1:0] accum;
accum = pl_accum[stage];
for ( int j=0; j<m; j++ ) begin
int pos;
pos = stage * m + j;
if ( pos < w && pl_plier[stage][pos] )
accum += pl_cand[stage] << pos;
end
pl_accum[stage+1] <= accum;
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum[stages];
endmodule
:Example:
module mult_pipe_2 #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] cand, plier,
input clk);
localparam int nstages = ( w + m - 1 ) / m;
logic [2*w-1:0] pl_accum[0:nstages];
logic [w-1:0] pl_cand[0:nstages];
logic [nstages-1:0][m-1:0] pl_plier[0:nstages];
always_ff @( posedge clk ) begin
pl_accum[0] <= 0;
pl_cand[0] <= cand;
pl_plier[0] <= plier;
for ( int stage=0; stage<nstages; stage++ ) begin
pl_accum[stage+1] <=
pl_accum[stage] +
( pl_plier[stage][stage] * pl_cand[stage] << stage*m );
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum[nstages];
endmodule
module mult_pipe_wfront #( int w = 16, int m = 1 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk );
localparam int stages = 2*w;
logic [2*w-1:0] pl_prod[0:stages];
logic [w-1:0] pl_cand[0:stages];
logic [w-1:0] pl_plier[0:stages];
logic [w-1:0] pl_sum[0:stages];
logic [w-1:0] pl_carry[0:stages];
always_ff @( posedge clk ) begin
pl_cand[0] <= cand;
pl_plier[0] <= plier;
pl_sum[0] <= 0;
pl_carry[0] <= 0;
pl_prod[0] <= 0;
for ( int stage = 0; stage < stages; stage++ ) begin
logic [2*w-1:0] prod_next;
logic [w-1:0] sum_next, carry_next;
logic [1:0] sc;
prod_next = pl_prod[stage];
for ( int i=0; i<w; i++ ) begin
logic a, b, c;
a = stage && i ? pl_sum[stage][i-1] : 0;
b = stage ? pl_carry[stage][i] : 0;
c = stage < w
? pl_plier[stage][w-1-i] && pl_cand[stage][stage] : 0;
sc = a + b + c;
{ carry_next[i], sum_next[i] } = sc;
end
prod_next[stage] = sc[0];
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
pl_sum[stage+1] <= sum_next;
pl_carry[stage+1] <= carry_next;
pl_prod[stage+1] <= prod_next;
end
end
assign prod = pl_prod[stages];
endmodule
module pipe_stage #( int w = 16, int m = 2, int stage = 0 )
( output logic [2*w-1:0] accum_out,
input logic [2*w-1:0] accum_in,
input logic [w-1:0] cand, plier );
always_comb begin
logic [2*w-1:0] accum; accum = accum_in;
for ( int j=0; j<m; j++ ) begin
int pos; pos = stage * m + j;
if ( pos < w && plier[pos] )
accum += cand << pos;
end
accum_out = accum;
end
endmodule
module mult_pipe_c_cpa #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] cand, plier,
input clk);
localparam int stages = ( w + m - 1 ) / m;
logic [2*w-1:0] pl_accum[0:stages];
logic [w-1:0] pl_cand[0:stages];
logic [w-1:0] pl_plier[0:stages];
for ( genvar stage = 0; stage < stages; stage++ ) begin
uwire [2*w-1:0] accum;
pipe_stage #(w, m, stage) our_stage
(accum, pl_accum[stage], pl_cand[stage], pl_plier[stage]);
always_ff @( posedge clk )
pl_accum[stage+1] <= accum;
end
always_ff @( posedge clk ) begin
pl_accum[0] <= 0;
pl_cand[0] <= cand;
pl_plier[0] <= plier;
for ( int stage=0; stage<stages; stage++ ) begin
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum[stages];
endmodule
`include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v"
module pipe_stage_csa #( int wid = 16, int m = 2, int stage = 0 )
( output uwire [2*wid-1:0] accum_out_a, accum_out_b,
input uwire [2*wid-1:0] accum_in_a, accum_in_b,
input uwire [wid-1:0] cand, plier );
uwire [2*wid-1:0] accum_a[m-1:-1];
uwire [2*wid-1:0] accum_b[m-1:-1];
uwire co[-1:m-1];
assign accum_a[-1] = accum_in_a;
assign accum_b[-1] = accum_in_b;
assign accum_out_a = accum_a[m-1];
assign accum_out_b = accum_b[m-1];
for ( genvar i = 0; i < m; i++ ) begin
localparam int pos = stage * m + i;
uwire [2*wid-1:0] pp = pos < wid && plier[pos] ? cand << pos : 0;
CW_csa #(2*wid) csa
( .carry(accum_a[i]), .sum(accum_b[i]), .co(co[i]),
.a(accum_a[i-1]), .b(accum_b[i-1]), .c(pp), .ci(1'b0) );
end
endmodule
module mult_pipe_c_csa #( int wid = 16, int m = 2 )
( output uwire [2*wid-1:0] prod,
input uwire [wid-1:0] cand, plier,
input uwire clk);
localparam int stages = ( wid + m - 1 ) / m;
logic [2*wid-1:0] pl_accum_a[0:stages];
logic [2*wid-1:0] pl_accum_b[0:stages];
logic [wid-1:0] pl_cand[0:stages];
logic [wid-1:0] pl_plier[0:stages];
for ( genvar stage = 0; stage < stages; stage++ ) begin
uwire [2*wid-1:0] accum_a, accum_b;
pipe_stage_csa #(wid, m, stage) our_stage
(accum_a, accum_b, pl_accum_a[stage], pl_accum_b[stage],
pl_cand[stage], pl_plier[stage]);
always_ff @( posedge clk ) begin
pl_accum_a[stage+1] <= accum_a;
pl_accum_b[stage+1] <= accum_b;
end
end
always_ff @( posedge clk ) begin
pl_accum_a[0] <= 0;
pl_accum_b[0] <= 0;
pl_cand[0] <= cand;
pl_plier[0] <= plier;
for ( int stage=0; stage<stages; stage++ ) begin
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum_a[stages] + pl_accum_b[stages];
endmodule
cadence
program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram
module testbench;
localparam int wid = 16;
localparam int num_tests = 1000;
localparam int NUM_MULT = 20;
localparam int err_limit = 7;
bit use_others;
logic [wid-1:0] plier, cand;
logic [wid-1:0] plierp, candp;
logic [2*wid-1:0] prod[NUM_MULT];
typedef struct { int idx; int err_count = 0;
bit seq = 0; bit pipe = 0; bit wf = 0; int deg = 1;
logic [2*wid-1:0] sout = 'h111; int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];
localparam int cycle_limit = num_tests * wid * 8;
int cycle;
bit done;
logic clock;
logic clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);
initial begin
clock = 0;
cycle = 0;
fork
forever #10 cycle += clock++;
wait( done );
wait( cycle >= cycle_limit )
$write("*** Cycle limit exceeded, ending.\n");
join_any;
$finish();
end
initial begin
while ( !done ) @( posedge clk_reactive ) #1
if ( use_others ) begin
plierp = plier;
candp = cand;
use_others = 0;
end else begin
plierp = cycle;
candp = 256;
end
end
task pi_seq(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1;
endtask
task pi_pipe(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;
endtask
task pi_wpipe(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].wf = 1;
endtask
mult_behav_1 #(wid) mb1(prod[0], plier, cand);
initial pi["Behavioral"].idx = 0;
mult_pipe1 #(wid) ms18(prod[18], plierp, candp, clock);
initial pi_pipe(18,"Pipelined Simple",1);
mult_pipe #(wid,4) ms54(prod[7], plierp, candp, clock);
initial pi_pipe(7,"Pipelined",ms54.m);
mult_pipe #(wid,3) ms53(prod[8], plierp, candp, clock);
initial pi_pipe(8,"Pipelined",ms53.m);
mult_pipe_wfront #(wid,1) ms4(prod[4], plierp, candp, clock);
initial pi_wpipe(4,"Pipelined WF",ms4.m);
mult_pipe_2 #(wid,4) ms17(prod[17], plierp, candp, clock);
initial pi_pipe(17,"Pipelined 2",ms17.m);
mult_pipe_2 #(wid,3) ms16(prod[16], plierp, candp, clock);
initial pi_pipe(16,"Pipelined 2",ms16.m);
mult_pipe_c_cpa #(wid,4) pgam4(prod[12], plierp, candp, clock);
initial pi_pipe(12,"Pipelined Comb CPA",pgam4.m);
mult_pipe_c_cpa #(wid,3) pgam13(prod[13], plierp, candp, clock);
initial pi_pipe(13,"Pipelined Comb CPA",pgam13.m);
mult_pipe_c_csa #(wid,3) pgam2(prod[2], plierp, candp, clock);
initial pi_pipe(2,"Pipelined CSA",pgam2.m);
mult_pipe_c_csa #(wid,4) pgam3(prod[3], plierp, candp, clock);
initial pi_pipe(3,"Pipelined CSA",pgam3.m);
int tests[$] = {1,1, 1,2, 1,32, 2,1, 32, 1};
initial begin
done = 0;
use_others = 0;
#0 begin
string index_used[NUM_MULT];
automatic int n_unused = 0, n_reused = 0;
foreach ( pi[ mut ] ) begin
automatic int idx = pi[mut].idx;
if ( index_used[idx].len() )
begin
$write("*** Index %0d used by %s and %s.\n",
idx, index_used[idx], mut );
n_reused++;
end
index_used[idx] = mut;
end
$write("Unused positions: ");
foreach ( index_used[idx] )
if ( index_used[idx].len() == 0 )
$write("%s%0d", n_unused++ ? ", " : "", idx);
$write("%s.\n",n_unused ? "" : "none -- all used");
if ( n_reused )
$fatal(2, "\nFound %0d re-used indices. Aborting simulation.\n\n",
n_reused);
end
@( posedge clk_reactive );
for ( int i=0; i<num_tests; i++ ) begin
automatic int cyc_start = cycle;
automatic int awaiting = pi.num();
plier = tests.size() ? tests.pop_front() : $random();
cand = tests.size() ? tests.pop_front() : $random();
plierp = plier;
candp = cand;
use_others = 1;
foreach ( pi[muti] ) begin
automatic string mut = muti; automatic Info p = pi[mut];
fork begin
automatic int arrival_late = 1;
automatic int steps = ( wid + pi[mut].deg - 1 ) / pi[mut].deg;
automatic int latency
= !pi[mut].seq ? 1 :
!pi[mut].pipe ? 2 * steps :
pi[mut].wf ? arrival_late + 2 * steps :
arrival_late + steps;
automatic int eta = 1 + cyc_start + latency;
pi[mut].latency = latency;
wait ( cycle_reactive == eta );
awaiting--;
pi[mut].sout = prod[pi[mut].idx];
pi[mut].cyc_tot += cycle - cyc_start;
end join_none;
end
wait ( awaiting == 0 );
foreach ( pi[ mut ] )
if ( prod[0] !== pi[mut].sout ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < 5 )
$write
("%-25s wrong result: %0d * %0d: 0x%0h != 0x%0h (correct)\n",
mut, plier, cand, pi[mut].sout, prod[0]);
end
@( posedge clk_reactive );
end
foreach ( pi[ mut ] )
$write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
num_tests, mut, pi[mut].err_count,
pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);
done = 1;
$finish(2);
end
endmodule
cadence