:Def: Pipelining
:Example:module very_simple_pipe
#( int w = 16 )
( output logic [w-1:0] x,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r;
always_ff @( posedge clk ) r <= a;
always_ff @( posedge clk ) x <= r;
endmodule
:Example:module simple_pipe2
#( int w = 16, int nstages = 4 )
( output uwire [w-1:0] x,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r[nstages];
always_ff @( posedge clk ) begin
r[0] <= a; for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
end
assign x = r[nstages-1];
endmodule
:Example:module simple_pipe2_ba
#( int w = 16, int nstages = 4 )
( output uwire [w-1:0] x,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r[nstages];
always_ff @( posedge clk ) begin
r[0] = a; for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
end
assign x = r[nstages-1];
endmodule
:Example:module simple_pipe_avg
#( int w = 16, int nstages = 4 )
( output uwire [w-1:0] x,
output logic [w-1:0] avg,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r[nstages];
assign r[0] = a;
always_ff @( posedge clk ) begin
for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
end
assign x = r[nstages-1];
logic [w+$clog2(nstages):0] sum;
always_comb begin
sum = 0;
for ( int i=0; i<nstages; i++ ) sum += r[i];
avg = sum / nstages;
end
endmodule
:Example:module simple_pipe_add1
#( int w = 16, int nstages = 4 )
( output uwire [w-1:0] x,
input uwire [w-1:0] a,
input uwire clk );
logic [w-1:0] r[nstages];
always_ff @( posedge clk ) begin
r[0] <= a + 1;
for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1] + 1;
end
assign x = r[nstages-1];
endmodule
:Def: Throughput:Def: Latency
:Def: Pipeline Latch
:Example:module mult_pipe1
#( int w = 16, int m = 1 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input uwire clk);
localparam int stages = w;
logic [2*w-1:0] pl_accum[0:stages];
logic [w-1:0] pl_plier[0:stages];
logic [w-1:0] pl_cand[0:stages];
always_ff @( posedge clk ) begin
pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;
for ( int stage=0; stage<stages; stage++ ) begin
localparam int pos = stage;
logic [2*w-1:0] pp, accum;
pp = pl_cand[stage][pos] ? pl_plier[stage] << pos : 0;
accum = pl_accum[stage] + pp;
pl_accum[stage+1] <= accum;
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum[stages];
endmodule
:Example:
module mult_pipe #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);
localparam int stages = ( w + m - 1 ) / m;
logic [2*w-1:0] pl_accum[0:stages];
logic [w-1:0] pl_plier[0:stages];
logic [w-1:0] pl_cand[0:stages];
always_ff @( posedge clk ) begin
pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;
for ( int stage=0; stage<stages; stage++ ) begin
logic [2*w-1:0] accum;
accum = pl_accum[stage];
for ( int j=0; j<m; j++ ) begin
int pos;
pos = stage * m + j;
if ( pos < w && pl_cand[stage][pos] )
accum += pl_plier[stage] << pos;
end
pl_accum[stage+1] <= accum;
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum[stages];
endmodule
:Example:
module mult_pipe_2 #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);
localparam int nstages = ( w + m - 1 ) / m;
logic [2*w-1:0] pl_accum[0:nstages];
logic [w-1:0] pl_plier[0:nstages];
logic [nstages-1:0][m-1:0] pl_cand[0:nstages];
always_ff @( posedge clk ) begin
pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;
for ( int stage=0; stage<nstages; stage++ ) begin
pl_accum[stage+1] <=
pl_accum[stage] +
( pl_plier[stage] * pl_cand[stage][stage] << stage*m );
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum[nstages];
endmodule
module mult_pipe_wfront #( int w = 16, int m = 1 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk );
localparam int stages = 2*w;
logic [2*w-1:0] pl_prod[0:stages];
logic [w-1:0] pl_plier[0:stages];
logic [w-1:0] pl_cand[0:stages];
logic [w-1:0] pl_sum[0:stages];
logic [w-1:0] pl_carry[0:stages];
always_ff @( posedge clk ) begin
pl_plier[0] = plier;
pl_cand[0] = cand;
pl_sum[0] = 0;
pl_carry[0] = 0;
pl_prod[0] = 0;
for ( int stage = 0; stage < stages; stage++ ) begin
logic [2*w-1:0] prod_next;
logic [w-1:0] sum_next, carry_next;
logic [1:0] sc;
prod_next = pl_prod[stage];
for ( int i=0; i<w; i++ ) begin
logic a, b, c;
a = stage && i ? pl_sum[stage][i-1] : 0;
b = stage ? pl_carry[stage][i] : 0;
c = stage < w
? pl_cand[stage][w-1-i] && pl_plier[stage][stage] : 0;
sc = a + b + c;
{ carry_next[i], sum_next[i] } = sc;
end
prod_next[stage] = sc[0];
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
pl_sum[stage+1] <= sum_next;
pl_carry[stage+1] <= carry_next;
pl_prod[stage+1] <= prod_next;
end
end
assign prod = pl_prod[stages];
endmodule
module pipe_stage #( int w = 16, int m = 2, int stage = 0 )
( output logic [2*w-1:0] accum_out,
input logic [2*w-1:0] accum_in,
input logic [w-1:0] plier, cand );
always_comb begin
logic [2*w-1:0] accum; accum = accum_in;
for ( int j=0; j<m; j++ ) begin
int pos; pos = stage * m + j;
if ( pos < w && cand[pos] )
accum += plier << pos;
end
accum_out = accum;
end
endmodule
module mult_pipe_c_cpa #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);
localparam int stages = ( w + m - 1 ) / m;
logic [2*w-1:0] pl_accum[0:stages];
logic [w-1:0] pl_plier[0:stages];
logic [w-1:0] pl_cand[0:stages];
for ( genvar stage = 0; stage < stages; stage++ ) begin
uwire [2*w-1:0] accum;
pipe_stage #(w, m, stage) our_stage
(accum, pl_accum[stage], pl_plier[stage], pl_cand[stage]);
always_ff @( posedge clk )
pl_accum[stage+1] <= accum;
end
always_comb begin
pl_plier[0] = plier;
pl_cand[0] = cand;
end
always_ff @( posedge clk ) begin
pl_accum[0] = 0;
for ( int stage=0; stage<stages; stage++ ) begin
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum[stages];
endmodule
`include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v"
module pipe_stage_csa #( int wid = 16, int m = 2, int stage = 0 )
( output uwire [2*wid-1:0] accum_out_a, accum_out_b,
input uwire [2*wid-1:0] accum_in_a, accum_in_b,
input uwire [wid-1:0] plier,
input uwire [wid-1:0] cand );
uwire [2*wid-1:0] accum_a[m-1:-1];
uwire [2*wid-1:0] accum_b[m-1:-1];
uwire co[-1:m-1];
assign accum_a[-1] = accum_in_a;
assign accum_b[-1] = accum_in_b;
assign accum_out_a = accum_a[m-1];
assign accum_out_b = accum_b[m-1];
for ( genvar i = 0; i < m; i++ ) begin
localparam int pos = stage * m + i;
uwire [2*wid-1:0] pp = pos < wid && cand[pos] ? plier << pos : 0;
CW_csa #(2*wid) csa
( .carry(accum_a[i]), .sum(accum_b[i]), .co(co[i]),
.a(accum_a[i-1]), .b(accum_b[i-1]), .c(pp), .ci(1'b0) );
end
endmodule
module mult_pipe_c_csa #( int wid = 16, int m = 2 )
( output uwire [2*wid-1:0] prod,
input uwire [wid-1:0] plier,
input uwire [wid-1:0] cand,
input uwire clk);
localparam int stages = ( wid + m - 1 ) / m;
logic [2*wid-1:0] pl_accum_a[0:stages];
logic [2*wid-1:0] pl_accum_b[0:stages];
logic [wid-1:0] pl_plier[0:stages];
logic [wid-1:0] pl_cand[0:stages];
for ( genvar stage = 0; stage < stages; stage++ ) begin
uwire [2*wid-1:0] accum_a, accum_b;
pipe_stage_csa #(wid, m, stage) our_stage
(accum_a, accum_b, pl_accum_a[stage], pl_accum_b[stage],
pl_plier[stage], pl_cand[stage]);
always_ff @( posedge clk ) begin
pl_accum_a[stage+1] <= accum_a;
pl_accum_b[stage+1] <= accum_b;
end
end
always_comb begin
pl_plier[0] = plier;
pl_cand[0] = cand;
end
always_ff @( posedge clk ) begin
pl_accum_a[0] = 0;
pl_accum_b[0] = 0;
for ( int stage=0; stage<stages; stage++ ) begin
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];
end
end
assign prod = pl_accum_a[stages] + pl_accum_b[stages];
endmodule
module mult_behav_1
#(int wid = 16)
(output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand);
assign prod = plier * cand;
endmodule
cadence
program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram
module testbench;
localparam int wid = 16;
localparam int num_tests = 1000;
localparam int NUM_MULT = 20;
localparam int err_limit = 7;
bit use_others;
logic [wid-1:0] plier, cand;
logic [wid-1:0] plierp, candp;
logic [2*wid-1:0] prod[NUM_MULT];
typedef struct { int idx; int err_count = 0;
bit seq = 0; bit pipe = 0; bit wf = 0; int deg = 1;
logic [2*wid-1:0] sout = 'h111; int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];
localparam int cycle_limit = num_tests * wid * 8;
int cycle;
bit done;
logic clock;
logic clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);
initial begin
clock = 0;
cycle = 0;
fork
forever #10 cycle += clock++;
wait( done );
wait( cycle >= cycle_limit )
$write("*** Cycle limit exceeded, ending.\n");
join_any;
$finish();
end
initial begin
while ( !done ) @( posedge clk_reactive ) #1
if ( use_others ) begin
plierp = plier;
candp = cand;
use_others = 0;
end else begin
plierp = cycle;
candp = 256;
end
end
task pi_seq(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1;
endtask
task pi_pipe(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;
endtask
task pi_wpipe(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].wf = 1;
endtask
mult_behav_1 #(wid) mb1(prod[0], plier, cand);
initial pi["Behavioral"].idx = 0;
mult_pipe1 #(wid) ms18(prod[18], plierp, candp, clock);
initial pi_pipe(18,"Pipelined Simple",1);
mult_pipe #(wid,4) ms54(prod[7], plierp, candp, clock);
initial pi_pipe(7,"Pipelined",ms54.m);
mult_pipe #(wid,3) ms53(prod[8], plierp, candp, clock);
initial pi_pipe(8,"Pipelined",ms53.m);
mult_pipe_wfront #(wid,1) ms4(prod[4], plierp, candp, clock);
initial pi_wpipe(4,"Pipelined WF",ms4.m);
mult_pipe_2 #(wid,4) ms17(prod[17], plierp, candp, clock);
initial pi_pipe(17,"Pipelined 2",ms17.m);
mult_pipe_2 #(wid,3) ms16(prod[16], plierp, candp, clock);
initial pi_pipe(16,"Pipelined 2",ms16.m);
mult_pipe_c_cpa #(wid,4) pgam4(prod[12], plierp, candp, clock);
initial pi_pipe(12,"Pipelined Comb CPA",pgam4.m);
mult_pipe_c_cpa #(wid,3) pgam13(prod[13], plierp, candp, clock);
initial pi_pipe(13,"Pipelined Comb CPA",pgam13.m);
mult_pipe_c_csa #(wid,3) pgam2(prod[2], plierp, candp, clock);
initial pi_pipe(2,"Pipelined CSA",pgam2.m);
mult_pipe_c_csa #(wid,4) pgam3(prod[3], plierp, candp, clock);
initial pi_pipe(3,"Pipelined CSA",pgam3.m);
int tests[$] = {1,1, 1,2, 1,32, 2,1, 32, 1};
initial begin
done = 0;
use_others = 0;
#0 begin
string index_used[NUM_MULT];
automatic int n_unused = 0, n_reused = 0;
foreach ( pi[ mut ] ) begin
automatic int idx = pi[mut].idx;
if ( index_used[idx].len() )
begin
$write("*** Index %0d used by %s and %s.\n",
idx, index_used[idx], mut );
n_reused++;
end
index_used[idx] = mut;
end
$write("Unused positions: ");
foreach ( index_used[idx] )
if ( index_used[idx].len() == 0 )
$write("%s%0d", n_unused++ ? ", " : "", idx);
$write("%s.\n",n_unused ? "" : "none -- all used");
if ( n_reused )
$fatal(2, "\nFound %0d re-used indices. Aborting simulation.\n\n",
n_reused);
end
@( posedge clk_reactive );
for ( int i=0; i<num_tests; i++ ) begin
automatic int cyc_start = cycle;
automatic int awaiting = pi.num();
plier = tests.size() ? tests.pop_front() : $random();
cand = tests.size() ? tests.pop_front() : $random();
plierp = plier;
candp = cand;
use_others = 1;
foreach ( pi[muti] ) begin
automatic string mut = muti; automatic Info p = pi[mut];
fork begin
automatic int steps = ( wid + pi[mut].deg - 1 ) / pi[mut].deg;
automatic int latency
= !pi[mut].seq ? 1 :
!pi[mut].pipe ? 2 * steps :
pi[mut].wf ? 2 * steps :
steps;
automatic int eta = 1 + cyc_start + latency;
pi[mut].latency = latency;
wait ( cycle_reactive == eta );
awaiting--;
pi[mut].sout = prod[pi[mut].idx];
pi[mut].cyc_tot += cycle - cyc_start;
end join_none;
end
wait ( awaiting == 0 );
foreach ( pi[ mut ] )
if ( prod[0] !== pi[mut].sout ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < 5 )
$write
("%-25s wrong result: %0d * %0d: 0x%0h != 0x%0h (correct)\n",
mut, plier, cand, pi[mut].sout, prod[0]);
end
@( posedge clk_reactive );
end
foreach ( pi[ mut ] )
$write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
num_tests, mut, pi[mut].err_count,
pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);
done = 1;
$finish(2);
end
endmodule
cadence