111
:Def: Critical Path :Def: Latency [of an action in a sequential circuit]
`default_nettype none
module mult_behav_1
#(int w = 16)
(output uwire [2*w-1:0] prod, input uwire [w-1:0] plier, cand);
assign prod = plier * cand;
endmodule
module carry_prop_adder #(int w=16)
(output uwire [w:1] s, input uwire [w:1] a,b);
assign s = a + b;
endmodule
module mult_linear
#(int w = 16)
(output logic [2*w-1:0] prod, input uwire [w-1:0] plier, cand);
logic [2*w-1:0] rsum [w-1:-1];
assign rsum[-1] = 0;
for ( genvar i=0; i<w; i++ ) begin
uwire [2*w-1:0] pprod = cand[i] ? plier << i : 0;
carry_prop_adder #(2*w) adder(rsum[i], rsum[i-1], pprod );
end
assign prod = rsum[w-1];
endmodule
module mult_linear_clk
#( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk);
uwire [2*w-1:0] p;
logic [w-1:0] pliercpy, candcpy;
mult_linear #(w) ml(p, pliercpy, candcpy);
always_ff @( posedge clk ) begin
pliercpy <= plier;
candcpy <= cand;
prod <= p;
end
endmodule
module mult_tree
#(int w = 16)
(output logic [2*w-1:0] prod, input uwire [w-1:0] plier, cand);
localparam int widp2 = 1 << $clog2(w);
logic [2*w-1:0] rsum [2*w-1:0];
localparam int mask = 2*w-1;
for ( genvar i=0; i<w; i++ )
assign rsum[i] = plier[i] ? cand << i : 0;
for ( genvar i=w; i<2*w-1; i++ )
carry_prop_adder #( 2*w ) adder
( rsum[i],
rsum[ mask & (i<<1) ], rsum[ mask & ( (i<<1) + 1 ) ] );
assign prod = rsum[2*w-2];
endmodule
module mult_seq #( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier,
input uwire [w-1:0] cand,
input uwire clk);
localparam int wlog = $clog2(w);
cadence initial if ( w != 1 << wlog ) $fatal(1,"Size must be a power of 2.");
cadence
bit [wlog-1:0] pos;
logic [2*w-1:0] accum;
always_ff @( posedge clk ) begin
if ( pos == 0 ) begin
prod = accum;
accum = 0;
end
if ( cand[pos] == 1 ) accum += plier << pos;
pos++;
end
endmodule
module mult_seq_ga #( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk );
localparam int wlog = $clog2(w);
bit [wlog-1:0] pos;
bit [2*w-1:0] accum;
uwire [2*w-1:0] sum;
uwire [2*w-1:0] pp = cand[pos] ? plier << pos : 0;
carry_prop_adder #(2*w) ga( sum, accum, pp );
always_ff @( posedge clk ) pos <= pos + 1;
always_ff @( posedge clk )
if ( pos == 0 ) begin
prod = sum;
accum = 0;
end else begin
accum = sum;
end
endmodule
module mult_seq_stream #( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk);
localparam int wlog = $clog2(w);
bit [wlog-1:0] pos;
logic [2*w-1:0] accum;
always_ff @( posedge clk ) begin
logic [w:0] pp;
if ( pos == 0 ) begin
prod = accum;
accum = cand;
pos = w - 1;
end else begin
pos--;
end
pp = accum[0] ? { 1'b0, plier } : 0;
accum = { { 1'b0, accum[2*w-1:w] } + pp, accum[w-1:1] };
end
endmodule
`ifdef XXX
Module Name Area Period Period Total Init.
Target Actual Latency Interv
mult_behav_1_w8 53168 1000 6062 6062 6062
mult_behav_1_w16 215672 1000 13551 13551 13551
mult_behav_1_w32 764479 1000 26485 26485 26485
mult_behav_1_w64 2891332 1000 52332 52332 52332
mult_seq_w8 56081 1000 8120 64960 64960
mult_seq_w16 122475 1000 15916 254656 254656
mult_seq_w32 258750 1000 31385 1004320 1004320
mult_seq_w64 544285 1000 58476 3742464 3742464
mult_seq_stream_w8 44320 1000 4518 36144 36144
mult_seq_stream_w16 78395 1000 8868 141888 141888
mult_seq_stream_w32 153863 1000 16361 523552 523552
mult_seq_stream_w64 304047 1000 30276 1937664 1937664
endmodule
`endif
module mult_seq_m #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk);
localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = $clog2(iterations);
bit [iter_lg:1] iter;
logic [2*w-1:0] accum;
always_ff @( posedge clk ) begin
if ( iter == iter_lg'(iterations) ) begin
prod = accum;
accum = 0;
iter = 0;
end
for ( int i=0; i<m; i++ )
begin
int pos;
pos = iter * m + i;
if ( cand[pos] ) accum += plier << pos;
end
iter++;
end
endmodule
module mult_seq_dm
#( int w = 16,
int m = 2 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk);
localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = $clog2(iterations);
uwire [iterations-1:0][m-1:0] cand_2d = cand;
bit [iter_lg:1] iter;
logic [2*w-1:0] accum;
always_ff @( posedge clk ) begin
if ( iter == iter_lg'(iterations) ) begin
prod = accum;
accum = 0;
iter = 0;
end
accum += plier * cand_2d[iter] << ( iter * m );
iter++;
end
endmodule
module mult_seq_wfront
#( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk );
localparam int wlog = $clog2(2*w);
cadence if ( 2**wlog != 2*w )
$fatal(2,"Size, parameter w=%0d, must be a power of 2.\n",w);
cadence
bit [wlog-1:0] pos;
always_ff @( posedge clk ) pos <= pos + 1;
logic [w-1:0] sum, carry;
logic [1:0] sc;
always_ff @( posedge clk ) begin
for ( int i=0; i<w; i++ ) begin
logic a, b, c;
a = pos < w && cand[w-1-i] ? plier[pos] : 0;
b = pos && i ? sum[i-1] : 0;
c = pos ? carry[i] : 0;
sc = a + b + c;
sum[i] <= sc[0];
carry[i] <= sc[1];
end
prod[pos] = sc[0];
end
endmodule
module mult_seq_wfront_opt
#( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk );
localparam int wlog = $clog2(w);
cadence if ( 2**wlog != w )
$fatal(2,"Size, parameter w=%0d, must be a power of 2.\n",w);
cadence
bit pos_eq_0;
bit [wlog:0] pos;
always_ff @( posedge clk ) { pos_eq_0, pos } <= pos + 1;
uwire pos_lt_w = !pos[wlog];
logic [w-1:0] sum, carry;
logic [1:0] sc;
uwire [w-1:0] plier_rot = {plier[0],plier[w-1:1]};
logic plier_pos;
always_ff @( posedge clk ) plier_pos <= plier_rot[wlog'(pos)];
always_ff @( posedge clk ) begin
for ( int i=0; i<w; i++ ) begin
logic a, b, c;
a = !pos_eq_0 && carry[i];
b = !pos_eq_0 && i && sum[i-1];
c = pos_lt_w && plier_pos && cand[w-1-i];
sc = a + b + c;
sum[i] <= sc[0];
carry[i] <= sc[1];
end
prod[pos] = sc[0];
end
endmodule
`ifdef xxx
Module Name Area Period Period Total Init.
Target Actual Latency Interv
mult_seq_wfront_w8 45390 1000 3132 50112 50112
mult_seq_wfront_w16 89668 1000 3260 104320 104320
mult_seq_wfront_w32 178367 1000 4202 268928 268928
mult_seq_wfront_w64 345415 1000 4716 603648 603648
mult_seq_wfront_opt_w8 47575 1000 2428 38848 38848
mult_seq_wfront_opt_w16 94652 1000 2275 72800 72800
mult_seq_wfront_opt_w32 177706 1000 2546 162944 162944
mult_seq_wfront_opt_w64 345301 1000 2724 348672 348672
mult_seq_stream_w8 44320 1000 4518 36144 36144
mult_seq_stream_w16 78395 1000 8868 141888 141888
mult_seq_stream_w32 153863 1000 16361 523552 523552
mult_seq_stream_w64 304047 1000 30276 1937664 1937664
`endif
module mult_seq_wfront_m #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk );
localparam int iterations = ( 2*w + m - 1 ) / m;
localparam int iter_lg = $clog2(iterations);
localparam int wlog = $clog2(m * iterations);
bit [iter_lg-1:0] iter;
always_ff @( posedge clk ) iter <= iter + 1;
logic [w-1:-1] sum, carry;
always_ff @( posedge clk ) begin
logic [w-1:-1] j_sum[m+1], j_carry[m+1];
logic [1:0] sc;
j_sum[0] = iter ? sum : 0;
j_carry[0] = iter ? carry : 0;
for ( int j=0; j<m; j++ ) begin
logic [wlog-1:0] pos;
pos = m*iter + j;
j_sum[j+1][-1] = 0;
for ( int i=0; i<w; i++ ) begin
logic a, b, c;
a = j_carry[j][i];
b = j_sum[j][i-1];
c = pos < w && cand[w-1-i] && plier[pos];
sc = a + b + c;
{ j_carry[j+1][i], j_sum[j+1][i] } = sc;
end
if ( pos < 2*w ) prod[pos] = sc[0];
end
sum <= j_sum[m];
carry <= j_carry[m];
end
endmodule
`ifdef DONT_DEFINE_ME
Module Name Area Period Period Total Init.
Target Actual Latency Interv
mult_seq_wfront_w16 89668 1000 3260 104320 104320
mult_seq_wfront_opt_w16 94652 1000 2275 72800 72800
mult_seq_wfront_m_w16_m1 97631 1000 3500 112000 112000
mult_seq_wfront_m_w16_m2 104787 1000 3658 58528 58528
mult_seq_wfront_m_w16_m4 130378 1000 3942 31536 31536
mult_seq_wfront_m_w16_m8 171258 1000 6417 25668 25668
mult_seq_dm_w16_m1 121611 1000 16360 261760 261760
mult_seq_dm_w16_m2 131048 1000 16730 133840 133840
mult_seq_dm_w16_m4 145285 1000 16704 66816 66816
mult_seq_dm_w16_m8 196118 1000 15161 30322 30322
mult_seq_wfront_w32 178367 1000 4202 268928 268928
mult_seq_wfront_opt_w32 177706 1000 2546 162944 162944
mult_seq_wfront_m_w32_m1 191334 1000 3766 241024 241024
mult_seq_wfront_m_w32_m2 205303 1000 3857 123424 123424
mult_seq_wfront_m_w32_m4 260182 1000 5266 84256 84256
mult_seq_wfront_m_w32_m8 351910 1000 7031 56248 56248
mult_seq_dm_w32_m1 246818 1000 31113 995616 995616
mult_seq_dm_w32_m2 279486 1000 30994 495904 495904
mult_seq_dm_w32_m4 314724 1000 32127 257016 257016
mult_seq_dm_w32_m8 408659 1000 31251 125004 125004
`endif
`ifdef DONT_DEFINE_ME
Module Name Area Period Period Total Init.
Target Actual Latency Interv
mult_behav_1_w8 53168 1000 6062 6062 6062
mult_behav_1_w16 215672 1000 13551 13551 13551
mult_behav_1_w32 764479 1000 26485 26485 26485
mult_behav_1_w64 2891332 1000 52332 52332 52332
mult_seq_stream_w8 44320 1000 4518 36144 36144
mult_seq_stream_w16 78395 1000 8868 141888 141888
mult_seq_stream_w32 153863 1000 16361 523552 523552
mult_seq_stream_w64 304047 1000 30276 1937664 1937664
mult_seq_wfront_w8 45390 1000 3132 50112 50112
mult_seq_wfront_w16 89668 1000 3260 104320 104320
mult_seq_wfront_w32 178367 1000 4202 268928 268928
mult_seq_wfront_w64 345415 1000 4716 603648 603648
mult_seq_wfront_opt_w8 47575 1000 2428 38848 38848
mult_seq_wfront_opt_w16 94652 1000 2275 72800 72800
mult_seq_wfront_opt_w32 177706 1000 2546 162944 162944
mult_seq_wfront_opt_w64 345301 1000 2724 348672 348672
mult_pipe_wfront_w8 239827 1000 993 7944 7944
mult_pipe_wfront_w16 1012675 1000 1173 18768 18768
mult_pipe_wfront_w32 4158007 1000 1512 48384 48384
mult_pipe_wfront_w64 16865186 1000 2256 144384 144384
mult_seq_csa_w8 80488 1000 9266 74128 74128
mult_seq_csa_w16 162743 1000 16580 265280 265280
mult_seq_csa_w32 343497 1000 31074 994368 994368
mult_seq_csa_w64 715059 1000 60431 3867584 3867584
mult_pipe1_w8 137822 1000 3859 30872 3859
mult_pipe1_w16 571541 1000 7499 119984 7499
mult_pipe1_w32 2325284 1000 14746 471872 14746
mult_pipe1_w64 9397076 1000 28722 1838208 28722
mult_seq_dm_w16_m1 121611 1000 16360 261760 261760
mult_seq_dm_w16_m2 131048 1000 16730 133840 133840
mult_seq_dm_w16_m4 145285 1000 16704 66816 66816
mult_seq_dm_w16_m8 196118 1000 15161 30322 30322
mult_seq_dm_w32_m1 246818 1000 31113 995616 995616
mult_seq_dm_w32_m2 279486 1000 30994 495904 495904
mult_seq_dm_w32_m4 314724 1000 32127 257016 257016
mult_seq_dm_w32_m8 408659 1000 31251 125004 125004
mult_seq_wfront_m_w16_m1 97631 1000 3500 112000 112000
mult_seq_wfront_m_w16_m2 104787 1000 3658 58528 58528
mult_seq_wfront_m_w16_m4 130378 1000 3942 31536 31536
mult_seq_wfront_m_w16_m8 171258 1000 6417 25668 25668
mult_seq_wfront_m_w32_m1 191334 1000 3766 241024 241024
mult_seq_wfront_m_w32_m2 205303 1000 3857 123424 123424
mult_seq_wfront_m_w32_m4 260182 1000 5266 84256 84256
mult_seq_wfront_m_w32_m8 351910 1000 7031 56248 56248
`endif
cadence
program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input var int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram
module testbench;
localparam int w = 16;
localparam int num_tests = 1000;
localparam int NUM_MULT = 20;
localparam int err_limit = 7;
bit use_others;
logic [w-1:0] plier, cand;
logic [w-1:0] plierp, candp;
logic [2*w-1:0] prod[NUM_MULT];
typedef struct { int idx; int err_count = 0;
bit seq = 0; bit pipe = 0; bit wf = 0; int deg = 1;
logic [2*w-1:0] sout = 'h111; int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];
localparam int cycle_limit = num_tests * w * 8;
int cycle;
bit done;
logic clock;
logic clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);
initial begin
clock = 0;
cycle = 0;
fork
forever #10 cycle += clock++;
wait( done );
wait( cycle >= cycle_limit )
$write("*** Cycle limit exceeded, ending.\n");
join_any;
$finish();
end
initial begin
while ( !done ) @( posedge clk_reactive ) #1
if ( use_others ) begin
plierp = plier;
candp = cand;
use_others = 0;
end else begin
plierp = cycle;
candp = 256;
end
end
task pi_seq(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1;
endtask
task pi_seqw(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].wf = 1;
endtask
task pi_pipe(input int idx, input string name, input int deg);
automatic string m = $sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;
endtask
mult_behav_1 #(w) mb1(prod[0], plier, cand);
initial pi["Behavioral"].idx = 0;
mult_linear #(w) ms1(prod[1], plier, cand);
initial pi["Linear"].idx = 1;
mult_tree #(w) ms2(prod[2], plier, cand);
initial pi["Tree"].idx = 2;
mult_seq #(w) ms3(prod[3], plier, cand, clock);
initial begin
automatic string m = "Sequential";
pi[m].idx = 3; pi[m].seq = 1;
end
mult_seq_ga #(w) msga1(prod[11], plier, cand, clock);
initial begin
automatic string m = "Sequential GA";
pi[m].idx = 11; pi[m].seq = 1;
end
mult_seq_stream #(w) mss1(prod[4], plier, cand, clock);
initial begin
automatic string m = "Sequential Streamlined";
pi[m].idx = 4; pi[m].seq = 1;
end
mult_seq_wfront #(w) ms7(prod[7], plier, cand, clock);
initial pi_seqw(7,"WFront", 1);
mult_seq_wfront_opt #(w) ms8(prod[8], plier, cand, clock);
initial pi_seqw(8,"WFront-Opt", 1);
mult_seq_wfront_m #(w,2) ms12(prod[12], plier, cand, clock);
initial pi_seqw(12,"WFront", ms12.m);
mult_seq_wfront_m #(w,5) ms13(prod[13], plier, cand, clock);
initial pi_seqw(13,"WFront", ms13.m);
mult_seq_m #(w,4) ms44(prod[5], plier, cand, clock);
initial pi_seq(5,"Seq", ms44.m);
mult_seq_m #(w,3) ms43(prod[6], plier, cand, clock);
initial pi_seq(6,"Seq", ms43.m);
mult_seq_dm #(w,1) msd14(prod[14], plier, cand, clock);
initial pi_seq(14,"Seq Rad", msd14.m);
mult_seq_dm #(w,2) msd16(prod[16], plier, cand, clock);
initial pi_seq(16,"Seq Rad", msd16.m);
mult_seq_dm #(w,4) msd44(prod[9], plier, cand, clock);
initial pi_seq(9,"Seq Rad", ms44.m);
mult_seq_dm #(w,3) msd43(prod[10], plier, cand, clock);
initial pi_seq(10,"Seq Rad", ms43.m);
mult_linear_clk #(w) mlc1(prod[15], plier, cand, clock);
initial begin
automatic string m = $sformatf("Linear Clock");
pi[m].idx = 15; pi[m].seq = 1;
end
int tests[$] = {1,32, 1,1, 1,2, 1,32, 32, 1};
initial begin
done = 0;
use_others = 0;
#0 begin
string index_used[NUM_MULT];
automatic int n_unused = 0, n_reused = 0;
foreach ( pi[ mut ] ) begin
automatic int idx = pi[mut].idx;
if ( index_used[idx].len() )
begin
$write("*** Index %0d used by %s and %s.\n",
idx, index_used[idx], mut );
n_reused++;
end
index_used[idx] = mut;
end
$write("Unused positions: ");
foreach ( index_used[idx] )
if ( index_used[idx].len() == 0 )
$write("%s%0d", n_unused++ ? ", " : "", idx);
$write("%s.\n",n_unused ? "" : "none -- all used");
if ( n_reused )
$fatal(2, "\nFound %0d re-used indices. Aborting simulation.\n\n",
n_reused);
end
@( posedge clk_reactive );
for ( int i=0; i<num_tests; i++ ) begin
automatic int cyc_start = cycle;
automatic int awaiting = pi.num();
plier = tests.size() ? tests.pop_front() : $random();
cand = tests.size() ? tests.pop_front() : $random();
plierp = plier;
candp = cand;
use_others = 1;
foreach ( pi[muti] ) begin
automatic string mut = muti; automatic Info p = pi[mut];
fork begin
automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
automatic int latency
= !pi[mut].seq ? 1 :
pi[mut].wf ? 5 * steps :
!pi[mut].pipe ? 2 * steps : steps;
automatic int eta = 1 + cyc_start + latency;
pi[mut].latency = latency;
wait ( cycle_reactive == eta );
awaiting--;
pi[mut].sout = prod[pi[mut].idx];
pi[mut].cyc_tot += cycle - cyc_start;
end join_none;
end
wait ( awaiting == 0 );
foreach ( pi[ mut ] )
if ( prod[0] !== pi[mut].sout ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < 5 )
$write
("%-25s wrong result: %0d * %0d: 0x%0h != 0x%0h (correct)\n",
mut, plier, cand, pi[mut].sout, prod[0]);
end
@( posedge clk_reactive );
end
foreach ( pi[ mut ] )
$write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
num_tests, mut, pi[mut].err_count,
pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);
done = 1;
$finish(2);
end
endmodule
cadence