`default_nettype none
module add_accum
#( int w = 21, n_stages = 3 )
( output logic [w-1:0] sum,
output logic sum_valid,
input uwire [w-1:0] ai,
input uwire ai_valid, reset, clk );
logic [n_stages:0] st_occ;
assign sum_valid = !st_occ;
uwire aout_valid = st_occ[n_stages-1];
uwire [w-1:0] aout;
uwire [w-1:0] a0 = ai_valid ? ai : sum;
uwire [w-1:0] a1 = aout_valid ? aout : sum;
add_pipe #(w,n_stages) add_p0( aout, a0, a1, clk );
logic sum_occupied;
uwire [1:0] n_values = ai_valid + sum_occupied + aout_valid;
uwire start_an_addition = n_values >= 2;
uwire write_sum = !sum_occupied && n_values == 1;
always_ff @( posedge clk ) if ( reset ) begin
sum <= 0;
sum_occupied <= 0;
st_occ <= 0;
end else begin
if ( write_sum ) sum <= aout_valid ? aout : ai;
sum_occupied <= n_values[0];
st_occ <= { st_occ[n_stages-1:0], start_an_addition };
end
endmodule
`ifdef xxx
Synthesizing at effort level "high"
Module Name Area Delay Delay
Actual Target
add_pipe_w24_n_stages1 29928 10.174 90.000 ns
add_pipe_w24_n_stages2 47043 5.428 90.000 ns
add_pipe_w24_n_stages3 64159 3.701 90.000 ns
add_pipe_w24_n_stages4 81275 2.837 90.000 ns
add_pipe_w24_n_stages6 115506 1.973 90.000 ns
add_accum_w24_n_stages1 87556 11.449 90.000 ns
add_accum_w24_n_stages2 105305 6.349 90.000 ns
add_accum_w24_n_stages3 123530 4.560 90.000 ns
add_accum_w24_n_stages4 141598 3.696 90.000 ns
add_accum_w24_n_stages6 177545 3.061 90.000 ns
add_pipe_w24_n_stages1 84351 1.114 0.100 ns
add_pipe_w24_n_stages2 83959 1.249 0.100 ns
add_pipe_w24_n_stages3 103383 1.105 0.100 ns
add_pipe_w24_n_stages4 117358 1.001 0.100 ns
add_pipe_w24_n_stages6 150854 0.896 0.100 ns
add_accum_w24_n_stages1 150738 2.023 0.100 ns
add_accum_w24_n_stages2 149544 1.757 0.100 ns
add_accum_w24_n_stages3 183994 1.514 0.100 ns
add_accum_w24_n_stages4 191611 1.444 0.100 ns
add_accum_w24_n_stages6 224175 1.332 0.100 ns
Normal exit.
`endif
module add_pipe
#( int w = 21, n_stages = 3 )
( output uwire [w-1:0] sum,
input uwire [w-1:0] a, b,
input uwire clk );
localparam int bits_per_stage = ( w + n_stages - 1 ) / n_stages;
localparam int wr = n_stages * bits_per_stage;
logic [wr-1:0] pl_a[n_stages+1], pl_b[n_stages+1], pl_sum[n_stages+1];
logic pl_carry[n_stages+1];
always_ff @( posedge clk ) begin
pl_a[0] = a;
pl_b[0] = b;
pl_carry[0] = 0;
for ( int s=0; s<n_stages; s++ ) begin
automatic logic [bits_per_stage:0] sumi =
pl_a[s][bits_per_stage-1:0] +
pl_b[s][bits_per_stage-1:0] + pl_carry[s];
pl_carry[s+1] <= sumi[bits_per_stage];
pl_sum[s+1] <=
{ sumi[bits_per_stage-1:0], pl_sum[s] } >> bits_per_stage;
pl_a[s+1] <= pl_a[s] >> bits_per_stage;
pl_b[s+1] <= pl_b[s] >> bits_per_stage;
end
end
assign sum = pl_sum[ n_stages ][w-1:0];
endmodule
cadence
program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input var int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram
module testbench;
localparam int n_stages[] = { 1, 2, 3, 5, 6 };
localparam int nw = 5; initial if ( nw != n_stages.size() )
$fatal(1,"Constant nw should be %0d.\n",n_stages.size() );
int t_errs; initial t_errs = 0;
final $write("Total number of errors: %0d\n",t_errs);
uwire d[nw:-1]; assign d[-1] = 1;
for ( genvar i=0; i<nw; i++ )
testbench_n #(n_stages[i]) t2( .done(d[i]), .tstart(d[i-1]) );
endmodule
module testbench_n
#( int n_stages = 3 )
( output logic done, input uwire tstart );
localparam int n_tests = 10000;
localparam int w = 30;
localparam int a_in_max = 42;
localparam int cyc_max = 1 << 30;
localparam int lat_limit_empty = n_stages + 2;
localparam int lat_min_empty = n_stages;
localparam int lat_limit_full = 2 + (1+$clog2(n_stages)) * ( n_stages + 1 );
bit clk;
int cycle, cycle_limit;
logic clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clk,cycle);
string event_trace;
initial begin
clk = 0;
cycle = 0;
done = 0;
cycle_limit = cyc_max;
wait( tstart );
fork
while ( !done ) #1 cycle += clk++;
wait( cycle >= cycle_limit )
$write("Exit from clock loop at cycle %0d, limit %0d. %s\n %s\n",
cycle, cycle_limit, "** CYCLE LIMIT EXCEEDED **",
event_trace);
join_any;
done = 1;
end
uwire [w-1:0] sum;
uwire sum_valid;
logic [w-1:0] a;
logic a_valid, reset;
add_accum #(w,n_stages) fpa(sum, sum_valid, a, a_valid, reset, clk);
int rsum;
bit tests_start;
int series_idx, value_idx, series_n_vals;
int n_errs, n_underdue_errs, n_overdue_errs, n_tests_done;
int sum_due_cyc_earliest, sum_due_cyc, n_correct;
int last_a_cyc;
int latency_sum, latency_sum_n;
bit error_val_issued, error_late_issued;
initial wait ( done ) begin
automatic int not_done = n_tests - series_idx;
$write("Done with %0d-stage tests, %0d series.\n Correct, %0d. Errors: %0d not done, %0d val, %0d/%0d early/late.\n",
n_stages, series_idx,
n_correct, not_done, n_errs, n_underdue_errs, n_overdue_errs );
$write("For %0d stages average latency %.2f cycles.\n",
n_stages,
real'(latency_sum) / ( latency_sum_n ? latency_sum_n : 1 ) );
testbench.t_errs += n_errs + n_underdue_errs + n_overdue_errs + not_done;
end
initial begin
wait( tests_start );
while ( !done ) @( posedge clk_reactive ) begin
if ( sum_valid ) begin
automatic bit pending = sum_due_cyc < cyc_max;
if ( pending ) begin
n_tests_done++;
sum_due_cyc = cyc_max;
if ( sum === rsum ) n_correct++;
latency_sum += cycle - last_a_cyc;
latency_sum_n++;
if ( cycle < sum_due_cyc_earliest ) begin
n_underdue_errs++;
if ( n_underdue_errs < 5 ) begin
$write
("At cyc %0d, value ready too soon, %0d, cyc. (Min cyc %0d.)\n",
cycle, last_a_cyc - cycle, lat_limit_empty
);
if ( event_trace != "" ) $write(" %s\n",event_trace);
end
end
end
if ( !error_val_issued && sum !== rsum ) begin
error_val_issued = 1;
n_errs++;
if ( n_errs < 5 ) begin
$write("At cyc %0d, wrong sum, %0d != %g (correct)\n",
cycle, sum, rsum);
if ( event_trace != "" ) $write(" %s\n",event_trace);
end
end
end else if ( sum_due_cyc <= cycle ) begin
if ( !error_late_issued ) begin
error_late_issued = 1;
n_overdue_errs++;
sum_due_cyc = cyc_max;
if ( n_overdue_errs < 5 ) begin
$write("At cycle %0d, sum overdue.\n",cycle);
if ( event_trace != "" ) $write(" %s\n",event_trace);
end
end
end
end
end
initial begin
automatic int seed = 4755;
automatic int series_sparsity = 0;
rsum = 0;
n_errs = 0;
latency_sum_n = 0;
latency_sum = 0;
error_val_issued = 0;
error_late_issued = 1;
series_idx = 0;
value_idx = 0;
series_n_vals = 0;
n_overdue_errs = 0;
n_underdue_errs = 0;
sum_due_cyc = cyc_max;
sum_due_cyc_earliest = 0;
n_tests_done = 0;
n_correct = 0;
event_trace = "";
wait( tstart );
$write("Starting tests for %0d-stage pipeline.\n",n_stages);
@( negedge clk );
reset = 1;
event_trace = $sformatf("R(%0d)",cycle);
a_valid = 0;
a = 0;
@( negedge clk );
cycle_limit = cycle + n_stages * 2;
tests_start = 1;
reset = 0;
@( negedge clk );
wait( sum_valid );
while ( series_idx < n_tests ) begin
@( negedge clk );
a = $dist_uniform( seed, 1, a_in_max );
if ( value_idx >= series_n_vals ) begin
a_valid = 0;
if ( sum_valid ) begin
series_idx++;
value_idx = 0;
event_trace = $sformatf("R(%0d)",cycle);
reset = 1;
a_valid = 0;
rsum = 0;
series_n_vals = $dist_uniform( seed, 1, 10 );
series_sparsity = series_idx % 6;
sum_due_cyc = cycle + 1;
sum_due_cyc_earliest = cycle;
error_val_issued = 0;
error_late_issued = 0;
cycle_limit = cycle + 1;
end
end else begin
reset = 0;
a_valid = series_sparsity == 0
|| $dist_uniform( seed, 0, series_sparsity ) == 0;
cycle_limit = cycle + lat_limit_full;
end
if ( a_valid ) begin
value_idx++;
event_trace = {event_trace,$sformatf("+%0d(%0d)",a,cycle)};
error_val_issued = 0;
error_late_issued = 0;
rsum += a;
last_a_cyc = cycle;
sum_due_cyc = cycle +
( sum_valid ? lat_limit_empty : lat_limit_full );
sum_due_cyc_earliest =
cycle + ( value_idx > 1 ? lat_min_empty : 0 );
end
end
done = 1;
end
endmodule
cadence