```////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2019 Homework 6
//
/// Assignment  https://www.ece.lsu.edu/koppel/v/2019/hw06.pdf
//
/// This file has the versions of problem solved live Fall 2020.

`default_nettype none

//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
///    Complete add_accum so that it accumulates a sum.
//
//         be modified. (Except the testbench, to help debug.)
//
//     [✔] add_accum must use an add_pipe module to compute the sum.
//
//     [✔] Module must be synthesizable. Use command: genus -files syn.tcl
//
//     [✔] As always, avoid costly, slow, and confusing code.
//     [✔] As always, don't assume parameters will be at their default values.

#( int w = 20, n_stages = 3 )
( output logic [w-1:0] sum,
output logic sum_valid,
input uwire [w-1:0] ai,
input uwire ai_valid, reset, clk );

`ifdef xxxxx
always_ff @ ( posedge clk )
if ( reset ) sum = 0; else if ( ai_valid ) sum += ai;

always_comb sum_valid = 1;
`endif

/// The code above must be removed and the pipelined adder used instead.

logic [n_stages:0] st_occ; // Indicate which stage of add_p0 is occupied.
logic sum_occ;

uwire aout_valid = st_occ[n_stages-1];

uwire [w-1:0] aout;
uwire [w-1:0] a0 = ai_valid ? ai : sum;  // May need other connections.
uwire [w-1:0] a1 = aout_valid ? aout : sum;

uwire [1:0] n_items = ai_valid + aout_valid + sum_occ;

logic sum_zero;

always_ff @( posedge clk ) if ( reset ) begin

st_occ <= 0;
sum <= 0;
sum_valid <= 1;
sum_occ <= 0;
sum_zero <= 1;

end else begin

// Keep track of which stage of add_p0 is occupied.
//
//  st_occ[0] <= ai_valid;  // Lets initially assume all values enter pipe.
st_occ[0] <= n_items > 1;
//
//
for ( int i=1; i<=n_stages; i++ ) st_occ[i] <= st_occ[i-1];
//
// Simpler way: st_occ <= { st_occ, ai_valid };

if ( ai_valid ) sum_zero <= 0;

sum_occ <= n_items[0];
sum_valid <= sum_zero || !st_occ[n_stages-1:0] && n_items == 1;

if ( n_items == 1 ) sum <= aout_valid ? aout : ai_valid ? ai : sum;

end

endmodule

`ifdef YYYYY
#( int w = 20, n_stages = 3 )
( output logic [w-1:0] sum,
output logic sum_valid,
input uwire [w-1:0] ai,
input uwire ai_valid, reset, clk );

`ifdef xxxxx
always_ff @ ( posedge clk )
if ( reset ) sum = 0; else if ( ai_valid ) sum += ai;

always_comb sum_valid = 1;
`endif

/// The code above must be removed and the pipelined adder used instead.

logic [n_stages:0] st_occ; // Indicate which stage of add_p0 is occupied.
logic sum_occ;

uwire aout_valid = st_occ[n_stages-1];

uwire [w-1:0] aout;
uwire [w-1:0] a0 = ai_valid ? ai : sum;  // May need other connections.
uwire [w-1:0] a1 = aout_valid ? aout : sum;

uwire [1:0] n_items = ai_valid + aout_valid + sum_occ;

always_ff @( posedge clk ) if ( reset ) begin

st_occ <= 0;
sum <= 0;
sum_valid <= 1;
sum_occ <= 0;

end else begin

// Keep track of which stage of add_p0 is occupied.
//
//  st_occ[0] <= ai_valid;  // Lets initially assume all values enter pipe.
st_occ[0] <= n_items > 1;
//
//
for ( int i=1; i<=n_stages; i++ ) st_occ[i] <= st_occ[i-1];
//
// Simpler way: st_occ <= { st_occ, ai_valid };

sum_occ <= n_items[0];
sum_valid <= n_items == 1;

if ( n_items == 1 ) sum <= aout_valid ? aout : ai_valid ? ai : sum;

end

endmodule
`endif

`ifdef xxxx
#( int w = 20, n_stages = 3 )
( output logic [w-1:0] sum,
output logic sum_valid,
input uwire [w-1:0] ai,
input uwire ai_valid, reset, clk );

`ifdef xxxxx
always_ff @ ( posedge clk )
if ( reset ) sum = 0; else if ( ai_valid ) sum += ai;

always_comb sum_valid = 1;
`endif

/// The code above must be removed and the pipelined adder used instead.

uwire [w-1:0] aout;
uwire [w-1:0] a0 = ai;  // May need other connections.
uwire [w-1:0] a1 = sum;

logic [n_stages:0] st_occ; // Indicate which stage of add_p0 is occupied.

uwire aout_valid = st_occ[n_stages-1];

assign sum_valid = !st_occ[n_stages-1:0];

uwire [1:0] n_items = ai_valid + aout_valid + sum_valid;

always_ff @( posedge clk ) if ( reset ) begin

st_occ <= 0;
sum <= 0;
sum_valid <= 0;

end else begin

// Keep track of which stage of add_p0 is occupied.
//
//  st_occ[0] <= ai_valid;  // Lets initially assume all values enter pipe.
st_occ[0] <= n_items > 1;
//
//
for ( int i=1; i<=n_stages; i++ ) st_occ[i] <= st_occ[i-1];
//
// Simpler way: st_occ <= { st_occ, ai_valid };

sum <= aout;

if ( aout_valid ) sum <= aout; else if ( !sum_valid ) sum <= ai;

end

endmodule
`endif

//
//  Do not modify this.

#( int w = 21, n_stages = 3 )
( output uwire [w-1:0] sum,
input uwire [w-1:0] a, b,
input uwire clk );

localparam int bits_per_stage = ( w + n_stages - 1 ) / n_stages;
localparam int wr = n_stages * bits_per_stage; // w rounded.

logic [wr-1:0] pl_a[n_stages+1], pl_b[n_stages+1], pl_sum[n_stages+1];
logic pl_carry[n_stages+1];

always_ff @( posedge clk ) begin

pl_a[0] = a;
pl_b[0] = b;
pl_carry[0] = 0;

for ( int s=0; s<n_stages; s++ ) begin

automatic logic [bits_per_stage:0] sumi =
pl_a[s][bits_per_stage-1:0] +
pl_b[s][bits_per_stage-1:0] + pl_carry[s];

pl_carry[s+1] <= sumi[bits_per_stage];
pl_sum[s+1] <=
{ sumi[bits_per_stage-1:0], pl_sum[s] } >> bits_per_stage;
pl_a[s+1] <= pl_a[s] >> bits_per_stage;
pl_b[s+1] <= pl_b[s] >> bits_per_stage;

end

end

assign sum = pl_sum[ n_stages ][w-1:0];

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Testbench
//
//  May be modified to facilitate debugging.

program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input var int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram

module testbench;

localparam int n_stages[] = { 2, 3, 5, 6 };

initial if ( nw != n_stages.size() )
\$fatal(1,"Constant nw should be %0d.\n",n_stages.size() );

int t_errs;     // Total number of errors.
initial t_errs = 0;
final \$write("Total number of errors: %0d\n",t_errs);

uwire d[nw:-1];    // Start / Done signals.
assign d[-1] = 1;  // Initialize first at true.

// Instantiate a testbench at each size.
//
for ( genvar i=0; i<nw; i++ )
testbench_n #(n_stages[i]) t2( .done(d[i]), .tstart(d[i-1]) );

endmodule

module testbench_n
#( int n_stages = 3 )
( output logic done, input uwire tstart );

localparam int n_tests = 10000;
localparam int w = 30;

localparam int a_in_max = 42;
localparam int cyc_max = 1 << 30;

localparam int lat_limit_empty = n_stages + 2;
localparam int lat_min_empty = n_stages;
localparam int lat_limit_full = 2 + (1+\$clog2(n_stages)) * ( n_stages + 1 );

bit clk;
int cycle, cycle_limit;
logic clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clk,cycle);

string event_trace;

initial begin
clk = 0;
cycle = 0;

done = 0;
cycle_limit = cyc_max;
wait( tstart );

fork
while ( !done ) #1 cycle += clk++;
wait( cycle >= cycle_limit )
\$write("Exit from clock loop at cycle %0d, limit %0d.  %s\n %s\n",
cycle, cycle_limit, "** CYCLE LIMIT EXCEEDED **",
event_trace);
join_any;

done = 1;
end

uwire [w-1:0] sum;
uwire sum_valid;
logic [w-1:0] a;
logic a_valid, reset;

add_accum #(w,n_stages) fpa(sum, sum_valid, a, a_valid, reset, clk);

int rsum;
bit tests_start;
int series_idx, value_idx, series_n_vals;
int n_errs, n_underdue_errs, n_overdue_errs, n_tests_done;
int sum_due_cyc_earliest, sum_due_cyc, n_correct;
int last_a_cyc;
int latency_sum, latency_sum_n;
bit error_val_issued, error_late_issued;

initial wait ( done ) begin
automatic int not_done = n_tests - series_idx;
\$write("Done with %0d-stage tests, %0d series.\n Correct, %0d; errors : %0d not done, %0d val, %0d/%0d early/late.\n",
n_stages, series_idx,
n_correct, not_done, n_errs, n_underdue_errs, n_overdue_errs );
\$write("For %0d stages average latency %.2f cycles.\n",
n_stages,
real'(latency_sum) / ( latency_sum_n ? latency_sum_n : 1 ) );
testbench.t_errs += n_errs + n_underdue_errs + n_overdue_errs + not_done;
end

initial begin

wait( tests_start );

while ( !done ) @( posedge clk_reactive ) begin

if ( sum_valid ) begin

automatic bit pending = sum_due_cyc < cyc_max;

if ( pending ) begin
n_tests_done++;
sum_due_cyc = cyc_max;
if ( sum === rsum ) n_correct++;
latency_sum += cycle - last_a_cyc;
latency_sum_n++;
if ( cycle < sum_due_cyc_earliest ) begin

n_underdue_errs++;
if ( n_underdue_errs < 5 ) begin
\$write
("At cyc %0d, value ready too soon, %0d, cyc. (Min cyc %0d.)\n",
cycle, last_a_cyc - cycle, lat_limit_empty
);
if ( event_trace != "" ) \$write(" %s\n",event_trace);
end

end
end

if ( !error_val_issued && sum !== rsum ) begin
error_val_issued = 1;
n_errs++;
if ( n_errs < 5 ) begin
\$write("At cyc %0d, wrong sum, %0d != %g (correct)\n",
cycle, sum, rsum);
if ( event_trace != "" ) \$write(" %s\n",event_trace);
end
end

end else if ( sum_due_cyc <= cycle ) begin

if ( !error_late_issued ) begin
error_late_issued = 1;
n_overdue_errs++;
sum_due_cyc = cyc_max;
if ( n_overdue_errs < 5 ) begin
\$write("At cycle %0d, sum overdue.\n",cycle);
if ( event_trace != "" ) \$write(" %s\n",event_trace);
end
end
end

end

end

initial begin

automatic int seed = 4755;
automatic int series_sparsity = 0;
rsum = 0;
n_errs = 0;
latency_sum_n = 0;
latency_sum = 0;
error_val_issued = 0;
error_late_issued = 1;
series_idx = 0;
value_idx = 0;
series_n_vals = 0;
n_overdue_errs = 0;
n_underdue_errs = 0;
sum_due_cyc = cyc_max;
sum_due_cyc_earliest = 0;
n_tests_done = 0;
n_correct = 0;
event_trace = "";

wait( tstart );
\$write("Starting tests for %0d-stage pipeline.\n",n_stages);

@( negedge clk );
reset = 1;
event_trace = \$sformatf("R(%0d)",cycle);
a_valid = 0;
a = 0;
@( negedge clk );
cycle_limit = cycle + n_stages * 2;
tests_start = 1;
reset = 0;
@( negedge clk );
wait( sum_valid );

while ( series_idx < n_tests ) begin

@( negedge clk );

a = \$dist_uniform( seed, 1, a_in_max );

if ( value_idx >= series_n_vals ) begin

a_valid = 0;

if ( sum_valid ) begin

series_idx++;
value_idx = 0;
event_trace = \$sformatf("R(%0d)",cycle);
reset = 1;
a_valid = 0;
rsum = 0;
series_n_vals = \$dist_uniform( seed, 1, 10 );
series_sparsity = series_idx % 6;
sum_due_cyc = cycle + 1;
sum_due_cyc_earliest = cycle;
error_val_issued = 0;
error_late_issued = 0;
cycle_limit = cycle + 1;
end

end else begin

reset = 0;
a_valid = series_sparsity == 0
|| \$dist_uniform( seed, 0, series_sparsity ) == 0;
cycle_limit = cycle + lat_limit_full;

end

if ( a_valid ) begin
value_idx++;
event_trace = {event_trace,\$sformatf("+%0d(%0d)",a,cycle)};
error_val_issued = 0;
error_late_issued = 0;
rsum += a;
last_a_cyc = cycle;
sum_due_cyc = cycle +
( sum_valid ? lat_limit_empty : lat_limit_full );
sum_due_cyc_earliest =
cycle + ( value_idx > 1 ? lat_min_empty : 0 );
end

end

done = 1;

end

endmodule