```////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2018 Homework 7 -- SOLUTION
//

/// Assignment  https://www.ece.lsu.edu/koppel/v/2018/hw07.pdf

`default_nettype none

//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
/// Complete mult_seq_ds_prob_1 as described in the handout and below.
//
//     [✔] Start multiplying when in_valid is 1 at a positive clock edge ..
//     [✔] .. even if that means abandoning a multiplication in progress.
//     [✔] Set out_avail to 1 when prod holds the result for
//         most recent plier*cand.
//
//     [✔] The module must pass the testbench.
//         Average cycles should be w/m+1
//     [✔] The module must be synthesizable.
//     [✔] Make sure that synthesized hardware is reasonably fast.
//
//     [✔] Code must be reasonably efficient.
//     [✔] Do not change module parameters.
//     [✔] Do not change ports, EXCEPT changing between var and net kinds.
//     [✔] Don't assume that parameter values will match those used here.
//     [✔] USE DEBUGGING TOOLS LIKE SimVision.
//

module mult_seq_ds_prob_1
#( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
// SOLUTION: Change kind of out_avail from net (uwire) to var.
output var logic  out_avail,
input uwire clk, in_valid,
input uwire [w-1:0] plier, cand );

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);
localparam logic [w+m-1:0] zero = 0;  // Used to set precision to w+m bits.

uwire [iterations-1:0][m-1:0] cand_2d = cand;

bit [iter_lg:0] iter;
logic [2*w-1:0] accum;

always_ff @( posedge clk ) begin

/// SOLUTION, Problem 1
//
//  - Start a new multiplication whenever in_valid is 1.
//  - When multiplication is finished set out_avail to 1.
//
if ( in_valid ) begin

// If in_valid is 1 start a multiplication.

accum = cand;
iter = 0;
out_avail = 0;

end else if ( !out_avail && iter == iterations ) begin

// If a multiplication is in progress (!out_avail) ..
// .. and we just finished the last iteration of a multiplication ..
// .. make the result available.

out_avail = 1;
prod = accum;

end

// Add on a partial product.
// Do this whether or not a multiplication is in progress.

accum = { zero + plier * accum[m-1:0] + accum[2*w-1:w], accum[w-1:m] };
iter++;

end

endmodule

//////////////////////////////////////////////////////////////////////////////
///  Problem 2
//
/// Complete mult_seq_d_prob_2 as described in the handout and below.
//
//     [✔] Skip over multiplicand digits that are zero.
//     [✔] Start multiplying when in_valid is 1 at a positive clock edge ..
//     [✔] .. even if that means abandoning a multiplication in progress.
//     [✔] Set out_avail to 1 when prod holds the result for
//         most recent plier*cand.
//
//     [✔] The module must pass the testbench.
//         Average cycles should be less than w/m+1
//     [✔] The module must be synthesizable.
//         The period should not be too much longer than the original module.
//     [✔] Make sure that synthesized hardware is reasonably fast.
//
//     [✔] The module must be synthesizable.
//     [✔] Code must be reasonably efficient.
//     [✔] Do not change module parameters.
//     [✔] Do not change ports, EXCEPT changing between var and net kinds.
//     [✔] Don't assume that parameter values will match those used here.
//     [✔] USE DEBUGGING TOOLS LIKE SimVision.

module mult_seq_d_prob_2
#( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
// SOLUTION: Change kind of out_avail from net (uwire) to var.
output logic out_avail,
input uwire clk, in_valid,
input uwire [w-1:0] plier, cand );

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);

uwire [iterations-1:0][m-1:0] cand_2d = cand;

bit [iter_lg-1:0] iter;
logic [2*w-1:0] accum;

always_ff @( posedge clk ) begin

logic [iter_lg-1:0] next_iter;

/// SOLUTION -- Problem 2
//
//  Implement handshaking.
//  Computation is completed when iter is zero. (See below.)
//
if ( in_valid ) begin

iter = 0;
accum = 0;
out_avail = 0;

end else if ( !out_avail && iter == 0 ) begin

prod = accum;
out_avail = 1;

end

accum += plier * cand_2d[iter] << ( iter * m );

/// SOLUTION -- Problem 2
//
//  Set iter to ..
//  .. index of next non-zero multiplicand digit ..
//  .. or to zero if multiplication is complete.
//
//  Scan multiplicand digits starting at most significant digit.
//  Update next_iter whenever ..
//   i > iter   ( meaning that that partial product not yet use ) ..
//   and digit, cand_2d[i], is non-zero.
//
next_iter = 0;
for ( int i=iterations-1;  i>0;  i-- )
if ( i>iter && cand_2d[i] ) next_iter = i;
iter = next_iter;

end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Comparison Modules
///

/// The modules below are for reference.

module mult_seq_ds_prob_1_orig
#( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
output uwire out_avail,
input uwire clk, in_valid,
input uwire [w-1:0] plier, cand );

/// DO NOT MODIFY THIS MODULE.
//  It is to be used for comparison when performing synthesis.

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);
localparam logic [w+m-1:0] zero = 0;  // Used to set precision to w+m bits.

uwire [iterations-1:0][m-1:0] cand_2d = cand;

bit [iter_lg:0] iter;
logic [2*w-1:0] accum;

always_ff @( posedge clk ) begin

if ( iter == iterations ) begin

prod = accum;
accum = cand;
iter = 0;

end

// Note: accum[m-1:0] is the same as cand_2d[iter];

accum = { zero + plier * accum[m-1:0] + accum[2*w-1:w], accum[w-1:m] };
iter++;

end

endmodule

module mult_seq_d_prob_2_orig
#( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
output uwire out_avail,
input uwire clk, in_valid,
input uwire [w-1:0] plier, cand );

/// DO NOT MODIFY THIS MODULE.
//  It is to be used for comparison when performing synthesis.

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);

uwire [iterations-1:0][m-1:0] cand_2d = cand;

bit [iter_lg:0] iter;
logic [2*w-1:0] accum;

always_ff @( posedge clk ) begin

if ( iter == iterations ) begin

prod = accum;
accum = 0;
iter = 0;

end

accum += plier * cand_2d[iter] << ( iter * m );

iter++;

end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input var int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram

module testbench;

localparam int w = 20;
localparam int num_tests = 400;
localparam int NUM_MULT = 20;
localparam int err_limit = 7;

bit use_others;
logic [w-1:0] plier, cand;
logic [w-1:0] plierp[NUM_MULT], candp[NUM_MULT];
logic [2*w-1:0] prod[NUM_MULT];
uwire availn[NUM_MULT];
logic avail[NUM_MULT];
logic in_valid[NUM_MULT];

typedef struct { int tidx; int cycle_start; } Test_Vector;

typedef struct { int idx;
int err_count = 0;
int err_timing = 0;
Test_Vector tests_active[\$];
bit all_tests_started = 0;
bit seq = 0; bit pipe = 0;
bit bpipe = 0;
int deg = 1;
int ncompleted = 0;
int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];

localparam int cycle_limit = num_tests * w * 4;
int cycle;
bit done;
logic clock;

logic clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

initial begin
clock = 0;
cycle = 0;

fork
forever #10 cycle += clock++;
wait( done );
wait( cycle >= cycle_limit )
\$write("*** Cycle limit exceeded, ending.\n");
join_any;

\$finish();
end

task pi_seq(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].bpipe = 0;

task pi_bseq(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].bpipe = 1;

task pi_pipe(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 0;
task pi_bpipe(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 1;

mult_seq_ds_prob_1 #(w,1) prob1_m1(prod[6], availn[6], clock,
in_valid[6], plierp[6], candp[6]);
initial pi_bseq(6,"Prob 1",prob1_m1.m);

mult_seq_ds_prob_1 #(w,2) prob1_m2(prod[7], availn[7], clock,
in_valid[7], plierp[7], candp[7]);
initial pi_bseq(7,"Prob 1",prob1_m2.m);

mult_seq_ds_prob_1 #(w,4) prob1_m4(prod[9], availn[9], clock,
in_valid[9], plierp[9], candp[9]);
initial pi_bseq(9,"Prob 1",prob1_m4.m);

mult_seq_ds_prob_1_orig #(w,1) ms14(prod[14], availn[14], clock,
in_valid[14], plierp[14], candp[14]);
initial pi_seq(14,"Seq",ms14.m);

mult_seq_ds_prob_1_orig #(w,2) ms4(prod[4], availn[4], clock,
in_valid[4], plierp[4], candp[4]);
initial pi_seq(4,"Seq",ms4.m);

mult_seq_ds_prob_1_orig #(w,4) ms5(prod[5], availn[5], clock,
in_valid[5], plierp[5], candp[5]);
initial pi_seq(5,"Seq",ms5.m);

mult_seq_d_prob_2 #(w,1) prob2_m1(prod[17], availn[17], clock,
in_valid[17], plierp[17], candp[17]);
initial pi_bseq(17,"Prob 2",prob2_m1.m);

mult_seq_d_prob_2 #(w,2) prob2_m2(prod[16], availn[16], clock,
in_valid[16], plierp[16], candp[16]);
initial pi_bseq(16,"Prob 2",prob2_m2.m);

mult_seq_d_prob_2 #(w,4) prob2_m4(prod[15], availn[15], clock,
in_valid[15], plierp[15], candp[15]);
initial pi_bseq(15,"Prob 2",prob2_m4.m);

always @* begin

foreach ( availn[i] ) begin
if ( availn[i] !== 1'bz ) avail[i] = availn[i];
end

end

// Array of multiplier/multiplicand values to try out.
// After these values are used a random number generator will be used.
//
int tests[\$] = {1,1, 1,2, 1,3, 1,4, 1,5,  1,32,  32, 1};

initial begin

automatic int awaiting = pi.size();

logic [w-1:0] pliers[num_tests], cands[num_tests];

done = 0;

foreach ( pi[mut] ) begin
automatic int midx = pi[mut].idx;
automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
automatic int latency =
!pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps;
pi[mut].latency = latency;
if ( pi[mut].bpipe == 0 ) begin
avail[midx] = 1;
end
in_valid[midx] = 0;
end

for ( int i=0; i<num_tests; i++ ) begin

automatic int num_bits_c = {\$random()}%w + 1;
automatic logic [w-1:0] mask_c = ( (w+1)'(1) << num_bits_c ) - 1;
automatic int num_bits_p = {\$random()}%w + 1;
automatic logic [w-1:0] mask_p = ( (w+1)'(1) << num_bits_p ) - 1;

pliers[i] = tests.size() ? tests.pop_front() : {\$random()}&mask_p;
cands[i] = tests.size() ? tests.pop_front() : {\$random()}&mask_c;

end

fork begin
forever @( negedge clk_reactive ) begin
foreach ( pi[mut] ) begin
automatic int midx = pi[mut].idx;
if ( !in_valid[midx] && pi[mut].pipe ) begin
plierp[midx] = cycle;
candp[midx] = 1;
end
end
end
end join_none;

repeat ( 2 * w ) @( negedge clock );

foreach ( pi[mutii] ) begin
automatic string muti = mutii;

fork begin
automatic string mut = muti;
automatic int midx = pi[mut].idx;
for ( int i=0; i<num_tests; i++ ) begin
automatic int gap_cyc =
!pi[mut].pipe ? w * 2 :
( {\$random} % 2 ) ? {\$random} % ( w + 2 ) : 0;
automatic Test_Vector tv;
repeat ( gap_cyc ) @( negedge clock );
plierp[midx] = pliers[i];
candp[midx] = cands[i];
in_valid[midx] = 1;
tv.tidx = i;
tv.cycle_start = cycle;
pi[mut].tests_active.push_back( tv );
@( negedge clock );
in_valid[midx] = 0;
end
pi[mut].all_tests_started = 1;
end join_none;

fork begin
automatic string mut = muti;
automatic int midx = pi[mut].idx;
while ( 1 ) begin
@( negedge clock );
while ( pi[mut].tests_active.size() == 0
&& !pi[mut].all_tests_started )
@( negedge clock );
if ( pi[mut].tests_active.size() == 0 ) break;
begin
automatic Test_Vector tv = pi[mut].tests_active.pop_front();
automatic int i = tv.tidx;
automatic logic [2*w-1:0] shadow_prod = pliers[i] * cands[i];
automatic int eta = tv.cycle_start + pi[mut].latency;
automatic bit timing_err = 0;
automatic int delta_t;
if ( pi[mut].bpipe ) begin

if ( !pi[mut].pipe && cycle == tv.cycle_start )
@( negedge clock );

while ( !avail[midx] && cycle < eta ) @( negedge clock );
if ( !avail[midx] || cycle > eta ) begin
timing_err = 1;
if ( pi[mut].err_timing++ < err_limit )
\$write("At cyc %4d (eta %0d) avail not set for %s (idx %0d) after %0d cycles for 0x%0h*0x%0h.\n",
cycle, eta, mut, midx, cycle - tv.cycle_start,
pliers[i], cands[i]);
end
end else begin
wait ( cycle >= eta );
end
delta_t = cycle - tv.cycle_start;
if ( !timing_err ) begin
pi[mut].ncompleted++;
pi[mut].cyc_tot += delta_t;
end
if ( !timing_err && shadow_prod !== prod[midx] ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < err_limit ) begin
\$write
("%-15s test %5d  cyc %0d+%0d (%0d) wrong: 0x%0h * 0x%0h:  0x%0h != 0x%0h (correct)\n",
mut, i, tv.cycle_start, delta_t, pi[mut].latency,
pliers[i], cands[i],
end
end
end
end
awaiting--;
end join_none;

end

wait( awaiting == 0 || cycle > cycle_limit );

\$write("At cycle %0d.  Error types:  couldn't test / wrong result / timing\n",cycle);

foreach ( pi[ mut ] )
\$write("For %-18s ran %4d tests, %4d/%4d/%4d errors found. Avg cyc %.1f\n",
mut, num_tests,
num_tests - pi[mut].ncompleted,
pi[mut].err_count, pi[mut].err_timing,
pi[mut].seq ? real'(pi[mut].cyc_tot) / pi[mut].ncompleted : 1);

done = 1;
\$write("Modules instantiated with w = %0d.\n",w);

\$finish(2);

end

endmodule