```////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2017 Homework 7 -- SOLUTION
//

/// Assignment  http://www.ece.lsu.edu/koppel/v/2017/hw07.pdf

//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
/// Complete so that mult_fast sets out_avail as described in the handout.
//
//     [✔] The module must be synthesizable.
//     [✔] Code must be reasonably efficient.
//     [✔] Do not change module parameters.
//     [✔] Do not change ports, EXCEPT changing between var and net kinds.
//     [✔] Don't assume that parameter values will match those used here.
//     [✔] USE DEBUGGING TOOLS LIKE SimVision.
//     [✔] Make sure that Avg cyc shown in testbench for Fast is lower
//         than Pipelined module of same degree (when all tests pass).

/// SOLUTION - Problem 1a
//
// A new pipeline latch will be created, pl_occ, which will carry the
// value of in_valid through the pipeline. The pl_occ register for the
// last stage will connect to the output port out_avail.

// The timing diagram below is for a w=12, m=4 multiplier in which the
// multiplier is always set to 2 (not shown). Three different
// multiplicands arrive, 'h123, 'h4, and 'h56, their products, 2 *
// 'h123 = 'h246, 2 * 'h4 = 'h8, and 2 * 'h56 = 'hac, appear at the
// outputs three cycles later. The progress of 'h123 through the
// pipeline is highlighted in blue.
//
// The timing diagram does not show the plier and cand inputs. As
// stated above, the plier is always 2 (for the sample execution shown
// in the diagram). The value of input cand appearing at a positive
// edge can be seen in pl_cand[0] just after the positive edge. For
// example, at the positive edge between cycle 0 and 1 input cand must
// be 'h123.
//

/// SOLUTION -- Problem 1a
//
module mult_fast_1a
#( int w = 16,
int m = 4 )
( output uwire [2*w-1:0] prod,
output uwire out_avail,
input uwire clk, in_valid,
input uwire [w-1:0] plier, cand );

localparam int nstages = ( w + m - 1 ) / m;

logic [2*w-1:0] pl_accum[0:nstages];
logic [w-1:0] pl_plier[0:nstages];
logic [w-1:0] pl_cand[0:nstages];

assign prod = pl_accum[nstages];

/// SOLUTION -- Problem 1a
//
logic pl_occ[0:nstages];
assign out_avail = pl_occ[nstages];
//
// Provide a pipeline latch for the in_valid signal to pass through
// the pipeline and connect the last stage's latch to the out_avail
// port.

always_ff @( posedge clk ) begin

/// SOLUTION -- Problem 1a
//
pl_occ[0] = in_valid;
//
// Connect in_valid port to the first stage.

pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;

for ( int stage=0; stage<nstages; stage++ ) begin

pl_accum[stage+1] <=
pl_accum[stage] +
( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m );

pl_cand[stage+1] <= pl_cand[stage] >> m;
pl_plier[stage+1] <= pl_plier[stage];

/// SOLUTION -- Problem 1a
//
pl_occ[stage+1] <= pl_occ[stage];
//
// Pass the in_valid signal through the pipeline.

end

end

endmodule

/// SOLUTION -- Problem 1b (and also 1a)
//
// The fast multiplier is supposed to provide a shortcut connection
// from each stage to the multiplier output, prod. Stage x can use the
// shortcut connection if the multiplication that the stage is
// carrying is complete and if no higher-numbered stages are occupied.
// A multiplication (meaning the result of multiplying a plier and
// cand) must appear at the output exactly once, and the arriving
// multiplications must appear at the outputs in the same order in
// which they arrived.

/// The Plan
//
//  - Find the highest-numbered occupied stage. Call it oldest_idx.
//  - Connect the result at that stage, pl_accum[oldest_idx], to prod.
//  - Set out_avail to true if stage oldest_idx is finished.
//  - Set pl_occ[oldest_idx] to zero if out_avail is true, to avoid duplicates.
//
// For the discussion below refer to module mult_fast_1b and to the
// timing diagram below. The timing diagram is for a module
// instantiated with w=12, m=4, and for which the multiplier is always
// 2. The arriving values in the timing diagram below are the same as
// the diagram appearing in the solution to Problem 1a.

/// Computing oldest_idx
//
// Combinational logic will be added that computes oldest_idx, the
// highest-numbered occupied stage. (Stage x is occupied if pl_occ[x]
// is true.) If none of the stages are occupied oldest_idx is set to
// zero.
//
// See the always_comb block in mult_fast_1b below.

/// Connect Stage's Result to Prod
//
// mult_fast_1b. The connection is made whether or not stage
// oldest_idx is finished or contains a valid value.

// In the timing diagram notice that at cycles 2 and 3 prod holds
// incomplete multiplications. That's fine because out_avail is zero.
// Setting prod to something like zero at cycles like 2 and 3 would
// require extra hardware and provide no benefit (based on the problem
// statement in the homework handout).

/// Set out_avail
//
// Output out_avail is set to 1 if stage oldest_idx is occupied and if
// the multiplicand at that stage is zero, meaning that the
// multiplication is complete. Whether it is occupied can be
// determined by examining pl_occ[oldest_idx], whether it is finished
// can be determined by examining pl_cand[oldest_idx]. See the "assign
// out_avail" in mult_fast_1b.

/// Set pl_occ[oldest_idx] When Done
//
// Since the result of a multiplication cannot appear at the output
// more than once pl_occ[x] must be set to zero at the end of cycle c
// if x was chosen in cycle c. (If this were not done the same
// multiplication would appear at the outputs again in the next clock
// cycle [c+1], when it is in stage x+1.) For example, stage 2 is
// chosen in cycle 5, its product 'h8 appears at the output. In cycle
// 6 the calculation is in stage 3, but now its value of pl_occ is
// zero and so it won't be chosen a second time.

//

module mult_fast_1b
#( int w = 16,
int m = 4 )
( output uwire [2*w-1:0] prod,
output uwire out_avail,
input uwire clk, in_valid,
input uwire [w-1:0] plier, cand );

localparam int nstages = ( w + m - 1 ) / m;

logic [2*w-1:0] pl_accum[0:nstages];
logic [w-1:0] pl_plier[0:nstages], pl_cand[0:nstages];
logic pl_occ[0:nstages];

/// SOLUTION -- Problem 1b
//
//  Determine the idx of the last (highest-numbered) occupied
//  stage, in which resides the oldest multiplication in the
//  pipeline.
//
logic [\$clog2(nstages):0] oldest_idx;
//
always_comb begin
oldest_idx = 0;
for ( int i=1; i<=nstages; i++ ) if ( pl_occ[i] ) oldest_idx = i;
end

/// SOLUTION -- Problem 1b
//
//  Connect the last occupied stage to the output ..
//
assign prod = pl_accum[oldest_idx];
//
//  .. and set out_avail to true if that stage is occupied and finished.
//
assign out_avail = pl_occ[oldest_idx] && pl_cand[oldest_idx] == 0;

always_ff @( posedge clk ) begin

pl_occ[0] = in_valid;

pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;

for ( int stage=0; stage<nstages; stage++ ) begin

pl_accum[stage+1] <=
pl_accum[stage] +
( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m );

pl_cand[stage+1] <= pl_cand[stage][w-1:m];
pl_plier[stage+1] <= pl_plier[stage];

/// SOLUTION -- Problem 1b
//
//  Pass 0 to next stage if this stage is providing the
//  result, otherwise pass this stage's value of occupied to
//  the next stage.
//
pl_occ[stage+1] <=
oldest_idx == stage && out_avail ? 0 : pl_occ[stage];

end

end

endmodule

module mult_behav_1
#(int w = 16)
(output logic [2*w-1:0] prod, input logic [w-1:0] plier, cand);

assign prod = plier * cand;
endmodule

/// :Example: Basic Pipelined Multiplier -- mult_pipe
//
//   Computes m partial products per stage.
//

module mult_pipe #( int w = 16, int m = 4 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);

localparam int nstages = ( w + m - 1 ) / m;

// Note: pl is for pipeline latch.
logic [2*w-1:0] pl_accum[0:nstages];
logic [w-1:0] pl_plier[0:nstages];
logic [w-1:0] pl_cand[0:nstages];

always_ff @( posedge clk ) begin

pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;

for ( int stage=0; stage<nstages; stage++ ) begin

logic [w-1:0] cand_next;
cand_next = pl_cand[stage][w-1:m];

pl_accum[stage+1] <=
pl_accum[stage] +
( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m );

pl_cand[stage+1] <= cand_next;
pl_plier[stage+1] <= pl_plier[stage];

end

end

assign prod = pl_accum[nstages];

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram

module testbench;

localparam int w = 16;
localparam int num_tests = 400;
localparam int NUM_MULT = 20;
localparam int err_limit = 7;

bit use_others;
logic [w-1:0] plier, cand;
logic [w-1:0] plierp[NUM_MULT], candp[NUM_MULT];
logic [2*w-1:0] prod[NUM_MULT];
uwire availn[NUM_MULT];
logic avail[NUM_MULT];
logic in_valid[NUM_MULT];

typedef struct { int tidx; int cycle_start; } Test_Vector;

typedef struct { int idx;
int err_count = 0;
int err_timing = 0;
Test_Vector tests_active[\$];
bit all_tests_started = 0;
bit seq = 0; bit pipe = 0;
bit bpipe = 0;
int deg = 1;
int ncompleted = 0;
int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];

localparam int cycle_limit = num_tests * w * 4;
int cycle;
bit done;
logic clock;

logic clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

initial begin
clock = 0;
cycle = 0;

fork
forever #10 cycle += clock++;
wait( done );
wait( cycle >= cycle_limit )
\$write("*** Cycle limit exceeded, ending.\n");
join_any;

\$finish();
end

task pi_seq(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].bpipe = 0;

task pi_pipe(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 0;
task pi_bpipe(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 1;

mult_behav_1 #(w) mb1(prod[0], plierp[0], candp[0]);
initial pi["Behavioral"].idx = 0;

mult_pipe    #(w,4) ms2(prod[2], plierp[2], candp[2], clock);
initial pi_pipe(2,"Pipelined",ms2.m);

mult_pipe    #(w,2) ms3(prod[3], plierp[3], candp[3], clock);
initial pi_pipe(3,"Pipelined",ms3.m);

mult_fast_1a    #(w,4) ms7(prod[7], availn[7], clock,
in_valid[7], plierp[7], candp[7]);
initial pi_bpipe(7,"Fast 1a",ms7.m);

mult_fast_1a    #(w,2) ms8(prod[8], availn[8], clock,
in_valid[8], plierp[8], candp[8]);
initial pi_bpipe(8,"Fast 1a",ms8.m);

mult_fast_1a    #(w,1) ms9(prod[9], availn[9], clock,
in_valid[9], plierp[9], candp[9]);
initial pi_bpipe(9,"Fast 1a",ms9.m);

mult_fast_1b    #(w,4) ms17(prod[17], availn[17], clock,
in_valid[17], plierp[17], candp[17]);
initial pi_bpipe(17,"Fast 1b",ms17.m);

mult_fast_1b    #(w,2) ms16(prod[16], availn[16], clock,
in_valid[16], plierp[16], candp[16]);
initial pi_bpipe(16,"Fast 1b",ms16.m);

mult_fast_1b    #(w,1) ms15(prod[15], availn[15], clock,
in_valid[15], plierp[15], candp[15]);
initial pi_bpipe(15,"Fast 1b",ms15.m);

always @*
foreach ( availn[i] ) if ( availn[i] !== 1'bz ) avail[i] = availn[i];

// Array of multiplier/multiplicand values to try out.
// After these values are used a random number generator will be used.
//
int tests[\$] = {1,1, 1,2, 1,3, 1,4, 1,5,  1,32,  32, 1};

initial begin

automatic int awaiting = pi.size();

logic [w-1:0] pliers[num_tests], cands[num_tests];

done = 0;

foreach ( pi[mut] ) begin
automatic int midx = pi[mut].idx;
automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
automatic int latency =
!pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps;
pi[mut].latency = latency;
if ( pi[mut].bpipe == 0 ) begin
avail[midx] = 1;
end
in_valid[midx] = 0;
end

for ( int i=0; i<num_tests; i++ ) begin

automatic int num_bits_c = {\$random()}%w + 1;
automatic logic [w-1:0] mask_c = ( (w+1)'(1) << num_bits_c ) - 1;
automatic int num_bits_p = {\$random()}%w + 1;
automatic logic [w-1:0] mask_p = ( (w+1)'(1) << num_bits_p ) - 1;

pliers[i] = tests.size() ? tests.pop_front() : {\$random()}&mask_p;
cands[i] = tests.size() ? tests.pop_front() : {\$random()}&mask_c;

end

fork forever @( negedge clk_reactive ) foreach ( pi[mut] ) begin
automatic int midx = pi[mut].idx;
if ( !in_valid[midx] && pi[mut].pipe ) begin
plierp[midx] = cycle;
candp[midx] = 1;
end
end join_none;

repeat ( 2 * w ) @( negedge clock );

foreach ( pi[mutii] ) begin
automatic string muti = mutii;

fork begin
automatic string mut = muti;
automatic int midx = pi[mut].idx;
for ( int i=0; i<num_tests; i++ ) begin
automatic int gap_cyc =
( {\$random} % 2 ) ? {\$random} % ( w + 2 ) : 0;
automatic Test_Vector tv;
repeat ( gap_cyc ) @( negedge clock );
plierp[midx] = pliers[i];
candp[midx] = cands[i];
in_valid[midx] = 1;
tv.tidx = i;
tv.cycle_start = cycle;
pi[mut].tests_active.push_back( tv );
@( negedge clock );
in_valid[midx] = 0;
end
pi[mut].all_tests_started = 1;
end join_none;

fork begin
automatic string mut = muti;
automatic int midx = pi[mut].idx;
while ( 1 ) begin
@( negedge clock );
while ( pi[mut].tests_active.size() == 0
&& !pi[mut].all_tests_started )
@( negedge clock );
if ( pi[mut].tests_active.size() == 0 ) break;
begin
automatic Test_Vector tv = pi[mut].tests_active.pop_front();
automatic int i = tv.tidx;
automatic logic [2*w-1:0] shadow_prod = pliers[i] * cands[i];
automatic int eta = tv.cycle_start + pi[mut].latency;
automatic bit timing_err = 0;
automatic int delta_t;
if ( pi[mut].bpipe ) begin
while ( !avail[midx] && cycle < eta ) @( negedge clock );
if ( !avail[midx] || cycle > eta ) begin
timing_err = 1;
if ( pi[mut].err_timing++ < err_limit )
\$write("At cyc %4d (eta %0d) avail not set for %s (idx %0d) after %0d cycles for 0x%0h*0x%0h.\n",
cycle, eta, mut, midx, cycle - tv.cycle_start,
pliers[i], cands[i]);
end
end else begin
wait ( cycle >= eta );
end
delta_t = cycle - tv.cycle_start;
if ( !timing_err ) begin
pi[mut].ncompleted++;
pi[mut].cyc_tot += delta_t;
end
if ( !timing_err && shadow_prod !== prod[midx] ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < err_limit ) begin
\$write
("%-15s test %5d  cyc %0d+%0d (%0d) wrong: 0x%0h * 0x%0h:  0x%0h != 0x%0h (correct)\n",
mut, i, tv.cycle_start, delta_t, pi[mut].latency,
pliers[i], cands[i],
end
end
end
end
awaiting--;
end join_none;

end

wait( awaiting == 0 || cycle > cycle_limit );

\$write("At cycle %0d.  Error types:  couldn't test / wrong result / timing\n",cycle);

foreach ( pi[ mut ] )
\$write("For %-18s ran %4d tests, %4d/%4d/%4d errors found. Avg cyc %.1f\n",
mut, num_tests,
num_tests - pi[mut].ncompleted,
pi[mut].err_count, pi[mut].err_timing,
pi[mut].seq ? real'(pi[mut].cyc_tot) / pi[mut].ncompleted : 1);

done = 1;

\$finish(2);

end

endmodule