////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2017 Homework 7 -- SOLUTION
//

 /// Assignment  http://www.ece.lsu.edu/koppel/v/2017/hw07.pdf


//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
 /// Complete so that mult_fast sets out_avail as described in the handout.
//
//     [✔] The module must be synthesizable.
//     [✔] Code must be reasonably efficient.
//     [✔] Do not change module parameters.
//     [✔] Do not change ports, EXCEPT changing between var and net kinds.
//     [✔] Don't assume that parameter values will match those used here.
//     [✔] USE DEBUGGING TOOLS LIKE SimVision.
//     [✔] Make sure that Avg cyc shown in testbench for Fast is lower 
//         than Pipelined module of same degree (when all tests pass).

 /// SOLUTION - Problem 1a
//
// A new pipeline latch will be created, pl_occ, which will carry the
// value of in_valid through the pipeline. The pl_occ register for the
// last stage will connect to the output port out_avail. 

// The timing diagram below is for a w=12, m=4 multiplier in which the
// multiplier is always set to 2 (not shown). Three different
// multiplicands arrive, 'h123, 'h4, and 'h56, their products, 2 *
// 'h123 = 'h246, 2 * 'h4 = 'h8, and 2 * 'h56 = 'hac, appear at the
// outputs three cycles later. The progress of 'h123 through the
// pipeline is highlighted in blue.
//
// The timing diagram does not show the plier and cand inputs. As
// stated above, the plier is always 2 (for the sample execution shown
// in the diagram). The value of input cand appearing at a positive
// edge can be seen in pl_cand[0] just after the positive edge. For
// example, at the positive edge between cycle 0 and 1 input cand must
// be 'h123.
// 

 /// SOLUTION -- Problem 1a
//
module mult_fast_1a
  #( int w = 16,
     int m = 4 )
   ( output uwire [2*w-1:0] prod,
     output uwire out_avail,
     input uwire clk, in_valid,
     input uwire [w-1:0] plier, cand );

   localparam int nstages = ( w + m - 1 ) / m;

   logic [2*w-1:0] pl_accum[0:nstages];
   logic [w-1:0] pl_plier[0:nstages];
   logic [w-1:0] pl_cand[0:nstages];


   assign prod = pl_accum[nstages];

   /// SOLUTION -- Problem 1a
   //
   logic pl_occ[0:nstages];
   assign out_avail = pl_occ[nstages];
   //
   // Provide a pipeline latch for the in_valid signal to pass through
   // the pipeline and connect the last stage's latch to the out_avail
   // port.

   always_ff @( posedge clk ) begin

      /// SOLUTION -- Problem 1a
      //
      pl_occ[0] = in_valid;
      //
      // Connect in_valid port to the first stage.

      pl_accum[0] = 0;
      pl_plier[0] = plier;
      pl_cand[0] = cand;

      for ( int stage=0; stage<nstages; stage++ ) begin

         pl_accum[stage+1] <=
           pl_accum[stage] +
             ( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m );

         pl_cand[stage+1] <= pl_cand[stage] >> m;
         pl_plier[stage+1] <= pl_plier[stage];

         /// SOLUTION -- Problem 1a
         //
         pl_occ[stage+1] <= pl_occ[stage];
         //
         // Pass the in_valid signal through the pipeline.

      end

   end

endmodule

 /// SOLUTION -- Problem 1b (and also 1a)
//
// The fast multiplier is supposed to provide a shortcut connection
// from each stage to the multiplier output, prod. Stage x can use the
// shortcut connection if the multiplication that the stage is
// carrying is complete and if no higher-numbered stages are occupied.
// A multiplication (meaning the result of multiplying a plier and
// cand) must appear at the output exactly once, and the arriving
// multiplications must appear at the outputs in the same order in
// which they arrived.

 /// The Plan
//
//  - Find the highest-numbered occupied stage. Call it oldest_idx.
//  - Connect the result at that stage, pl_accum[oldest_idx], to prod.
//  - Set out_avail to true if stage oldest_idx is finished.
//  - Set pl_occ[oldest_idx] to zero if out_avail is true, to avoid duplicates.
//
// For the discussion below refer to module mult_fast_1b and to the
// timing diagram below. The timing diagram is for a module
// instantiated with w=12, m=4, and for which the multiplier is always
// 2. The arriving values in the timing diagram below are the same as
// the diagram appearing in the solution to Problem 1a.

 /// Computing oldest_idx
//
// Combinational logic will be added that computes oldest_idx, the
// highest-numbered occupied stage. (Stage x is occupied if pl_occ[x]
// is true.) If none of the stages are occupied oldest_idx is set to
// zero.
//
// See the always_comb block in mult_fast_1b below.

 /// Connect Stage's Result to Prod
//
// Output prod is set to pl_accum[oldest_idx], see the assign in
// mult_fast_1b. The connection is made whether or not stage
// oldest_idx is finished or contains a valid value.

// In the timing diagram notice that at cycles 2 and 3 prod holds
// incomplete multiplications. That's fine because out_avail is zero.
// Setting prod to something like zero at cycles like 2 and 3 would
// require extra hardware and provide no benefit (based on the problem
// statement in the homework handout).

 /// Set out_avail
//
// Output out_avail is set to 1 if stage oldest_idx is occupied and if
// the multiplicand at that stage is zero, meaning that the
// multiplication is complete. Whether it is occupied can be
// determined by examining pl_occ[oldest_idx], whether it is finished
// can be determined by examining pl_cand[oldest_idx]. See the "assign
// out_avail" in mult_fast_1b.

 /// Set pl_occ[oldest_idx] When Done
//
// Since the result of a multiplication cannot appear at the output
// more than once pl_occ[x] must be set to zero at the end of cycle c
// if x was chosen in cycle c. (If this were not done the same
// multiplication would appear at the outputs again in the next clock
// cycle [c+1], when it is in stage x+1.) For example, stage 2 is
// chosen in cycle 5, its product 'h8 appears at the output. In cycle
// 6 the calculation is in stage 3, but now its value of pl_occ is
// zero and so it won't be chosen a second time.

 // 

module mult_fast_1b
  #( int w = 16,
     int m = 4 )
   ( output uwire [2*w-1:0] prod,
     output uwire out_avail,
     input uwire clk, in_valid,
     input uwire [w-1:0] plier, cand );

   localparam int nstages = ( w + m - 1 ) / m;

   logic [2*w-1:0] pl_accum[0:nstages];
   logic [w-1:0] pl_plier[0:nstages], pl_cand[0:nstages];
   logic pl_occ[0:nstages];

   /// SOLUTION -- Problem 1b
   //
   //  Determine the idx of the last (highest-numbered) occupied
   //  stage, in which resides the oldest multiplication in the
   //  pipeline.
   //
   logic [$clog2(nstages):0] oldest_idx;
   //
   always_comb begin
      oldest_idx = 0;
      for ( int i=1; i<=nstages; i++ ) if ( pl_occ[i] ) oldest_idx = i;
   end

   /// SOLUTION -- Problem 1b
   //
   //  Connect the last occupied stage to the output ..
   //
   assign prod = pl_accum[oldest_idx];
   //
   //  .. and set out_avail to true if that stage is occupied and finished.
   //
   assign out_avail = pl_occ[oldest_idx] && pl_cand[oldest_idx] == 0;


   always_ff @( posedge clk ) begin

      pl_occ[0] = in_valid;

      pl_accum[0] = 0;
      pl_plier[0] = plier;
      pl_cand[0] = cand;

      for ( int stage=0; stage<nstages; stage++ ) begin

         pl_accum[stage+1] <=
           pl_accum[stage] +
             ( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m );

         pl_cand[stage+1] <= pl_cand[stage][w-1:m];
         pl_plier[stage+1] <= pl_plier[stage];

         /// SOLUTION -- Problem 1b
         //
         //  Pass 0 to next stage if this stage is providing the
         //  result, otherwise pass this stage's value of occupied to
         //  the next stage.
         //
         pl_occ[stage+1] <=
           oldest_idx == stage && out_avail ? 0 : pl_occ[stage];

      end

   end

endmodule




module mult_behav_1
  #(int w = 16)
   (output logic [2*w-1:0] prod, input logic [w-1:0] plier, cand);

   assign prod = plier * cand;
endmodule

 /// :Example: Basic Pipelined Multiplier -- mult_pipe
//
//   Computes m partial products per stage.
//

module mult_pipe #( int w = 16, int m = 4 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] plier,
     input logic [w-1:0] cand,
     input clk);

   localparam int nstages = ( w + m - 1 ) / m;

   // Note: pl is for pipeline latch.
   logic [2*w-1:0] pl_accum[0:nstages];
   logic [w-1:0] pl_plier[0:nstages];
   logic [w-1:0] pl_cand[0:nstages];

   always_ff @( posedge clk ) begin

      pl_accum[0] = 0;
      pl_plier[0] = plier;
      pl_cand[0] = cand;

      for ( int stage=0; stage<nstages; stage++ ) begin

         logic [w-1:0] cand_next;
         cand_next = pl_cand[stage][w-1:m];

         pl_accum[stage+1] <=
           pl_accum[stage] +
             ( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m );

         pl_cand[stage+1] <= cand_next;
         pl_plier[stage+1] <= pl_plier[stage];


      end

   end

   assign prod = pl_accum[nstages];

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

// cadence translate_off

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int w = 16;
   localparam int num_tests = 400;
   localparam int NUM_MULT = 20;
   localparam int err_limit = 7;

   bit use_others;
   logic [w-1:0] plier, cand;
   logic [w-1:0] plierp[NUM_MULT], candp[NUM_MULT];
   logic [2*w-1:0] prod[NUM_MULT];
   uwire availn[NUM_MULT];
   logic avail[NUM_MULT];
   logic in_valid[NUM_MULT];

   typedef struct { int tidx; int cycle_start; } Test_Vector;

   typedef struct { int idx;
                    int err_count = 0;
                    int err_timing = 0;
                    Test_Vector tests_active[$];
                    bit all_tests_started = 0;
                    bit seq = 0; bit pipe = 0;
                    bit bpipe = 0;
                    int deg = 1;
                    int ncompleted = 0;
                    int cyc_tot = 0;
                    int latency = 0;
                    } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * w * 4;
   int cycle;
   bit done;
   logic clock;

   logic clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);


   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   task pi_seq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].bpipe = 0;
   endtask

   task pi_pipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 0;
   endtask
   task pi_bpipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 1;
   endtask

   mult_behav_1 #(w) mb1(prod[0], plierp[0], candp[0]);
   initial pi["Behavioral"].idx = 0;

   mult_pipe    #(w,4) ms2(prod[2], plierp[2], candp[2], clock);
   initial pi_pipe(2,"Pipelined",ms2.m);

   mult_pipe    #(w,2) ms3(prod[3], plierp[3], candp[3], clock);
   initial pi_pipe(3,"Pipelined",ms3.m);

   mult_fast_1a    #(w,4) ms7(prod[7], availn[7], clock,
                            in_valid[7], plierp[7], candp[7]);
   initial pi_bpipe(7,"Fast 1a",ms7.m);

   mult_fast_1a    #(w,2) ms8(prod[8], availn[8], clock,
                            in_valid[8], plierp[8], candp[8]);
   initial pi_bpipe(8,"Fast 1a",ms8.m);

   mult_fast_1a    #(w,1) ms9(prod[9], availn[9], clock,
                            in_valid[9], plierp[9], candp[9]);
   initial pi_bpipe(9,"Fast 1a",ms9.m);

   mult_fast_1b    #(w,4) ms17(prod[17], availn[17], clock,
                            in_valid[17], plierp[17], candp[17]);
   initial pi_bpipe(17,"Fast 1b",ms17.m);

   mult_fast_1b    #(w,2) ms16(prod[16], availn[16], clock,
                            in_valid[16], plierp[16], candp[16]);
   initial pi_bpipe(16,"Fast 1b",ms16.m);

   mult_fast_1b    #(w,1) ms15(prod[15], availn[15], clock,
                            in_valid[15], plierp[15], candp[15]);
   initial pi_bpipe(15,"Fast 1b",ms15.m);

   always @*
     foreach ( availn[i] ) if ( availn[i] !== 1'bz ) avail[i] = availn[i];

   // Array of multiplier/multiplicand values to try out.
   // After these values are used a random number generator will be used.
   //
   int tests[$] = {1,1, 1,2, 1,3, 1,4, 1,5,  1,32,  32, 1};

   initial begin

      automatic int awaiting = pi.size();

      logic [w-1:0] pliers[num_tests], cands[num_tests];

      done = 0;

      foreach ( pi[mut] ) begin
         automatic int midx = pi[mut].idx;
         automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
         automatic int latency =
           !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps;
         pi[mut].latency = latency;
         if ( pi[mut].bpipe == 0 ) begin
            avail[midx] = 1;
         end
         in_valid[midx] = 0;
      end

      for ( int i=0; i<num_tests; i++ ) begin

         automatic int num_bits_c = {$random()}%w + 1;
         automatic logic [w-1:0] mask_c = ( (w+1)'(1) << num_bits_c ) - 1;
         automatic int num_bits_p = {$random()}%w + 1;
         automatic logic [w-1:0] mask_p = ( (w+1)'(1) << num_bits_p ) - 1;

         pliers[i] = tests.size() ? tests.pop_front() : {$random()}&mask_p;
         cands[i] = tests.size() ? tests.pop_front() : {$random()}&mask_c;

      end

      fork forever @( negedge clk_reactive ) foreach ( pi[mut] ) begin
         automatic int midx = pi[mut].idx;
         if ( !in_valid[midx] && pi[mut].pipe ) begin
            plierp[midx] = cycle;
            candp[midx] = 1;
         end
      end join_none;

      repeat ( 2 * w ) @( negedge clock );

      foreach ( pi[mutii] ) begin
         automatic string muti = mutii;

         fork begin
            automatic string mut = muti;
            automatic int midx = pi[mut].idx;
            for ( int i=0; i<num_tests; i++ ) begin
               automatic int gap_cyc =
                 ( {$random} % 2 ) ? {$random} % ( w + 2 ) : 0;
               automatic Test_Vector tv;
               repeat ( gap_cyc ) @( negedge clock );
               plierp[midx] = pliers[i];
               candp[midx] = cands[i];
               in_valid[midx] = 1;
               tv.tidx = i;
               tv.cycle_start = cycle;
               pi[mut].tests_active.push_back( tv );
               @( negedge clock );
               in_valid[midx] = 0;
            end
            pi[mut].all_tests_started = 1;
         end join_none;

         fork begin
            automatic string mut = muti;
            automatic int midx = pi[mut].idx;
            while ( 1 ) begin
               @( negedge clock );
               while ( pi[mut].tests_active.size() == 0
                       && !pi[mut].all_tests_started )
                 @( negedge clock );
               if ( pi[mut].tests_active.size() == 0 ) break;
               begin
                  automatic Test_Vector tv = pi[mut].tests_active.pop_front();
                  automatic int i = tv.tidx;
                  automatic logic [2*w-1:0] shadow_prod = pliers[i] * cands[i];
                  automatic int eta = tv.cycle_start + pi[mut].latency;
                  automatic bit timing_err = 0;
                  automatic int delta_t;
                  if ( pi[mut].bpipe ) begin
                     while ( !avail[midx] && cycle < eta ) @( negedge clock );
                     if ( !avail[midx] || cycle > eta ) begin
                        timing_err = 1;
                        if ( pi[mut].err_timing++ < err_limit )
                          $write("At cyc %4d (eta %0d) avail not set for %s (idx %0d) after %0d cycles for 0x%0h*0x%0h.\n",
                                 cycle, eta, mut, midx, cycle - tv.cycle_start,
                                 pliers[i], cands[i]);
                     end
                  end else begin
                     wait ( cycle >= eta );
                  end
                  delta_t = cycle - tv.cycle_start;
                  if ( !timing_err ) begin
                     pi[mut].ncompleted++;
                     pi[mut].cyc_tot += delta_t;
                  end
                  if ( !timing_err && shadow_prod !== prod[midx] ) begin
                     pi[mut].err_count++;
                     if ( pi[mut].err_count < err_limit ) begin
                        $write
                          ("%-15s test %5d  cyc %0d+%0d (%0d) wrong: 0x%0h * 0x%0h:  0x%0h != 0x%0h (correct)\n",
                           mut, i, tv.cycle_start, delta_t, pi[mut].latency,
                           pliers[i], cands[i],
                           prod[midx], shadow_prod);
                     end
                  end
               end
            end
            awaiting--;
         end join_none;

      end

      wait( awaiting == 0 || cycle > cycle_limit );

      $write("At cycle %0d.  Error types:  couldn't test / wrong result / timing\n",cycle);

      foreach ( pi[ mut ] )
        $write("For %-18s ran %4d tests, %4d/%4d/%4d errors found. Avg cyc %.1f\n",
               mut, num_tests,
               num_tests - pi[mut].ncompleted,
               pi[mut].err_count, pi[mut].err_timing,
               pi[mut].seq ? real'(pi[mut].cyc_tot) / pi[mut].ncompleted : 1);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on