////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2018 Homework 7 -- SOLUTION
//

 /// Assignment  https://www.ece.lsu.edu/koppel/v/2018/hw07.pdf

`default_nettype none

//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
 /// Complete mult_seq_ds_prob_1 as described in the handout and below.
//
//     [✔] Start multiplying when in_valid is 1 at a positive clock edge ..
//     [✔] .. even if that means abandoning a multiplication in progress.
//     [✔] Set out_avail to 1 when prod holds the result for 
//         most recent plier*cand.
//
//     [✔] The module must pass the testbench.
//         Average cycles should be w/m+1
//     [✔] The module must be synthesizable.
//     [✔] Make sure that synthesized hardware is reasonably fast.
//
//     [✔] Code must be reasonably efficient.
//     [✔] Do not change module parameters.
//     [✔] Do not change ports, EXCEPT changing between var and net kinds.
//     [✔] Don't assume that parameter values will match those used here.
//     [✔] USE DEBUGGING TOOLS LIKE SimVision.
//

module mult_seq_ds_prob_1
  #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     // SOLUTION: Change kind of out_avail from net (uwire) to var.
     output var logic  out_avail,
     input uwire clk, in_valid,
     input uwire [w-1:0] plier, cand );

   localparam int iterations = ( w + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);
   localparam logic [w+m-1:0] zero = 0;  // Used to set precision to w+m bits.

   uwire [iterations-1:0][m-1:0] cand_2d = cand;

   bit [iter_lg:0] iter;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      /// SOLUTION, Problem 1
      //
      //  - Start a new multiplication whenever in_valid is 1.
      //  - When multiplication is finished set out_avail to 1.
      //
      if ( in_valid ) begin

         // If in_valid is 1 start a multiplication.

         accum = cand;
         iter = 0;
         out_avail = 0;

      end else if ( !out_avail && iter == iterations ) begin

         // If a multiplication is in progress (!out_avail) ..
         // .. and we just finished the last iteration of a multiplication ..
         // .. make the result available.

         out_avail = 1;
         prod = accum;

      end

      // Add on a partial product. 
      // Do this whether or not a multiplication is in progress.

      accum = { zero + plier * accum[m-1:0] + accum[2*w-1:w], accum[w-1:m] };
      iter++;

   end

endmodule


//////////////////////////////////////////////////////////////////////////////
///  Problem 2
//
 /// Complete mult_seq_d_prob_2 as described in the handout and below.
//
//     [✔] Skip over multiplicand digits that are zero.
//     [✔] Start multiplying when in_valid is 1 at a positive clock edge ..
//     [✔] .. even if that means abandoning a multiplication in progress.
//     [✔] Set out_avail to 1 when prod holds the result for 
//         most recent plier*cand.
//
//     [✔] The module must pass the testbench.
//         Average cycles should be less than w/m+1
//     [✔] The module must be synthesizable.
//         The period should not be too much longer than the original module.
//     [✔] Make sure that synthesized hardware is reasonably fast.
//
//     [✔] The module must be synthesizable.
//     [✔] Code must be reasonably efficient.
//     [✔] Do not change module parameters.
//     [✔] Do not change ports, EXCEPT changing between var and net kinds.
//     [✔] Don't assume that parameter values will match those used here.
//     [✔] USE DEBUGGING TOOLS LIKE SimVision.

module mult_seq_d_prob_2
  #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     // SOLUTION: Change kind of out_avail from net (uwire) to var.
     output logic out_avail,
     input uwire clk, in_valid,
     input uwire [w-1:0] plier, cand );

   localparam int iterations = ( w + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);

   uwire [iterations-1:0][m-1:0] cand_2d = cand;

   bit [iter_lg-1:0] iter;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      logic [iter_lg-1:0] next_iter;

      /// SOLUTION -- Problem 2
      //
      //  Implement handshaking.
      //  Computation is completed when iter is zero. (See below.)
      //
      if ( in_valid ) begin

         iter = 0;
         accum = 0;
         out_avail = 0;

      end else if ( !out_avail && iter == 0 ) begin

         prod = accum;
         out_avail = 1;

      end

      accum += plier * cand_2d[iter] << ( iter * m );

      /// SOLUTION -- Problem 2
      //
      //  Set iter to ..
      //  .. index of next non-zero multiplicand digit ..
      //  .. or to zero if multiplication is complete.
      //
      //  Scan multiplicand digits starting at most significant digit.
      //  Update next_iter whenever ..
      //   i > iter   ( meaning that that partial product not yet use ) ..
      //   and digit, cand_2d[i], is non-zero.
      //
      next_iter = 0;
      for ( int i=iterations-1;  i>0;  i-- )
        if ( i>iter && cand_2d[i] ) next_iter = i;
      iter = next_iter;

   end

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Comparison Modules
///

 /// The modules below are for reference.

module mult_seq_ds_prob_1_orig
  #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     output uwire out_avail,
     input uwire clk, in_valid,
     input uwire [w-1:0] plier, cand );

   /// DO NOT MODIFY THIS MODULE.
   //  It is to be used for comparison when performing synthesis.

   localparam int iterations = ( w + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);
   localparam logic [w+m-1:0] zero = 0;  // Used to set precision to w+m bits.

   uwire [iterations-1:0][m-1:0] cand_2d = cand;

   bit [iter_lg:0] iter;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      if ( iter == iterations ) begin

         prod = accum;
         accum = cand;
         iter = 0;

      end

      // Note: accum[m-1:0] is the same as cand_2d[iter];

      accum = { zero + plier * accum[m-1:0] + accum[2*w-1:w], accum[w-1:m] };
      iter++;

   end

endmodule

module mult_seq_d_prob_2_orig
  #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     output uwire out_avail,
     input uwire clk, in_valid,
     input uwire [w-1:0] plier, cand );

   /// DO NOT MODIFY THIS MODULE.
   //  It is to be used for comparison when performing synthesis.

   localparam int iterations = ( w + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);

   uwire [iterations-1:0][m-1:0] cand_2d = cand;

   bit [iter_lg:0] iter;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      if ( iter == iterations ) begin

         prod = accum;
         accum = 0;
         iter = 0;

      end

      accum += plier * cand_2d[iter] << ( iter * m );

      iter++;

   end

endmodule




//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

// cadence translate_off

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input var int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int w = 20;
   localparam int num_tests = 400;
   localparam int NUM_MULT = 20;
   localparam int err_limit = 7;

   bit use_others;
   logic [w-1:0] plier, cand;
   logic [w-1:0] plierp[NUM_MULT], candp[NUM_MULT];
   logic [2*w-1:0] prod[NUM_MULT];
   uwire availn[NUM_MULT];
   logic avail[NUM_MULT];
   logic in_valid[NUM_MULT];

   typedef struct { int tidx; int cycle_start; } Test_Vector;

   typedef struct { int idx;
                    int err_count = 0;
                    int err_timing = 0;
                    Test_Vector tests_active[$];
                    bit all_tests_started = 0;
                    bit seq = 0; bit pipe = 0;
                    bit bpipe = 0;
                    int deg = 1;
                    int ncompleted = 0;
                    int cyc_tot = 0;
                    int latency = 0;
                    } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * w * 4;
   int cycle;
   bit done;
   logic clock;

   logic clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);


   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   task pi_seq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].bpipe = 0;
   endtask

   task pi_bseq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].bpipe = 1;
   endtask

   task pi_pipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 0;
   endtask
   task pi_bpipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 1;
   endtask

   mult_seq_ds_prob_1 #(w,1) prob1_m1(prod[6], availn[6], clock,
                            in_valid[6], plierp[6], candp[6]);
   initial pi_bseq(6,"Prob 1",prob1_m1.m);

   mult_seq_ds_prob_1 #(w,2) prob1_m2(prod[7], availn[7], clock,
                            in_valid[7], plierp[7], candp[7]);
   initial pi_bseq(7,"Prob 1",prob1_m2.m);

   mult_seq_ds_prob_1 #(w,4) prob1_m4(prod[9], availn[9], clock,
                            in_valid[9], plierp[9], candp[9]);
   initial pi_bseq(9,"Prob 1",prob1_m4.m);

   mult_seq_ds_prob_1_orig #(w,1) ms14(prod[14], availn[14], clock,
                            in_valid[14], plierp[14], candp[14]);
   initial pi_seq(14,"Seq",ms14.m);

   mult_seq_ds_prob_1_orig #(w,2) ms4(prod[4], availn[4], clock,
                            in_valid[4], plierp[4], candp[4]);
   initial pi_seq(4,"Seq",ms4.m);

   mult_seq_ds_prob_1_orig #(w,4) ms5(prod[5], availn[5], clock,
                            in_valid[5], plierp[5], candp[5]);
   initial pi_seq(5,"Seq",ms5.m);


   mult_seq_d_prob_2 #(w,1) prob2_m1(prod[17], availn[17], clock,
                            in_valid[17], plierp[17], candp[17]);
   initial pi_bseq(17,"Prob 2",prob2_m1.m);

   mult_seq_d_prob_2 #(w,2) prob2_m2(prod[16], availn[16], clock,
                            in_valid[16], plierp[16], candp[16]);
   initial pi_bseq(16,"Prob 2",prob2_m2.m);

   mult_seq_d_prob_2 #(w,4) prob2_m4(prod[15], availn[15], clock,
                            in_valid[15], plierp[15], candp[15]);
   initial pi_bseq(15,"Prob 2",prob2_m4.m);

   always @* begin

      foreach ( availn[i] ) begin
         if ( availn[i] !== 1'bz ) avail[i] = availn[i];
      end

   end

   // Array of multiplier/multiplicand values to try out.
   // After these values are used a random number generator will be used.
   //
   int tests[$] = {1,1, 1,2, 1,3, 1,4, 1,5,  1,32,  32, 1};

   initial begin

      automatic int awaiting = pi.size();

      logic [w-1:0] pliers[num_tests], cands[num_tests];

      done = 0;

      foreach ( pi[mut] ) begin
         automatic int midx = pi[mut].idx;
         automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
         automatic int latency =
           !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps;
         pi[mut].latency = latency;
         if ( pi[mut].bpipe == 0 ) begin
            avail[midx] = 1;
         end
         in_valid[midx] = 0;
      end

      for ( int i=0; i<num_tests; i++ ) begin

         automatic int num_bits_c = {$random()}%w + 1;
         automatic logic [w-1:0] mask_c = ( (w+1)'(1) << num_bits_c ) - 1;
         automatic int num_bits_p = {$random()}%w + 1;
         automatic logic [w-1:0] mask_p = ( (w+1)'(1) << num_bits_p ) - 1;

         pliers[i] = tests.size() ? tests.pop_front() : {$random()}&mask_p;
         cands[i] = tests.size() ? tests.pop_front() : {$random()}&mask_c;

      end

      fork begin
         forever @( negedge clk_reactive ) begin
            foreach ( pi[mut] ) begin
               automatic int midx = pi[mut].idx;
               if ( !in_valid[midx] && pi[mut].pipe ) begin
                  plierp[midx] = cycle;
                  candp[midx] = 1;
               end
            end
         end
      end join_none;

      repeat ( 2 * w ) @( negedge clock );

      foreach ( pi[mutii] ) begin
         automatic string muti = mutii;

         fork begin
            automatic string mut = muti;
            automatic int midx = pi[mut].idx;
            for ( int i=0; i<num_tests; i++ ) begin
               automatic int gap_cyc =
                 !pi[mut].pipe ? w * 2 :
                 ( {$random} % 2 ) ? {$random} % ( w + 2 ) : 0;
               automatic Test_Vector tv;
               repeat ( gap_cyc ) @( negedge clock );
               plierp[midx] = pliers[i];
               candp[midx] = cands[i];
               in_valid[midx] = 1;
               tv.tidx = i;
               tv.cycle_start = cycle;
               pi[mut].tests_active.push_back( tv );
               @( negedge clock );
               in_valid[midx] = 0;
            end
            pi[mut].all_tests_started = 1;
         end join_none;

         fork begin
            automatic string mut = muti;
            automatic int midx = pi[mut].idx;
            while ( 1 ) begin
               @( negedge clock );
               while ( pi[mut].tests_active.size() == 0
                       && !pi[mut].all_tests_started )
                 @( negedge clock );
               if ( pi[mut].tests_active.size() == 0 ) break;
               begin
                  automatic Test_Vector tv = pi[mut].tests_active.pop_front();
                  automatic int i = tv.tidx;
                  automatic logic [2*w-1:0] shadow_prod = pliers[i] * cands[i];
                  automatic int eta = tv.cycle_start + pi[mut].latency;
                  automatic bit timing_err = 0;
                  automatic int delta_t;
                  if ( pi[mut].bpipe ) begin

                     if ( !pi[mut].pipe && cycle == tv.cycle_start )
                       @( negedge clock );

                     while ( !avail[midx] && cycle < eta ) @( negedge clock );
                     if ( !avail[midx] || cycle > eta ) begin
                        timing_err = 1;
                        if ( pi[mut].err_timing++ < err_limit )
                          $write("At cyc %4d (eta %0d) avail not set for %s (idx %0d) after %0d cycles for 0x%0h*0x%0h.\n",
                                 cycle, eta, mut, midx, cycle - tv.cycle_start,
                                 pliers[i], cands[i]);
                     end
                  end else begin
                     wait ( cycle >= eta );
                  end
                  delta_t = cycle - tv.cycle_start;
                  if ( !timing_err ) begin
                     pi[mut].ncompleted++;
                     pi[mut].cyc_tot += delta_t;
                  end
                  if ( !timing_err && shadow_prod !== prod[midx] ) begin
                     pi[mut].err_count++;
                     if ( pi[mut].err_count < err_limit ) begin
                        $write
                          ("%-15s test %5d  cyc %0d+%0d (%0d) wrong: 0x%0h * 0x%0h:  0x%0h != 0x%0h (correct)\n",
                           mut, i, tv.cycle_start, delta_t, pi[mut].latency,
                           pliers[i], cands[i],
                           prod[midx], shadow_prod);
                     end
                  end
               end
            end
            awaiting--;
         end join_none;

      end

      wait( awaiting == 0 || cycle > cycle_limit );

      $write("At cycle %0d.  Error types:  couldn't test / wrong result / timing\n",cycle);

      foreach ( pi[ mut ] )
        $write("For %-18s ran %4d tests, %4d/%4d/%4d errors found. Avg cyc %.1f\n",
               mut, num_tests,
               num_tests - pi[mut].ncompleted,
               pi[mut].err_count, pi[mut].err_timing,
               pi[mut].seq ? real'(pi[mut].cyc_tot) / pi[mut].ncompleted : 1);

      done = 1;
      $write("Modules instantiated with w = %0d.\n",w);

      $finish(2);

   end

endmodule

// cadence translate_on