////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2017 Homework 7
//

 /// Assignment  http://www.ece.lsu.edu/koppel/v/2017/hw07.pdf

 /// Instructions:
  //
  // (1) Find the undergraduate workstation laboratory, room 126 EE
  //     Building.
  //
  // (2) Locate your account.  If you did not get an account please
  //     E-mail: koppel@ece.lsu.edu
  //
  // (3) Log in to a Linux workstation.
  //
  // (4) If you haven't already, follow the account setup instructions here:
  //     http://www.ece.lsu.edu/koppel/v/proc.html
  //
  // (5) Copy this assignment, local path name
  //     /home/faculty/koppel/pub/ee4755/hw/2017/hw07
  //     to a directory ~/hw07 in your class account. (~ is your home
  //     directory.) Use this file for your solution.
  ///      BE SURE THAT YOUR FILE IS CORRECTLY NAMED AND IN THE RIGHT PLACE.
  //
  // (6) Find the problems in this file and solve them.
  //
  //     Your entire solution should be in this file.
  //
  //     Do not change module names.
  //
  // (7) Your solution will automatically be copied from your account by
  //     the TA-bot.


 /// Additional Resources
  //
  // Verilog Documentation
  //    The Verilog Standard
  //      http://standards.ieee.org/getieee/1800/download/1800-2012.pdf
  //    Introductory Treatment (Warning: Does not include SystemVerilog)
  //      Brown & Vranesic, Fundamentals of Digital Logic with Verilog, 3rd Ed.
  //
  // Account Setup and Emacs (Text Editor) Instructions
  //      http://www.ece.lsu.edu/koppel/v/proc.html
  //      To learn Emacs look for Emacs tutorial.


//////////////////////////////////////////////////////////////////////////////
///  Problems 1 and 2
//
 /// Complete so that mult_fast sets out_avail as described in the handout.
//
//     [ ] Module must be synthesizable.
//     [ ] Code must be reasonably efficient.
//     [ ] Do not change module parameters.
//     [ ] Do not change ports, EXCEPT changing between var and net kinds.
//     [ ] The module must by synthesizable.
//     [ ] Don't assume that parameter values will match those used here.
//     [ ] USE DEBUGGING TOOLS LIKE SimVision.
//     [ ] Make sure that Avg cyc shown in testbench for Fast is lower 
//         than Pipelined module of same degree (when all tests pass).

module mult_fast
  #( int w = 16,
     int m = 4 )
   ( output uwire [2*w-1:0] prod,
     output uwire out_avail,
     input uwire clk, in_valid,
     input uwire [w-1:0] plier, cand );

   localparam int nstages = ( w + m - 1 ) / m;

   logic [2*w-1:0] pl_accum[0:nstages];
   logic [w-1:0] pl_plier[0:nstages];
   logic [w-1:0] pl_cand[0:nstages];

   assign prod = pl_accum[nstages];

   always_ff @( posedge clk ) begin

      pl_accum[0] = 0;
      pl_plier[0] = plier;
      pl_cand[0] = cand;

      for ( int stage=0; stage<nstages; stage++ ) begin

         pl_accum[stage+1] <=
           pl_accum[stage] +
             ( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m );

         pl_cand[stage+1] <= pl_cand[stage] >> m;
         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

endmodule



//////////////////////////////////////////////////////////////////////////////
/// Comparison Modules
///

 /// The modules below are for reference.


module mult_behav_1
  #(int w = 16)
   (output logic [2*w-1:0] prod, input logic [w-1:0] plier, cand);

   assign prod = plier * cand;
endmodule

 /// :Example: Basic Pipelined Multiplier -- mult_pipe
//
//   Computes m partial products per stage.
//

module mult_pipe #( int w = 16, int m = 4 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] plier,
     input logic [w-1:0] cand,
     input clk);

   localparam int stages = ( w + m - 1 ) / m;

   // Note: pl is for pipeline latch.
   logic [2*w-1:0] pl_accum[0:stages];
   logic [w-1:0] pl_plier[0:stages];
   logic [w-1:0] pl_cand[0:stages];

   always_ff @( posedge clk ) begin

      pl_accum[0] = 0;
      pl_plier[0] = plier;
      pl_cand[0] = cand;

      for ( int stage=0; stage<stages; stage++ ) begin

         logic [2*w-1:0] accum;
         accum = pl_accum[stage];

         for ( int j=0; j<m; j++ ) begin

            int pos;
            pos = stage * m + j;

            if ( pos < w && pl_cand[stage][pos] )
              accum += pl_plier[stage] << pos;

         end

         /// Values to use in the next clock cycle, *not* the next iteration.
         pl_accum[stage+1] <= accum;
         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

   assign prod = pl_accum[stages];

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

// cadence translate_off

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int w = 16;
   localparam int num_tests = 400;
   localparam int NUM_MULT = 20;
   localparam int err_limit = 7;

   bit use_others;
   logic [w-1:0] plier, cand;
   logic [w-1:0] plierp[NUM_MULT], candp[NUM_MULT];
   logic [2*w-1:0] prod[NUM_MULT];
   uwire availn[NUM_MULT];
   logic avail[NUM_MULT];
   logic in_valid[NUM_MULT];

   typedef struct { int tidx; int cycle_start; } Test_Vector;

   typedef struct { int idx;
                    int err_count = 0;
                    int err_timing = 0;
                    Test_Vector tests_active[$];
                    bit all_tests_started = 0;
                    bit seq = 0; bit pipe = 0;
                    bit bpipe = 0;
                    int deg = 1;
                    logic [2*w-1:0] sout = 'h111;
                    logic [2*w-1:0] prod_history[$];
                    int ncompleted = 0;
                    int cyc_tot = 0;
                    int latency = 0;
                    } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * w * 4;
   int cycle;
   bit done;
   logic clock;

   logic clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);


   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   task pi_seq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].bpipe = 0;
   endtask

   task pi_pipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 0;
   endtask
   task pi_bpipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 1;
   endtask

   mult_behav_1 #(w) mb1(prod[0], plierp[0], candp[0]);
   initial pi["Behavioral"].idx = 0;

   mult_pipe    #(w,4) ms54(prod[7], plierp[7], candp[7], clock);
   initial pi_pipe(7,"Pipelined",ms54.m);

   mult_pipe    #(w,2) ms53(prod[8], plierp[8], candp[8], clock);
   initial pi_pipe(8,"Pipelined",ms53.m);

   mult_fast    #(w,4) ms17(prod[17], availn[17], clock,
                            in_valid[17], plierp[17], candp[17]);
   initial pi_bpipe(17,"Fast",ms17.m);

   mult_fast    #(w,2) ms16(prod[16], availn[16], clock,
                            in_valid[16], plierp[16], candp[16]);
   initial pi_bpipe(16,"Fast",ms16.m);

   mult_fast    #(w,1) ms15(prod[15], availn[15], clock,
                            in_valid[15], plierp[15], candp[15]);
   initial pi_bpipe(15,"Fast",ms15.m);

   always @* begin

      foreach ( availn[i] ) begin
         if ( availn[i] !== 1'bz ) avail[i] = availn[i];
      end

   end

   // Array of multiplier/multiplicand values to try out.
   // After these values are used a random number generator will be used.
   //
   int tests[$] = {1,1, 1,2, 1,3, 1,4, 1,5,  1,32,  32, 1};

   initial begin

      automatic int awaiting = pi.size();

      logic [w-1:0] pliers[num_tests], cands[num_tests];

      done = 0;

      foreach ( pi[mut] ) begin
         automatic int midx = pi[mut].idx;
         automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
         automatic int latency =
           !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps;
         pi[mut].latency = latency;
         if ( pi[mut].bpipe == 0 ) begin
            avail[midx] = 1;
         end
         in_valid[midx] = 0;
      end

      for ( int i=0; i<num_tests; i++ ) begin

         automatic int num_bits_c = {$random()}%w + 1;
         automatic logic [w-1:0] mask_c = ( (w+1)'(1) << num_bits_c ) - 1;
         automatic int num_bits_p = {$random()}%w + 1;
         automatic logic [w-1:0] mask_p = ( (w+1)'(1) << num_bits_p ) - 1;

         pliers[i] = tests.size() ? tests.pop_front() : {$random()}&mask_p;
         cands[i] = tests.size() ? tests.pop_front() : {$random()}&mask_c;

      end

      fork begin
         forever @( negedge clk_reactive ) begin
            foreach ( pi[mut] ) begin
               automatic int midx = pi[mut].idx;
               if ( !in_valid[midx] && pi[mut].pipe ) begin
                  plierp[midx] = cycle;
                  candp[midx] = 1;
               end
            end
         end
      end join_none;

      fork begin
         forever @( posedge clk_reactive ) begin
            foreach ( pi[mut] ) begin
               if ( pi[mut].pipe ) begin
                  automatic int midx = pi[mut].idx;
                  pi[mut].prod_history.push_back(prod[midx]);
                  if ( pi[mut].prod_history.size() > 2 * w ) begin
                     automatic bit pad = pi[mut].prod_history.pop_front();
                  end
               end
            end
         end
      end join_none;

      repeat ( 2 * w ) @( negedge clock );

      foreach ( pi[mutii] ) begin
         automatic string muti = mutii;

         fork begin
            automatic string mut = muti;
            automatic int midx = pi[mut].idx;
            for ( int i=0; i<num_tests; i++ ) begin
               automatic int gap_cyc =
                 ( {$random} % 2 ) ? {$random} % ( w + 2 ) : 0;
               automatic int wait_limit = cycle + 10 * w + gap_cyc;
               automatic Test_Vector tv;
               repeat ( gap_cyc ) @( negedge clock );
               if ( cycle > wait_limit ) begin
                  $write("Wait limit exceeded for %s.\n", mut);
                  break;
               end
               plierp[midx] = pliers[i];
               candp[midx] = cands[i];
               in_valid[midx] = 1;
               tv.tidx = i;
               tv.cycle_start = cycle;
               pi[mut].tests_active.push_back( tv );
               @( negedge clock );
               in_valid[midx] = 0;
            end
            pi[mut].all_tests_started = 1;
         end join_none;

         fork begin
            automatic string mut = muti;
            automatic int midx = pi[mut].idx;
            while ( 1 ) begin
               @( negedge clock );
               while ( pi[mut].tests_active.size() == 0
                       && !pi[mut].all_tests_started )
                 @( negedge clock );
               if ( pi[mut].tests_active.size() == 0 ) break;
               begin
                  automatic Test_Vector tv = pi[mut].tests_active.pop_front();
                  automatic int i = tv.tidx;
                  automatic logic [2*w-1:0] shadow_prod = pliers[i] * cands[i];
                  automatic int eta = tv.cycle_start + pi[mut].latency;
                  automatic bit timing_err = 0;
                  automatic int delta_t;
                  if ( pi[mut].bpipe ) begin
                     while ( !avail[midx] && cycle < eta ) @( negedge clock );
                     if ( !avail[midx] || cycle > eta ) begin
                        timing_err = 1;
                        if ( pi[mut].err_timing++ < err_limit )
                          $write("At cyc %4d (eta %0d) avail not set for %s (idx %0d) after %0d cycles for 0x%0h*0x%0h.\n",
                                 cycle, eta, mut, midx, cycle - tv.cycle_start,
                                 pliers[i], cands[i]);
                     end
                  end else begin
                     wait ( cycle >= eta );
                  end
                  delta_t = cycle - tv.cycle_start;
                  if ( !timing_err ) begin
                     pi[mut].ncompleted++;
                     pi[mut].cyc_tot += delta_t;
                  end
                  if ( !timing_err && shadow_prod !== prod[midx] ) begin
                     pi[mut].err_count++;
                     if ( pi[mut].err_count < err_limit ) begin
                        $write
                          ("%-15s test %5d  cyc %0d+%0d (%0d) wrong: 0x%0h * 0x%0h:  0x%0h != 0x%0h (correct)\n",
                           mut, i, tv.cycle_start, delta_t, pi[mut].latency,
                           pliers[i], cands[i],
                           prod[midx], shadow_prod);
                     end
                  end
               end
            end
            awaiting--;
         end join_none;

      end

      wait( awaiting == 0 || cycle > cycle_limit );

      $write("At cycle %0d.  Error types:  couldn't test / wrong result / timing\n",cycle);

      foreach ( pi[ mut ] )
        $write("For %-18s ran %4d tests, %4d/%4d/%4d errors found. Avg cyc %.1f\n",
               mut, num_tests,
               num_tests - pi[mut].ncompleted,
               pi[mut].err_count, pi[mut].err_timing,
               pi[mut].seq ? real'(pi[mut].cyc_tot) / pi[mut].ncompleted : 1);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on