////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2015 Homework 4
//
 /// SOLUTION

 /// Assignment  http://www.ece.lsu.edu/koppel/v/2015/hw04.pdf
 /// Solution discussion http://www.ece.lsu.edu/koppel/v/2015/hw04_sol.pdf

 /// Instructions:
  //
  // (1) Find the undergraduate workstation laboratory, room 126 EE
  //     Building.
  //
  // (2) Locate your account.  If you did not get an account please
  //     E-mail: koppel@ece.lsu.edu
  //
  // (3) Log in to a Linux workstation.
  //     The account should start up with a WIMP interface (windows, icons,
  //     mouse, pull-down menus)  ( :-) ) but one or two things need
  //     to be done from a command-line shell.  If you need to brush up
  //     on Unix commands follow http://www.ece.lsu.edu/koppel/v/4ltrwrd/.
  //
  // (4) If you haven't already, follow the account setup instructions here:
  //     http://www.ece.lsu.edu/koppel/v/proc.html
  //
  // (5) Copy this assignment, local path name
  //     /home/faculty/koppel/pub/ee4755/hw/2015f/hw04
  //     to a directory ~/hw04 in your class account. (~ is your home
  //     directory.) Use this file for your solution.
  //
  // (6) Find the problems in this file and solve them.
  //
  //     Your entire solution should be in this file.
  //
  //     Do not change module names.
  //
  // (7) Your solution will automatically be copied from your account by
  //     the TA-bot.


 /// Additional Resources
  //
  // Verilog Documentation
  //    The Verilog Standard
  //      http://standards.ieee.org/getieee/1800/download/1800-2012.pdf
  //    Introductory Treatment (Warning: Does not include SystemVerilog)
  //      Brown & Vranesic, Fundamentals of Digital Logic with Verilog, 3rd Ed.
  //
  // Account Setup and Emacs (Text Editor) Instructions
  //      http://www.ece.lsu.edu/koppel/v/proc.html
  //      To learn Emacs look for Emacs tutorial.
  //
  // Unix Help
  //      http://www.ece.lsu.edu/koppel/v/4ltrwrd/



//////////////////////////////////////////////////////////////////////////////
///  Problem 0
//
 /// Shift Left Modules
//
//   Look over the code below.
//   There is nothing to turn in for this problem.
//

`default_nettype none

module shift_fixed
  #( int wid_lg = 4,
     int amt = 1,
     int wid = 1 << wid_lg )
   ( output uwire [wid-1:0] shifted,
     input uwire [wid-1:0] unshifted,
     input uwire shift );

   assign  shifted = shift ? unshifted << amt : unshifted;

endmodule

module shift_lt_behav
  #( int wid_lg = 4,
     int wid = 1 << wid_lg )
   ( output uwire [wid-1:0] shifted,
     input uwire [wid-1:0] unshifted,
     input uwire [wid_lg-1:0] amt );

   assign shifted = unshifted << amt;

endmodule

module shift_lt_comb
  #( int wid_lg = 4,
     int wid = 1 << wid_lg )
   ( output uwire [wid-1:0] shifted,
     input uwire [wid-1:0] unshifted,
     input uwire [wid_lg-1:0] amt );

   uwire [wid-1:0]       step[wid_lg-1:-1];

   assign step[-1] = unshifted;
   assign shifted = step[wid_lg-1];

   for ( genvar i=0; i<wid_lg; i++ )
     shift_fixed #(wid_lg,1<<i) sf( step[i], step[i-1], amt[i] );

endmodule

module shift_lt_seq
  #( int wid_lg = 4,
     int wid = 1 << wid_lg )
   ( output logic [wid-1:0] shifted,
     output uwire ready,
     input uwire [wid-1:0] unshifted,
     input uwire [wid_lg-1:0] amt,
     input uwire start,
     input uwire clk );

   logic [wid_lg-1:0]   cnt;

   uwire [wid-1:0]       sf_out;

   shift_fixed #(wid_lg,1) sf( sf_out, shifted, 1'b1 );

   always_ff @( posedge clk ) begin

      if ( start == 1 ) begin

         shifted = unshifted;
         cnt = amt;

      end else if ( cnt > 0 ) begin

         shifted = sf_out;
         cnt--;

      end

   end

   assign ready = cnt == 0;

endmodule



module shift_lt_seq_d
  #( int wid_lg = 4,
     int num_shifters = 2,
     int wid = 1 << wid_lg )
   ( output logic [wid-1:0] shifted,
     output uwire ready,
     input uwire [wid-1:0] unshifted,
     input uwire [wid_lg-1:0] amt,
     input uwire start,
     input uwire clk );

   localparam int cnt_bits = ( wid_lg + num_shifters - 1 ) / num_shifters;
   logic [num_shifters-1:0][cnt_bits-1:0] cnt;
   uwire [wid-1:0] inter_sh[num_shifters-1:-1];
   assign inter_sh[-1] = shifted;

   for ( genvar i = 0; i < num_shifters; i++ ) begin

      localparam int shift_amt = 1 << i * cnt_bits;
      uwire       shift = cnt[i] != 0;

      shift_fixed #(wid_lg,shift_amt) sf( inter_sh[i], inter_sh[i-1], shift );

   end

   always_ff @( posedge clk )

      if ( start == 1 ) begin

         shifted = unshifted;
         cnt = amt;

      end else if ( cnt > 0 ) begin

         shifted = inter_sh[num_shifters-1];
         for ( int i=0; i<num_shifters; i++ ) if ( cnt[i] ) cnt[i]--;

      end

   assign ready = cnt == 0;

endmodule


//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
//   Modify shift_lt_seq_d_sol so that it synthesizes to the same
//   hardware as shift_lt_seq_d_live (further below).
//
//     [✔] Be sure that all code that you add synthesizes to
//         combinational logic.
//
//     [✔] Make sure that the module runs correctly.
//


module shift_lt_seq_d_sol
  #( int wid_lg = 4,
     int num_shifters = 2,
     int wid = 1 << wid_lg )
   ( output logic [wid-1:0] shifted,
     output logic ready,
     input uwire [wid-1:0] unshifted,
     input uwire [wid_lg-1:0] amt,
     input uwire start,
     input uwire clk );

   logic [num_shifters-1:0] shift;

   uwire [wid-1:0]           shin[num_shifters-1:-1];

   localparam int bits_per_seg = wid_lg / num_shifters;

   for ( genvar i=0; i<num_shifters; i++ ) begin

      localparam int fs_amt = 2 ** ( i * bits_per_seg );

      shift_fixed #( wid_lg, fs_amt ) sf( shin[i], shin[i-1], shift[i] );

   end

   assign shin[-1] = shifted;

   logic [num_shifters-1:0][bits_per_seg-1:0] cnt;

   logic [wid-1:0] next_shifted;
   logic next_ready;
   logic [num_shifters-1:0] next_shift;
   logic [num_shifters-1:0][bits_per_seg-1:0] next_cnt;

   /// Problem 1: Modify this module, especially around here.

   /// SOLUTION
   //
   //  Some logic from shift_lt_seq_d has been placed into the
   //  always_comb block and some has been placed in assigns.
   //  It would be equally correct to put all of the logic in
   //  an always_comb block (or blocks) or to put all of the logic
   //  in assign statements.  The deciding factor should be on how
   //  easy it is to read the code.

   always_comb begin

      if ( start == 1 ) begin

         next_cnt = amt;
         next_shift = 0;

      end else begin

         for ( int i=0; i<num_shifters; i++ ) begin
            next_shift[i] = cnt[i] > 0;

            // Note that next_cnt is always assigned, this avoids latches.
            next_cnt[i] = next_shift[i] ? cnt[i] - 1 : cnt[i];
         end

      end

   end

   // Use a continuous assignment for next_ready and next_shifted.
   assign next_ready = start ? 0 : cnt == 0 ? 1 : ready;
   assign next_shifted = start ? unshifted : shin[num_shifters-1];

   always_ff @( posedge clk ) begin

      shifted = next_shifted;
      ready = next_ready;
      shift = next_shift;
      cnt = next_cnt;

   end

endmodule

module shift_lt_seq_d_live
  #( int wid_lg = 6,
     int num_shifters = 1,
     int wid = 1 << wid_lg )
   ( output logic [wid-1:0] shifted,
     output logic ready,
     input uwire [wid-1:0] unshifted,
     input uwire [wid_lg-1:0] amt,
     input uwire start,
     input uwire clk );

   /// DO NOT modify this module.

   localparam int bits_per_seg = wid_lg / num_shifters;

   logic [num_shifters-1:0] shift;
   uwire [wid-1:0] shin[num_shifters-1:-1];
   assign shin[-1] = shifted;

   for ( genvar i=0; i<num_shifters; i++ ) begin

      localparam int fs_amt = 2 ** ( i * bits_per_seg );

      shift_fixed #( wid_lg, fs_amt ) sf( shin[i], shin[i-1], shift[i] );

   end

   logic [num_shifters-1:0][bits_per_seg-1:0] cnt;

   always_ff @( posedge clk ) begin

      if ( start == 1 ) begin

         ready = 0;
         cnt = amt;
         shift = 0;
         shifted = unshifted;

      end else begin

         if ( cnt == 0 ) ready = 1;

         for ( int i=0; i<num_shifters; i++ ) begin
            shift[i] = cnt[i] > 0;
            if ( cnt[i] != 0 ) cnt[i]--;
         end

         shifted = shin[num_shifters-1];

      end

   end

endmodule


//////////////////////////////////////////////////////////////////////////////
///  Problem 2
//
//   Modify shift_lt_seq_d_p2 so that it uses one less cycle.
//
//     [✔] Make sure that the module runs correctly.
//     [✔] Don't change the number of shifters per stage.


module shift_lt_seq_d_p2
  #( int wid_lg = 6,
     int num_shifters = 1,
     int wid = 1 << wid_lg )
   ( output logic [wid-1:0] shifted,
     output logic ready,
     input uwire [wid-1:0] unshifted,
     input uwire [wid_lg-1:0] amt,
     input uwire start,
     input uwire clk );

   localparam int bits_per_seg = wid_lg / num_shifters;

   logic [num_shifters-1:0] shift;
   uwire [wid-1:0] shin[num_shifters-1:-1];
   assign shin[-1] = shifted;

   for ( genvar i=0; i<num_shifters; i++ ) begin

      localparam int fs_amt = 2 ** ( i * bits_per_seg );

      shift_fixed #( wid_lg, fs_amt ) sf( shin[i], shin[i-1], shift[i] );

   end

   logic [num_shifters-1:0][bits_per_seg-1:0] cnt;

   always_ff @( posedge clk ) begin

      if ( start == 1 ) begin

         ready = 0;
         cnt = amt;

         shifted = unshifted;

      end else begin

         shifted = shin[num_shifters-1];

      end

      /// SOLUTION
      //
      //  Set shift and update cnt whether or not start==1.
      //

      if ( cnt == 0 ) ready = 1;

      for ( int i=0; i<num_shifters; i++ ) begin
         shift[i] = cnt[i] > 0;
         if ( cnt[i] != 0 ) cnt[i]--;
      end

   end

endmodule





//////////////////////////////////////////////////////////////////////////////
/// Testbench Code
//
//  The code below instantiates some of the modules above,
//  provides test inputs, and verifies the outputs.
//
//  The testbench may be modified to facilitate your solution. Of
//  course, the removal of tests which your module fails is not a
//  method of fixing a broken module. (The idea is to put in tests
//  which make it easier to determine what the problem is, for
//  example, test inputs that are all 0's or all 1's.)


// cadence translate_off


program reactivate(output uwire clk_reactive, input uwire clk);
   assign clk_reactive = clk;
endprogram

module testbench;

   localparam int wid_lg = 6;
   localparam int wid = 1 << wid_lg;

   localparam int max_units = 20;

   logic      clk;
   bit        done;
   int cycle;

   uwire [wid-1:0] sout[max_units];
   uwire ready[max_units];
   logic [wid-1:0] sin;
   logic [wid_lg-1:0] amt;
   logic              start;

   typedef struct { int idx; int err_count = 0; bit seq = 0;
                    logic [wid-1:0] sout = 'h111; int cyc_tot = 0; } Info;
   Info pi[string];

   shift_lt_seq_d #(wid_lg,1) my_sld4(sout[4], ready[4], sin, amt, start, clk);
   initial begin
      automatic string m = "Degree 1";
      pi[m].idx = 4; pi[m].seq = 1;
   end

   shift_lt_seq_d #(wid_lg,3) my_sld5(sout[5], ready[5], sin, amt, start, clk);
   initial begin
      automatic string m = "Degree 3";
      pi[m].idx = 5; pi[m].seq = 1;
   end

   shift_lt_seq_d_live #(wid_lg,1) my_sld9(sout[9], ready[9], sin, amt, start, clk);
   initial begin
      automatic string m = "Degree 1 live";
      pi[m].idx = 9; pi[m].seq = 1;
   end

   shift_lt_seq_d_live #(wid_lg,3) my_sld2(sout[2], ready[2], sin, amt, start, clk);
   initial begin
      automatic string m = "Degree 3 live";
      pi[m].idx = 2; pi[m].seq = 1;
   end

   shift_lt_seq_d_sol #(wid_lg,1) my_sld1(sout[1], ready[1], sin, amt, start, clk);
   initial begin
      automatic string m = "Degree 1 sol";
      pi[m].idx = 1; pi[m].seq = 1;
   end

   shift_lt_seq_d_sol #(wid_lg,3) my_sld10(sout[10], ready[10], sin, amt, start, clk);
   initial begin
      automatic string m = "Degree 3 sol";
      pi[m].idx = 10; pi[m].seq = 1;
   end

   shift_lt_seq_d_p2 #(wid_lg,1) my_sld3(sout[3], ready[3], sin, amt, start, clk);
   initial begin
      automatic string m = "Degree 1 P2";
      pi[m].idx = 3; pi[m].seq = 1;
   end

   shift_lt_seq_d_p2 #(wid_lg,3) my_sld6(sout[6], ready[6], sin, amt, start, clk);
   initial begin
      automatic string m = "Degree 3 P2";
      pi[m].idx = 6; pi[m].seq = 1;
   end

   localparam int tests_per_sa = 50;
   localparam int num_tests = wid * tests_per_sa;
   localparam int cycle_limit = num_tests * wid * 2;

   uwire clk_reactive;
   reactivate ra(clk_reactive,clk);

   initial begin
      clk = 0;
      cycle = 0;

      fork
         forever #10 cycle += clk++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   initial begin

      // Number of test inputs (stimuli).
      //
      automatic int test_count = 0;

      done = 0;
      start = 1;

      @( posedge clk_reactive ); @( posedge clk_reactive );


      // Provide one test pattern per shift amount.
      //
      for ( int i=0; i<num_tests; i++ ) begin

         automatic int cyc_start = cycle;
         automatic int cyc_timeout = cycle + wid * 2;
         logic [wid-1:0] shadow_sout;
         int awaiting;
         automatic logic [wid_lg-1:0] amt_1 = i / tests_per_sa;

         amt = { amt_1[1:0], amt_1[wid_lg-1:2] };

         test_count++;

         for ( int p=0; p<wid; p+=32 ) sin[p+:32] = $random;


         shadow_sout = sin << amt;

         start = 1;
         @( posedge clk_reactive );
         start = 0;

         // Collect output as ready signals go to 1, or immediately
         // for non-sequential modules.
         //
         awaiting = pi.num();
         foreach ( pi[muti] ) begin
            automatic string mut = muti; // Bug workaround?
            fork begin
               while ( pi[mut].seq
                       && ready[pi[mut].idx] !== 1
                       && cycle < cyc_timeout )
                 @( posedge clk_reactive );
               awaiting--;
               pi[mut].sout = sout[pi[mut].idx];
               pi[mut].cyc_tot += cycle - cyc_start;
            end join_none;
         end
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] )
           if ( shadow_sout !== pi[mut].sout ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write
                  ("%-20s wrong result for 0x%0h << %0d:  0x%0h != 0x%0h (correct)\n",
                   mut, sin, amt, pi[mut].sout, shadow_sout);
           end

      end

      done = 1;

      foreach ( pi[ mut ] )
         $write("Ran %4d tests for %-15s, %4d errors found. Avg cyc %.1f\n",
                  test_count, mut, pi[mut].err_count,
                pi[mut].seq ? real'(pi[mut].cyc_tot) / test_count : 1
                );
   end

endmodule

// cadence translate_on