////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2016 Homework 6
/// SOLUTION
//

 /// Assignment  https://www.ece.lsu.edu/koppel/v/2016/hw06.pdf


`default_nettype none

//////////////////////////////////////////////////////////////////////////////
///  Problem 0
//
 //  Look over but don't modify these modules.


// cadence translate_off

 /// Non-Synthesizable Mag Module --- Complete, Don't Edit
//
module mag_functional
  ( output shortreal mag,
    input shortreal v [3] );

   always_comb begin
      shortreal sos;
      sos = 0;
      for ( int i=0; i<3; i++ ) sos += v[i] * v[i];
      mag = sos;
   end

endmodule
// cadence translate_on

 /// Combinational Module  --- Complete, Don't Edit
//
module mag_comb
  ( output uwire [31:0] mag,
    input uwire [31:0] v [3] );

   uwire [31:0]   vsq[3];
   uwire [7:0]    status[5];
   uwire [31:0]    sum01;
   localparam    logic [2:0] rnd = 0; // 0 is round toward even.

   for ( genvar i=0; i<3; i++ )
     CW_fp_mult m1( v[i], v[i], rnd, status[i], vsq[i]); // Product is last!

   CW_fp_add a1( vsq[0], vsq[1], rnd, sum01, status[3] );
   CW_fp_add a2( sum01, vsq[2], rnd, mag, status[4] );

endmodule


//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
 /// Complete mag_seq so that it computes mag sequentially, using one
 /// fp add and one fp multiply module.
//
//     [x] Learn to use SimVision *before* wasting hours on simple problems.
//     [x] The code must be synthesizable.
//     [x] Make sure that the testbench does not report errors.
//     [x] Can use behavioral or implicit structural code.
//     [x] Do not rename modules or change ports.
//     [x] Must use exactly one CW_fp_add and one CW_fp_mult.
//     [x] Assume that data arrives at module inputs late in the clock cycle.

// cadence translate_off
class Debug;

   int cycle;
   int test_cyc;  // Number of cycles since test began.
   int test_num;
   shortreal vr[3];
   logic [31:0] v[3];
   shortreal magr; // Correct result.
   logic [31:0] mag; // Correct result.

endclass
// cadence translate_on

module mag_seq
  ( output uwire [31:0] mag,
    output uwire ready,
    input uwire [31:0] v [3],
    input uwire start,
    input uwire clk );

   // cadence translate_off
   Debug db;
   // cadence translate_on

   localparam logic [2:0] rnd = 0; // 1 is round towards zero.

   uwire [7:0] sm, sa;

   logic [31:0] accum[2];
   uwire [31:0]  prod, sum;
   logic [2:0]  step;

   /// SOLUTION -- Assign multiplier input.
   //
   uwire [31:0]  ma = v[ step ];

   CW_fp_mult m1( .a(ma),       .b(ma),       .rnd(rnd), .z(prod), .status(sm));
   CW_fp_add  a1( .a(accum[0]), .b(accum[1]), .rnd(rnd), .z(sum),  .status(sa));

   localparam int last_step = 4;
   assign     ready = step == last_step;

   always_ff @( posedge clk )
     if ( start ) step <= 0;
     else if ( step < last_step ) step <= step + 1;

   always_ff @( posedge clk )
     begin
         case ( step )
           0: accum[0] <= prod;  // Save v[0] * v[0].

           /// SOLUTION below.
           1: accum[1] <= prod;  // Save v[1] * v[1].

           2: begin
              accum[0] <= prod;  // Save v[2] * v[2].
              accum[1] <= sum;   // Save (v[0]*v[0]) + (v[1]*v[1])
           end

           3: accum[1] <= sum;   // Save (v[0]*v[0]+v[1]*v[1]) + (v[2]*v[2]).

         endcase
     end

   assign mag = accum[1];

endmodule


//////////////////////////////////////////////////////////////////////////////
///  Problem 2
//
 /// Complete mag_pipe so that it computes mag in pipelined fashion and
 /// has at most one fp operation delay per cycle.
//
//     [x] Learn to use SimVision *before* wasting hours on simple problems.
//     [x] The code must be synthesizable.
//     [x] Make sure that the testbench does not report errors.
//     [x] Can use behavioral or implicit structural code.
//     [x] Do not rename modules or change ports.
//     [x] Choose number of stages to maximize throughput (minimize delay).
//     [x] Use as many CW_fp_add and CW_fp_mult modules as needed, but no more.
//     [x] Assume that data arrives at module inputs late in the clock cycle.


module mag_pipe
  ( output uwire [31:0] mag,
    input uwire [31:0] v [3],
    input uwire clk );

   // cadence translate_off
   Debug db;
   // cadence translate_on

   /// Do not rename nstages. The testbench examines its value and it must be set
   ///  correctly.
   //  For a vector arriving at cycle t, magnitude will be available at
   //  cycle t + nstages.
   localparam int nstages = 4;

   localparam logic [2:0] rnd = 0; // 1 is round towards zero.

   logic [31:0] pl_vsq[1:2][3];
   logic [31:0] pl_sos[2:3];
   uwire [31:0]  vsq[3], sum01, sum012;

   uwire [7:0]   s[5];

   // Pipeline latches between inputs and stage 0.
   //
   logic [31:0] pl_v[3];

   ///
   /// Logic Within Stages
   ///

   // Stage 0: Three Multipliers.
   //
   // Instantiate 3 multipliers. All of these are in stage 0.
   //
   for ( genvar i=0; i<3; i++ )
     CW_fp_mult m1(.a(pl_v[i]), .b(pl_v[i]),
                   .rnd(rnd), .z(vsq[i]), .status(s[i]));

   // Stage 1: An adder.
   //
   CW_fp_add a1( pl_vsq[1][0], pl_vsq[1][1], rnd, sum01, s[3] );

   // Stage 2: Another adder.
   //
   CW_fp_add a2( pl_sos[2], pl_vsq[2][2], rnd, sum012, s[4] );

   ///
   /// Pipeline Latches (Registers Separating Stages)
   ///
   always_ff @( posedge clk ) begin

      // Module input -> Stage 0
      //
      pl_v <= v;

      // Stage 0 -> 1
      //
      //  Result of multiplications done in stage 0.
      //
      pl_vsq[1] <= vsq;   // Note: vsq is a 3-element array of 32-bit vals.

      // Stage 1 -> 2
      //
      //  Pass along multiplications done in stage 1.
      //
      pl_vsq[2][2] <= pl_vsq[1][2];
      //
      //  Sum performed in stage 1.
      //
      pl_sos[2] <= sum01;

      // Stage 2 -> 3
      //
      //  Sum performed in stage 2.
      //
      pl_sos[3] <= sum012;
   end

   assign mag = pl_sos[3];

endmodule

 // Synthesized hardware after optimization:
 // :


//////////////////////////////////////////////////////////////////////////////
/// Testbench Code
//
//  The code below instantiates some of the modules above,
//  provides test inputs, and verifies the outputs.
//
//  The testbench may be modified to facilitate your solution. Of
//  course, the removal of tests which your module fails is not a
//  method of fixing a broken module. (One might modify the testbench
//  so that the first tests it performs are thoe which make it easier
//  to determine what the problem is, for example, test inputs that
//  are all 0's or all 1's.)




// cadence translate_off

function automatic real rand_real(real minv, real maxv);
      rand_real = minv + ( maxv - minv ) * ( real'({$random}) ) / 2.0**32;
endfunction

function automatic shortreal fabs(shortreal val);
      fabs = val < 0 ? -val : val;
endfunction

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input var int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram


module testbench();

   typedef enum { MT_comb, MT_seq, MT_pipe } Module_Type;

   localparam int wid = 32;
   localparam int max_latency = 10;
   localparam int num_tests = 16;
   localparam int nmuts = 10;
   int err[nmuts];

   uwire [31:0]  mag[nmuts];
   uwire         ready[nmuts];
   shortreal   magr;
   shortreal vr[3];
   logic [31:0] v[3];
   logic [31:0] vp[3];
   logic        start;

   typedef struct
     {
      int idx;
      int err_count = 0;
      int ncyc = 0;
      Module_Type mt = MT_comb;
      logic [wid-1:0] sout = 'h111;
      int cyc_tot = 0;
      int latency = 0;
      } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * max_latency * 4;
   int cycle, cyc_start;
   bit done;
   logic clock;
   bit   use_others;

   logic      clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

   task pi_seq(input int idx, input string name);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_seq;
   endtask

   task pi_pipe(input int idx, input string name, input int ncyc);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_pipe;
      pi[m].ncyc = ncyc;
   endtask

   Debug db;
   initial db = new;

   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 begin
            cycle += clock++;
            db.cycle = cycle;
            db.test_cyc = cycle - cyc_start;
         end
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   mag_functional mf( magr, vr );
   mag_comb m1( mag[0], v );
   initial pi["Comb."].idx = 0;
   mag_seq m2( mag[1], ready[1], v, start, clock );
   initial begin pi_seq(1,"Seq."); m2.db = db; end
   mag_pipe m4( mag[3], vp, clock );
   initial begin pi_pipe(3,"Pipe",m4.nstages); m4.db = db; end

   initial begin

      while ( !done ) @( posedge clk_reactive ) #2

         if ( use_others ) begin

            vp = v;
            use_others = 0;
            start = 1;

         end else begin

            vp[0] = $shortrealtobits(shortreal'(cycle-cyc_start));
            vp[1] = cycle - cyc_start;
            vp[2] = 0;
            start = 0;

         end
   end

   initial begin

      automatic int tot_errors = 0;

      done = 0;
      use_others = 0;
      start = 0;

      @( posedge clk_reactive );


      for ( int i=0; i<num_tests; i++ ) begin

         automatic int awaiting = pi.num();

         db.test_num = i;
         cyc_start = cycle;
         db.test_cyc = 0;

         if ( i < 8 ) begin

            // In first eight test vector components are zero or one.
            //
            for ( int j=0; j<3; j++ ) vr[j] = i & 1 << j ? 1.0 : 0.0;

         end else begin

            // In other tests vector components are randomly chosen.
            //
            for ( int j=0; j<3; j++ ) vr[j] = rand_real(-10,+10);

         end

         for ( int j=0; j<3; j++ ) v[j] = $shortrealtobits(vr[j]);
         db.vr = vr;
         db.v = v;
         fork
            #0 begin
               db.magr = magr;
               db.mag = $shortrealtobits(magr);
            end
         join_none

         vp = v;
         use_others = 1;

         /// Collect Result (mag) From Each Module Under Test (mut)
         //
         foreach ( pi[muti] ) begin

            automatic string mut = muti;  // Informal name of module.
            automatic Info p = pi[mut];

            // Create a child thread to get response from current mut.
            // The parent thread, without delay, proceeds to join_none.
            //
            fork begin

               automatic int steps = pi[mut].ncyc;
               automatic int latency =
                pi[mut].mt == MT_comb ? 1 :
                pi[mut].mt == MT_seq ? 2 : steps;

               // Compute time at which result should be ready or
               // when to start examining a READY output.
               //
               automatic int eta = 1 + cyc_start + latency;

               pi[mut].latency = latency;

               // Wait (just this thread waits) until result should be ready.
               //
               wait ( cycle_reactive == eta );

               // If this module has a READY output, wait for it.
               //
               if ( pi[mut].mt == MT_seq ) wait( ready[pi[mut].idx] );

               // Decrement count of the number of modules we are waiting for.
               //
               awaiting--;

               // Store the module MAG output, it will be checked later
               // for correctness.
               //
               pi[mut].sout = mag[pi[mut].idx];

               pi[mut].cyc_tot += cycle - cyc_start;

               // This thread ends execution here.
            end join_none;

         end

         // Wait until data collected from all modules under test.
         //
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] ) begin

            // Assign module output to a shortreal.
            //
            automatic shortreal mmagr = $bitstoshortreal(pi[mut].sout);
            //
            // Note: pi[mut].sout is type logic which is assumed to be
            // an unsigned integer. However, the contents is really an
            // IEEE 754 single-precision float (shortreal in
            // SystemVerilog) and so $bitstoshortreal is used so that
            // pi[mut].sout is copied bit-for-bit unchanged to mmagr.

            // Compute difference between module output and expected
            // output.  With FP small differences can be okay, they might
            // occur, for example, due to differences in the order of
            // operations.
            //
            automatic shortreal err_mag = fabs( mmagr - magr );
            automatic bit okay = err_mag < 1e-4;

            if ( !okay ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write("%s test #%0d vec (%.1f,%.1f,%.1f) error: h'%8h  %7.4f != %7.4f (correct)\n",
                   mut, i, vr[2], vr[1], vr[0],
                       pi[mut].sout, mmagr, magr);
           end
         end

         while ( {$random} & 1 == 1 ) @( posedge clk_reactive );
         //
         // Note: By waiting for reactive clock we can be sure that
         // modules under test have completed all work due to the
         // positive edge of the regular clk. Wait a random amount of
         // time in case any modules are only correct at some stride.

      end

      foreach ( pi[ mut ] )
        $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
               num_tests, mut, pi[mut].err_count,
               pi[mut].mt == MT_comb ? 1 : real'(pi[mut].cyc_tot) / num_tests);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on

`default_nettype uwire

// Load Verilog for ChipWare floating-point multiply and add modules.
//
`include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_fp_mult.v"
`include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_fp_add.v"