/// LSU EE 4755
//
 /// Multi-Step Sequential Module Examples.
//
 /// Based on the Following Assignments
//
//   2016 Final Exam Problem 1 -- Complete Verilog to match diagram.
//   Exam:     https://www.ece.lsu.edu/v/2016/fe.pdf
//   Solution: https://www.ece.lsu.edu/v/2016/fe_sol.pdf
//   Also written up in Sequential Problem Set,
//     https://www.ece.lsu.edu/v/guides/pset-syn-seq-main.pdf
//
 /// Multi-Step Overview
//
//   All modules have two inputs v0 and v1,
//     which are floating-point values.
//     
//   All modules set their output to v0*v0 + v0*v1 + v1*v1.
//
//   Here are the differences between them:
//
//   - ms_functional
//     Functional Verilog code.
//     Not intended for synthesis so there is no point describing the
//     type of hardware (combinational, sequential, etc).
//     This module is the easiest to understand and might be used
//       to check the results of the others.
//
//   - ms_comb
//     Combinational Logic
//     Uses 3 FP multipliers and 2 FP adders.
//     As combinational logic it has minimum latency, but might
//       require a lower clock frequency in the design in which
//       it is used.
//
//   - ms_seq
//     Sequential Implementation (Four Cycles)
//     Uses 1 FP multiplier and 1 FP adder.
//     As sequential logic:
//       - It has lower cost than the combinational design since it
//         only needs one adder and one mult.
//       - But can only work on one calculation per clock cycle.
//
//   - ms_seq_live
//     An unfinished version of ms_seq. Intended for use during lectures.
//
//   - ms_pipe
//     Pipelined Implementation (Three Stages)
//     Uses 3 FP multipliers and 2 FP adders.
//     As a (fully) pipelined 3-stage implementation:
//       - It uses as many FP units as the combinational version.
//       - It has a throughput of 1 (completes one calculation per cycle).
//       - It has a latency of 3 cycles.
//
//   - ms_pipe_live
//     An unfinished version of ms_pipe. Intended for use during lectures.



`default_nettype none


 /// Non-Synthesizable Mag Module --- Complete, Don't Edit
// cadence translate_off
module ms_functional
  ( output shortreal mag,
    input shortreal v0, v1 );

   always_comb mag = v0 * v0 + v0 * v1 + v1 * v1;

endmodule
// cadence translate_on

module ms_comb
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output uwire [wf-1:0] result,
     output uwire ready,
     input uwire [wf-1:0] v0, v1,
     input uwire start, clk);

   localparam int nstages = 1;
   localparam logic [2:0] rm = 0; // Rounding Mode

   uwire [7:0] mul_s1, mul_s2, mul_s3, a_s1, a_s2;
   uwire [wf-1:0] v00, v01, v11, s1;

   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m00( .z(v00), .a(v0), .b(v0), .rnd(rm), .status(mul_s1) );
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m01( .z(v01), .a(v0), .b(v1), .rnd(rm), .status(mul_s2) );
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m11( .z(v11), .a(v1), .b(v1), .rnd(rm), .status(mul_s3) );

   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a1( .z(s1), .a(v00), .b(v11), .rnd(rm), .status(a_s1) );
   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a2( .z(result), .a(s1), .b(v01), .rnd(rm), .status(a_s2) );

   assign ready = 1;

endmodule
// 



 /// Sequential Modules Below


 // 

 // 

module ms_seq_live
  ( output uwire [31:0] result,
    output uwire ready,
    input uwire [31:0] v0, v1,
    input uwire start, clk);

   uwire [7:0] mul_s, add_s;
   uwire [31:0] mul_a, mul_b;
   uwire [31:0] add_a, add_b;
   uwire [31:0] prod, sum;

   logic [31:0] ac0, ac1;
   logic [2:0]  step;

   localparam   int last_step = 1;  /// NEED TO SET CORRECTLY
   localparam logic [2:0] rm = 0; // Rounding Mode

   /// Note: This module is not complete. It will be solved in class.
   //  See ms_seq, further below, for the solution.

   always_ff @( posedge clk )
     if ( start ) step <= 0; else if ( step < last_step ) step <= step + 1;

   assign       ready = step == last_step; /// THIS MUST BE CHANGED.

   CW_fp_mult m1( .z(prod), .a(mul_a), .b(mul_b), .rnd(rm), .status(mul_s) );
   CW_fp_add  a1( .z(sum),  .a(add_a), .b(add_b), .rnd(rm), .status(add_s) );


endmodule



module ms_pipe_live
  ( output logic [31:0] result,   output uwire ready,
    input uwire [31:0] v0, v1,    input uwire start, clk);

   localparam int nstages = 3;
   localparam logic [2:0] rm = 0; // Rounding Mode

   uwire [7:0] mul_s, add_s;
   uwire [31:0] mul_a, mul_b;
   uwire [31:0] add_a, add_b;
   uwire [31:0] prod, sum;

   /// Note: This module is not complete. It will be solved in class.
   //  See ms_pipe, further below, for the solution.

   CW_fp_mult m1( .z(prod), .a(mul_a), .b(mul_b), .rnd(rm), .status(mul_s) );
   CW_fp_add  a1( .z(sum),  .a(add_a), .b(add_b), .rnd(rm), .status(add_s) );

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Spoiler Alert: Solutions Below
//












 // The surrounding lines intentionally left blank.  Solution Further Down


















 // The surrounding lines intentionally left blank.  Solution Further Down















 // The surrounding lines intentionally left blank.  Solution Further Down



















 // 

module ms_seq
  ( output logic [31:0] result,
    output logic ready,
    input uwire [31:0] v0, v1,
    input uwire start,
    input uwire clk );

   localparam logic [2:0] rnd = 0; // 1 is round towards zero.

   uwire [7:0] mul_s, add_s;
   logic [2:0]  step;
   uwire [31:0] prod, sum;
   logic [31:0] ac0, ac1;

   localparam   int last_step = 4;

   always_ff @( posedge clk )
     if ( start ) step <= 0;
     else if ( step < last_step ) step <= step + 1;

   uwire [31:0] mul_a = step < 2  ? v0 : v1;
   uwire [31:0] mul_b = step == 0 ? v0 : v1;
   uwire [31:0] add_a = ac0,  add_b = ac1;

   CW_fp_mult m1( .z(prod), .a(mul_a), .b(mul_b), .rnd(rnd), .status(mul_s) );
   CW_fp_add  a1( .z(sum),  .a(add_a), .b(add_b), .rnd(rnd), .status(add_s) );

   always_ff @( posedge clk )
     begin

        ac0 <= prod;

        case ( step )
           0: ac1 <= 0;
           1: ac1 <= sum;
           2: ac1 <= sum;
        endcase

        if ( start ) ready <= 0; else if ( step == last_step-1 ) ready <= 1;

     end

   assign result = sum;

endmodule

module ms_pipe
  ( output uwire [31:0] result,
    output uwire ready,
    input uwire [31:0] v0, v1,
    input uwire start, clk);

   localparam int nstages = 3;
   localparam logic [2:0] rm = 0; // Rounding Mode

   uwire [7:0] mul_s1, mul_s2, mul_s3, a_s1, a_s2;
   uwire [31:0] v00, v01, v11, s1, s2;
   logic [31:0] pl_1_v00, pl_1_v01, pl_1_v11;
   logic [31:0] pl_2_v0001, pl_2_v11;
   logic [31:0] pl_3_sum;
   logic pl_1_occ, pl_2_occ, pl_3_occ;

   CW_fp_mult m00( .z(v00), .a(v0), .b(v0), .rnd(rm), .status(mul_s1) );
   CW_fp_mult m01( .z(v01), .a(v0), .b(v1), .rnd(rm), .status(mul_s2) );
   CW_fp_mult m11( .z(v11), .a(v1), .b(v1), .rnd(rm), .status(mul_s3) );

   CW_fp_add  a1( .z(s1), .a(pl_1_v00),   .b(pl_1_v01), .rnd(rm), .status(a_s1) );
   CW_fp_add  a2( .z(s2), .a(pl_2_v0001), .b(pl_2_v11), .rnd(rm), .status(a_s2) );

   assign ready = pl_3_occ;
   assign result = pl_3_sum;

   always_ff @( posedge clk ) begin

      pl_1_v00 <= v00;
      pl_1_v01 <= v01;
      pl_1_v11 <= v11;
      pl_1_occ <= start;

      pl_2_v0001 <= s1;
      pl_2_v11 <= pl_1_v11;
      pl_2_occ <= pl_1_occ;

      pl_3_sum <= s2;
      pl_3_occ <= pl_2_occ;

   end

endmodule



// cadence translate_off

function automatic real rand_real(real minv, real maxv);
      rand_real = minv + ( maxv - minv ) * ( real'({$random}) ) / 2.0**32;
endfunction

function automatic shortreal fabs(shortreal val);
      fabs = val < 0 ? -val : val;
endfunction

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input var int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram


module testbench();

   typedef enum { MT_comb, MT_seq, MT_pipe } Module_Type;

   localparam int wid = 32;
   localparam int max_latency = 10;
   localparam int num_tests = 16;
   localparam int nmuts = 10;
   int err[nmuts];

   uwire [31:0]  mag[nmuts];
   uwire         ready[nmuts];
   shortreal   magr;
   shortreal vr[2];
   logic [31:0] v[2], vp[2];
   logic        start;

   typedef struct
     {
      int idx;
      int err_count = 0;
      int ncyc = 0;
      Module_Type mt = MT_comb;
      logic [wid-1:0] sout = 'h111;
      int cyc_tot = 0;
      int latency = 0;
      } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * max_latency * 4;
   int cycle, cyc_start;
   bit done;
   logic clock;
   bit   use_others;

   logic      clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

   task pi_seq(input int idx, input string name);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_seq;
   endtask

   task pi_pipe(input int idx, input string name, input int ncyc);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_pipe;
      pi[m].ncyc = ncyc;
   endtask

   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 begin
            cycle += clock++;
         end
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   ms_functional mf( magr, vr[0], vr[1] );
   ms_seq m2( mag[1], ready[1], v[0],v[1], start, clock );
   initial begin pi_seq(1,"Seq."); end
   ms_seq_live m2r( mag[2], ready[2], v[0],v[1], start, clock );
   initial begin pi_seq(2,"Seq. Live"); end
   ms_pipe m3( mag[3], ready[3], vp[0],vp[1], start, clock );
   initial begin pi_pipe(3,"Pipe",m3.nstages); end
   ms_pipe_live m4( mag[4], ready[4], vp[0],vp[1], start, clock );
   initial begin pi_pipe(4,"Pipe Live",m4.nstages); end

   initial begin

      while ( !done ) @( posedge clk_reactive ) #2

         if ( use_others ) begin

            vp = v;
            use_others = 0;
            start = 1;

         end else begin

            vp[0] = $shortrealtobits(shortreal'(cycle-cyc_start));
            vp[1] = cycle - cyc_start;
            start = 0;

         end
   end

   initial begin

      automatic int tot_errors = 0;

      done = 0;
      use_others = 0;
      start = 0;

      @( posedge clk_reactive );


      for ( int i=0; i<num_tests; i++ ) begin

         automatic int awaiting = pi.num();

         cyc_start = cycle;

         if ( i < 4 ) begin

            // In first eight test vector components are zero or one.
            //
            for ( int j=0; j<2; j++ ) vr[j] = i & 1 << j ? 1.0 : 0.0;

         end else begin

            // In other tests vector components are randomly chosen.
            //
            for ( int j=0; j<2; j++ ) vr[j] = rand_real(-10,+10);

         end

         for ( int j=0; j<2; j++ ) v[j] = $shortrealtobits(vr[j]);

         vp = v;
         use_others = 1;

         /// Collect Result (mag) From Each Module Under Test (mut)
         //
         foreach ( pi[muti] ) begin

            automatic string mut = muti;  // Informal name of module.
            automatic Info p = pi[mut];

            // Create a child thread to get response from current mut.
            // The parent thread, without delay, proceeds to join_none.
            //
            fork begin

               automatic int steps = pi[mut].ncyc;
               automatic int latency =
                pi[mut].mt == MT_comb ? 1 :
                pi[mut].mt == MT_seq ? 2 : steps;

               // Compute time at which result should be ready or
               // when to start examining a READY output.
               //
               automatic int eta = 1 + cyc_start + latency;

               pi[mut].latency = latency;

               // Wait (just this thread waits) until result should be ready.
               //
               wait ( cycle_reactive == eta );

               // If this module has a READY output, wait for it.
               //
               if ( pi[mut].mt == MT_seq ) wait( ready[pi[mut].idx] );

               // Decrement count of the number of modules we are waiting for.
               //
               awaiting--;

               // Store the module MAG output, it will be checked later
               // for correctness.
               //
               pi[mut].sout = mag[pi[mut].idx];

               pi[mut].cyc_tot += cycle - cyc_start;

               // This thread ends execution here.
            end join_none;

         end

         // Wait until data collected from all modules under test.
         //
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] ) begin

            // Assign module output to a shortreal.
            //
            automatic shortreal mmagr = $bitstoshortreal(pi[mut].sout);
            //
            // Note: pi[mut].sout is type logic which is assumed to be
            // an unsigned integer. However, the contents is really an
            // IEEE 754 single-precision float (shortreal in
            // SystemVerilog) and so $bitstoshortreal is used so that
            // pi[mut].sout is copied bit-for-bit unchanged to mmagr.

            // Compute difference between module output and expected
            // output.  With FP small differences can be okay, they might
            // occur, for example, due to differences in the order of
            // operations.
            //
            automatic shortreal err_mag = fabs( mmagr - magr );
            automatic bit okay = err_mag < 1e-4;

            if ( !okay ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write("%s test #%0d vec (%.1f,%.1f) error: h'%8h  %7.4f != %7.4f (correct)\n",
                   mut, i, vr[1], vr[0],
                       pi[mut].sout, mmagr, magr);
           end
         end

         while ( {$random} & 1 == 1 ) @( posedge clk_reactive );
         //
         // Note: By waiting for reactive clock we can be sure that
         // modules under test have completed all work due to the
         // positive edge of the regular clk. Wait a random amount of
         // time in case any modules are only correct at some stride.

      end

      foreach ( pi[ mut ] )
        $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
               num_tests, mut, pi[mut].err_count,
               pi[mut].mt == MT_comb ? 1 : real'(pi[mut].cyc_tot) / num_tests);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on

`default_nettype uwire

`include "/apps/linux/cadence/GENUS211/share/synth/lib/chipware/sim/verilog/CW/CW_fp_mult.v"
`include "/apps/linux/cadence/GENUS211/share/synth/lib/chipware/sim/verilog/CW/CW_fp_add.v"