/// LSU EE 4755
//
// Code based on 2016 Final Exam Problem 1.
// This file includes code written in class on 8 November 2021, 15:52:44 CST.

`default_nettype none

// cadence translate_off
 /// Non-Synthesizable Mag Module --- Complete, Don't Edit

module prob1_functional
  ( output shortreal mag,
    input shortreal v0, v1 );

   always_comb mag = v0 * v0 + v0 * v1 + v1 * v1;

endmodule
// cadence translate_on


 // 


 /// Version of module written in class  8 November 2021, 15:34:37 CST
 //
module prob1_seq
  ( output uwire [31:0] result,
    output logic ready,
    input uwire [31:0] v0, v1,
    input uwire start, clk);

   uwire [7:0] mul_s, add_s;
   uwire [31:0] prod;

   logic [31:0] ac0, ac1;
   logic [2:0]  step;

   localparam   int last_step = 4;  /// SET CORRECTLY

   always_ff @( posedge clk )
     if ( start ) step <= 0; else if ( step < last_step ) step <= step + 1;

   logic [31:0] mul_a, mul_b;

   always_comb begin

      case ( step )

        0: begin mul_a = v0;  mul_b = v0; end
        1: begin mul_a = v0;  mul_b = v1; end
        2: begin mul_a = v1;  mul_b = v1; end
        default begin mul_a = v0;  mul_b = v1; end

      endcase

   end

   // Step
   //  0
   //  1
   //  2
   //  3

   always_ff @( posedge clk ) begin

      ac0 <= prod;
      ac1 <= step == 0 ? 32'h0 : step < 3 ? result : ac1;
      ready <= start ? 0 : step == 3 ? 1 : ready;

   end


   CW_fp_mult m1( .a(mul_a), .b(mul_b), .rnd(3'd0), .z(prod), .status(mul_s));
   CW_fp_add  a1( .a(ac0), .b(ac1), .rnd(3'd0), .z(result),  .status(add_s));

endmodule

 // 

module prob1_seq_sol
  ( output logic [31:0] result,
    output logic ready,
    input uwire [31:0] v0, v1,
    input uwire start,
    input uwire clk );

   localparam logic [2:0] rnd = 0; // 1 is round towards zero.

   uwire [7:0] mul_s, add_s;

   logic [2:0]  step;

   uwire [31:0] mul_a, mul_b;
   uwire [31:0] add_a, add_b;
   uwire [31:0] prod, sum;

   logic [31:0] ac0, ac1;

   localparam   int last_step = 4;

   always_ff @( posedge clk )
     if ( start ) step <= 0;
     else if ( step < last_step ) step <= step + 1;

   CW_fp_mult m1( .a(mul_a), .b(mul_b), .rnd(rnd), .z(prod), .status(mul_s));
   CW_fp_add  a1( .a(add_a), .b(add_b), .rnd(rnd), .z(sum),  .status(add_s));

   assign mul_a = step < 2  ? v0 : v1;
   assign mul_b = step == 0 ? v0 : v1;
   assign add_a = ac0,  add_b = ac1;

   always_ff @( posedge clk )
     begin

        ac0 <= prod;

        case ( step )
           0: ac1 <= 0;
           1: ac1 <= sum;
           2: ac1 <= sum;
        endcase

        if ( start ) ready <= 0; else if ( step == last_step-1 ) ready <= 1;

     end

   assign result = sum;

endmodule




// cadence translate_off

function automatic real rand_real(real minv, real maxv);
      rand_real = minv + ( maxv - minv ) * ( real'({$random}) ) / 2.0**32;
endfunction

function automatic shortreal fabs(shortreal val);
      fabs = val < 0 ? -val : val;
endfunction

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input var int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram


module testbench();

   typedef enum { MT_comb, MT_seq, MT_pipe } Module_Type;

   localparam wid = 32;
   localparam max_latency = 10;
   localparam int num_tests = 16;
   localparam   int nmuts = 10;
   int err[nmuts];

   uwire [31:0]  mag[nmuts];
   uwire         ready[nmuts];
   shortreal   magr;
   shortreal vr[2];
   logic [31:0] v[2];
   logic [31:0] vp[2];
   logic        start;

   typedef struct
     {
      int idx;
      int err_count = 0;
      int ncyc = 0;
      Module_Type mt = MT_comb;
      logic [wid-1:0] sout = 'h111;
      int cyc_tot = 0;
      int latency = 0;
      } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * max_latency * 4;
   int cycle, cyc_start;
   bit done;
   logic clock;
   bit   use_others;

   logic      clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

   task pi_seq(input int idx, input string name);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_seq;
   endtask

   task pi_pipe(input int idx, input string name, input int ncyc);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_pipe;
      pi[m].ncyc = ncyc;
   endtask

   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 begin
            cycle += clock++;
         end
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   prob1_functional mf( magr, vr[0], vr[1] );
   prob1_seq_sol m2( mag[1], ready[1], v[0],v[1], start, clock );
   initial begin pi_seq(1,"Seq. Sol"); end
   prob1_seq m2r( mag[2], ready[2], v[0],v[1], start, clock );
   initial begin pi_seq(2,"Seq."); end
   //  prob1_pipe m4( mag[3], vp, clock );
   //  initial begin pi_pipe(3,"Pipe",m4.nstages); m4.db = db; end

   initial begin

      while ( !done ) @( posedge clk_reactive ) #2

         if ( use_others ) begin

            vp = v;
            use_others = 0;
            start = 1;

         end else begin

            vp[0] = $shortrealtobits(shortreal'(cycle-cyc_start));
            vp[1] = cycle - cyc_start;
            start = 0;

         end
   end

   initial begin

      automatic int tot_errors = 0;

      done = 0;
      use_others = 0;
      start = 0;

      @( posedge clk_reactive );


      for ( int i=0; i<num_tests; i++ ) begin

         automatic int awaiting = pi.num();

         cyc_start = cycle;

         if ( i < 4 ) begin

            // In first eight test vector components are zero or one.
            //
            for ( int j=0; j<2; j++ ) vr[j] = i & 1 << j ? 1.0 : 0.0;

         end else begin

            // In other tests vector components are randomly chosen.
            //
            for ( int j=0; j<2; j++ ) vr[j] = rand_real(-10,+10);

         end

         for ( int j=0; j<2; j++ ) v[j] = $shortrealtobits(vr[j]);

         vp = v;
         use_others = 1;

         /// Collect Result (mag) From Each Module Under Test (mut)
         //
         foreach ( pi[muti] ) begin

            automatic string mut = muti;  // Informal name of module.
            automatic Info p = pi[mut];

            // Create a child thread to get response from current mut.
            // The parent thread, without delay, proceeds to join_none.
            //
            fork begin

               automatic int steps = pi[mut].ncyc;
               automatic int latency =
                pi[mut].mt == MT_comb ? 1 :
                pi[mut].mt == MT_seq ? 2 : steps;

               // Compute time at which result should be ready or
               // when to start examining a READY output.
               //
               automatic int eta = 1 + cyc_start + latency;

               pi[mut].latency = latency;

               // Wait (just this thread waits) until result should be ready.
               //
               wait ( cycle_reactive == eta );

               // If this module has a READY output, wait for it.
               //
               if ( pi[mut].mt == MT_seq ) wait( ready[pi[mut].idx] );

               // Decrement count of the number of modules we are waiting for.
               //
               awaiting--;

               // Store the module MAG output, it will be checked later
               // for correctness.
               //
               pi[mut].sout = mag[pi[mut].idx];

               pi[mut].cyc_tot += cycle - cyc_start;

               // This thread ends execution here.
            end join_none;

         end

         // Wait until data collected from all modules under test.
         //
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] ) begin

            // Assign module output to a shortreal.
            //
            automatic shortreal mmagr = $bitstoshortreal(pi[mut].sout);
            //
            // Note: pi[mut].sout is type logic which is assumed to be
            // an unsigned integer. However, the contents is really an
            // IEEE 754 single-precision float (shortreal in
            // SystemVerilog) and so $bitstoshortreal is used so that
            // pi[mut].sout is copied bit-for-bit unchanged to mmagr.

            // Compute difference between module output and expected
            // output.  With FP small differences can be okay, they might
            // occur, for example, due to differences in the order of
            // operations.
            //
            automatic shortreal err_mag = fabs( mmagr - magr );
            automatic bit okay = err_mag < 1e-4;

            if ( !okay ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write("%s test #%0d vec (%.1f,%.1f) error: h'%8h  %7.4f != %7.4f (correct)\n",
                   mut, i, vr[1], vr[0],
                       pi[mut].sout, mmagr, magr);
           end
         end

         while ( {$random} & 1 == 1 ) @( posedge clk_reactive );
         //
         // Note: By waiting for reactive clock we can be sure that
         // modules under test have completed all work due to the
         // positive edge of the regular clk. Wait a random amount of
         // time in case any modules are only correct at some stride.

      end

      foreach ( pi[ mut ] )
        $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
               num_tests, mut, pi[mut].err_count,
               pi[mut].mt == MT_comb ? 1 : real'(pi[mut].cyc_tot) / num_tests);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on

`default_nettype wire
`include "/apps/linux/cadence/GENUS211/share/synth/lib/chipware/sim/verilog/CW/CW_fp_mult.v"
`include "/apps/linux/cadence/GENUS211/share/synth/lib/chipware/sim/verilog/CW/CW_fp_add.v"