////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2024 Homework 2 -- SOLUTION
//

 /// Assignment  https://www.ece.lsu.edu/koppel/v/2024/hw02.pdf
 /// Solution writeup  https://www.ece.lsu.edu/koppel/v/2024/hw02_sol.pdf

`default_nettype none


//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
  ///  Complete scaled_comp_fp so the comparison is done in FP.
//
//     [✔] Use Chipware modules for floating-point operations and conversions.
//
//     [✔] Make sure that the testbench does not report errors.
//     [✔] Module must be synthesizable. Use command: genus -files syn.tcl
//
//     [✔] Don't assume any particular parameter values.
//
//     [✔] Code must be written clearly.

typedef enum logic [2:0]
  { Rnd_to_even = 0, Rnd_to_0 = 1, Rnd_to_plus_inf = 2,
    Rnd_to_minus_inf = 3, Rnd_to_near_up = 4, Rnd_from_0 = 5 }
    Rnd;

module comp_fp
  #( int w_c = 5, w_s = 2, w_exp = 5, w_sig = 5, w_sig2 = 6,
     int w_fp = 1 + w_exp + w_sig, w_fp2 = 1 + w_exp + w_sig2 )
   ( output uwire gt,
     output uwire [w_fp2-1:0] ssum,
     input uwire [w_fp-1:0] a, b,
     input uwire [w_s-1:0] s,
     input uwire [w_c-1:0] c );

   /// SOLUTION -- Problem 1

   // First, compute a2^s by just adding s to the exponent.
   //
   uwire [w_fp-1:0] a_sc = a ? a + ( s << w_sig ) : a;
   //
   // Note that ( s << w_sig ) shifts s into the correct position.

   // Convert a_sc and b from FP numbers with w_sig-bit significands to
   // FP numbers with w_sig2-bit significands by just shifting them over.
   //
   uwire [w_fp2-1:0] a_cw = a_sc << w_sig2 - w_sig;
   uwire [w_fp2-1:0] b_cw = b    << w_sig2 - w_sig;
   //  uwire [w_fp2-1:0] b_cw = { b,    (w_sig2 - w_sig)'(1'b0)    };
   //  uwire [w_fp2-1:0] b_cw = { b,  { (w_sig2 - w_sig) {1'b0} }  };
   //
   // This just appends w_sig2-w_sig zeros to the LSB part.

   // Add the now-widened a2^s to b.
   //
   CW_fp_add #( .sig_width(w_sig2), .exp_width(w_exp) )
   d2( .z(ssum), .a(a_cw), .b(b_cw),  .status(), .rnd(Rnd_to_near_up) );


   // Convert c to FP
   //
   uwire [w_fp2:1] cf;
   CW_fp_i2flt #( .sig_width(w_sig2), .exp_width(w_exp), .isize(w_c), .isign(0) )
   coa( .z(cf), .a(c),  .status(), .rnd(Rnd_to_even) );

   // Compare ssum to c
   //
   CW_fp_cmp #( .sig_width(w_sig2), .exp_width(w_exp) )
   cmp( .agtb(gt), .a(ssum), .b(cf),
        .zctr(1'b0), .altb(), .aeqb(), .unordered(),
        .z0(), .z1(), .status0(), .status1() );
   //
   // Note that most of the outputs are unconnected.

endmodule


//////////////////////////////////////////////////////////////////////////////
///  Problem 2
//
  ///  Complete scaled_comp_int so the comparison is done as an int.
//
//     [✔] Use Chipware modules for floating-point operations and conversions.
//
//     [✔] Make sure that the testbench does not report errors.
//     [✔] Module must be synthesizable. Use command: genus -files syn.tcl
//
//     [✔] Don't assume any particular parameter values.
//
//     [✔] Code must be written clearly.

module comp_int
  #( int w_c = 5, w_s = 2, w_exp = 5, w_sig = 5, w_sig2 = 6,
     int w_fp = 1 + w_exp + w_sig, w_fp2 = 1 + w_exp + w_sig2 )
   ( output uwire gt,
     output uwire [w_fp2-1:0] ssum,
     input uwire [w_fp-1:0] a, b,
     input uwire [w_s-1:0] s,
     input uwire [w_c-1:0] c );

   /// SOLUTION -- Problem 2

   // Instantiate comp_fp, but leave the gt output unconnected to anything.
   //
   uwire gtx;  // Don't connect this to anything!
   comp_fp #(.w_c(w_c),.w_s(w_s),.w_exp(w_exp),.w_sig(w_sig),.w_sig2(w_sig2))
   fp( gtx, ssum, a, b, s, c);
   //
   // Since gtx is not used the hardware that would connect to gtx
   // will not be synthesized.

   // Convert ssum to a (w_c+1)-bit number.
   //
   uwire [w_c:0] sumi;
   uwire [7:0] sumi_status;
   CW_fp_flt2i #( .isize(w_c+1), .sig_width(w_sig2), .exp_width(w_exp)  ) ftoi
     ( .z(sumi), .a(ssum), .status(sumi_status), .rnd(Rnd_to_plus_inf) );
   //
   // Since c is w_c bits it is wasteful to use more than w_c bits for
   // ssumi, except for the one extra bit used for the sign.

   uwire ssum_positive = !ssum[w_fp2-1];
   uwire ssum_overflow = sumi_status[6];

   // Compute gt
   //
   assign gt = ssum_positive && ( ssum_overflow || sumi > c );
   //
   // Note: If ssum_overflow is true then ssum can't fit in w_c
   // bits and so ssum > c.

endmodule

`ifdef DONT_DEFINE_ME

Module Name                           Area   Delay   Delay     Synth
                                            Actual  Target      Time
comp_int_w_c6_w_s6_w_exp7_w_sig6_w_sig27
                                    148989   23.33   900.0 ns     17 s
comp_fp_w_c6_w_s6_w_exp7_w_sig6_w_sig27
                                    140689   20.42   900.0 ns     13 s
comp_int_w_c12_w_s6_w_exp7_w_sig6_w_sig213
                                    244127   30.05   900.0 ns     19 s
comp_fp_w_c12_w_s6_w_exp7_w_sig6_w_sig213
                                    210213   26.19   900.0 ns     17 s


`endif



//////////////////////////////////////////////////////////////////////////////
/// Testbench Code


// cadence translate_off

virtual class conv #(int wexp=6, wsig=10);
   // Convert between real and fp types using parameter-provided
   // exponent and significand sizes.

   localparam int w = 1 + wexp + wsig;
   localparam int bias_r = ( 1 << 11 - 1 ) - 1;
   localparam int w_sig_r = 52;
   localparam int w_exp_r = 11;
   localparam int bias_h = ( 1 << wexp - 1 ) - 1;

   static function logic [w-1:0] rtof( real r );
      logic [wsig-1:0] sig_f;
      logic [w_sig_r-wsig-2:0] sig_x;
      logic sig_x_msb;
      logic [w_exp_r-1:0] exp_r;
      logic sign_r;
      { sign_r, exp_r, sig_f, sig_x_msb, sig_x } = $realtobits(r);
      // So, what about a rounding mode? Not now!
      rtof = !r ? 0 : { sign_r, wexp'( exp_r + bias_h - bias_r ), sig_f };
   endfunction

   static function real ftor( logic [w-1:0] f );
      ftor = !f ? 0.0
        : $bitstoreal
          ( { f[w-1],
              w_exp_r'( bias_r + f[w-2:wsig] - bias_h ),
              f[wsig-1:0], (w_sig_r-wsig)'(0) } );
   endfunction

   static function int err_bits( logic [w-1:0] a, b );

      logic [wsig-1:0] sig_a, sig_b;
      logic [wsig+2:0] frac_a, frac_b, frac_diff;
      logic [wexp-1:0] exp_a, exp_b;
      logic s_a, s_b;
      int delta_e;

      if ( $isunknown(a) || $isunknown(b) ) return 1 << wexp;
      if ( a == b ) return 0;

      { s_a, exp_a, sig_a } = a;
      { s_b, exp_b, sig_b } = b;

      if ( exp_a == 0 || exp_b == 0 ) begin
         logic [wsig-1:0] sig = ~ ( sig_a | sig_b );
         return 1 + wsig - $clog2( sig + 1 );
      end

      delta_e = $abs( 0 + exp_a - exp_b );
      if ( delta_e > 1 ) return delta_e + wsig;
      frac_a = exp_a > exp_b ? { 2'b1, sig_a, 1'b0 } : { 3'b1, sig_a };
      frac_b = exp_b > exp_a ? { 2'b1, sig_b, 1'b0 } : { 3'b1, sig_b };
      frac_diff =
        s_a != s_b ? frac_a + frac_b :
        frac_a > frac_b ? frac_a - frac_b : frac_b - frac_a;
      return $clog2( frac_diff + 1 );

   endfunction

endclass

module scaled_comp_1_behav
  #( int w_c = 5, w_s = 2, w_exp = 5, w_sig = 5, wfp = 1 + w_exp + w_sig )
   ( output logic gt,
     input uwire [wfp-1:0] a, b,
     input uwire [w_s-1:0] s,
     input uwire [w_c-1:0] c );

   logic [wfp-1:0] ssum;

   always_comb begin
      real ra, rb, ssumr;
      ra = conv#(w_exp,w_sig)::ftor( a );
      rb = conv#(w_exp,w_sig)::ftor( b );
      ssumr = ra * 2.0 ** s + rb;
      ssum = conv#(w_exp,w_sig)::rtof( ssumr );
      gt = ssumr > c;
   end

endmodule

// cadence translate_on


//////////////////////////////////////////////////////////////////////////////
/// Testbench Code


// cadence translate_off

function automatic int unsigned rand_wid(int max_wid);
      automatic int wid = 1 + {$random()} % max_wid;
      return {$random()} & ( ( 1 << wid ) - 1 );
endfunction

function automatic real rand_fp(real min, real max);
      automatic real range = max - min;
      localparam real rmax_inv = real'(1) / ( ( longint'(1) << 32 ) - 1 );
      automatic real runit = {$random()} * rmax_inv;
      return runit * range + min;
endfunction

function automatic real fabs(real val);
      fabs = val < 0 ? -val : val;
endfunction

function int min( int a, b );
      min = a <= b ? a : b;
endfunction
function int max( int a, b );
      max = a >= b ? a : b;
endfunction


virtual class conv2 #(int wexp=6, wsig=10);
   // Convert between real and fp types using parameter-provided
   // exponent and significand sizes.

   localparam int w = 1 + wexp + wsig;
   localparam int bias_r = ( 1 << 11 - 1 ) - 1;
   localparam int w_sig_r = 52;
   localparam int w_exp_r = 11;
   localparam int bias_h = ( 1 << wexp - 1 ) - 1;

   static function logic [w-1:0] rtof( real r );
      logic [wsig-1:0] sig_f;
      logic [w_sig_r-wsig-2:0] sig_x;
      logic sig_x_msb;
      logic [w_exp_r-1:0] exp_r;
      logic sign_r;
      { sign_r, exp_r, sig_f, sig_x_msb, sig_x } = $realtobits(r);
      // So, what about a rounding mode? Not now!
      rtof = !r ? 0 : { sign_r, wexp'( exp_r + bias_h - bias_r ), sig_f };
   endfunction

   static function real ftor( logic [w-1:0] f );
      ftor = !f ? 0.0
        : $bitstoreal
          ( { f[w-1],
              w_exp_r'( bias_r + f[w-2:wsig] - bias_h ),
              f[wsig-1:0], (w_sig_r-wsig)'(0) } );
   endfunction

   static function int err_bits( logic [w-1:0] a, b );

      logic [wsig-1:0] sig_a, sig_b;
      logic [wsig+2:0] frac_a, frac_b, frac_diff;
      logic [wexp-1:0] exp_a, exp_b;
      logic s_a, s_b;
      int delta_e;

      if ( $isunknown(a) || $isunknown(b) ) return 1 << wexp;
      if ( a == b ) return 0;

      { s_a, exp_a, sig_a } = a;
      { s_b, exp_b, sig_b } = b;

      if ( exp_a == 0 || exp_b == 0 ) begin
         logic [wsig-1:0] sig = ~ ( sig_a | sig_b );
         return 1 + wsig - $clog2( sig + 1 );
      end

      delta_e = $abs( 0 + exp_a - exp_b );
      if ( delta_e > 1 ) return delta_e + wsig;
      frac_a = exp_a > exp_b ? { 2'b1, sig_a, 1'b0 } : { 3'b1, sig_a };
      frac_b = exp_b > exp_a ? { 2'b1, sig_b, 1'b0 } : { 3'b1, sig_b };
      frac_diff =
        s_a != s_b ? frac_a + frac_b :
        frac_a > frac_b ? frac_a - frac_b : frac_b - frac_a;
      return $clog2( frac_diff + 1 );

   endfunction

endclass

// cadence translate_on

// cadence translate_off

// Module names. (Used by the testbench.)
//
typedef enum { M_int, M_fp } M_Type;

module testbench;

   localparam int n_tests = 100000;

   localparam int npsets = 3; // This MUST be set to the size of pset.
   // { w_exp, w_sig, wc_int }
   localparam int pset[npsets][3] =
              '{
                { 7,  6,  6 },
                { 7,  7,  10 },
                { 8,  5,  12 }
                };

   localparam int nmsets = 2;
   localparam M_Type mset[2] = '{ M_int, M_fp };

   string mtype_str[M_Type] = '{ M_int: "comp_int", M_fp: "comp_fp " };
   string mtype_abbr[M_Type] = '{ M_int: "in", M_fp: "fp" };

   int t_errs_mod[M_Type];
   int t_errs_each_gt[M_Type][int][2];
   int t_errs_each_ss[M_Type][int][2];
   int t_errs_size_gt[int][2];
   int t_errs_size_ss[int][2];
   int t_errs_each[M_Type][int];

   localparam int nsets = npsets * nmsets;

   logic d[nsets:-1]; // Start / Done signals.

   int t_errs_gt[2], t_errs_ss[2];     // Total number of errors.
   initial begin
      t_errs_gt = '{0,0};
      t_errs_ss = '{0,0};
      for ( int m=0; m<nmsets; m++ )
        for ( int i=0; i<npsets; i++ ) begin
           t_errs_each_gt[mset[m]][i] = '{-1,-1};
           t_errs_each_ss[mset[m]][i] = '{-1,-1};
        end

      d[-1] = 1;
   end

   final begin
   `ifdef xxx
      $write("\nNumber of tests: %0d.\n", n_tests);
      for ( int i=0; i<npsets; i++ )
        $write("Total for exp=%0d, sig=%0d, wc=%2d: Err (s=0): %0d ss, %0d gt.  Err (s>0): %0d ss, %0d gt.\n",
               pset[i][0], pset[i][1], pset[i][2],
               t_errs_size_ss[i][1], t_errs_size_gt[i][1],
               t_errs_size_ss[i][0], t_errs_size_gt[i][0]);
      for ( int i=0; i<nmsets; i++ )
        $write("Total for mod %4s: %5d errors.\n",
               mtype_str[mset[i]],
               t_errs_mod[mset[i]]
               );
      `endif
      for ( int mi=0; mi<nmsets; mi++ )
        for ( int i=0; i<npsets; i++ ) begin
           automatic M_Type m = mset[mi];
           for ( int j=1; j>=0; j-- )
             $write("Total %s exp=%0d, sig=%0d, wc=%2d, %s: Errors: %0d ss, %0d gt.\n",
                    mtype_str[m],
                    pset[i][0], pset[i][1], pset[i][2], j == 0 ? "s>0" : "s=0",
                    t_errs_each_ss[m][i][j], t_errs_each_gt[m][i][j] );
        end

      //  $write("Total number of errors (s=0): ss %0d, gt %0d.  (s>0): ss %0d, gt %0d.\n",
             //  t_errs_ss[1], t_errs_gt[1], t_errs_ss[0], t_errs_gt[0]);
   end

   for ( genvar m=0; m<nmsets; m++ )
     for ( genvar i=0; i<npsets; i++ ) begin
        localparam int idx = m * npsets + i;
        testbench_n
          #( .w_exp(pset[i][0]), .w_sig(pset[i][1]), .w_c(pset[i][2]),
             .pset(i), .mtype(mset[m]) )
        t2( .done(d[idx]), .tstart(d[idx-1]) );
     end

endmodule


module testbench_n
  #( int w_exp = 5, w_sig = 8, w_c = 12,
     pset = 0, M_Type mtype = M_fp )
   ( output logic done, input uwire tstart );

   // Number of sample outputs to print (whether correct or not).
   localparam int n_samples = 1;

   localparam int w_s = w_exp-1;
   localparam int w_fp = 1 + w_sig + w_exp;
   localparam int bias = ( 1 << w_exp-1 ) - 1;
   localparam int w_sig2 = w_c + 1;
   localparam int w_fp2 = 1 + w_sig2 + w_exp;
   logic [w_c-1:0] c;
   logic [w_s-1:0] sl;
   logic [w_fp-1:0] a, b;
   uwire [w_fp2-1:0] ssum;
   uwire gt;

   localparam int c_max = ( 1 << w_c ) - 1;

   case ( mtype )
     M_fp:
       comp_fp #( w_c, w_s, w_exp, w_sig, w_sig2 )
     c1(gt, ssum, a, b, sl, c);
     M_int:
       comp_int #( w_c, w_s, w_exp, w_sig, w_sig2 )
     c1(gt, ssum, a, b, sl, c);
   endcase

   initial begin

      automatic int n_tests = testbench.n_tests;
      automatic int n_err_gt[2] = '{0,0}, n_err_ss[2] = '{0,0};
      automatic int n_gt = 0;

      wait( tstart );

      $write("\nStarting %4s tests iwth exp=%0d, sig=%0d, wc=%0d\n",
             testbench.mtype_str[mtype], w_exp, w_sig, w_c);

      for (int i=0; i<n_tests; i++ ) begin

         automatic bit choose_close = $random() & 1'b1;
         automatic bit sl_zero = i < n_tests / 2;
         automatic bit ab_positive = 0;
         logic [w_fp2-1:0] shadow_ssumf;
         int eb_ssum, c_pre;
         real ar, br, shadow_ar, shadow_br, shadow_ssumr, delta, a_scr, ssumr;
         real tol;
         logic shadow_gt;
         bit err_ss, err_gt, err;

         c =  $random() & 7 ? rand_wid(w_c) : c_max;
         sl = sl_zero ? 0 : 1 + rand_wid(w_exp-2);

         ar = ( ab_positive ? rand_fp(0,1):rand_fp(-0.5,0.5) )*rand_wid(w_c+2);

         a_scr = ar * 2 ** sl;

         br = choose_close
           ? c - a_scr + rand_fp(-1,1)
           : ( ab_positive ? rand_fp(0,1):rand_fp(-0.5,0.5) ) * rand_wid(w_c+2);

         if ( ab_positive && br < 0 ) br = -br;

         a = conv#(w_exp,w_sig)::rtof( ar );
         b = conv#(w_exp,w_sig)::rtof( br );

         shadow_ar = conv#(w_exp,w_sig)::ftor( a );
         shadow_br = conv#(w_exp,w_sig)::ftor( b );
         shadow_ssumr = shadow_ar * 2.0**sl + shadow_br;
         shadow_ssumf = conv#(w_exp,w_sig2)::rtof(shadow_ssumr);
         delta = shadow_ssumr - c;
         tol = c / real'( 1 << w_sig2 + 1 );
         shadow_gt = shadow_ssumr > c;
         n_gt += shadow_gt;

         #1;

         ssumr = conv#(w_exp,w_sig2)::ftor(ssum);
         err_gt = fabs(delta) > tol && gt !== shadow_gt;
         eb_ssum = conv#(w_exp,w_sig2)::err_bits(ssum,shadow_ssumf);
         err_ss = eb_ssum > 1;
         err = err_gt || err_ss;

         if ( i < n_samples || err ) begin
            if ( err_gt ) n_err_gt[sl_zero]++;
            if ( err_ss ) n_err_ss[sl_zero]++;

            if ( i < n_samples || err_ss && n_err_ss[sl_zero] < 5 )
              $write( "%s %s #(%0d,%0d,%0d) a=%.2f, s=%0d, b=%.2f, c=%0d. ss %.4e %s %.4e (correct)\n",
                      err_ss ? "Error " : "Sample",
                      testbench.mtype_abbr[mtype],
                      w_exp, w_sig, w_c,
                      shadow_ar, sl, shadow_br, c,
                      ssumr, err_ss ? "!=" : "==", shadow_ssumr );

            if ( i < n_samples || err_gt && n_err_gt[sl_zero] < 5 )
              $write( "%s %s #(%0d,%0d,%0d) a=%.2f, s=%0d, b=%.2f, c=%0d. gt %h %s %h (correct) %g\n",
                      err_gt ? "Error " : "Sample",
                      testbench.mtype_abbr[mtype],
                      w_exp, w_sig, w_c,
                      shadow_ar, sl, shadow_br, c,
                      gt, err_gt ? "!=" : "==", shadow_gt, delta );
         end

      end

      for ( int i=1; i>=0; i-- )
        $write("Finished  %4s tests exp=%0d, sig=%0d, wc=%0d, %s. Errors: %0d ss, %0d gt\n",
             testbench.mtype_str[mtype], w_exp, w_sig, w_c,
               i == 0 ? "s>0" : "s=0",
               n_err_ss[i], n_err_gt[i]);
      //  $write("Frac gt %.4f\n", real'(n_gt)/n_tests);

      for ( int i=0; i<2; i++ ) begin
         testbench.t_errs_gt[i] += n_err_gt[i];
         testbench.t_errs_ss[i] += n_err_ss[i];
         testbench.t_errs_each_gt[mtype][pset][i] = n_err_gt[i];
         testbench.t_errs_each_ss[mtype][pset][i] = n_err_ss[i];
      end

      done = 1;
   end

endmodule

// Define SIMULATION_ON in a translate_off region. Used to control
// whether simulation or synthesis versions of Chipware modules are
// included.
//
`define SIMULATION_ON
//
// cadence translate_on


`default_nettype wire

`ifdef SIMULATION_ON

`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/sim/verilog/CW/CW_fp_mult.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/sim/verilog/CW/CW_fp_add.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/sim/verilog/CW/CW_fp_sub.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/sim/verilog/CW/CW_fp_div.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/sim/verilog/CW/CW_fp_i2flt.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/sim/verilog/CW/CW_fp_flt2i.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/sim/verilog/CW/CW_fp_cmp.v"

`else

`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/syn/CW/CW_fp_mult.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/syn/CW/CW_fp_add.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/syn/CW/CW_fp_sub.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/syn/CW/CW_fp_i2flt.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/syn/CW/CW_fp_flt2i.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/syn/CW/CW_fp_div.v"
`include "/apps/linux/cadence/DDIEXPORT23/GENUS231/share/synth/lib/chipware/syn/CW/CW_fp_cmp.v"

`endif