hw05.v

////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2022 Homework 5
//

 /// Assignment  https://www.ece.lsu.edu/koppel/v/2022/hw05.pdf

 /// Instructions:
  //
  // (1) Find the undergraduate workstation laboratory, room 2241 Patrick
  //     F. Taylor Hall. Machines to use are in the back.
  //
  // (2) Locate your account.  If you did not get an account please
  //     E-mail: koppel@ece.lsu.edu
  //
  // (3) Log in to a Linux workstation.
  //
  // (4) If you haven't already, follow the account setup instructions here:
  //     https://www.ece.lsu.edu/koppel/v/proc.html
  //
  // (5) Copy this assignment, local path name
  //     /home/faculty/koppel/pub/ee4755/hw/2022/hw05
  //     to a directory ~/hw05 in your class account. (~ is your home
  //     directory.) Use this file for your solution.
  ///      BE SURE THAT YOUR FILE IS CORRECTLY NAMED AND IN THE RIGHT PLACE.
  //
  // (6) Find the problems in this file and solve them.
  //
  //     Your entire solution should be in this file.
  //
  //     Do not change module names.
  //
  // (7) The Verilog portion of your solution will automatically be
  //     copied from your account by the TA-bot.
  //
  // (8) Submit the writeup on paper or E-mail a PDF file.


 /// Additional Resources
  //
  // Verilog Documentation
  //    The Verilog Standard
  //      https://ieeexplore.ieee.org/document/8299595/
  //    Introductory Treatment (Warning: Does not include SystemVerilog)
  //      Brown & Vranesic, Fundamentals of Digital Logic with Verilog, 3rd Ed.
  //
  // Account Setup and Emacs (Text Editor) Instructions
  //      https://www.ece.lsu.edu/koppel/v/proc.html
  //      To learn Emacs look for Emacs tutorial.
  //

`default_nettype none

//////////////////////////////////////////////////////////////////////////////
///  All Problems
//
 ///   Arithmetic-Unit-Only Modules
//
//    These modules have a single arithmetic module.
//
//    Use these to estimate the cost of the multi-step complex modules.
//
//    The ports and parameters match the multi-step for convenience.

module try_mult
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output uwire [wf-1:0] result,
     output uwire ready,
     input uwire [wf-1:0] v0, v1,
     input uwire start, clk);

   localparam logic [2:0] rm = 0; // Rounding Mode
   uwire [7:0] mul_s1;

   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m00( .a(v0), .b(v1), .rnd(rm), .z(result), .status(mul_s1));

   assign ready = 1;

endmodule

module try_sq
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output uwire [wf-1:0] result,
     output uwire ready,
     input uwire [wf-1:0] v0,
     input uwire start, clk);

   try_mult #(wsig,wexp,ieee) tm( result, ready, v0, v0, start, clk);

endmodule

module try_add
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output uwire [wf-1:0] result,
     output uwire ready,
     input uwire [wf-1:0] v0, v1,
     input uwire start, clk );

   localparam logic [2:0] rm = 0; // Rounding Mode
   uwire [7:0] add_s1;

   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a00( .a(v0), .b(v1), .rnd(rm), .z(result), .status(add_s1));

   assign ready = 1;

endmodule


 ///   Multi-Step Modules
//
//    These compute the function in three different ways.
//
//    Do not modify these modules.
//    Modify the m1 modules instead.


// cadence translate_off
module ms_functional
  ( output real mag, input real v0, v1 );

   localparam string name = "Func";

   always_comb mag = v0 * v0 + v0 * v1 + v1 * v1;

endmodule
// cadence translate_on


module ms_comb
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output uwire [wf-1:0] result,
     output uwire ready,
     input uwire [wf-1:0] v0, v1,
     input uwire start, clk);

   // cadence translate_off
   localparam string name = "Comb";
   // cadence translate_on

   localparam int nstages = 1;
   localparam logic [2:0] rm = 0; // Rounding Mode

   uwire [7:0] mul_s1, mul_s2, mul_s3, a_s1, a_s2;
   uwire [wf-1:0] v00, v01, v11, s1;

   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m00( .a(v0), .b(v0), .rnd(rm), .z(v00), .status(mul_s1));
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m01( .a(v0), .b(v1), .rnd(rm), .z(v01), .status(mul_s2));
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m11( .a(v1), .b(v1), .rnd(rm), .z(v11), .status(mul_s3));

   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a1(.a(v00), .b(v11), .rnd(rm), .z(s1), .status(a_s1));
   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a2(.a(s1), .b(v01), .rnd(rm), .z(result), .status(a_s2));

   assign ready = 1;

endmodule


module ms_seq
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output logic [wf-1:0] result,  output logic ready,
     input uwire [wf-1:0] v0, v1,   input uwire start, clk );

   // cadence translate_off
   localparam string name = "Seq";
   // cadence translate_on

   uwire [7:0] mul_s, add_s; // Operation status. Ignored.
   uwire [wf-1:0] mul_a, mul_b, add_a, add_b, prod, sum;

   logic [2:0]  step;
   logic [wf-1:0] ac0, ac1;

   localparam int last_step = 4;

   always_ff @( posedge clk )
     if ( start ) step <= 0;
     else if ( step < last_step ) step <= step + 1;

   localparam logic [2:0] rm = 0; // Rounding Mode
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m1( .z(prod), .a(mul_a), .b(mul_b), .rnd(rm), .status(mul_s));
   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a1( .z(sum),  .a(add_a), .b(add_b), .rnd(rm), .status(add_s));

   assign mul_a = step < 2  ? v0 : v1;
   assign mul_b = step == 0 ? v0 : v1;
   assign add_a = ac0,  add_b = ac1;

   always_ff @( posedge clk )
     begin
        ac0 <= prod;
        if ( step < 3 ) ac1 <= step ? sum : 0;
        if ( start ) ready <= 0; else if ( step == last_step-1 ) ready <= 1;
     end

   assign result = sum;

endmodule

module ms_pipe
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output uwire [wf-1:0] result,
     output uwire ready,
     input uwire [wf-1:0] v0, v1,
     input uwire start, clk);

   // cadence translate_off
   localparam string name = "Pipe";
   // cadence translate_on

   localparam int nstages = 4;
   localparam logic [2:0] rm = 0; // Rounding Mode

   uwire [7:0] mul_s1, mul_s2, mul_s3, a_s1, a_s2;
   uwire [wf-1:0] v00, v01, v11, s1, s2;
   logic [wf-1:0] pl_1_v00, pl_1_v01, pl_1_v11;
   logic [wf-1:0] pl_2_v0001, pl_2_v11;
   logic [wf-1:0] pl_3_sum;
   logic pl_1_occ, pl_2_occ, pl_3_occ;

   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m00( .z(v00), .a(v0), .b(v0), .rnd(rm), .status(mul_s1));
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m01( .z(v01), .a(v0), .b(v1), .rnd(rm), .status(mul_s2));
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m11( .z(v11), .a(v1), .b(v1), .rnd(rm), .status(mul_s3));

   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a1(.z(s1), .a(pl_1_v00),   .b(pl_1_v01), .rnd(rm), .status(a_s1));
   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a2(.z(s2), .a(pl_2_v0001), .b(pl_2_v11), .rnd(rm), .status(a_s2));

   assign ready = pl_3_occ;
   assign result = pl_3_sum;

   always_ff @( posedge clk ) begin

      pl_1_v00 <= v00;
      pl_1_v01 <= v01;
      pl_1_v11 <= v11;
      pl_1_occ <= start;

      pl_2_v0001 <= s1;
      pl_2_v11 <= pl_1_v11;
      pl_2_occ <= pl_1_occ;

      pl_3_sum <= s2;
      pl_3_occ <= pl_2_occ;

   end

endmodule


 ///   Experimentation Modules
//
//    These compute a different function in three different ways.
//
//    Modify these modules.
//



// cadence translate_off
module m1_functional
  ( output real mag,
    input real v0, v1 );

   // The testbench uses this module to test the others, so set
   // the computation to match the others.

   localparam string name = "One Func";
   always_comb mag = v0 + v0 * v1 + v1 * v1;

endmodule
// cadence translate_on


module m1_comb
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output uwire [wf-1:0] result,
     output uwire ready,
     input uwire [wf-1:0] v0, v1,
     input uwire start, clk);

   // cadence translate_off
   localparam string name = "One Comb";
   // cadence translate_on

   localparam int nstages = 1;
   localparam logic [2:0] rm = 0; // Rounding Mode

   uwire [7:0] mul_s1, mul_s2, mul_s3, a_s1, a_s2;
   uwire [wf-1:0] v00, v01, v11, s1;

   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m01( .a(v0), .b(v1), .rnd(rm), .z(v01), .status(mul_s2));
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m11( .a(v1), .b(v1), .rnd(rm), .z(v11), .status(mul_s3));

   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a1(.a(v0), .b(v11), .rnd(rm), .z(s1), .status(a_s1));
   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a2(.a(s1), .b(v01), .rnd(rm), .z(result), .status(a_s2));

   assign ready = 1;

endmodule


module m1_seq
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output logic [wf-1:0] result,  output logic ready,
     input uwire [wf-1:0] v0, v1,   input uwire start, clk );

   // cadence translate_off
   localparam string name = "One Seq";
   // cadence translate_on

   uwire [7:0] mul_s, add_s; // Operation status. Ignored.
   uwire [wf-1:0] mul_a, mul_b, add_a, add_b, prod, sum;

   logic [2:0]  step;
   logic [wf-1:0] ac0, ac1;

   localparam int last_step = 4;

   always_ff @( posedge clk )
     if ( start ) step <= 0;
     else if ( step < last_step ) step <= step + 1;

   localparam logic [2:0] rm = 0; // Rounding Mode
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m1( .z(prod), .a(mul_a), .b(mul_b), .rnd(rm), .status(mul_s));
   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a1( .z(sum),  .a(add_a), .b(add_b), .rnd(rm), .status(add_s));

   localparam logic [wf-1:0] one = { ( 1 << wexp - 1 ) - 1, wsig'(0) };
   assign mul_a = step < 2  ? v0 : v1;
   assign mul_b = step == 0 ? one : v1;
   assign add_a = ac0,  add_b = ac1;

   always_ff @( posedge clk )
     begin
        ac0 <= prod;
        if ( step < 3 ) ac1 <= step ? sum : 0;
        if ( start ) ready <= 0; else if ( step == last_step-1 ) ready <= 1;
     end

   assign result = sum;

endmodule

module m1_pipe
  #( int wsig = 23, wexp = 8, ieee = 1, wf = 1 + wexp + wsig )
   ( output uwire [wf-1:0] result,
     output uwire ready,
     input uwire [wf-1:0] v0, v1,
     input uwire start, clk);

   // cadence translate_off
   localparam string name = "One Pipe";
   // cadence translate_on

   localparam int nstages = 4;
   localparam logic [2:0] rm = 0; // Rounding Mode

   uwire [7:0] mul_s1, mul_s2, mul_s3, a_s1, a_s2;
   uwire [wf-1:0] v00, v01, v11, s1, s2;
   logic [wf-1:0] pl_1_v00, pl_1_v01, pl_1_v11;
   logic [wf-1:0] pl_2_v0001, pl_2_v11;
   logic [wf-1:0] pl_3_sum;
   logic pl_1_occ, pl_2_occ, pl_3_occ;

   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m01( .z(v01), .a(v0), .b(v1), .rnd(rm), .status(mul_s2));
   CW_fp_mult #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     m11( .z(v11), .a(v1), .b(v1), .rnd(rm), .status(mul_s3));

   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a1(.z(s1), .a(pl_1_v00),   .b(pl_1_v01), .rnd(rm), .status(a_s1));
   CW_fp_add #( .sig_width(wsig), .exp_width(wexp), .ieee_compliance(ieee) )
     a2(.z(s2), .a(pl_2_v0001), .b(pl_2_v11), .rnd(rm), .status(a_s2));

   assign ready = pl_3_occ;
   assign result = pl_3_sum;

   always_ff @( posedge clk ) begin

      pl_1_v00 <= v0;
      pl_1_v01 <= v01;
      pl_1_v11 <= v11;
      pl_1_occ <= start;

      pl_2_v0001 <= s1;
      pl_2_v11 <= pl_1_v11;
      pl_2_occ <= pl_1_occ;

      pl_3_sum <= s2;
      pl_3_occ <= pl_2_occ;

   end

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Testbench Code
//
// It is okay to modify the testbench code to facilitate the coding
// and debugging of your modules.


// cadence translate_off

function automatic real rand_real(real minv, real maxv);
      rand_real = minv + ( maxv - minv ) * ( real'({$random}) ) / 2.0**32;
endfunction

function automatic real fabs(real val);
      fabs = val < 0 ? -val : val;
endfunction

virtual class conv #(int wexp=6, wsig=10);
   // Convert between real and fp types using parameter-provided
   // exponent and significand sizes.

   localparam int w = 1 + wexp + wsig;
   localparam int bias_r = ( 1 << 11 - 1 ) - 1;
   localparam int w_sig_r = 52;
   localparam int w_exp_r = 11;
   localparam int bias_h = ( 1 << wexp - 1 ) - 1;

   static function logic [w-1:0] rtof( real r );
      logic [wsig-1:0] sig_f;
      logic [w_sig_r-wsig-1:0] sig_x;
      logic [w_exp_r-1:0] exp_r;
      logic sign_r;
      { sign_r, exp_r, sig_f, sig_x } = $realtobits(r);
      rtof = !r ? 0 : { sign_r, wexp'( exp_r + bias_h - bias_r ), sig_f };
   endfunction

   static function real ftor( logic [w-1:0] f );
      ftor = !f ? 0.0
        : $bitstoreal
          ( { f[w-1],
              w_exp_r'( bias_r + f[w-2:wsig] - bias_h ),
              f[wsig-1:0], (w_sig_r-wsig)'(0) } );
   endfunction

endclass


program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input var int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int npsets = 4; // Number of instantiations.
   localparam int pset[npsets][2] =
              '{ { 7, 0 }, { 23, 0}, {7, 1 }, {23, 1} };
   //
   // Above: First number in each pair is value of n_avg_of,
   // second number is maximum word length.

   int n_err_shown;  // Number of times error info printed to console.
   int n_err_sh_nc, n_err_sh_nw, n_err_sh_avg, n_err_sh_state;
   initial begin
      n_err_sh_nc = 0;
      n_err_sh_nw = 0;
      n_err_sh_avg = 0;
      n_err_sh_state = 0;
   end
   int t_errs;       // Total number of errors.
   initial begin t_errs = 0; n_err_shown = 0; end
   final $write("Total number of errors: %0d\n",t_errs);

   uwire d[npsets:-1];    // Start / Done signals.
   assign d[-1] = 1;  // Initialize first at true.

   // Instantiate a testbench at each size.
   //
   for ( genvar i=0; i<npsets; i++ )
     testbench_n #(pset[i][0],pset[i][1]) t2( .done(d[i]), .tstart(d[i-1]) );

endmodule



module testbench_n
  #( int w_sig = 7, use_one = 0 )
   ( output logic done, input uwire tstart );

   typedef enum { MT_comb, MT_seq, MT_pipe } Module_Type;

   localparam int w_exp = 8;
   localparam int wid = w_sig + w_exp + 1;
   localparam int max_latency = 10;
   localparam int num_tests = 16;
   localparam int nmuts = 10;
   int err[nmuts];

   uwire [wid-1:0]  mag[nmuts];
   uwire         ready[nmuts];
   real   magr;
   real vr[2];
   logic [wid-1:0] v[2], vp[2];
   logic        start;

   typedef struct
     {
      int idx;
      int err_count = 0;
      int ncyc = 0;
      Module_Type mt = MT_comb;
      logic [wid-1:0] sout = 'h111;
      int cyc_tot = 0;
      } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * max_latency * 4;
   int cycle, cyc_start;
   logic clock;
   bit use_others;

   logic clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

   task pi_seq(input int idx, input string name);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_seq;
   endtask

   task pi_comb(input int idx, input string name);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_comb;
   endtask

   task pi_pipe(input int idx, input string name, input int ncyc);
      automatic string m = $sformatf("%s", name);
      pi[m].idx = idx; pi[m].mt = MT_pipe;
      pi[m].ncyc = ncyc;
   endtask

   initial begin
      clock = 0;
      cycle = 0;

      done = 0;
      wait( tstart );

      fork
         while ( !done ) #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      done = 1;
   end

   if ( use_one ) begin

      m1_functional mf( magr, vr[0], vr[1] );
      m1_seq #( .wsig(w_sig), .wexp(w_exp), .ieee(0) )
        m2( mag[1], ready[1], v[0],v[1], start, clock );
      initial begin pi_seq(1,m2.name); end

      m1_comb #( .wsig(w_sig), .wexp(w_exp), .ieee(0) )
        m5r( mag[5], ready[5], v[0],v[1], start, clock );
      initial begin pi_comb(5,m5r.name); end

      m1_pipe #( .wsig(w_sig), .wexp(w_exp), .ieee(0) )
        m3( mag[3], ready[3], vp[0],vp[1], start, clock );
      initial begin pi_pipe(3,m3.name,m3.nstages); end

   end else begin

      ms_functional mf( magr, vr[0], vr[1] );
      ms_seq #( .wsig(w_sig), .wexp(w_exp), .ieee(0) )
        m2( mag[1], ready[1], v[0],v[1], start, clock );
      initial begin pi_seq(1,m2.name); end

      ms_comb #( .wsig(w_sig), .wexp(w_exp), .ieee(0) )
        m5r( mag[5], ready[5], v[0],v[1], start, clock );
      initial begin pi_comb(5,m5r.name); end

      ms_pipe #( .wsig(w_sig), .wexp(w_exp), .ieee(0) )
        m3( mag[3], ready[3], vp[0],vp[1], start, clock );
      initial begin pi_pipe(3,m3.name,m3.nstages); end

   end

   initial begin

      while ( !done ) @( posedge clk_reactive ) #2

         if ( use_others ) begin

            vp = v;
            use_others = 0;
            start = 1;

         end else begin

            vp[0] = conv#(w_exp,w_sig)::rtof( real'(cycle-cyc_start) );
            vp[1] = cycle - cyc_start;
            start = 0;

         end
   end

   initial begin

      automatic int n_err = 0;

      use_others = 0;
      start = 0;

      @( posedge clk_reactive );


      for ( int i=0; i<num_tests; i++ ) begin

         automatic int awaiting = pi.num();

         cyc_start = cycle;

         if ( i < 4 ) begin

            // In first eight test vector components are zero or one.
            //
            for ( int j=0; j<2; j++ ) vr[j] = i & 1 << j ? 1.0 : 0.0;

         end else begin

            // In other tests vector components are randomly chosen.
            //
            for ( int j=0; j<2; j++ ) vr[j] = rand_real(-10,+10);

         end

         for ( int j=0; j<2; j++ ) v[j] = conv#(w_exp,w_sig)::rtof(vr[j]);

         vp = v;
         use_others = 1;

         /// Collect Result (mag) From Each Module Under Test (mut)
         //
         foreach ( pi[muti] ) begin

            // Note: need to make a local copy of muti because of the
            // fork below.
            automatic string mut = muti;

            // Create a child thread to get response from current mut.
            // The parent thread, without delay, proceeds to join_none.
            //
            fork begin

               if ( pi[mut].mt == MT_seq ) begin

                  wait ( !ready[pi[mut].idx] );
                  wait ( ready[pi[mut].idx] );

               end else begin

                  // Compute time at which result should be ready or
                  // when to start examining a READY output.
                  //
                  automatic int latency =
                    pi[mut].mt == MT_comb ? 1 : pi[mut].ncyc;
                  automatic int eta = cyc_start + latency;

                  wait ( cycle_reactive == eta );

               end

               // Decrement count of the number of modules we are waiting for.
               //
               awaiting--;

               // Store the module MAG output, it will be checked later
               // for correctness.
               //
               pi[mut].sout = mag[pi[mut].idx];

               pi[mut].cyc_tot += cycle - cyc_start;

               // This thread ends execution here.
            end join_none;

         end

         // Wait until data collected from all modules under test.
         //
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] ) begin

            // Assign module output to a real.
            //
            automatic real mmagr = conv#(w_exp,w_sig)::ftor(pi[mut].sout);
            //
            // Note: pi[mut].sout is type logic which is assumed to be
            // an unsigned integer. However, the contents is really an
            // float.

            // Compute difference between module output and expected
            // output.  With FP small differences can be okay, they might
            // occur, for example, due to differences in the order of
            // operations.
            //
            automatic real err_mag =
              fabs( mmagr - magr ) / fabs( magr ? magr : 1 );
            localparam real tol = real'(4) / ( 1 << w_sig );
            automatic bit okay = err_mag < tol;

            if ( !okay ) begin
              pi[mut].err_count++;
               n_err++;
              if ( pi[mut].err_count < 5 )
                $write("%s (%0d) test #%0d vec (%.1f,%.1f) error: h'%8h  %7.4f != %7.4f (correct)\n",
                   mut, w_sig, i, vr[1], vr[0],
                       pi[mut].sout, mmagr, magr);
           end
         end

         while ( {$random} & 1 == 1 ) @( posedge clk_reactive );
         //
         // Note: By waiting for reactive clock we can be sure that
         // modules under test have completed all work due to the
         // positive edge of the regular clk. Wait a random amount of
         // time in case any modules are only correct at some stride.

      end

      foreach ( pi[ mut ] )
        $write("Ran %4d tests for (%0d) %-0s, %4d errors found. Avg cyc %.1f\n",
               num_tests, w_sig, mut,
               pi[mut].err_count,
               pi[mut].mt == MT_comb ? 1 : real'(pi[mut].cyc_tot) / num_tests);

      done = 1;
      testbench.t_errs += n_err;

   end

endmodule

`define SIMULATION_ON

// cadence translate_on

`default_nettype wire

`ifdef SIMULATION_ON

`include "/apps/linux/cadence/GENUS211/share/synth/lib/chipware/sim/verilog/CW/CW_fp_mult.v"
`include "/apps/linux/cadence/GENUS211/share/synth/lib/chipware/sim/verilog/CW/CW_fp_add.v"

`else

`include "/apps/linux/cadence/GENUS211/share/synth/lib/chipware/syn/CW/CW_fp_mult.v"
`include "/apps/linux/cadence/GENUS211/share/synth/lib/chipware/syn/CW/CW_fp_add.v"

`endif