/// EE 4755 - Digital Design Using HDLs
//
//  Pipelining and Pipelined Multipliers

//////////////////////////////////////////////////////////////////////////////
/// Multiplication Background
//
//  This material covered in more detail in mult-seq.v

 /// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  cand  -- Multiplicand
//     1100  plier -- Multiplier
//     """"
//     0000  Partial Product
//    0000
//   0101
//  0101
//  """""""
//  0111100  prod  -- Product
//
//  OED Citation
//    The multiplycand, which must alwaies stand aboue.
//    1594 T. Blundeville Exercises i. iv. f. 6 


 /// Behavioral Multiplier
//
//   The synthesis program picks a good multiplier from its own
//   library of generic multipliers or from target technology library.
//
//   - Good, if nothing special is being done.
//
module mult_behav_1
  #(int w = 16)
   (output uwire [2*w-1:0] prod, input uwire [w-1:0] cand, plier);

   assign prod = cand * plier;
endmodule


 /// Linear Combinational Multiplier
//
//   This uses a linear structure has an unnecessarily long critical
//   path. But it's easy to understand.
//
module mult_linear
  #(int w = 16)
   (output logic [2*w-1:0] prod, input uwire [w-1:0] cand, plier);

   uwire [2*w-1:0] b[w:0];

   assign b[0] = 0;
   assign prod = b[w];

   for ( genvar pos = 0;  pos < w;  pos++ ) begin
      uwire [2*w-1:0] pp = plier[pos] ? cand << pos : 0;
      assign b[pos+1] = pp + b[pos];
   end

endmodule
//
 /// Cost Analysis
//
//   Lightly optimized Ripple Implementation:
//    See mult_bfas for basis of cost analysis.
//    pprod = plier[i] ? cand << i : 0;
//      Notes: i is a constant, and so plier[i] and cand << i are constants.
//             Because else side is zero and only w bits can be nonzero ..
//             .. consists of just w AND gates.
//      w w = w^2
//    Adders: w 9 w = 9 w^2
//      Note: only w BFA units per pp because lower i bits are unchanged.
//    Total cost: 10 w^2.
//
 /// Timing Analysis
//
//   Grid of w x w  BFAs: 8w + 2
//   Optimize using BHAs: 8w - 12


 /// Tree Combinational Multiplier
//
//   Has a slightly shorter critical path than mult_linear.
//   See 2019 Homework 3 for cost and delay analysis.
//   https://www.ece.lsu.edu/v/2019/hw03_sol.pdf
//
module mult_tree
  #( int wa = 16, int wb = wa, int wp = wa + wb )
   ( output uwire [wp-1:0] prod,
     input uwire [wa-1:0] a,
     input uwire [wb-1:0] b );

   if ( wa == 1 ) begin

      assign prod = a ? b : 0;

   end else begin

      // Split a in half and recursively instantiate a module for each half.
      localparam int wn = wa / 2;
      localparam int wx = wb + wn;

      uwire [wx-1:0] prod_lo, prod_hi;

      mult_tree #(wn,wb) mlo( prod_lo, a[wn-1:0],  b );
      mult_tree #(wn,wb) mhi( prod_hi, a[wa-1:wn], b );

      // Combine the partial products.
      assign prod = prod_lo + ( prod_hi << wn );

   end
endmodule



module mult_linear_clk
  #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] cand, plier,
     input uwire clk);

   uwire [2*w-1:0] p;
   logic [w-1:0] candcpy, pliercpy;

   mult_linear #(w) ml(p, candcpy, pliercpy);

   always_ff @( posedge clk ) begin
      candcpy <= cand;
      pliercpy <= plier;
      prod <= p;
   end

endmodule


 /// Simple Sequential Multiplier
//
module mult_seq #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] cand, plier,
     input uwire clk);

   localparam int wlog = $clog2(w);

   // cadence translate_off
   initial if ( w != 1 << wlog ) $fatal(1,"Size must be a power of 2.");
   // cadence translate_on

   bit [wlog-1:0] pos;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      if ( pos == 0 ) begin
         prod = accum;
         accum = 0;
      end
      if ( plier[pos] ) accum += cand << pos;
      pos++;
   end

endmodule

// 

//
 /// Cost Analysis  :  Delay Analysis
//
// Regs: prod, accum: 2 7 2 w = 28 w
// Regs: pos: 7 lg w

// pos == 0 : lg w   :  lg lg w
// if ( ) accum = 0; : 2w  : 1
// if ( ) prod = accum: 2 3 w  : 2
// plier[pos]: 3(w - 1) ≅ 3 w  : 2 lg w
// cand << pos: 3 2 w lg w = 6 w lg w  : 2 lg w
// accum += 9 2 w = 18 w  : 4w
// pos++: 9 lg w  : 4 + 2 lg w
//
// Total cost: 28w + 7lg w + lgw + 2w + 6w + 3 w + 6 w lgw + 18 w + 9 lgw
// =           57 w + 17 lg w + 6 w lg w
//
// Paths
//  pos == 0 -> accum=0 -> : lg lg w + 1
//  cand << pos : 2 lg w
// Critical Path
//  cand << pos -> accum += -> accum = : 2 lg w + 4w + 2
// Register Delay: 6
//
 /// Latency
//  w ( 4w + 2lg w + 2 + 6 )  =  4w^2 + 2 w lg w + 8w
//
//  Because of the w^2 term the latency of mult_seq is very high. Much
//  better latency, O( w lg w ), is achieved by the wavefront
//  sequential multipliers, which are described in mult-seq.v.



//////////////////////////////////////////////////////////////////////////////
/// Pipelining Concept

 /// Pipelining Concept    <-- Very Important but Tricky Concept, Pay Attention!
 //
 //
 //  :Def: Pipelining
 //   Performing an operation in *stages* on multiple data items.


// :Example:
//
// Output x has the value that input a had two cycles in the past.
//
module very_simple_pipe
  #( int w = 16 )
   ( output logic [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r;

   always_ff @( posedge clk ) r <= a;
   always_ff @( posedge clk ) x <= r;

endmodule

// :


// :Example:
//
// Pipeline that passes data through unchanged.
// Output x has the value that input a had nstages cycles in the past.
//
module simple_pipe2
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r[nstages];

   always_ff @( posedge clk ) begin

      r[0] <= a; // Non-blocking assignment here, blocking in simple_pipe2_ba.
      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];

   end

   assign x = r[nstages-1];

endmodule
//
 /// Important thing to notice:
 //
 //  At any moment the module holds the most recent nstages values of a.


// :



// :Example:
//
// The pipeline below, simple_pipe2_ba, is almost identical to the one above,
// simple_pipe2. The only difference is that here, r[0] is just wire
// whereas in simple_pipe2 r[0] is a register. The difference is due
// to the way in which r[0] is assigned.
//
// Output x has the value that input a had nstages-1 cycles in the past.
//
module simple_pipe2_ba
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r[nstages];

   always_ff @( posedge clk ) begin

      r[0] = a; // Blocking assignment here, non-blocking in simple_pipe2.
      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];

   end

   assign x = r[nstages-1];

endmodule

// :



// :Example:
//
// Output x has the value that input a had nstages-1 cycles in the past.
// Output avg is the average of these values.
//
module simple_pipe_avg
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     output logic [w-1:0] avg,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r[nstages];

   assign r[0] = a;

   always_ff @( posedge clk ) begin

      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];

   end

   assign x = r[nstages-1];

   logic [w+$clog2(nstages):0] sum;

   always_comb begin

      sum = 0;
      for ( int i=0; i<nstages; i++ ) sum += r[i];
      avg = sum / nstages;

   end

endmodule

 /// Inferred Hardware for simple_pipe_avg

// 

// 


// :Example:
//
// Output x has value of input a nstages cycles in the past, but incremented
// once per stage.
//
module simple_pipe_add1
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r[nstages];

   always_ff @( posedge clk ) begin

      r[0] <= a + 1;
      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1] + 1;

   end

   assign x = r[nstages-1];

endmodule





//////////////////////////////////////////////////////////////////////////////
/// Pipelined Multiplier

 /// Pipelining Idea    <-- Very Important Concept, Pay Attention!
//
//   Consider a four-bit multiply.
//
//     Module mult_seq computes the product ..
//     .. using the same logic ..
//     .. on four consecutive cycles.
//
//        Time:    0  1  2  3  4  5  6  7
//        Hardw:   S  S  S  S               <-  16 * 52
//        Hardw:               S  S  S  S   <-  14 * 37
//
//
//     A pipelined module might compute the product ..
//     .. using four different pieces of logic ..
//     .. each for one cycle.
//
//        Time:    0  1  2  3
//        Hardw:   P0 P1 P2 P3           <-  16 * 52
//
//        Time:    0  1  2  3  4  5  6
//        Hardw:   P0 P1 P2 P3           <-  16 * 52
//        Hardw:      P0 P1 P2 P3        <-  14 * 37
//        Hardw:         P0 P1 P2 P3     <-   7 * 2
//        Hardw:            P0 P1 P2 P3  <-  12 * 9
//
//
//   Benefits of Pipelining
//
//     A new operation can start *each* clock cycle.
//     Simpler hardware (though more of it).


 /// Throughput v. Latency
//
// :Def: Throughput
//  The amount of work per unit time.
//  For example, 10 multiplies per clock cycle.
//               Quadrillion floating-point operations per second.
//
// :Def: Latency
//  The amount of time it takes to do something.
//  For example, the latency of a multiply is 20 ns.
//
//
 /// Which is better, high throughput or low latency?:
//
//   Impatient people prefer low latency.
//   Productive people prefer high throughput.
//
//   Jokes aside, in reality it depends on the situation.


 /// Coding of Pipelined Units
//
//   :Def: Pipeline Latch
//   A set of registers that divides stages.
//   Input to pipeline latch is in one stage ...
//   ... output is in the next stage.
//
//   Note: 
//     "Pipeline Latch" indicates how a register is used ...
//     ... but it no different than other registers.
//
//


 /// :Example: Basic Pipelined Multiplier -- mult_pipe1
//
//   Computes 1 partial product per stage.
//
module mult_pipe1
  #( int w = 16, int m = 1 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] cand, plier,
     input uwire clk);

   localparam int stages = w;

   // Note: pl is for pipeline latch.
   logic [2*w-1:0] pl_accum[0:stages];
   logic [w-1:0] pl_cand[0:stages];
   logic [w-1:0] pl_plier[0:stages];

   always_ff @( posedge clk ) begin

      pl_accum[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;

      for ( int stage=0; stage<stages; stage++ ) begin

         /// Compute some stuff within this stage.
         //
         automatic int pos = stage;
         logic [2*w-1:0] pp, accum;

         pp = pl_plier[stage][stage] ? pl_cand[stage] << stage : 0;
         //                   POS                        POS

         accum = pl_accum[stage] + pp;


         /// Move *everything* to next stage.
         //
         pl_accum[stage+1] <= accum;
         pl_cand[stage+1]  <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

   assign prod = pl_accum[stages];

endmodule

 /// Inferred Hardware for mult_pipe1 Without and With Labels.
// 
// 


 /// mult_pipe1
 /// Cost Analysis : Delay Analysis
//
//   Note: Cost is entire module, delay is one stage.
//
// Registers: (Cost per bit: 7;  Delay per bit: 6 )
//  pl_accum:         ( w + 1 ) * 2w * 7 = 14w^2 + 14w
//  plier, cand:  2 * ( w + 1 ) *  w * 7 = 14w^2 + 14w
//  Total: 28w^2 + 28w
//
//  pl_cand[stage] << stage :  No hardware since stage is a constant.
//  pp = pl_plier[stage][stage] ? pl_cand[stage] << stage : 0;
//    Per stage: a w-bit mux with one input of 0, synthesizes into w AND gates.
//      w^2 : 1
//  accum = pl_accum[stage] + pp;
//    Per stage: a 2w-bit ripple adder:
//     w 9 2 w = 18w^2 : 4w
//
//  Total cost:
//    28w^2 + 28w + w^2 + 18w^2  =  47 w^2 + 28w
//
//  Critical Path
//    pp mux -> adder:  1 + 4w
//  Register Delay: 6
//
//  Clock Period
//   1 + 4w + 6  =  4w + 7
//  Latency
//   (w+1) (4w+7) = 4w^2 + 11w + 7



 /// :Example: Basic Pipelined Multiplier -- mult_pipe
//
//   Computes m partial products per stage.
//

module mult_pipe #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] cand, plier,
     input clk);

   localparam int stages = ( w + m - 1 ) / m;

   // Note: pl is for pipeline latch.
   logic [2*w-1:0] pl_accum[0:stages];
   logic [w-1:0] pl_cand[0:stages];
   logic [w-1:0] pl_plier[0:stages];

   always_ff @( posedge clk ) begin

      pl_accum[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;

      for ( int stage=0; stage<stages; stage++ ) begin

         logic [2*w-1:0] accum;
         accum = pl_accum[stage];

         for ( int j=0; j<m; j++ ) begin

            int pos;
            pos = stage * m + j;

            if ( pos < w && pl_plier[stage][pos] )
              accum += pl_cand[stage] << pos;

         end

         /// Values to use in the next clock cycle, *not* the next iteration.
         pl_accum[stage+1] <= accum;
         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

   assign prod = pl_accum[stages];

endmodule


// Areas for improvement of mult_pipe
//
// -- Use fewer bits:
//
//    accum: Earlier stages need fewer bits.
//    plier: Later stages need fewer bits.
//
// -- Use a carry-save adder until the last stage.


// :Example: Alternative computation of partial products.
//
// Like mult_pipe, except compute set of m partial products using a
// multiplier.
//
// We (humans) know that the two are equivalent. Will the synthesis
// program's optimization code see the two as equivalent?

module mult_pipe_2 #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] cand, plier,
     input clk);

   localparam int nstages = ( w + m - 1 ) / m;

   logic [2*w-1:0] pl_accum[0:nstages];
   logic [w-1:0] pl_cand[0:nstages];
   logic [nstages-1:0][m-1:0] pl_plier[0:nstages];

   always_ff @( posedge clk ) begin

      pl_accum[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;

      for ( int stage=0; stage<nstages; stage++ ) begin

         pl_accum[stage+1] <=
           pl_accum[stage] +
           ( pl_plier[stage][stage] * pl_cand[stage] << stage*m );

         pl_cand[stage+1] <= pl_cand[stage];

         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

   assign prod = pl_accum[nstages];

endmodule

 /// Inferred Hardware for mult_pipe_2
// 


//////////////////////////////////////////////////////////////////////////////
/// Pipelined Workfront Multiplier
//  mult_pipe_wfront
//

module mult_pipe_wfront #( int w = 16, int m = 1 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] cand, plier,
     input uwire clk );

   localparam int stages = 2*w;

   // Note: pl is for pipeline latch.
   logic [2*w-1:0] pl_prod[0:stages];
   logic [w-1:0] pl_cand[0:stages];
   logic [w-1:0] pl_plier[0:stages];
   logic [w-1:0] pl_sum[0:stages];
   logic [w-1:0] pl_carry[0:stages];

   always_ff @( posedge clk ) begin

      pl_cand[0] <= cand;
      pl_plier[0] <= plier;
      pl_sum[0] <= 0;
      pl_carry[0] <= 0;
      pl_prod[0] <= 0;

      for ( int stage = 0; stage < stages; stage++ ) begin
         logic [2*w-1:0] prod_next;
         logic [w-1:0] sum_next, carry_next;
         logic [1:0] sc;

         prod_next = pl_prod[stage];

         for ( int i=0; i<w; i++ ) begin
            logic a, b, c;
            a = stage && i ? pl_sum[stage][i-1] : 0;
            b = stage      ? pl_carry[stage][i] : 0;
            c = stage < w
              ? pl_plier[stage][w-1-i] && pl_cand[stage][stage] : 0;
            sc = a + b + c;
            { carry_next[i], sum_next[i] } = sc;
         end

         prod_next[stage] = sc[0];

         /// Move *everything* to next stage.
         //
         pl_cand[stage+1]  <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];
         pl_sum[stage+1] <= sum_next;
         pl_carry[stage+1] <= carry_next;
         pl_prod[stage+1] <= prod_next;

      end

   end

   assign prod = pl_prod[stages];

endmodule
//
 /// mult_pipe_wfront
 /// Cost Analysis : Delay Analysis
//
//   Note: Cost is entire module, delay is one stage.
//
// Registers: (Cost per bit: 7;  Delay per bit: 6 )
//  pl_prod :         ( w + 1 ) * 2w * 7  =  14w^2 + 14w
//  pl_plier, pl_cand, pl_sum, pl_carry:
//    4 * ( w + 1 ) *  w * 7  =  28w^2 + 28w
//  Total: 42w^2 + 42w
//







//////////////////////////////////////////////////////////////////////////////
/// Pipelined Multiplier, Instantiated Stages
//  mult_pipe_comb_stage
//
//  Very similar to mult_pipe, except each stage is a module.


module pipe_stage #( int w = 16, int m = 2, int stage = 0 )
   ( output logic [2*w-1:0] accum_out,
     input logic [2*w-1:0] accum_in,
     input logic [w-1:0] cand, plier );

   always_comb begin

      logic [2*w-1:0] accum;  accum = accum_in;

      for ( int j=0; j<m; j++ ) begin

         int pos;  pos = stage * m + j;

         if ( pos < w && plier[pos] )
           accum += cand << pos;

      end

      accum_out = accum;

   end

endmodule



module mult_pipe_c_cpa #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] cand, plier,
     input clk);

   localparam int stages = ( w + m - 1 ) / m;

   logic [2*w-1:0] pl_accum[0:stages];
   logic [w-1:0] pl_cand[0:stages];
   logic [w-1:0] pl_plier[0:stages];

   for ( genvar stage = 0;  stage < stages; stage++ ) begin

      uwire [2*w-1:0] accum;

      pipe_stage #(w, m, stage) our_stage
        (accum, pl_accum[stage], pl_cand[stage], pl_plier[stage]);

      always_ff @( posedge clk )
        pl_accum[stage+1] <= accum;

   end

   always_ff @( posedge clk ) begin

      pl_accum[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;

      for ( int stage=0; stage<stages; stage++ ) begin

         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

   assign prod = pl_accum[stages];

endmodule



`include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v"



module pipe_stage_csa #( int wid = 16, int m = 2, int stage = 0 )
   ( output uwire [2*wid-1:0] accum_out_a, accum_out_b,
     input uwire [2*wid-1:0] accum_in_a, accum_in_b,
     input uwire [wid-1:0] cand, plier );

   uwire [2*wid-1:0] accum_a[m-1:-1];
   uwire [2*wid-1:0] accum_b[m-1:-1];
   uwire co[-1:m-1];

   assign accum_a[-1] = accum_in_a;
   assign accum_b[-1] = accum_in_b;
   assign accum_out_a = accum_a[m-1];
   assign accum_out_b = accum_b[m-1];

   for ( genvar i = 0; i < m; i++ ) begin

      localparam int pos = stage * m + i;

      uwire [2*wid-1:0] pp = pos < wid && plier[pos] ? cand << pos : 0;

      CW_csa #(2*wid) csa
        ( .carry(accum_a[i]), .sum(accum_b[i]), .co(co[i]),
          .a(accum_a[i-1]), .b(accum_b[i-1]), .c(pp), .ci(1'b0) );

   end

endmodule

module mult_pipe_c_csa #( int wid = 16, int m = 2 )
   ( output uwire [2*wid-1:0] prod,
     input uwire [wid-1:0] cand, plier,
     input uwire clk);

   localparam int stages = ( wid + m - 1 ) / m;

   logic [2*wid-1:0] pl_accum_a[0:stages];
   logic [2*wid-1:0] pl_accum_b[0:stages];
   logic [wid-1:0] pl_cand[0:stages];
   logic [wid-1:0] pl_plier[0:stages];

   for ( genvar stage = 0;  stage < stages;  stage++ ) begin

      uwire [2*wid-1:0] accum_a, accum_b;

      pipe_stage_csa #(wid, m, stage) our_stage
        (accum_a, accum_b, pl_accum_a[stage], pl_accum_b[stage],
         pl_cand[stage], pl_plier[stage]);

      always_ff @( posedge clk ) begin
        pl_accum_a[stage+1] <= accum_a;
        pl_accum_b[stage+1] <= accum_b;
      end

   end

   always_ff @( posedge clk ) begin

      pl_accum_a[0] <= 0;
      pl_accum_b[0] <= 0;

      pl_cand[0] <= cand;
      pl_plier[0] <= plier;


      for ( int stage=0; stage<stages; stage++ ) begin

         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

   assign prod = pl_accum_a[stages] + pl_accum_b[stages];

endmodule



//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

// cadence translate_off

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int wid = 16;
   localparam int num_tests = 1000;
   localparam int NUM_MULT = 20;
   localparam int err_limit = 7;

   bit use_others;
   logic [wid-1:0] plier, cand;
   logic [wid-1:0] plierp, candp;
   logic [2*wid-1:0] prod[NUM_MULT];

   typedef struct { int idx; int err_count = 0;
                    bit seq = 0; bit pipe = 0; bit wf = 0; int deg = 1;
                    logic [2*wid-1:0] sout = 'h111; int cyc_tot = 0;
                    int latency = 0;
                    } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * wid * 8;
   int cycle;
   bit done;
   logic clock;

   logic      clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   initial begin

      while ( !done ) @( posedge clk_reactive ) #1

         if ( use_others ) begin

            plierp = plier;
            candp = cand;
            use_others = 0;

         end else begin

            plierp = cycle;
            candp = 256;

         end
   end

   task pi_seq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1;
   endtask

   task pi_pipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;
   endtask

   task pi_wpipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].wf = 1;
   endtask

   mult_behav_1 #(wid) mb1(prod[0], plier, cand);
   initial pi["Behavioral"].idx = 0;

   mult_pipe1    #(wid) ms18(prod[18], plierp, candp, clock);
   initial pi_pipe(18,"Pipelined Simple",1);

   mult_pipe    #(wid,4) ms54(prod[7], plierp, candp, clock);
   initial pi_pipe(7,"Pipelined",ms54.m);

   mult_pipe    #(wid,3) ms53(prod[8], plierp, candp, clock);
   initial pi_pipe(8,"Pipelined",ms53.m);

   mult_pipe_wfront #(wid,1) ms4(prod[4], plierp, candp, clock);
   initial pi_wpipe(4,"Pipelined WF",ms4.m);

   mult_pipe_2    #(wid,4) ms17(prod[17], plierp, candp, clock);
   initial pi_pipe(17,"Pipelined 2",ms17.m);

   mult_pipe_2    #(wid,3) ms16(prod[16], plierp, candp, clock);
   initial pi_pipe(16,"Pipelined 2",ms16.m);

   mult_pipe_c_cpa #(wid,4) pgam4(prod[12], plierp, candp, clock);
   initial pi_pipe(12,"Pipelined Comb CPA",pgam4.m);

   mult_pipe_c_cpa    #(wid,3) pgam13(prod[13], plierp, candp, clock);
   initial pi_pipe(13,"Pipelined Comb CPA",pgam13.m);

   mult_pipe_c_csa    #(wid,3) pgam2(prod[2], plierp, candp, clock);
   initial pi_pipe(2,"Pipelined CSA",pgam2.m);

   mult_pipe_c_csa    #(wid,4) pgam3(prod[3], plierp, candp, clock);
   initial pi_pipe(3,"Pipelined CSA",pgam3.m);


   // Array of multiplier/multiplicand values to try out.
   // After these values are used a random number generator will be used.
   //
   int tests[$] = {1,1, 1,2,  1,32,  2,1,  32, 1};

   initial begin

      done = 0;
      use_others = 0;

      // Make sure that no two multipliers are assigned the same index ..
      // .. and show unused indices.
      #0 begin
         string index_used[NUM_MULT];
         automatic int n_unused = 0, n_reused = 0;
         foreach ( pi[ mut ] ) begin
            automatic int idx = pi[mut].idx;
            if ( index_used[idx].len() )
              begin
                 $write("*** Index %0d used by %s and %s.\n",
                        idx, index_used[idx], mut );
                 n_reused++;
              end
            index_used[idx] = mut;
         end
         $write("Unused positions: ");
         foreach ( index_used[idx] )
           if ( index_used[idx].len() == 0 )
             $write("%s%0d", n_unused++ ? ", " : "", idx);
         $write("%s.\n",n_unused ? "" : "none -- all used");
         if ( n_reused )
           $fatal(2, "\nFound %0d re-used indices. Aborting simulation.\n\n",
                  n_reused);
      end

      @( posedge clk_reactive );

      for ( int i=0; i<num_tests; i++ ) begin
         automatic int cyc_start = cycle;
         automatic int awaiting = pi.num();

         // Set multiplier and multiplicand values for non-piped units.
         //
         plier = tests.size() ? tests.pop_front() : $random();
         cand = tests.size() ? tests.pop_front() : $random();

         // Set multiplier and multiplicand values for piped units.
         //
         plierp = plier;
         candp = cand;
         use_others = 1;

         foreach ( pi[muti] ) begin
            automatic string mut = muti; // Bug workaround?
            automatic Info p = pi[mut];

            fork begin
               automatic int arrival_late = 1;
               automatic int steps = ( wid + pi[mut].deg - 1 ) / pi[mut].deg;
               automatic int latency
                 = !pi[mut].seq ? 1 :
                   !pi[mut].pipe ? 2 * steps :
                    pi[mut].wf ? arrival_late + 2 * steps :
                    arrival_late + steps;
               automatic int eta = 1 + cyc_start + latency;
               pi[mut].latency = latency;
               wait ( cycle_reactive == eta );
               awaiting--;
               pi[mut].sout = prod[pi[mut].idx];
               pi[mut].cyc_tot += cycle - cyc_start;
            end join_none;
         end
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] )
           if ( prod[0] !== pi[mut].sout ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write
                  ("%-25s wrong result: %0d * %0d:  0x%0h != 0x%0h (correct)\n",
                   mut, plier, cand, pi[mut].sout, prod[0]);
           end

         @( posedge clk_reactive );

      end

      foreach ( pi[ mut ] )
        $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
               num_tests, mut, pi[mut].err_count,
               pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on