mult-seq.v

/// EE 4755 - Digital Design Using HDLs
//
//  Classroom demo code.


////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm

 /// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  cand  -- Multiplicand
//     1100  plier -- Multiplier
//     """"
//     0000  Partial Product
//    0000
//   0101
//  0101
//  """""""
//  0111100  prod  -- Product


////////////////////////////////////////////////////////////////////////////////
/// Delay, Critical Path, and Latency

 /// :Def: Critical Path
 //        The longest path starting at a launch point and ending at
 //        a capture point.
 //
 //  In a combinational circuit the launch points are usually the
 //  module inputs and the capture points are usually the module
 //  outputs.
 //
 //  In a sequential circuit typically:
 //
 //     - All register outputs are launch points.
 //     - Module inputs may or may not be launch points.
 //     - All register inputs are capture points.
 //     - Module outputs may or may not be capture points.
 //
 /// :Def: Latency [of an action in a sequential circuit]
 //        Product of number of cycles needed to complete the action ..
 //        .. and the clock period.



`default_nettype none


//////////////////////////////////////////////////////////////////////////////
/// Behavioral Multiplier


module mult_behav_1
  #(int w = 16)
   (output uwire [2*w-1:0] prod, input uwire [w-1:0] plier, cand);

   assign prod = plier * cand;
endmodule


//////////////////////////////////////////////////////////////////////////////
/// Linear Multiplier

 /// Simple Adder, Don't Modify
module carry_prop_adder #(int w=16)
   (output uwire [w:1] s, input uwire [w:1] a,b);
   assign s = a + b;
endmodule


module mult_linear
  #(int w = 16)
   (output logic [2*w-1:0] prod, input uwire [w-1:0] plier, cand);

   logic [2*w-1:0] rsum [w-1:-1];

   assign rsum[-1] = 0;

   for ( genvar i=0; i<w; i++ ) begin
      uwire [2*w-1:0] pprod = cand[i] ? plier << i : 0;
      carry_prop_adder #(2*w) adder(rsum[i], rsum[i-1], pprod );
   end

   assign    prod = rsum[w-1];

endmodule
//
 /// Cost Analysis
//
//   Lightly optimized Ripple Implementation:
//    pprod = plier[i] ? cand << i : 0;
//      Notes: i is a constant, and so plier[i] and cand << i are constants.
//             Because else side is zero and only w bits can be nonzero ..
//             .. consists of just w and gates.
//      w w = w^2
//    Adders: w 9 w = 9 w^2
//      Note: only w BFA units per pp because lower i bits are unchanged.
//    Total cost: 10 w^2.
//
 /// Timing Analysis
//
//   Grid of w x w  BFAs: 8w + 2
//   Optimize using BHAs: 8w - 12




module mult_linear_clk
  #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand,
     input uwire clk);

   uwire [2*w-1:0] p;
   logic [w-1:0] pliercpy, candcpy;

   mult_linear #(w) ml(p, pliercpy, candcpy);

   always_ff @( posedge clk ) begin
      pliercpy <= plier;
      candcpy <= cand;
      prod <= p;
   end

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Non-Recursive Tree Multiplier


module mult_tree
  #(int w = 16)
   (output logic [2*w-1:0] prod, input uwire [w-1:0] plier, cand);

   localparam int widp2 = 1 << $clog2(w);

   logic [2*w-1:0] rsum [2*w-1:0];
   localparam int mask = 2*w-1;

   // Compute partial products.
   //
   for ( genvar i=0; i<w; i++ )
     assign rsum[i] = plier[i] ? cand << i : 0;

   // Add partial products together.
   //
   for ( genvar i=w; i<2*w-1; i++ )
     carry_prop_adder #( 2*w ) adder
                 ( rsum[i],
                   rsum[ mask &   (i<<1)       ],   // Left child.
                   rsum[ mask & ( (i<<1) + 1 ) ]    // Right child.
                   );

   assign    prod = rsum[2*w-2];

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Simple Sequential Multiplier

module mult_seq #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier,
     input uwire [w-1:0] cand,
     input uwire clk);

   localparam int wlog = $clog2(w);

   // cadence translate_off
   initial if ( w != 1 << wlog ) $fatal(1,"Size must be a power of 2.");
   // cadence translate_on

   bit [wlog-1:0] pos;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      if ( pos == 0 ) begin
         prod = accum;
         accum = 0;
      end
      if ( cand[pos] == 1 ) accum += plier << pos;
      pos++;
   end

endmodule

// 

//
 /// Cost Analysis  :  Delay Analysis
//
// Regs: prod, accum: 2 2 7 w = 28 w
// Regs: pos: 7 lg w

// pos == 0 : lg w   :  lg lg w
// if ( ) accum = 0; : 2w  : 1
// if ( ) prod = accum: 2 3 w  : 2
// cand[pos]: 3(w - 1) ≅ 3 w  : 2 lg w
// plier << pos: 3 2 w lg w = 6 w lg w  : 2 lg w
// accum += 9 2 w = 18 w  : 4w
// pos++: 9 lg w  : 4 + 2 lg w
//
// Total cost: 28w + 7lg w + lgw + 2w + 6w + 3 w + 6 w lgw + 18 w + 9 lgw
// =           57 w + 17 lg w + 6 w lg w
//
// Paths
//  pos == 0 -> accum=0 -> : lg lg w + 1
//  plier << pos : 2 lg w
// Critical Path
//  plier << pos -> accum += -> accum = : 2 lg w + 4w + 2
// Register Delay: 6
//
// Latency
//  w ( 4w + 2lg w + 2 + 6 )  =  4w^2 + 2 w lg w + 8w


//////////////////////////////////////////////////////////////////////////////
/// Sequential Multiplier, Using Instantiated Adder
//
//  Simple multiplier, no handshaking.
//
//  The cost of mult_seq_ga is about the same as mult_seq.

module mult_seq_ga #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand,
     input uwire clk );

   localparam int wlog = $clog2(w);

   bit [wlog-1:0] pos;
   bit [2*w-1:0] accum;
   uwire [2*w-1:0] sum;
   uwire [2*w-1:0] pp = cand[pos] ? plier << pos : 0;

   carry_prop_adder #(2*w) ga( sum, accum, pp );

   always_ff @( posedge clk ) pos <= pos + 1;

   always_ff @( posedge clk )
     if ( pos == 0 ) begin
        prod = sum;
        accum = 0;
     end else begin
        accum = sum;
     end

endmodule



//////////////////////////////////////////////////////////////////////////////
/// Streamlined Sequential Multiplier

 /// Techniques For Lowering Cost
//
//   Instead of shifting the multiplier, shift the accumulator.
//   Use part of the accumulator to store the multiplicand.

module mult_seq_stream #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand,
     input uwire clk);

   localparam int wlog = $clog2(w);

   bit [wlog-1:0] pos;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      logic [w:0] pp;

      if ( pos == 0 ) begin

         prod = accum;
         accum = cand;
         pos = w - 1;

      end else begin

         pos--;

      end

      // Note: the multiplicand is in the lower bits of the accumulator.
      //
      pp = accum[0] ? { 1'b0, plier } : 0;

      // Add on the partial product and shift the accumulator.
      //
      accum = { { 1'b0, accum[2*w-1:w] } + pp, accum[w-1:1] };

   end

endmodule

// 

 /// Cost Analysis  :  Timing Analysis
//
// Regs: prod, accum:  2 2 7 w = 28 w
// Regs: pos: 7 lg w

// pos == 0 : lg w   :  lg lg w
// if ( ) prod = accum: 2 3 w  : 2
// if ( ) accum = cand; : w + 3w = 4w  : 2
// if ( ) pos = w-1 / pos-- : 3 lg w  : 2
// pos--: 9 lg w  : 4 + 2 lg w
//

// pp = accum[0] ? { 1'b0, plier } : 0;: w  : 1
// accum = { 1'b0, accum[2*w-1:w] } + pp:  9w  : 2 + 2w

//
// Total cost: 28w + 7lg w + lg w + 6w + 4w + 3 lg w + 9 lg w + w + 9 w
// =           48 w + 20 lg w
//
// Critical Path
//  pos == 0 -> accum=cand ->  accum[0] ? : 0 -> + pp :
//   lg lg w + 2 + 1 + 2 + 2w = 2w + lg lg w + 5 ≅ 2w + 5
// Register Delay: 6
//
// Latency
//  w ( 6 + 2w + 5 ) = 2w^2 + 11

 /// Synthesis Data
//
`ifdef XXX
Module Name                             Area  Period  Period    Total    Init.
                                              Target  Actual  Latency   Interv
mult_behav_1_w8                        53168    1000    6062     6062     6062
mult_behav_1_w16                      215672    1000   13551    13551    13551
mult_behav_1_w32                      764479    1000   26485    26485    26485
mult_behav_1_w64                     2891332    1000   52332    52332    52332

mult_seq_w8                            56081    1000    8120    64960    64960
mult_seq_w16                          122475    1000   15916   254656   254656
mult_seq_w32                          258750    1000   31385  1004320  1004320
mult_seq_w64                          544285    1000   58476  3742464  3742464

mult_seq_stream_w8                     44320    1000    4518    36144    36144
mult_seq_stream_w16                    78395    1000    8868   141888   141888
mult_seq_stream_w32                   153863    1000   16361   523552   523552
mult_seq_stream_w64                   304047    1000   30276  1937664  1937664
endmodule
`endif
//
// As expected, cost grows linearly with w, as does the clock period.
// Cost of streamlined much lower then mult_seq, as is the clock period.
// But, latency is still O(w^2) (it quadruples when w is doubled).


//////////////////////////////////////////////////////////////////////////////
/// Degree-m Sequential Multipliers

// Compute m partial products in each iteration.
//
// Will the synthesis program figure it out?


module mult_seq_m #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand,
     input uwire clk);

   localparam int iterations = ( w + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);

   bit [iter_lg:1] iter;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      if ( iter == iter_lg'(iterations) ) begin

         prod = accum;
         accum = 0;
         iter = 0;

      end

      for ( int i=0; i<m; i++ )
        begin
           int pos;
           pos = iter * m + i;
           if ( cand[pos] ) accum += plier << pos;
        end

      iter++;

   end

endmodule


module mult_seq_dm
  #( int w = 16,
     int m = 2 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand,
     input uwire clk);

   localparam int iterations = ( w + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);

   uwire [iterations-1:0][m-1:0] cand_2d = cand;

   bit [iter_lg:1] iter;
   logic [2*w-1:0] accum;

   always_ff @( posedge clk ) begin

      if ( iter == iter_lg'(iterations) ) begin

         prod = accum;
         accum = 0;
         iter = 0;

      end

      accum += plier * cand_2d[iter] << ( iter * m );

      iter++;

   end

endmodule

 /// Cost Analysis  :  Timing Analysis
//
// Regs: accum, prod: 2 2 7 w = 28 w : 6
// Regs: iter: 7 lg w/m : 6
//
// iter == iter_lg'(iterations): 3 lg w/m + lg w/m - 1 ≅ 4 lg w/m  : 2 + lg w/m
// prod = accum: 3 2 w = 6 w  : 2
// accum = 0: 2w  : 1
// iter = 0: lg w/m  : 1
//
// cand_2d[iter]: 3 (lg w/m-1) \approx 3 lg w/m  : 2 lg w/m
// * : m 9 w  : 6m + 2w 
// << iter*m : 3 w lg w/m  : 2 lg w/m
// accum += : 9 2 w = 18 w  : 4w
// iter++: 9 lg w/m  : 2 + 2 lg w/m
//
// Total Cost: 28w + 7 lg[w/m] + 4 lg[w/m] + 6w + 2w + lg[w/m] 
//    + 3lg[w/m] + 9m w + 3w lg[w/m] + 18w + 9 lg[w/m]
//  = 54 w + 9 m w + 24 lg[w/m] + 3 w lg[w/m]
//
// Critical Path -- Not overlapping plier * cand with accum +
// iter == iterations -> accum=0;iter=0; -> cand_2d[iter] -> * -> << -> +
//  (2 + lg[w/m]) + 1 + 2 lg[ w/m] + ( 6m + 2w ) + 2 lg[w/m] + 4w
//  = 6 w + 6 m + 5 lg[w/m] + 3
//
// Latency
//  w/m ( 6 w + 6 m + 5 lg[w/m] + 3 + 6 )
//  = 6 w^2/m + 9 w/m + 6w + 5w/m lg[w/m]


//////////////////////////////////////////////////////////////////////////////
/// Workfront Sequential Multipliers
//
 /// Goal: Avoid O(w) Clock Period Imposed by Ripple Adder
//
//   Idea:
//     Consider the grid of binary full adders (BFAs) that might be
//     synthesized for module mult_linear.
//
//     Let B[i][j] denote the BFA handling ..
//     .. bit j of ..
//     .. the partial product for multiplicand bit i.
//
//     Call the input ports:       B[i][j].a,   B[i][j].b,  B[i][j].ci
//     .. and the output ports:    B[i][j].sum, B[i][j].co
//
//     We know that the carry output B[i][j].co ..
//     .. connects to B[i][j+1].ci.
//
//     We also know that output B[i][j].sum ..
//     .. connects to B[i+1][j].a.
//
//     So, if B[i][j], B[i+1][j-1], and B[i-1][j+1] ..
//     .. are all computed at, say, cycle x, ..
//     .. then B[i+1][j] and B[i][j+1] can be computed in cycle x+1.
//
//     Input B[i][j].b = cand[i] && plier[i+j]
//        does not depend on any BFA ..
//        .. and so does not restrict at which cycle B[i][j] can execute.
//
//     A possible plan is, in cycle c, to execute ..
//       { B[i][j] : i + j = c,  i in [0,w-1],  j in [0,2w-1] }
//     For example,
//       Cycle 0: B[0][0]
//       Cycle 1: B[0][1], B[1][0]
//       Cycle 2: B[0][2], B[1][1], B[2][0]
//       Cycle 3: B[0][3], B[1][2], B[2][1], B[3][0]
//
//     Note that B[0][0], B[0][1], etc. would all be the same BFA.
//
//     The arrangement above has the disadvantage of using just one
//     BFA in cycle 0, one in cycle 1, etc. The set of active BFAs is
//     called the workfront.  The workfront modules described here
//     connect the BFAs differently, keeping the size of the workfront at
//     w throughout the multiplication. That's achieved by setting:
//
//       B[i][j].a  = B[i-1][j-1].sum
//       B[i][j].ci = B[i][j-1].sum
//       B[i][j].b  = plier[j] && cand[w-1-i] for j <= w;
//
//     Here are the active BFAs with this scheme:
//       Cycle 0: B[0][0], B[1][0], B[2][0], ... B[w-1][0]
//       Cycle 1: B[0][1], B[1][1], B[2][1], ... B[w-1][1]
//
//     Also, note that with this scheme ..
//     .. prod[cyc] = B[w-1][cyc].sum
//
//    Module mult_seq_wfront implements this scheme. Note that only w
//    BFAs are needed, and that multiplication takes 2w cycles.
//
module mult_seq_wfront
  #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand,
     input uwire clk );

   localparam int wlog = $clog2(2*w);
   // cadence translate_off
   if ( 2**wlog != 2*w )
     $fatal(2,"Size, parameter w=%0d, must be a power of 2.\n",w);
   // cadence translate_on

   bit [wlog-1:0] pos;

   always_ff @( posedge clk ) pos <= pos + 1;

   logic [w-1:0] sum, carry;
   logic [1:0] sc;

   always_ff @( posedge clk ) begin

      for ( int i=0; i<w; i++ ) begin
         logic a, b, c;
         a = pos < w && cand[w-1-i] ? plier[pos] : 0;
         b = pos && i               ? sum[i-1]   : 0;
         c = pos                    ? carry[i]   : 0;
         sc = a + b + c;
         sum[i] <= sc[0];
         carry[i] <= sc[1];
      end

      prod[pos] = sc[0];

   end

endmodule


 /// Optimized Workfront Multiplier
//
module mult_seq_wfront_opt
  #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand,
     input uwire clk );

   localparam int wlog = $clog2(w);
   // cadence translate_off
   if ( 2**wlog != w )
     $fatal(2,"Size, parameter w=%0d, must be a power of 2.\n",w);
   // cadence translate_on

   bit pos_eq_0;
   bit [wlog:0] pos;

   always_ff @( posedge clk ) { pos_eq_0, pos } <= pos + 1;

   uwire pos_lt_w = !pos[wlog];    // Efficiently compute  pos < w.

   logic [w-1:0] sum, carry;
   logic [1:0] sc;

   uwire [w-1:0] plier_rot = {plier[0],plier[w-1:1]};
   logic plier_pos;
   always_ff @( posedge clk ) plier_pos <= plier_rot[wlog'(pos)];

   always_ff @( posedge clk ) begin

      for ( int i=0; i<w; i++ ) begin
         logic a, b, c;
         a = !pos_eq_0              && carry[i];
         b = !pos_eq_0 && i         && sum[i-1];
         c =  pos_lt_w && plier_pos && cand[w-1-i];
         sc = a + b + c;
         sum[i]   <= sc[0];
         carry[i] <= sc[1];
      end

      prod[pos] = sc[0];

   end

endmodule

 // :

 /// Cost Analysis  :  Timing Analysis
//
// Registers:
//  prod: 2w * 7 = 14w.
//  sum,carry: 2 * w * 7 = 14w
//  pos, pos_eq_0:  7 ( 2 + lg w ) \approx  7 lg w
//  plier_pos: 7
//
// always_ff @( posedge clk ) { pos_eq_0, pos } <= pos + 1;
//  Note: One input is a constant, so use BHA's.
//  Ripple adder: 3 lg 2w : 1 + lg w.
//
// always_ff @( posedge clk ) plier_pos <= plier_rot[wlog'(pos)];
//   Note: {plier[0],plier[w-1:1]} is free, it's just renaming bits.
//   Same hardware used by all i iterations: a w-bit mux to extract plier bit.
//   mux:  3(w-1) : 2 lg w
//
//
// a = !pos_eq_0              && carry[i];
// b = !pos_eq_0 && i         && sum[i-1];
// c =  pos_lt_w && plier_pos && cand[w-1-i];
//   Notes: 
//     i and w are constants, so carry[i], etc, ..
//       .. just refer to a bit, no mux needed.
//     pos_lt_w && plier_pos same for all i, so computed once.
//     Delay is for c.
// Once: 1
// Per iter: 3
// All iters: 1 + 3w : 2 
//
// sc = a + b + c;
//   Note: Just a binary full adder.
//   Per iteration:  9  : 4
//   All iters:      9w : 4
//
// prod[pos] = sc[0]
//  Decoder for bit-addressable register: lg 2w bits to 2w bits
//   4w - 4 : 1 + lg w
//  Muxen to select sc[0] or prior bit:
//   2w * 3 = 6w : 2
//  Total: 10w - 4 : 2 + lg 2w = 3 + lg w
//
// Total Cost:
//  14w+14w+7lg[w]+7 + 3 (1+lg[w]) + 3(w-1) + 1+3w + 9w + 10w-4
//  = 53 w + 10 lg w + 4
//
// Paths
//  plier mux: 2 lg w
//  pos+1:     1+ lg w
//  pos_lt_w && cand[w-1-i] && plier_pos -> BFA(cin) ->
//   : 1 + 1 + 2 = 4
//  !pos_eq_0 && carry[i] -> BFA -| prod[pos] internal mux
//   : 1 + 4 = 5
//  prod[pos] decode network -| prod[pos] internal mux
//   : 1 + lg w
//
// Critical Path (the winner is, for w > 4).
//   plier mux: 2 lg w
//
// Clock Period: (Register delay: 6)
//   6 + 2 lg w 
// Latency: Based on 2w cycles.
//   6w + 2w lg w
//
 /// Comparison with Synthesis
`ifdef xxx
Module Name                             Area  Period  Period    Total    Init.
                                              Target  Actual  Latency   Interv
mult_seq_wfront_w8                     45390    1000    3132    50112    50112
mult_seq_wfront_w16                    89668    1000    3260   104320   104320
mult_seq_wfront_w32                   178367    1000    4202   268928   268928
mult_seq_wfront_w64                   345415    1000    4716   603648   603648

mult_seq_wfront_opt_w8                 47575    1000    2428    38848    38848
mult_seq_wfront_opt_w16                94652    1000    2275    72800    72800
mult_seq_wfront_opt_w32               177706    1000    2546   162944   162944
mult_seq_wfront_opt_w64               345301    1000    2724   348672   348672

mult_seq_stream_w8                     44320    1000    4518    36144    36144
mult_seq_stream_w16                    78395    1000    8868   141888   141888
mult_seq_stream_w32                   153863    1000   16361   523552   523552
mult_seq_stream_w64                   304047    1000   30276  1937664  1937664

`endif
//
// Optimizations reduce period without significant cost impact.
//  
// Cost growth with w agrees with analysis, but period does not grow
// as fast as modeled.
//
// Workfront multiplier much faster than streamlined for w>8



 /// Degree-m Workfront Multiplier
//
module mult_seq_wfront_m #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand,
     input uwire clk );

   localparam int iterations = ( 2*w + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);

   localparam int wlog = $clog2(m * iterations);

   bit [iter_lg-1:0] iter;

   always_ff @( posedge clk ) iter <= iter + 1;

   logic [w-1:-1] sum, carry;

   always_ff @( posedge clk ) begin

      logic [w-1:-1] j_sum[m+1], j_carry[m+1];
      logic [1:0] sc;

      j_sum[0] = iter ? sum : 0;
      j_carry[0] = iter ? carry : 0;

      for ( int j=0; j<m; j++ ) begin

         logic [wlog-1:0] pos;
         pos = m*iter + j;

         j_sum[j+1][-1] = 0;

         for ( int i=0; i<w; i++ ) begin
            logic a, b, c;
            a = j_carry[j][i];
            b = j_sum[j][i-1];
            c = pos < w && cand[w-1-i] && plier[pos];
            sc = a + b + c;
            { j_carry[j+1][i], j_sum[j+1][i] } = sc;
         end
         if ( pos < 2*w ) prod[pos] = sc[0];

      end

      sum <= j_sum[m];
      carry <= j_carry[m];

   end

endmodule

 /// Synthesis Data
//
`ifdef DONT_DEFINE_ME
Module Name                             Area  Period  Period    Total    Init.
                                              Target  Actual  Latency   Interv
mult_seq_wfront_w16                    89668    1000    3260   104320   104320
mult_seq_wfront_opt_w16                94652    1000    2275    72800    72800

mult_seq_wfront_m_w16_m1               97631    1000    3500   112000   112000
mult_seq_wfront_m_w16_m2              104787    1000    3658    58528    58528
mult_seq_wfront_m_w16_m4              130378    1000    3942    31536    31536
mult_seq_wfront_m_w16_m8              171258    1000    6417    25668    25668

mult_seq_dm_w16_m1                    121611    1000   16360   261760   261760
mult_seq_dm_w16_m2                    131048    1000   16730   133840   133840
mult_seq_dm_w16_m4                    145285    1000   16704    66816    66816
mult_seq_dm_w16_m8                    196118    1000   15161    30322    30322


mult_seq_wfront_w32                   178367    1000    4202   268928   268928
mult_seq_wfront_opt_w32               177706    1000    2546   162944   162944

mult_seq_wfront_m_w32_m1              191334    1000    3766   241024   241024
mult_seq_wfront_m_w32_m2              205303    1000    3857   123424   123424
mult_seq_wfront_m_w32_m4              260182    1000    5266    84256    84256
mult_seq_wfront_m_w32_m8              351910    1000    7031    56248    56248

mult_seq_dm_w32_m1                    246818    1000   31113   995616   995616
mult_seq_dm_w32_m2                    279486    1000   30994   495904   495904
mult_seq_dm_w32_m4                    314724    1000   32127   257016   257016
mult_seq_dm_w32_m8                    408659    1000   31251   125004   125004


`endif


`ifdef DONT_DEFINE_ME

// Data collected  9 November 2018,  8:53:10 CST

Module Name                             Area  Period  Period    Total    Init.
                                              Target  Actual  Latency   Interv
mult_behav_1_w8                        53168    1000    6062     6062     6062
mult_behav_1_w16                      215672    1000   13551    13551    13551
mult_behav_1_w32                      764479    1000   26485    26485    26485
mult_behav_1_w64                     2891332    1000   52332    52332    52332

mult_seq_stream_w8                     44320    1000    4518    36144    36144
mult_seq_stream_w16                    78395    1000    8868   141888   141888
mult_seq_stream_w32                   153863    1000   16361   523552   523552
mult_seq_stream_w64                   304047    1000   30276  1937664  1937664

mult_seq_wfront_w8                     45390    1000    3132    50112    50112
mult_seq_wfront_w16                    89668    1000    3260   104320   104320
mult_seq_wfront_w32                   178367    1000    4202   268928   268928
mult_seq_wfront_w64                   345415    1000    4716   603648   603648

mult_seq_wfront_opt_w8                 47575    1000    2428    38848    38848
mult_seq_wfront_opt_w16                94652    1000    2275    72800    72800
mult_seq_wfront_opt_w32               177706    1000    2546   162944   162944
mult_seq_wfront_opt_w64               345301    1000    2724   348672   348672

mult_pipe_wfront_w8                   239827    1000     993     7944     7944
mult_pipe_wfront_w16                 1012675    1000    1173    18768    18768
mult_pipe_wfront_w32                 4158007    1000    1512    48384    48384
mult_pipe_wfront_w64                16865186    1000    2256   144384   144384

mult_seq_csa_w8                        80488    1000    9266    74128    74128
mult_seq_csa_w16                      162743    1000   16580   265280   265280
mult_seq_csa_w32                      343497    1000   31074   994368   994368
mult_seq_csa_w64                      715059    1000   60431  3867584  3867584

mult_pipe1_w8                         137822    1000    3859    30872     3859
mult_pipe1_w16                        571541    1000    7499   119984     7499
mult_pipe1_w32                       2325284    1000   14746   471872    14746
mult_pipe1_w64                       9397076    1000   28722  1838208    28722


mult_seq_dm_w16_m1                    121611    1000   16360   261760   261760
mult_seq_dm_w16_m2                    131048    1000   16730   133840   133840
mult_seq_dm_w16_m4                    145285    1000   16704    66816    66816
mult_seq_dm_w16_m8                    196118    1000   15161    30322    30322

mult_seq_dm_w32_m1                    246818    1000   31113   995616   995616
mult_seq_dm_w32_m2                    279486    1000   30994   495904   495904
mult_seq_dm_w32_m4                    314724    1000   32127   257016   257016
mult_seq_dm_w32_m8                    408659    1000   31251   125004   125004

mult_seq_wfront_m_w16_m1               97631    1000    3500   112000   112000
mult_seq_wfront_m_w16_m2              104787    1000    3658    58528    58528
mult_seq_wfront_m_w16_m4              130378    1000    3942    31536    31536
mult_seq_wfront_m_w16_m8              171258    1000    6417    25668    25668

mult_seq_wfront_m_w32_m1              191334    1000    3766   241024   241024
mult_seq_wfront_m_w32_m2              205303    1000    3857   123424   123424
mult_seq_wfront_m_w32_m4              260182    1000    5266    84256    84256
mult_seq_wfront_m_w32_m8              351910    1000    7031    56248    56248


`endif



//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

// cadence translate_off

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input var int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int w = 16;
   localparam int num_tests = 1000;
   localparam int NUM_MULT = 20;
   localparam int err_limit = 7;

   bit use_others;
   logic [w-1:0] plier, cand;
   logic [w-1:0] plierp, candp;
   logic [2*w-1:0] prod[NUM_MULT];

   typedef struct { int idx; int err_count = 0;
                    bit seq = 0; bit pipe = 0; bit wf = 0; int deg = 1;
                    logic [2*w-1:0] sout = 'h111; int cyc_tot = 0;
                    int latency = 0;
                    } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * w * 8;
   int cycle;
   bit done;
   logic clock;

   logic      clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   initial begin

      while ( !done ) @( posedge clk_reactive ) #1

         if ( use_others ) begin

            plierp = plier;
            candp = cand;
            use_others = 0;

         end else begin

            plierp = cycle;
            candp = 256;

         end
   end

   task pi_seq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1;
   endtask

   task pi_seqw(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].wf = 1;
   endtask

   task pi_pipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;
   endtask

   mult_behav_1 #(w) mb1(prod[0], plier, cand);
   initial pi["Behavioral"].idx = 0;

   mult_linear  #(w) ms1(prod[1], plier, cand);
   initial pi["Linear"].idx = 1;

   mult_tree    #(w) ms2(prod[2], plier, cand);
   initial pi["Tree"].idx = 2;

   mult_seq #(w) ms3(prod[3], plier, cand, clock);
   initial begin
      automatic string m = "Sequential";
      pi[m].idx = 3; pi[m].seq = 1;
   end

   mult_seq_ga  #(w) msga1(prod[11], plier, cand, clock);
   initial begin
      automatic string m = "Sequential GA";
      pi[m].idx = 11; pi[m].seq = 1;
   end

   mult_seq_stream #(w) mss1(prod[4], plier, cand, clock);
   initial begin
      automatic string m = "Sequential Streamlined";
      pi[m].idx = 4; pi[m].seq = 1;
   end

   mult_seq_wfront   #(w) ms7(prod[7], plier, cand, clock);
   initial pi_seqw(7,"WFront", 1);
   mult_seq_wfront_opt   #(w) ms8(prod[8], plier, cand, clock);
   initial pi_seqw(8,"WFront-Opt", 1);

   mult_seq_wfront_m   #(w,2) ms12(prod[12], plier, cand, clock);
   initial pi_seqw(12,"WFront", ms12.m);

   mult_seq_wfront_m   #(w,5) ms13(prod[13], plier, cand, clock);
   initial pi_seqw(13,"WFront", ms13.m);

   mult_seq_m   #(w,4) ms44(prod[5], plier, cand, clock);
   initial pi_seq(5,"Seq", ms44.m);

   mult_seq_m   #(w,3) ms43(prod[6], plier, cand, clock);
   initial pi_seq(6,"Seq", ms43.m);

   mult_seq_dm  #(w,1) msd14(prod[14], plier, cand, clock);
   initial pi_seq(14,"Seq Rad", msd14.m);
   mult_seq_dm  #(w,2) msd16(prod[16], plier, cand, clock);
   initial pi_seq(16,"Seq Rad", msd16.m);

   mult_seq_dm  #(w,4) msd44(prod[9], plier, cand, clock);
   initial pi_seq(9,"Seq Rad", ms44.m);

   mult_seq_dm  #(w,3) msd43(prod[10], plier, cand, clock);
   initial pi_seq(10,"Seq Rad", ms43.m);

   mult_linear_clk  #(w) mlc1(prod[15], plier, cand, clock);
   initial begin
      automatic string m = $sformatf("Linear Clock");
      pi[m].idx = 15; pi[m].seq = 1;
   end


   // Array of multiplier/multiplicand values to try out.
   // After these values are used a random number generator will be used.
   //
   int tests[$] = {1,32,  1,1, 1,2,  1,32,  32, 1};

   initial begin

      done = 0;
      use_others = 0;

      // Make sure that no two multipliers are assigned the same index ..
      // .. and show unused indices.
      #0 begin
         string index_used[NUM_MULT];
         automatic int n_unused = 0, n_reused = 0;
         foreach ( pi[ mut ] ) begin
            automatic int idx = pi[mut].idx;
            if ( index_used[idx].len() )
              begin
                 $write("*** Index %0d used by %s and %s.\n",
                        idx, index_used[idx], mut );
                 n_reused++;
              end
            index_used[idx] = mut;
         end
         $write("Unused positions: ");
         foreach ( index_used[idx] )
           if ( index_used[idx].len() == 0 )
             $write("%s%0d", n_unused++ ? ", " : "", idx);
         $write("%s.\n",n_unused ? "" : "none -- all used");
         if ( n_reused )
           $fatal(2, "\nFound %0d re-used indices. Aborting simulation.\n\n",
                  n_reused);
      end

      @( posedge clk_reactive );

      for ( int i=0; i<num_tests; i++ ) begin
         automatic int cyc_start = cycle;
         automatic int awaiting = pi.num();

         // Set multiplier and multiplicand values for non-piped units.
         //
         plier = tests.size() ? tests.pop_front() : $random();
         cand = tests.size() ? tests.pop_front() : $random();

         // Set multiplier and multiplicand values for piped units.
         //
         plierp = plier;
         candp = cand;
         use_others = 1;

         foreach ( pi[muti] ) begin
            automatic string mut = muti; // Bug workaround?
            automatic Info p = pi[mut];
            fork begin
               automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
               automatic int latency
                 = !pi[mut].seq ? 1 :
                   pi[mut].wf ? 5 * steps :
                     !pi[mut].pipe ? 2 * steps : steps;
               automatic int eta = 1 + cyc_start + latency;
               pi[mut].latency = latency;
               wait ( cycle_reactive == eta );
               awaiting--;
               pi[mut].sout = prod[pi[mut].idx];
               pi[mut].cyc_tot += cycle - cyc_start;
            end join_none;
         end
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] )
           if ( prod[0] !== pi[mut].sout ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write
                  ("%-25s wrong result: %0d * %0d:  0x%0h != 0x%0h (correct)\n",
                   mut, plier, cand, pi[mut].sout, prod[0]);
           end

         @( posedge clk_reactive );

      end

      foreach ( pi[ mut ] )
        $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
               num_tests, mut, pi[mut].err_count,
               pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on