```/// EE 4755 - Digital Design Using HDLs
//
//  Classroom demo code.

////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm

/// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  cand  -- Multiplicand
//     1100  plier -- Multiplier
//     """"
//     0000  Partial Product
//    0000
//   0101
//  0101
//  """""""
//  0111100  prod  -- Product
//
//  OED Citation
//    The multiplycand, which must alwaies stand aboue.
//    1594 T. Blundeville Exercises i. iv. f. 6

////////////////////////////////////////////////////////////////////////////////
/// Delay, Critical Path, and Latency

/// :Def: Critical Path
//        The longest path starting at a launch point and ending at
//        a capture point.
//
//  In a combinational circuit the launch points are usually the
//  module inputs and the capture points are usually the module
//  outputs.
//
//  In a sequential circuit typically:
//
//     - All register outputs are launch points.
//     - Module inputs may or may not be launch points.
//     - All register inputs are capture points.
//     - Module outputs may or may not be capture points.
//
/// :Def: Latency [of an action in a sequential circuit]
//        Product of number of cycles needed to complete the action ..
//        .. and the clock period.

`default_nettype none

//////////////////////////////////////////////////////////////////////////////
/// Behavioral Multiplier

module mult_behav_1
#(int w = 16)
(output uwire [2*w-1:0] prod, input uwire [w-1:0] cand, plier);

assign prod = cand * plier;
endmodule

//////////////////////////////////////////////////////////////////////////////
/// Combinational Multiplier BFAs

module bfa(output uwire c, s, input uwire a, b, cin);
assign {c,s} = { a & b | a & cin | b & cin, a ^ b ^ cin };
endmodule
module bha(output uwire c, s, input uwire a, cin);
assign {c,s} = { a & cin, a ^ cin };
endmodule

module mult_bfas
#( int wa = 5, wb = wa )
( output uwire [wa+wb-1:0] prod,
input uwire [wa-1:0] a,
input uwire [wb-1:0] b );

uwire [wb:0][wa-1:0] s;
assign s[wb] = wa'(0);
assign prod[wa-1:0] = s[0];
uwire [wb:-1] c2;
assign c2[-1] = 0;

for ( genvar ib = 0;  ib < wb;  ib++ ) begin

uwire [wa-1:0] c, ab = b[ib] ? a : 0;
assign s[ib][0] = ab[0];
assign c[0] = 0;

for ( genvar ia = 1;  ia < wa;  ia++ )
bfa B( c[ia], s[ib][ia], s[ib+1][ia-1], ab[ia], c[ia-1] );

bfa my_bfa( c2[ib], prod[wa+ib], s[ib+1][wa-1], c[wa-1], c2[ib-1] );

end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Linear Multiplier

(output uwire [w:1] s, input uwire [w:1] a,b);
assign s = a + b;
endmodule

module mult_linear
#(int w = 16)
(output logic [2*w-1:0] prod, input uwire [w-1:0] plier, cand);

uwire [2*w-1:0] b[w:0];

assign b[0] = 0;
assign prod = b[w];

for ( genvar row = 0;  row < w;  row++ ) begin

uwire [2*w-1:0] pp = plier[row] ? cand << row : 0;

carry_prop_adder #(2*w) a( b[row+1], pp, b[row] );

end

endmodule
//
/// Cost Analysis
//
//   Lightly optimized Ripple Implementation:
//    See mult_bfas for basis of cost analysis.
//    pprod = plier[i] ? cand << i : 0;
//      Notes: i is a constant, and so plier[i] and cand << i are constants.
//             Because else side is zero and only w bits can be nonzero ..
//             .. consists of just w AND gates.
//      w w = w^2
//    Adders: w 9 w = 9 w^2
//      Note: only w BFA units per pp because lower i bits are unchanged.
//    Total cost: 10 w^2.
//
/// Timing Analysis
//
//   Grid of w x w  BFAs: 8w + 2
//   Optimize using BHAs: 8w - 12

module mult_linear_clk
#( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk);

uwire [2*w-1:0] p;
logic [w-1:0] candcpy, pliercpy;

mult_linear #(w) ml(p, candcpy, pliercpy);

always_ff @( posedge clk ) begin
candcpy <= cand;
pliercpy <= plier;
prod <= p;
end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Tree Multipliers

// See 2019 Homework 3 for cost and delay analysis.
// https://www.ece.lsu.edu/v/2019/hw03_sol.pdf

module mult_tree
#( int wa = 16, int wb = wa, int wp = wa + wb )
( output uwire [wp-1:0] prod,
input uwire [wa-1:0] a,
input uwire [wb-1:0] b );

if ( wa == 1 ) begin

assign prod = a ? b : 0;

end else begin

// Split a in half and recursively instantiate a module for each half.
localparam int wn = wa / 2;
localparam int wx = wb + wn;

uwire [wx-1:0] prod_lo, prod_hi;

mult_tree #(wn,wb) mlo( prod_lo, a[wn-1:0],  b );
mult_tree #(wn,wb) mhi( prod_hi, a[wa-1:wn], b );

// Combine the partial products.
assign prod = prod_lo + ( prod_hi << wn );

end
endmodule

function int min(int a, int b);
min = a <= b ? a : b;
endfunction

module mult_tree_bfas
#( int wa = 16, int wb = wa, int wp = wa + wb )
( output uwire [wp-1:0] prod,
input uwire [wa-1:0] a,
input uwire [wb-1:0] b );

if ( wa == 1 ) begin

assign prod = a ? b : 0;

end else begin

// Split a in half and recursively instantiate a module for each half.
localparam int wa_lo = wa / 2;
localparam int wp_lo_full = wb + wa_lo;
localparam int wa_hi = wa - wa_lo;
localparam int wp_hi_full = wb + wa_hi;

// Compute the width of the product needed by the lo and hi modules.
localparam int wp_lo = min( wp_lo_full, wp );
localparam int wp_hi = min( wp_hi_full, wp - wa_lo );

uwire [wp_lo-1:0] prod_lo;
uwire [wp_hi-1:0] prod_hi;

localparam int wb_hi = min( wb, wp_hi );

mult_tree_bfas #(wa_lo,wb,   wp_lo)
mlo( prod_lo, a[wa_lo-1:0],  b );
mult_tree_bfas #(wa_hi,wb_hi,wp_hi)
mhi( prod_hi, a[wa-1:wa_lo], b[wb_hi-1:0] );

assign prod[wa_lo-1:0] = prod_lo[wa_lo-1:0];

uwire c[wp-1:wa_lo-1];
assign c[wa_lo-1] = 0;

// Use wp_lo and wp_hi in the loop bounds so that there are only
// as many BFA and BHA modules as needed.
for ( genvar i=wa_lo; i<wp_lo; i++ )
bfa b(c[i], prod[i], prod_lo[i], prod_hi[i-wa_lo], c[i-1] );
for ( genvar i=wp_lo; i<wp_hi+wa_lo; i++ )
bha b(c[i], prod[i], prod_hi[i-wa_lo], c[i-1] );

localparam int wz = wp - wp_hi - wa_lo;
if ( wz > 0 ) assign prod[wp-1 : wp_hi+wa_lo] = 0;

end
endmodule

//

`ifdef XXX
Module Name                             Area  Period  Period    Total    Init.
Target  Actual  Latency   Interv

mult_bfas_wa8                          54394  200000    6116     6116     6116
mult_bfas_wa16                        225365  200000   13812    13812    13812
mult_bfas_wa32                        920712  200000   29204    29204    29204
mult_bfas_wa64                       3616401  200000   60949    60949    60949

mult_tori_bfas_wa8                     64685  200000    5786     5786     5786
mult_tori_bfas_wa16                   279167  200000   11750    11750    11750
mult_tori_bfas_wa32                  1148780  200000   23256    23256    23256
mult_tori_bfas_wa64                  3699504  200000   50712    50712    50712

mult_tree_wa8                          53168  200000    6131     6131     6131
mult_tree_wa16                        224139  200000   13046    13046    13046
mult_tree_wa32                        919486  200000   26875    26875    26875
mult_tree_wa64                       3615079  200000   55494    55494    55494

mult_behav_1_w8                        53168  200000    6062     6062     6062
mult_behav_1_w16                      215672  200000   13551    13551    13551
mult_behav_1_w32                      764479  200000   26485    26485    26485
mult_behav_1_w64                     2891012  200000   52637    52637    52637

mult_bfas_wa8                         145104       1    2878     2878     2878
mult_bfas_wa16                        510180       1    6319     6319     6319
mult_bfas_wa32                       1968611       1   12733    12733    12733
mult_bfas_wa64                       7601789       1   25830    25830    25830

mult_tori_bfas_wa8                    148649       1    2443     2443     2443
mult_tori_bfas_wa16                   503217       1    4908     4908     4908
mult_tori_bfas_wa32                  2502944       1    9204     9204     9204
mult_tori_bfas_wa64                  8862465       1   14525    14525    14525

mult_tree_wa8                         130322       1    2184     2184     2184
mult_tree_wa16                        534195       1    3129     3129     3129
mult_tree_wa32                       1951056       1    4166     4166     4166
mult_tree_wa64                       7283978       1    5320     5320     5320

mult_behav_1_w8                       131694       1    2182     2182     2182
mult_behav_1_w16                      501798       1    3106     3106     3106
mult_behav_1_w32                     1582738       1    4310     4310     4310
mult_behav_1_w64                     5488252       1    5272     5272     5272
`endif

//////////////////////////////////////////////////////////////////////////////
/// Simple Sequential Multiplier

module mult_seq #( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk);

localparam int wlog = \$clog2(w);

initial if ( w != 1 << wlog ) \$fatal(1,"Size must be a power of 2.");

bit [wlog-1:0] pos;
logic [2*w-1:0] accum;

always_ff @( posedge clk ) begin

if ( pos == 0 ) begin
prod = accum;
accum = 0;
end
if ( plier[pos] ) accum += cand << pos;
pos++;
end

endmodule

//

//
/// Cost Analysis  :  Delay Analysis
//
// Regs: prod, accum: 2 7 2 w = 28 w
// Regs: pos: 7 lg w

// pos == 0 : lg w   :  lg lg w
// if ( ) accum = 0; : 2w  : 1
// if ( ) prod = accum: 2 3 w  : 2
// plier[pos]: 3(w - 1) ≅ 3 w  : 2 lg w
// cand << pos: 3 2 w lg w = 6 w lg w  : 2 lg w
// accum += 9 2 w = 18 w  : 4w
// pos++: 9 lg w  : 4 + 2 lg w
//
// Total cost: 28w + 7lg w + lgw + 2w + 6w + 3 w + 6 w lgw + 18 w + 9 lgw
// =           57 w + 17 lg w + 6 w lg w
//
// Paths
//  pos == 0 -> accum=0 -> : lg lg w + 1
//  cand << pos : 2 lg w
// Critical Path
//  cand << pos -> accum += -> accum = : 2 lg w + 4w + 2
// Register Delay: 6
//
// Latency
//  w ( 4w + 2lg w + 2 + 6 )  =  4w^2 + 2 w lg w + 8w

//////////////////////////////////////////////////////////////////////////////
/// Sequential Multiplier, Using Instantiated Adder
//
//  Simple multiplier, no handshaking.
//
//  The cost of mult_seq_ga is about the same as mult_seq.

module mult_seq_ga #( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk );

localparam int wlog = \$clog2(w);

bit [wlog-1:0] pos;
bit [2*w-1:0] accum;
uwire [2*w-1:0] sum;
uwire [2*w-1:0] pp = plier[pos] ? cand << pos : 0;

carry_prop_adder #(2*w) ga( sum, accum, pp );

always_ff @( posedge clk ) pos <= pos + 1;

always_ff @( posedge clk )
if ( pos == 0 ) begin
prod = sum;
accum = 0;
end else begin
accum = sum;
end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Streamlined Sequential Multiplier

/// Techniques For Lowering Cost
//
//   Instead of shifting the multiplier, shift the accumulator.
//   Use part of the accumulator to store the multiplicand.

module mult_seq_stream #( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk);

localparam int wlog = \$clog2(w);

bit [wlog-1:0] pos;
logic [2*w-1:0] accum;

always_ff @( posedge clk ) begin

logic [w:0] pp;

if ( pos == 0 ) begin

prod = accum;
accum = plier;
pos = w - 1;

end else begin

pos--;

end

// Note: the multiplicand is in the lower bits of the accumulator.
//
pp = accum[0] ? { 1'b0, cand } : 0;

// Add on the partial product and shift the accumulator.
//
accum = { { 1'b0, accum[2*w-1:w] } + pp, accum[w-1:1] };

end

endmodule

//        0101  cand  -- Multiplicand
//        1100  plier -- Multiplier
//        """"
//        0000  <- Shift multiplicand by 0
//  =  0000000
//       0000   <- Shift multiplicand by 1
//  =  0000000
//      0101    <- Shift multiplicand by 2
//  =  0010100
//     0101     <- Shift multiplicand by 3
//  =  0111100
//     """""""
//     0111100  prod  -- Product

//        0101  cand  -- Multiplicand
//        1100  Plier -- Multiplier
//        """"
//        0000
//  =     0000oooo
//  =     00000ooo  <- Shift right by 1
//        0000
//  =     00000ooo
//  =     000000oo
//        0101
//  =     010100oo
//  =     0010100o
//        0101
//  =     0111100o
//  =     00111100
//     """""""
//         0111100  prod  -- Product

//

/// Cost Analysis  :  Timing Analysis
//
// Regs: prod, accum:  2 2 7 w = 28 w
// Regs: pos: 7 lg w

// pos == 0 : lg w   :  lg lg w
// if ( ) prod = accum: 2 3 w  : 2
// if ( ) accum = plier; : w + 3w = 4w  : 2
// if ( ) pos = w-1 / pos-- : 3 lg w  : 2
// pos--: 9 lg w  : 4 + 2 lg w
//

// pp = accum[0] ? { 1'b0, cand } : 0;   : w  : 1
// accum = { 1'b0, accum[2*w-1:w] } + pp  :  9w  : 2 + 2w

//
// Total cost: 28w + 7lg w + lg w + 6w + 4w + 3 lg w + 9 lg w + w + 9 w
// =           48 w + 20 lg w
//
// Critical Path
//  pos == 0 -> accum=plier ->  accum[0] ? : 0 -> + pp :
//   lg lg w + 2 + 1 + 2 + 2w = 2w + lg lg w + 5 ≅ 2w + 5
// Register Delay: 6
//
// Latency
//  w ( 6 + 2w + 5 ) = 2w^2 + 11w

// Latency, order 2
//  w/2 ( 6 + 2w + 5 + 4 ) = w^2 + 7.5w

// Latency, order d
//  w/d ( 6 + 2w + 5 + 4(d-1) ) = 2/d w^2 + 15w/d

/// Synthesis Data
//
`ifdef XXX
Module Name                             Area  Period  Period    Total    Init.
Target  Actual  Latency   Interv
mult_behav_1_w8                        53168    1000    6062     6062     6062
mult_behav_1_w16                      215672    1000   13551    13551    13551
mult_behav_1_w32                      764479    1000   26485    26485    26485
mult_behav_1_w64                     2891332    1000   52332    52332    52332

mult_seq_w8                            56081    1000    8120    64960    64960
mult_seq_w16                          122475    1000   15916   254656   254656
mult_seq_w32                          258750    1000   31385  1004320  1004320
mult_seq_w64                          544285    1000   58476  3742464  3742464

mult_seq_stream_w8                     44320    1000    4518    36144    36144
mult_seq_stream_w16                    78395    1000    8868   141888   141888
mult_seq_stream_w32                   153863    1000   16361   523552   523552
mult_seq_stream_w64                   304047    1000   30276  1937664  1937664
endmodule
`endif
//
// As expected, cost grows linearly with w, as does the clock period.
// Cost of streamlined much lower then mult_seq, as is the clock period.
// But, latency is still O(w^2) (it quadruples when w is doubled).

//////////////////////////////////////////////////////////////////////////////
/// Degree-m Sequential Multipliers

// Compute m partial products in each iteration.
//
// Will the synthesis program figure it out?

module mult_seq_m #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk);

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);

bit [iter_lg:1] iter;
logic [2*w-1:0] accum;

always_ff @( posedge clk ) begin

if ( iter == iter_lg'(iterations) ) begin

prod = accum;
accum = 0;
iter = 0;

end

for ( int i=0; i<m; i++ )
begin
int pos;
pos = iter * m + i;
if ( plier[pos] ) accum += cand << pos;
end

iter++;

end

endmodule

module mult_seq_dm
#( int w = 16,
int m = 2 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] cand, plier,
input uwire clk);

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);

uwire [iterations-1:0][m-1:0] plier_2d = plier;

bit [iter_lg:1] iter;
logic [2*w-1:0] accum;

always_ff @( posedge clk ) begin

if ( iter == iter_lg'(iterations) ) begin

prod = accum;
accum = 0;
iter = 0;

end

accum += cand * plier_2d[iter] << ( iter * m );

iter++;

end

endmodule

/// Cost Analysis  :  Timing Analysis
//
// Regs: accum, prod: 2 2 7 w = 28 w : 6
// Regs: iter: 7 lg w/m : 6
//
// iter == iter_lg'(iterations): 3 lg w/m + lg w/m - 1 ≅ 4 lg w/m  : 2 + lg w/m
// prod = accum: 3 2 w = 6 w  : 2
// accum = 0: 2w  : 1
// iter = 0: lg w/m  : 1
//
// plier_2d[iter]: 3 (lg w/m-1) \approx 3 lg w/m  : 2 lg w/m
// * : m 9 w  : 6m + 2w
// << iter*m : 3 w lg w/m  : 2 lg w/m
// accum += : 9 2 w = 18 w  : 4w
// iter++: 9 lg w/m  : 2 + 2 lg w/m
//
// Total Cost: 28w + 7 lg[w/m] + 4 lg[w/m] + 6w + 2w + lg[w/m]
//    + 3lg[w/m] + 9m w + 3w lg[w/m] + 18w + 9 lg[w/m]
//  = 54 w + 9 m w + 24 lg[w/m] + 3 w lg[w/m]
//
// Critical Path -- Not overlapping cand * plier with accum +
// iter == iterations -> accum=0;iter=0; -> piler_2d[iter] -> * -> << -> +
//  (2 + lg[w/m]) + 1 + 2 lg[ w/m] + ( 6m + 2w ) + 2 lg[w/m] + 4w
//  = 6 w + 6 m + 5 lg[w/m] + 3
//
// Latency
//  w/m ( 6 w + 6 m + 5 lg[w/m] + 3 + 6 )
//  = 6 w^2/m + 9 w/m + 6w + 5w/m lg[w/m]

//////////////////////////////////////////////////////////////////////////////
/// Workfront Sequential Multipliers
//
/// Goal: Avoid O(w) Clock Period Imposed by Ripple Adder
//
//   Idea:
//     Consider the grid of binary full adders (BFAs) that might be
//     synthesized for module mult_linear.
//
//     Let B[i][j] denote the BFA handling ..
//     .. bit j of ..
//     .. the partial product for multiplicand bit i.
//
//     Call the input ports:       B[i][j].a,   B[i][j].b,  B[i][j].ci
//     .. and the output ports:    B[i][j].sum, B[i][j].co
//
//     We know that the carry output B[i][j].co ..
//     .. connects to B[i][j+1].ci.
//
//     We also know that output B[i][j].sum ..
//     .. connects to B[i+1][j].a.
//
//     So, if B[i][j], B[i+1][j-1], and B[i-1][j+1] ..
//     .. are all computed at, say, cycle x, ..
//     .. then B[i+1][j] and B[i][j+1] can be computed in cycle x+1.
//
//     Input B[i][j].b = cand[i] && plier[i+j]
//        does not depend on any BFA ..
//        .. and so does not restrict at which cycle B[i][j] can execute.
//
//     A possible plan is, in cycle c, to execute ..
//       { B[i][j] : i + j = c,  i in [0,w-1],  j in [0,2w-1] }
//     For example,
//       Cycle 0: B[0][0]
//       Cycle 1: B[0][1], B[1][0]
//       Cycle 2: B[0][2], B[1][1], B[2][0]
//       Cycle 3: B[0][3], B[1][2], B[2][1], B[3][0]
//
//     Note that B[0][0], B[0][1], etc. would all be the same BFA.
//
//     The arrangement above has the disadvantage of using just one
//     BFA in cycle 0, one in cycle 1, etc. The set of active BFAs is
//     called the workfront.  The workfront modules described here
//     connect the BFAs differently, keeping the size of the workfront at
//     w throughout the multiplication. That's achieved by setting:
//
//       B[i][j].a  = B[i-1][j-1].sum
//       B[i][j].ci = B[i][j-1].sum
//       B[i][j].b  = plier[j] && cand[w-1-i] for j <= w;
//
//     Here are the active BFAs with this scheme:
//       Cycle 0: B[0][0], B[1][0], B[2][0], ... B[w-1][0]
//       Cycle 1: B[0][1], B[1][1], B[2][1], ... B[w-1][1]
//
//     Also, note that with this scheme ..
//     .. prod[cyc] = B[w-1][cyc].sum
//
//    Module mult_seq_wfront implements this scheme. Note that only w
//    BFAs are needed, and that multiplication takes 2w cycles.
//
module mult_seq_wfront
#( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk );

localparam int wlog = \$clog2(2*w);
if ( 2**wlog != 2*w )
\$fatal(2,"Size, parameter w=%0d, must be a power of 2.\n",w);

bit [wlog-1:0] pos;

always_ff @( posedge clk ) pos <= pos + 1;

logic [w-1:0] sum, carry;
logic [1:0] sc;

always_ff @( posedge clk ) begin

for ( int i=0; i<w; i++ ) begin
logic a, b, c;
a = pos < w && cand[w-1-i] ? plier[pos] : 0;
b = pos && i               ? sum[i-1]   : 0;
c = pos                    ? carry[i]   : 0;
sc = a + b + c;
sum[i] <= sc[0];
carry[i] <= sc[1];
end

prod[pos] = sc[0];

end

endmodule

/// Optimized Workfront Multiplier
//
module mult_seq_wfront_opt
#( int w = 16 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk );

localparam int wlog = \$clog2(w);
if ( 2**wlog != w )
\$fatal(2,"Size, parameter w=%0d, must be a power of 2.\n",w);

bit pos_eq_0;
bit [wlog:0] pos;

always_ff @( posedge clk ) { pos_eq_0, pos } <= pos + 1;

uwire pos_lt_w = !pos[wlog];    // Efficiently compute  pos < w.

logic [w-1:0] sum, carry;
logic [1:0] sc;

uwire [w-1:0] plier_rot = {plier[0],plier[w-1:1]};
logic plier_pos;
always_ff @( posedge clk ) plier_pos <= plier_rot[wlog'(pos)];

always_ff @( posedge clk ) begin

for ( int i=0; i<w; i++ ) begin
logic a, b, c;
a = !pos_eq_0              && carry[i];
b = !pos_eq_0 && i         && sum[i-1];
c =  pos_lt_w && plier_pos && cand[w-1-i];
sc = a + b + c;
sum[i]   <= sc[0];
carry[i] <= sc[1];
end

prod[pos] = sc[0];

end

endmodule

// :

/// Cost Analysis  :  Timing Analysis
//
// Registers:
//  prod: 2w * 7 = 14w.
//  sum,carry: 2 * w * 7 = 14w
//  pos, pos_eq_0:  7 ( 2 + lg w ) \approx  7 lg w
//  plier_pos: 7
//
// always_ff @( posedge clk ) { pos_eq_0, pos } <= pos + 1;
//  Note: One input is a constant, so use BHA's.
//  Ripple adder: 3 lg 2w : 1 + lg w.
//
// always_ff @( posedge clk ) plier_pos <= plier_rot[wlog'(pos)];
//   Note: {plier[0],plier[w-1:1]} is free, it's just renaming bits.
//   Same hardware used by all i iterations: a w-bit mux to extract plier bit.
//   mux:  3(w-1) : 2 lg w
//
//
// a = !pos_eq_0              && carry[i];
// b = !pos_eq_0 && i         && sum[i-1];
// c =  pos_lt_w && plier_pos && cand[w-1-i];
//   Notes:
//     i and w are constants, so carry[i], etc, ..
//       .. just refer to a bit, no mux needed.
//     pos_lt_w && plier_pos same for all i, so computed once.
//     Delay is for c.
// Once: 1
// Per iter: 3
// All iters: 1 + 3w : 2
//
// sc = a + b + c;
//   Note: Just a binary full adder.
//   Per iteration:  9  : 4
//   All iters:      9w : 4
//
// prod[pos] = sc[0]
//  Decoder for bit-addressable register: lg 2w bits to 2w bits
//   4w - 4 : 1 + lg w
//  Muxen to select sc[0] or prior bit:
//   2w * 3 = 6w : 2
//  Total: 10w - 4 : 2 + lg 2w = 3 + lg w
//
// Total Cost:
//  14w+14w+7lg[w]+7 + 3 (1+lg[w]) + 3(w-1) + 1+3w + 9w + 10w-4
//  = 53 w + 10 lg w + 4
//
// Paths
//  plier mux: 2 lg w
//  pos+1:     1+ lg w
//  pos_lt_w && cand[w-1-i] && plier_pos -> BFA(cin) ->
//   : 1 + 1 + 2 = 4
//  !pos_eq_0 && carry[i] -> BFA -| prod[pos] internal mux
//   : 1 + 4 = 5
//  prod[pos] decode network -| prod[pos] internal mux
//   : 1 + lg w
//
// Critical Path (the winner is, for w > 4).
//   plier mux: 2 lg w
//
// Clock Period: (Register delay: 6)
//   6 + 2 lg w
// Latency: Based on 2w cycles.
//   6w + 2w lg w
//
/// Comparison with Synthesis
`ifdef xxx
Module Name                             Area  Period  Period    Total    Init.
Target  Actual  Latency   Interv
mult_seq_wfront_w8                     45390    1000    3132    50112    50112
mult_seq_wfront_w16                    89668    1000    3260   104320   104320
mult_seq_wfront_w32                   178367    1000    4202   268928   268928
mult_seq_wfront_w64                   345415    1000    4716   603648   603648

mult_seq_wfront_opt_w8                 47575    1000    2428    38848    38848
mult_seq_wfront_opt_w16                94652    1000    2275    72800    72800
mult_seq_wfront_opt_w32               177706    1000    2546   162944   162944
mult_seq_wfront_opt_w64               345301    1000    2724   348672   348672

mult_seq_stream_w8                     44320    1000    4518    36144    36144
mult_seq_stream_w16                    78395    1000    8868   141888   141888
mult_seq_stream_w32                   153863    1000   16361   523552   523552
mult_seq_stream_w64                   304047    1000   30276  1937664  1937664

`endif
//
// Optimizations reduce period without significant cost impact.
//
// Cost growth with w agrees with analysis, but period does not grow
// as fast as modeled.
//
// Workfront multiplier much faster than streamlined for w>8

/// Degree-m Workfront Multiplier
//
module mult_seq_wfront_m #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input uwire [w-1:0] plier, cand,
input uwire clk );

localparam int iterations = ( 2*w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);

localparam int wlog = \$clog2(m * iterations);

bit [iter_lg-1:0] iter;

always_ff @( posedge clk ) iter <= iter + 1;

logic [w-1:-1] sum, carry;

always_ff @( posedge clk ) begin

logic [w-1:-1] j_sum[m+1], j_carry[m+1];
logic [1:0] sc;

j_sum[0] = iter ? sum : 0;
j_carry[0] = iter ? carry : 0;

for ( int j=0; j<m; j++ ) begin

logic [wlog-1:0] pos;
pos = m*iter + j;

j_sum[j+1][-1] = 0;

for ( int i=0; i<w; i++ ) begin
logic a, b, c;
a = j_carry[j][i];
b = j_sum[j][i-1];
c = pos < w && cand[w-1-i] && plier[pos];
sc = a + b + c;
{ j_carry[j+1][i], j_sum[j+1][i] } = sc;
end
if ( pos < 2*w ) prod[pos] = sc[0];

end

sum <= j_sum[m];
carry <= j_carry[m];

end

endmodule

/// Synthesis Data
//
`ifdef DONT_DEFINE_ME
Module Name                             Area  Period  Period    Total    Init.
Target  Actual  Latency   Interv
mult_seq_wfront_w16                    89668    1000    3260   104320   104320
mult_seq_wfront_opt_w16                94652    1000    2275    72800    72800

mult_seq_wfront_m_w16_m1               97631    1000    3500   112000   112000
mult_seq_wfront_m_w16_m2              104787    1000    3658    58528    58528
mult_seq_wfront_m_w16_m4              130378    1000    3942    31536    31536
mult_seq_wfront_m_w16_m8              171258    1000    6417    25668    25668

mult_seq_dm_w16_m1                    121611    1000   16360   261760   261760
mult_seq_dm_w16_m2                    131048    1000   16730   133840   133840
mult_seq_dm_w16_m4                    145285    1000   16704    66816    66816
mult_seq_dm_w16_m8                    196118    1000   15161    30322    30322

mult_seq_wfront_w32                   178367    1000    4202   268928   268928
mult_seq_wfront_opt_w32               177706    1000    2546   162944   162944

mult_seq_wfront_m_w32_m1              191334    1000    3766   241024   241024
mult_seq_wfront_m_w32_m2              205303    1000    3857   123424   123424
mult_seq_wfront_m_w32_m4              260182    1000    5266    84256    84256
mult_seq_wfront_m_w32_m8              351910    1000    7031    56248    56248

mult_seq_dm_w32_m1                    246818    1000   31113   995616   995616
mult_seq_dm_w32_m2                    279486    1000   30994   495904   495904
mult_seq_dm_w32_m4                    314724    1000   32127   257016   257016
mult_seq_dm_w32_m8                    408659    1000   31251   125004   125004

`endif

`ifdef DONT_DEFINE_ME

// Data collected  9 November 2018,  8:53:10 CST

Module Name                             Area  Period  Period    Total    Init.
Target  Actual  Latency   Interv
mult_behav_1_w8                        53168    1000    6062     6062     6062
mult_behav_1_w16                      215672    1000   13551    13551    13551
mult_behav_1_w32                      764479    1000   26485    26485    26485
mult_behav_1_w64                     2891332    1000   52332    52332    52332

mult_seq_stream_w8                     44320    1000    4518    36144    36144
mult_seq_stream_w16                    78395    1000    8868   141888   141888
mult_seq_stream_w32                   153863    1000   16361   523552   523552
mult_seq_stream_w64                   304047    1000   30276  1937664  1937664

mult_seq_wfront_w8                     45390    1000    3132    50112    50112
mult_seq_wfront_w16                    89668    1000    3260   104320   104320
mult_seq_wfront_w32                   178367    1000    4202   268928   268928
mult_seq_wfront_w64                   345415    1000    4716   603648   603648

mult_seq_wfront_opt_w8                 47575    1000    2428    38848    38848
mult_seq_wfront_opt_w16                94652    1000    2275    72800    72800
mult_seq_wfront_opt_w32               177706    1000    2546   162944   162944
mult_seq_wfront_opt_w64               345301    1000    2724   348672   348672

mult_pipe_wfront_w8                   239827    1000     993     7944     7944
mult_pipe_wfront_w16                 1012675    1000    1173    18768    18768
mult_pipe_wfront_w32                 4158007    1000    1512    48384    48384
mult_pipe_wfront_w64                16865186    1000    2256   144384   144384

mult_seq_csa_w8                        80488    1000    9266    74128    74128
mult_seq_csa_w16                      162743    1000   16580   265280   265280
mult_seq_csa_w32                      343497    1000   31074   994368   994368
mult_seq_csa_w64                      715059    1000   60431  3867584  3867584

mult_pipe1_w8                         137822    1000    3859    30872     3859
mult_pipe1_w16                        571541    1000    7499   119984     7499
mult_pipe1_w32                       2325284    1000   14746   471872    14746
mult_pipe1_w64                       9397076    1000   28722  1838208    28722

mult_seq_dm_w16_m1                    121611    1000   16360   261760   261760
mult_seq_dm_w16_m2                    131048    1000   16730   133840   133840
mult_seq_dm_w16_m4                    145285    1000   16704    66816    66816
mult_seq_dm_w16_m8                    196118    1000   15161    30322    30322

mult_seq_dm_w32_m1                    246818    1000   31113   995616   995616
mult_seq_dm_w32_m2                    279486    1000   30994   495904   495904
mult_seq_dm_w32_m4                    314724    1000   32127   257016   257016
mult_seq_dm_w32_m8                    408659    1000   31251   125004   125004

mult_seq_wfront_m_w16_m1               97631    1000    3500   112000   112000
mult_seq_wfront_m_w16_m2              104787    1000    3658    58528    58528
mult_seq_wfront_m_w16_m4              130378    1000    3942    31536    31536
mult_seq_wfront_m_w16_m8              171258    1000    6417    25668    25668

mult_seq_wfront_m_w32_m1              191334    1000    3766   241024   241024
mult_seq_wfront_m_w32_m2              205303    1000    3857   123424   123424
mult_seq_wfront_m_w32_m4              260182    1000    5266    84256    84256
mult_seq_wfront_m_w32_m8              351910    1000    7031    56248    56248

`endif

//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input var int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram

module testbench;

localparam int w = 16;
localparam int num_tests = 1000;
localparam int NUM_MULT = 20;
localparam int err_limit = 7;

bit use_others;
logic [w-1:0] plier, cand;
logic [w-1:0] plierp, candp;
logic [2*w-1:0] prod[NUM_MULT];

typedef struct { int idx; int err_count = 0;
bit seq = 0; bit pipe = 0; bit wf = 0; int deg = 1;
logic [2*w-1:0] sout = 'h111; int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];

localparam int cycle_limit = num_tests * w * 8;
int cycle;
bit done;
logic clock;

logic      clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

initial begin
clock = 0;
cycle = 0;

fork
forever #10 cycle += clock++;
wait( done );
wait( cycle >= cycle_limit )
\$write("*** Cycle limit exceeded, ending.\n");
join_any;

\$finish();
end

initial begin

while ( !done ) @( posedge clk_reactive ) #1

if ( use_others ) begin

plierp = plier;
candp = cand;
use_others = 0;

end else begin

plierp = cycle;
candp = 256;

end
end

task pi_seq(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1;

task pi_seqw(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].wf = 1;

task pi_pipe(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;

mult_behav_1 #(w) mb1(prod[0], plier, cand);
initial pi["Behavioral"].idx = 0;

mult_bfas #(w) mb17(prod[17], plier, cand);
initial pi["Flat BFAs"].idx = 17;

mult_linear  #(w) ms1(prod[1], plier, cand);
initial pi["Linear"].idx = 1;

mult_tree    #(w) ms2(prod[2], plier, cand);
initial pi["Tree"].idx = 2;

mult_tree_bfas #(w) ms18(prod[18], plier, cand);
initial pi["Tree BFAs"].idx = 18;

mult_seq #(w) ms3(prod[3], plier, cand, clock);
initial begin
automatic string m = "Sequential";
pi[m].idx = 3; pi[m].seq = 1;
end

mult_seq_ga  #(w) msga1(prod[11], plier, cand, clock);
initial begin
automatic string m = "Sequential GA";
pi[m].idx = 11; pi[m].seq = 1;
end

mult_seq_stream #(w) mss1(prod[4], plier, cand, clock);
initial begin
automatic string m = "Sequential Streamlined";
pi[m].idx = 4; pi[m].seq = 1;
end

mult_seq_wfront   #(w) ms7(prod[7], plier, cand, clock);
initial pi_seqw(7,"WFront", 1);
mult_seq_wfront_opt   #(w) ms8(prod[8], plier, cand, clock);
initial pi_seqw(8,"WFront-Opt", 1);

mult_seq_wfront_m   #(w,2) ms12(prod[12], plier, cand, clock);
initial pi_seqw(12,"WFront", ms12.m);

mult_seq_wfront_m   #(w,5) ms13(prod[13], plier, cand, clock);
initial pi_seqw(13,"WFront", ms13.m);

mult_seq_m   #(w,4) ms44(prod[5], plier, cand, clock);
initial pi_seq(5,"Seq", ms44.m);

mult_seq_m   #(w,3) ms43(prod[6], plier, cand, clock);
initial pi_seq(6,"Seq", ms43.m);

mult_seq_dm  #(w,1) msd14(prod[14], plier, cand, clock);
mult_seq_dm  #(w,2) msd16(prod[16], plier, cand, clock);

mult_seq_dm  #(w,4) msd44(prod[9], plier, cand, clock);

mult_seq_dm  #(w,3) msd43(prod[10], plier, cand, clock);

mult_linear_clk  #(w) mlc1(prod[15], plier, cand, clock);
initial begin
automatic string m = \$sformatf("Linear Clock");
pi[m].idx = 15; pi[m].seq = 1;
end

// Array of multiplier/multiplicand values to try out.
// After these values are used a random number generator will be used.
//
int tests[\$] = {1,32,  1,1, 1,2,  1,32,  32, 1};

initial begin

done = 0;
use_others = 0;

// Make sure that no two multipliers are assigned the same index ..
// .. and show unused indices.
#0 begin
string index_used[NUM_MULT];
automatic int n_unused = 0, n_reused = 0;
foreach ( pi[ mut ] ) begin
automatic int idx = pi[mut].idx;
if ( index_used[idx].len() )
begin
\$write("*** Index %0d used by %s and %s.\n",
idx, index_used[idx], mut );
n_reused++;
end
index_used[idx] = mut;
end
\$write("Unused positions: ");
foreach ( index_used[idx] )
if ( index_used[idx].len() == 0 )
\$write("%s%0d", n_unused++ ? ", " : "", idx);
\$write("%s.\n",n_unused ? "" : "none -- all used");
if ( n_reused )
\$fatal(2, "\nFound %0d re-used indices. Aborting simulation.\n\n",
n_reused);
end

@( posedge clk_reactive );

for ( int i=0; i<num_tests; i++ ) begin
automatic int cyc_start = cycle;
automatic int awaiting = pi.num();

// Set multiplier and multiplicand values for non-piped units.
//
plier = tests.size() ? tests.pop_front() : \$random();
cand = tests.size() ? tests.pop_front() : \$random();

// Set multiplier and multiplicand values for piped units.
//
plierp = plier;
candp = cand;
use_others = 1;

foreach ( pi[muti] ) begin
automatic string mut = muti; // Bug workaround?
automatic Info p = pi[mut];
fork begin
automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
automatic int latency
= !pi[mut].seq ? 1 :
pi[mut].wf ? 5 * steps :
!pi[mut].pipe ? 2 * steps : steps;
automatic int eta = 1 + cyc_start + latency;
pi[mut].latency = latency;
wait ( cycle_reactive == eta );
awaiting--;
pi[mut].sout = prod[pi[mut].idx];
pi[mut].cyc_tot += cycle - cyc_start;
end join_none;
end
wait ( awaiting == 0 );

// Check the output of each Module Under Test.
//
foreach ( pi[ mut ] )
if ( prod[0] !== pi[mut].sout ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < 5 )
\$write
("%-25s wrong result: %0d * %0d:  0x%0h != 0x%0h (correct)\n",
mut, plier, cand, pi[mut].sout, prod[0]);
end

@( posedge clk_reactive );

end

foreach ( pi[ mut ] )
\$write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
num_tests, mut, pi[mut].err_count,
pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);

done = 1;

\$finish(2);

end

endmodule