```/// EE 4755 - Digital Design Using HDLs
//
//  Classroom demo code.

////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm

/// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  cand  -- Multiplicand
//     1100  plier -- Multiplier
//     """"
//     0000  Partial Product
//    0000
//   0101
//  0101
//  """""""
//  0111100  prod  -- Product

//////////////////////////////////////////////////////////////////////////////
/// Behavioral Multiplier

module mult_behav_1
#(int w = 16)
(output logic[2*w-1:0] prod, input logic[w-1:0] plier, cand);

assign prod = plier * cand;
endmodule

//////////////////////////////////////////////////////////////////////////////
/// Linear Multiplier

module carry_prop_adder #(int w=16)(output [w:1] s, input [w:1] a,b);
assign s = a + b;
endmodule

module mult_linear
#(int w = 16)
(output logic[2*w-1:0] prod, input logic[w-1:0] plier, cand);

logic [2*w-1:0] rsum [w-1:-1];

assign rsum[-1] = 0;

for ( genvar i=0; i<w; i++ ) begin
uwire [2*w-1:0] pprod = plier[i] ? cand << i : 0;
end

assign    prod = rsum[w-1];

endmodule

module mult_linear_clk #(int w = 16)
(output logic[2*w-1:0] prod, input logic[w-1:0] plier, cand, input clk);

uwire [2*w-1:0] p;
logic [w-1:0] pliercpy, candcpy;

mult_linear #(w) ml(p, plier, cand);

always_ff @( posedge clk ) begin
pliercpy <= plier;
candcpy <= cand;
prod <= p;
end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Tree Multiplier

module mult_tree
#(int w = 16)
(output logic[2*w-1:0] prod, input logic[w-1:0] plier, cand);

localparam int widp2 = 1 << \$clog2(w);

logic [2*w-1:0] rsum [2*w-1:0];

// Compute partial products.
//
for ( genvar i=0; i<w; i++ )
assign rsum[i] = plier[i] ? cand << i : 0;

//
for ( genvar i=w; i<2*w-1; i++ )
( rsum[i],
rsum[ mask &   (i<<1)       ],   // Left child.
rsum[ mask & ( (i<<1) + 1 ) ]    // Right child.
);

assign    prod = rsum[2*w-2];

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Simple Sequential Multiplier

module mult_seq #( int w = 16 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);

localparam int wlog = \$clog2(w);

logic [wlog-1:0] pos;
logic [2*w-1:0] accum;

initial pos = 0;

always @( posedge clk ) begin
if ( pos == 0 ) begin
prod = accum;
accum = 0;
end
if ( cand[pos] == 1 ) accum += plier << pos;
pos++;
end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Sequential Multiplier, Using Instantiated Adder
//
//  Simple multiplier, no handshaking.

module mult_seq_ga #( int w = 16 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk );

localparam int wlog = \$clog2(w);

logic [wlog-1:0] pos;
logic [2*w-1:0] accum;
uwire [2*w-1:0] sum;

initial begin pos = 0; accum = 0; end

uwire [2*w-1:0] pp = cand[pos] ? plier << pos : 0;

carry_prop_adder #(2*w) ga( sum, accum, pp );

always @( posedge clk ) pos <= pos + 1;
always @( posedge clk ) begin

if ( pos == 0 ) begin
prod = sum;
accum = 0;
end else begin
accum = sum;
end

end

endmodule

module mult_seq_csa #( int w = 16 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);

localparam int wlog = \$clog2(w);

logic [wlog-1:0] pos;

logic [2*w-1:0] accum_sum_a_reg, accum_sum_b_reg;
uwire             co;

initial begin pos = 0; accum_sum_a_reg = 0; accum_sum_b_reg = 0;  end

uwire [2*w-1:0] accum_sum_a, accum_sum_b;

uwire [2*w-1:0] pp = cand[pos] ? plier << pos : 0;

// Instantiate a carry save adder from the ChipWare library.
//
CW_csa #(2*w) csa
( .carry(accum_sum_a), .sum(accum_sum_b), .co(co),
.a(accum_sum_a_reg), .b(accum_sum_b_reg), .c(pp), .ci(1'b0) );

always @( posedge clk ) pos <= pos + 1;

always @( posedge clk ) begin

if ( pos == w-1 ) begin

prod = accum_sum_a + accum_sum_b;
accum_sum_a_reg = 0;
accum_sum_b_reg = 0;

end else begin

accum_sum_a_reg = accum_sum_a;
accum_sum_b_reg = accum_sum_b;

end

end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Streamlined Sequential Multiplier

/// Techniques For Lowering Cost
//
//   Instead of shifting the multiplier, shift the accumulator.
//   Use part of the accumulator to store the multiplicand.

module mult_seq_stream #( int w = 16 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);

localparam int wlog = \$clog2(w);

logic [wlog-1:0] pos;
logic [2*w-1:0] accum;

initial pos = 0;

always @( posedge clk ) begin

logic [w:0] pp;

if ( pos == 0 ) begin

prod = accum;
accum = cand;
pos = w - 1;

end else begin

pos--;

end

// Note: the multiplicand is in the lower bits of the accumulator.
//
pp = accum[0] ? { 1'b0, plier } : 0;

// Add on the partial product and shift the accumulator.
//
accum = { { 1'b0, accum[2*w-1:w] } + pp, accum[w-1:1] };

end

endmodule

//

//////////////////////////////////////////////////////////////////////////////
/// Degree-m Sequential Multipliers

// Compute m partial products in each iteration.
//
// Will the synthesis program figure it out?

module mult_seq_m #( int w = 16, int m = 2 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);

logic [iter_lg:1] iter;
logic [2*w-1:0] accum;

initial iter = 0;

always @( posedge clk ) begin

if ( iter == iter_lg'(iterations) ) begin

prod = accum;
accum = 0;
iter = 0;

end

for ( int i=0; i<m; i++ )
begin
int pos;
pos = iter * m + i;
if ( cand[pos] ) accum += plier << pos;
end

iter++;

end

endmodule

module mult_seq_dm
#( int w = 16,
int m = 2 )
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);

uwire [iterations-1:0][m-1:0] cand_2d = cand;

logic [iter_lg:1] iter;
logic [2*w-1:0] accum;

initial iter = 0;

always @( posedge clk ) begin

if ( iter == iter_lg'(iterations) ) begin

prod = accum;
accum = 0;
iter = 0;

end

accum += plier * cand_2d[iter] << ( iter * m );

iter++;

end

endmodule

module mult_seq_csa_m
#( int w = 16,
int m = 2 // Number of partial products per cycle.
)
( output logic [2*w-1:0] prod,
input logic [w-1:0] plier,
input logic [w-1:0] cand,
input clk);

localparam int iterations = ( w + m - 1 ) / m;
localparam int iter_lg = \$clog2(iterations);
localparam int w_lg = \$clog2(w);

logic [iter_lg:0] iter;

initial iter = 0;

uwire [2*w-1:0] accum_sum_a[0:m], accum_sum_b[0:m];
logic [2*w-1:0] accum_sum_a_reg, accum_sum_b_reg;

assign           accum_sum_a[0] = accum_sum_a_reg;
assign           accum_sum_b[0] = accum_sum_b_reg;

for ( genvar i=0; i<m; i++ ) begin

uwire [w_lg:1] pos = iter * m + i;
uwire          co; // Unconnected.

uwire [2*w-1:0] pp = pos < w && cand[pos] ? plier << pos : 0;

CW_csa #(2*w) csa
( .sum(accum_sum_a[i+1]), .carry(accum_sum_b[i+1]), .co(co),
.a(accum_sum_a[i]), .b(accum_sum_b[i]), .c(pp), .ci(1'b0) );

end

always @( posedge clk ) begin

if ( iter == iterations ) begin

prod <= accum_sum_a_reg + accum_sum_b_reg;

accum_sum_a_reg <= 0;
accum_sum_b_reg <= 0;
iter <= 0;

end else begin

prod <= prod;

accum_sum_a_reg <= accum_sum_a[m];
accum_sum_b_reg <= accum_sum_b[m];
iter <= iter + 1;

end

end

endmodule

// Inferred Hardware for m = 2:
//

// Optimization Plan for m = 2:
//

`ifdef DONT_DEFINE_ME
Module Name                             Area   Clock    Total    Init.
Period    Delay   Interv
mult_pipe_2_wid16_pp_per_stage1       652540    1988    -1988     1988
mult_pipe_wid16_pp_per_stage1         747364    1717    27472     1717

mult_pipe_2_wid16_pp_per_stage2       390304    2530    -2530     2530
mult_pipe_wid16_pp_per_stage2         459860    2425    19400     2425

mult_pipe_2_wid16_pp_per_stage4       330368    2913    -2913     2913
mult_pipe_wid16_pp_per_stage4         357580    2983    11932     2983

mult_pipe_2_wid16_pp_per_stage8       256392    3515    -3515     3515
mult_pipe_wid16_pp_per_stage8         264352    3498     6996     3498
Normal exit.
`endif

//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram

module testbench;

localparam int w = 16;
localparam int num_tests = 1000;
localparam int NUM_MULT = 20;
localparam int err_limit = 7;

bit use_others;
logic [w-1:0] plier, cand;
logic [w-1:0] plierp, candp;
logic [2*w-1:0] prod[NUM_MULT];

typedef struct { int idx; int err_count = 0;
bit seq = 0; bit pipe = 0; int deg = 1;
logic [2*w-1:0] sout = 'h111; int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];

localparam int cycle_limit = num_tests * w * 4;
int cycle;
bit done;
logic clock;

logic      clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

initial begin
clock = 0;
cycle = 0;

fork
forever #10 cycle += clock++;
wait( done );
wait( cycle >= cycle_limit )
\$write("*** Cycle limit exceeded, ending.\n");
join_any;

\$finish();
end

initial begin

while ( !done ) @( posedge clk_reactive ) #1

if ( use_others ) begin

plierp = plier;
candp = cand;
use_others = 0;

end else begin

plierp = cycle;
candp = 256;

end
end

task pi_seq(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1;

task pi_pipe(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;

mult_behav_1 #(w) mb1(prod[0], plier, cand);
initial pi["Behavioral"].idx = 0;

mult_linear  #(w) ms1(prod[1], plier, cand);
initial pi["Linear"].idx = 1;

mult_tree    #(w) ms2(prod[2], plier, cand);
initial pi["Tree"].idx = 2;

mult_seq #(w) ms3(prod[3], plier, cand, clock);
initial begin
automatic string m = "Sequential";
pi[m].idx = 3; pi[m].seq = 1;
end

mult_seq_ga  #(w) msga1(prod[11], plier, cand, clock);
initial begin
automatic string m = "Sequential GA";
pi[m].idx = 11; pi[m].seq = 1;
end

mult_seq_stream #(w) mss1(prod[4], plier, cand, clock);
initial begin
automatic string m = "Sequential Streamlined";
pi[m].idx = 4; pi[m].seq = 1;
end

mult_seq_m   #(w,4) ms44(prod[5], plier, cand, clock);
initial pi_seq(5,"Seq", ms44.m);

mult_seq_m   #(w,3) ms43(prod[6], plier, cand, clock);
initial pi_seq(6,"Seq", ms43.m);

mult_seq_dm  #(w,4) msd44(prod[9], plier, cand, clock);

mult_seq_dm  #(w,3) msd43(prod[10], plier, cand, clock);

mult_seq_csa  #(w) mcsa(prod[14], plier, cand, clock);
initial begin
automatic string m = \$sformatf("Mult Seq CSA");
pi[m].idx = 14; pi[m].seq = 1;
end

mult_linear_clk  #(w) mlc1(prod[15], plier, cand, clock);
initial begin
automatic string m = \$sformatf("Linear Clock");
pi[m].idx = 15; pi[m].seq = 1;
end

// Array of multiplier/multiplicand values to try out.
// After these values are used a random number generator will be used.
//
int tests[\$] = {1,1, 1,2,  1,32,  32, 1};

initial begin

done = 0;
use_others = 0;

@( posedge clk_reactive );

for ( int i=0; i<num_tests; i++ ) begin
automatic int cyc_start = cycle;
automatic int awaiting = pi.num();

// Set multiplier and multiplicand values for non-piped units.
//
plier = tests.size() ? tests.pop_front() : \$random();
cand = tests.size() ? tests.pop_front() : \$random();

// Set multiplier and multiplicand values for piped units.
//
plierp = plier;
candp = cand;
use_others = 1;

foreach ( pi[muti] ) begin
automatic string mut = muti; // Bug workaround?
automatic Info p = pi[mut];
fork begin
automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg;
automatic int latency
= !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps;
automatic int eta = 1 + cyc_start + latency;
pi[mut].latency = latency;
wait ( cycle_reactive == eta );
awaiting--;
pi[mut].sout = prod[pi[mut].idx];
pi[mut].cyc_tot += cycle - cyc_start;
end join_none;
end
wait ( awaiting == 0 );

// Check the output of each Module Under Test.
//
foreach ( pi[ mut ] )
if ( prod[0] !== pi[mut].sout ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < 5 )
\$write
("%-25s wrong result: %0d * %0d:  0x%0h != 0x%0h (correct)\n",
mut, plier, cand, pi[mut].sout, prod[0]);
end

@( posedge clk_reactive );

end

foreach ( pi[ mut ] )
\$write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
num_tests, mut, pi[mut].err_count,
pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);

done = 1;

\$finish(2);

end

endmodule