```/// EE 4755 - Digital Design Using HDLs
//
//  Classroom demo code.

// Time-stamp: <21 November 2016, 13:03:18 CST, koppel@cyc.ece.lsu.edu>

//////////////////////////////////////////////////////////////////////////////
/// Pipelining Concept

/// Pipelining Concept    <-- Very Important but Tricky Concept, Pay Attention!
//
//
//  :Def: Pipelining
//   Performing an operation in *stages* on multiple data items.

// :Example:
//
// Output x has the value that input a had two cycles in the past.
//
module very_simple_pipe
#( int w = 16 )
( output logic [w-1:0] x,
input wire [w-1:0] a,
input wire clk );

logic [w-1:0] r;

always_ff @( posedge clk ) r <= a;
always_ff @( posedge clk ) x <= r;

endmodule

// :Example:
//
// Output x has the value that input a had nstages cycles in the past.
//
module simple_pipe2
#( int w = 16, int nstages = 4 )
( output wire [w-1:0] x,
input wire [w-1:0] a,
input wire clk );

logic [w-1:0] r[nstages];

always_ff @( posedge clk ) begin

r[0] <= a;
for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];

end

assign x = r[nstages-1];

endmodule
//
/// Important thing to notice:
//
//  At any moment the module holds the most recent nstages values of a.

// :Example:
//
// Output x has the value that input a had nstages cycles in the past.//
// Output avg is the average of these values.
//
module simple_pipe_avg
#( int w = 16, int nstages = 4 )
( output wire [w-1:0] x,
output logic [w-1:0] avg,
input wire [w-1:0] a,
input wire clk );

logic [w-1:0] r[nstages];

always_ff @( posedge clk ) begin

r[0] <= a;
for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];

end

assign x = r[nstages-1];

logic [w+\$clog2(nstages):0] sum;

always_comb begin

sum = 0;
for ( int i=0; i<nstages; i++ ) sum += r[i];
avg = sum / nstages;

end

endmodule

// :Example:
//
// Output x has value of input a nstages cycles in the past, but incremented.
//
#( int w = 16, int nstages = 4 )
( output wire [w-1:0] x,
input wire [w-1:0] a,
input wire clk );

logic [w-1:0] r[nstages];

always_ff @( posedge clk ) begin

r[0] <= a;
for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1] + 1;

end

assign x = r[nstages-1];

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Pipelined Multiplier

/// Pipelining Idea    <-- Very Important Concept, Pay Attention!
//
//   Consider a four-bit multiply.
//
//     Module mult_seq computes the product ..
//     .. using the same logic ..
//     .. on four consecutive cycles.
//
//        Time:    0  1  2  3
//        Hardw:   S  S  S  S
//
//
//     A pipelined module might compute the product ..
//     .. using four different pieces of logic ..
//     .. each for one cycle.
//
//        Time:    0  1  2  3
//        Hardw:   P0 P1 P2 P3           <-  16 * 52
//
//        Time:    0  1  2  3  4  5  6
//        Hardw:   P0 P1 P2 P3           <-  16 * 52
//        Hardw:      P0 P1 P2 P3        <-  14 * 37
//        Hardw:         P0 P1 P2 P3     <-   7 * 2
//        Hardw:            P0 P1 P2 P3  <-  12 * 9
//
//
//   Benefits of Pipelining
//
//     A new operation can start *each* clock cycle.
//     Simpler hardware (though more of it).

/// Throughput v. Latency
//
// :Def: Throughput
//  The amount of work per unit time.
//  For example, 10 multiplies per clock cycle.
//               Quadrillion floating-point operations per second.
//
// :Def: Latency
//  The amount of time it takes to do something.
//  For example, the latency of a multiply is 20 ns.
//
//
/// Which is better, high throughput or low latency:
//
//   Impatient people low latency.
//   Productive people prefer high throughput.
//
//   Of course, it depends on the situation.

/// Coding of Pipelined Units
//
//   :Def: Pipeline Latch
//   A set of registers that divides stages.
//   Input to pipeline latch is in one stage ...
//   ... output is in the next stage.
//
//   Note:
//     "Pipeline Latch" indicates how a register is used ...
//     ... but it no different than other registers.
//
//

/// :Example: Basic Pipelined Multiplier -- mult_pipe1
//
//   Computes 1 partial product per stage.
//
module mult_pipe1
#( int wid = 16, int pp_per_stage = 1 )
( output logic [2*wid-1:0] prod,
input logic [wid-1:0] plier,
input logic [wid-1:0] cand,
input clk);

localparam int stages = wid;

// Note: pl is for pipeline latch.
logic [2*wid-1:0] pl_accum[0:stages];
logic [wid-1:0] pl_plier[0:stages];
logic [wid-1:0] pl_cand[0:stages];

always_ff @( posedge clk ) begin

pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;

for ( int stage=0; stage<stages; stage++ ) begin

/// Compute some stuff within this stage.
//
logic [2*wid-1:0] pp, accum;

pp = pl_cand[stage][stage] ? pl_plier[stage] << stage : 0;
//                   POS                         POS

accum = pl_accum[stage] + pp;

/// Move *everything* to next stage.
//
pl_accum[stage+1] <= accum;
pl_cand[stage+1]  <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];

end

end

assign prod = pl_accum[stages];

endmodule

/// :Example: Basic Pipelined Multiplier -- mult_pipe
//
//   Computes pp_per_stage partial products per stage.
//

module mult_pipe #( int wid = 16, int pp_per_stage = 2 )
( output logic [2*wid-1:0] prod,
input logic [wid-1:0] plier,
input logic [wid-1:0] cand,
input clk);

localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage;

// Note: pl is for pipeline latch.
logic [2*wid-1:0] pl_accum[0:stages];
logic [wid-1:0] pl_plier[0:stages];
logic [wid-1:0] pl_cand[0:stages];

always_ff @( posedge clk ) begin

pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;

for ( int stage=0; stage<stages; stage++ ) begin

logic [2*wid-1:0] accum;
accum = pl_accum[stage];

for ( int j=0; j<pp_per_stage; j++ ) begin

int pos;
pos = stage * pp_per_stage + j;

if ( pos < wid && pl_cand[stage][pos] )
accum += pl_plier[stage] << pos;

accum += pl_cond[stage][pos] * pl_plier[stage] << pos;

end

/// Values to use in the next clock cycle, *not* the next iteration.
pl_accum[stage+1] <= accum;
pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];

end

end

assign prod = pl_accum[stages];

endmodule

// Areas for improvement of mult_pipe
//
// -- Use fewer bits:
//
//    accum: Earlier stages need fewer bits.
//    cand:  Later stages need fewer bits.
//
// -- Use a carry-save adder until the last stage.

// :Example: Alternative computation of partial products.
//
// Like mult_pipe, except compute set of pp_per_stage partial
// products using a multiplier.
//
// We (humans) know that the two are equivalent. Will the synthesis
// program's optimization code see the two as equivalent?

module mult_pipe_2 #( int wid = 16, int pp_per_stage = 2 )
( output logic [2*wid-1:0] prod,
input logic [wid-1:0] plier,
input logic [wid-1:0] cand,
input clk);

localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage;

logic [2*wid-1:0] pl_accum[0:stages];
logic [wid-1:0] pl_plier[0:stages];
logic [stages-1:0][pp_per_stage-1:0] pl_cand[0:stages];

always_ff @( posedge clk ) begin

pl_accum[0] = 0;
pl_plier[0] = plier;
pl_cand[0] = cand;

for ( int stage=0; stage<stages; stage++ ) begin

pl_accum[stage+1] <=
pl_accum[stage] +
( pl_plier[stage] * pl_cand[stage][stage] << stage*pp_per_stage );

pl_cand[stage+1] <= pl_cand[stage];

pl_plier[stage+1] <= pl_plier[stage];

end

end

assign prod = pl_accum[stages];

endmodule

// Module Name                             Area   Clock    Total    Init.
//                                               Period    Delay   Interv
// mult_pipe_2_wid16_pp_per_stage1       652540    1988    -1988     1988
// mult_pipe_wid16_pp_per_stage1         747364    1717    27472     1717

// mult_pipe_2_wid16_pp_per_stage2       390304    2530    -2530     2530
// mult_pipe_wid16_pp_per_stage2         459860    2425    19400     2425

// mult_pipe_2_wid16_pp_per_stage4       330368    2913    -2913     2913
// mult_pipe_wid16_pp_per_stage4         357580    2983    11932     2983

// mult_pipe_2_wid16_pp_per_stage8       256392    3515    -3515     3515
// mult_pipe_wid16_pp_per_stage8         264352    3498     6996     3498

//////////////////////////////////////////////////////////////////////////////
/// Pipelined Multiplier, Instantiated Stages
//  mult_pipe_comb_stage
//
//  Very similar to mult_pipe, except each stage is a module.

module pipe_stage #( int wid = 16, int pp_per_stage = 2, int stage = 0 )
( output logic [2*wid-1:0] accum_out,
input logic [2*wid-1:0] accum_in,
input logic [wid-1:0] plier,
input logic [wid-1:0] cand );

always_comb begin

logic [2*wid-1:0] accum;  accum = accum_in;

for ( int j=0; j<pp_per_stage; j++ ) begin

int pos;  pos = stage * pp_per_stage + j;

if ( pos < wid && cand[pos] )
accum += plier << pos;

end

accum_out = accum;

end

endmodule

module mult_pipe_c_cpa #( int wid = 16, int pp_per_stage = 2 )
( output logic [2*wid-1:0] prod,
input logic [wid-1:0] plier,
input logic [wid-1:0] cand,
input clk);

localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage;

logic [2*wid-1:0] pl_accum[0:stages];
logic [wid-1:0] pl_plier[0:stages];
logic [wid-1:0] pl_cand[0:stages];

for ( genvar stage = 0;  stage < stages; stage++ ) begin

wire [2*wid-1:0] accum;

pipe_stage #(wid, pp_per_stage, stage) our_stage
(accum, pl_accum[stage], pl_plier[stage], pl_cand[stage]);

always_ff @( posedge clk )
pl_accum[stage+1] <= accum;

end

always_comb begin
pl_plier[0] = plier;
pl_cand[0] = cand;
end

always_ff @( posedge clk ) begin

pl_accum[0] = 0;

for ( int stage=0; stage<stages; stage++ ) begin

pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];

end

end

assign prod = pl_accum[stages];

endmodule

module pipe_stage_csa #( int wid = 16, int pp_per_stage = 2, int stage = 0 )
( output wire [2*wid-1:0] accum_out_a, accum_out_b,
input wire [2*wid-1:0] accum_in_a, accum_in_b,
input wire [wid-1:0] plier,
input wire [wid-1:0] cand );

wire [2*wid-1:0] accum_a[pp_per_stage-1:-1];
wire [2*wid-1:0] accum_b[pp_per_stage-1:-1];
wire co[-1:pp_per_stage-1];

assign accum_a[-1] = accum_in_a;
assign accum_b[-1] = accum_in_b;
assign accum_out_a = accum_a[pp_per_stage-1];
assign accum_out_b = accum_b[pp_per_stage-1];

for ( genvar i = 0; i < pp_per_stage; i++ ) begin

localparam int pos = stage * pp_per_stage + i;

wire [2*wid-1:0] pp = pos < wid && cand[pos] ? plier << pos : 0;

CW_csa #(2*wid) csa
( .carry(accum_a[i]), .sum(accum_b[i]), .co(co[i]),
.a(accum_a[i-1]), .b(accum_b[i-1]), .c(pp), .ci(1'b0) );

end

endmodule

module mult_pipe_c_csa #( int wid = 16, int pp_per_stage = 2 )
( output wire [2*wid-1:0] prod,
input wire [wid-1:0] plier,
input wire [wid-1:0] cand,
input wire clk);

localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage;

logic [2*wid-1:0] pl_accum_a[0:stages];
logic [2*wid-1:0] pl_accum_b[0:stages];
logic [wid-1:0] pl_plier[0:stages];
logic [wid-1:0] pl_cand[0:stages];

for ( genvar stage = 0;  stage < stages;  stage++ ) begin

wire [2*wid-1:0] accum_a, accum_b;

pipe_stage_csa #(wid, pp_per_stage, stage) our_stage
(accum_a, accum_b, pl_accum_a[stage], pl_accum_b[stage],
pl_plier[stage], pl_cand[stage]);

always_ff @( posedge clk ) begin
pl_accum_a[stage+1] <= accum_a;
pl_accum_b[stage+1] <= accum_b;
end

end

always_comb begin
pl_plier[0] = plier;
pl_cand[0] = cand;
end

always_ff @( posedge clk ) begin

pl_accum_a[0] = 0;
pl_accum_b[0] = 0;

for ( int stage=0; stage<stages; stage++ ) begin

pl_cand[stage+1] <= pl_cand[stage];
pl_plier[stage+1] <= pl_plier[stage];

end

end

assign prod = pl_accum_a[stages] + pl_accum_b[stages];

endmodule

module mult_behav_1
#(int wid = 16)
(output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand);

assign prod = plier * cand;
endmodule

//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

program reactivate
(output wire clk_reactive, output int cycle_reactive,
input wire clk, input int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram

module testbench;

localparam int wid = 16;
localparam int num_tests = 1000;
localparam int NUM_MULT = 20;
localparam int err_limit = 7;

bit use_others;
logic [wid-1:0] plier, cand;
logic [wid-1:0] plierp, candp;
logic [2*wid-1:0] prod[NUM_MULT];

typedef struct { int idx; int err_count = 0;
bit seq = 0; bit pipe = 0; int deg = 1;
logic [2*wid-1:0] sout = 'h111; int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];

localparam int cycle_limit = num_tests * wid * 4;
int cycle;
bit done;
logic clock;

logic      clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

initial begin
clock = 0;
cycle = 0;

fork
forever #10 cycle += clock++;
wait( done );
wait( cycle >= cycle_limit )
\$write("*** Cycle limit exceeded, ending.\n");
join_any;

\$finish();
end

initial begin

while ( !done ) @( posedge clk_reactive ) #1

if ( use_others ) begin

plierp = plier;
candp = cand;
use_others = 0;

end else begin

plierp = cycle;
candp = 256;

end
end

task pi_seq(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1;

task pi_pipe(input int idx, input string name, input int deg);
automatic string m = \$sformatf("%s Deg %0d", name, deg);
pi[m].deg = deg;
pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;

mult_behav_1 #(wid) mb1(prod[0], plier, cand);
initial pi["Behavioral"].idx = 0;

mult_pipe1    #(wid) ms18(prod[18], plierp, candp, clock);
initial pi_pipe(18,"Pipelined Simple",1);

mult_pipe    #(wid,4) ms54(prod[7], plierp, candp, clock);
initial pi_pipe(7,"Pipelined",ms54.pp_per_stage);

mult_pipe    #(wid,3) ms53(prod[8], plierp, candp, clock);
initial pi_pipe(8,"Pipelined",ms53.pp_per_stage);

mult_pipe_2    #(wid,4) ms17(prod[17], plierp, candp, clock);
initial pi_pipe(17,"Pipelined 2",ms17.pp_per_stage);

mult_pipe_2    #(wid,3) ms16(prod[16], plierp, candp, clock);
initial pi_pipe(16,"Pipelined 2",ms16.pp_per_stage);

mult_pipe_c_cpa #(wid,4) pgam4(prod[12], plierp, candp, clock);
initial pi_pipe(12,"Pipelined Comb CPA",pgam4.pp_per_stage);

mult_pipe_c_cpa    #(wid,3) pgam13(prod[13], plierp, candp, clock);
initial pi_pipe(13,"Pipelined Comb CPA",pgam13.pp_per_stage);

mult_pipe_c_csa    #(wid,3) pgam2(prod[2], plierp, candp, clock);
initial pi_pipe(2,"Pipelined CSA",pgam2.pp_per_stage);

mult_pipe_c_csa    #(wid,4) pgam3(prod[3], plierp, candp, clock);
initial pi_pipe(3,"Pipelined CSA",pgam3.pp_per_stage);

// Array of multiplier/multiplicand values to try out.
// After these values are used a random number generator will be used.
//
int tests[\$] = {1,1, 1,2,  1,32,  32, 1};

initial begin

done = 0;
use_others = 0;

@( posedge clk_reactive );

for ( int i=0; i<num_tests; i++ ) begin
automatic int cyc_start = cycle;
automatic int awaiting = pi.num();

// Set multiplier and multiplicand values for non-piped units.
//
plier = tests.size() ? tests.pop_front() : \$random();
cand = tests.size() ? tests.pop_front() : \$random();

// Set multiplier and multiplicand values for piped units.
//
plierp = plier;
candp = cand;
use_others = 1;

foreach ( pi[muti] ) begin
automatic string mut = muti; // Bug workaround?
automatic Info p = pi[mut];

fork begin
automatic int steps = ( wid + pi[mut].deg - 1 ) / pi[mut].deg;
automatic int latency
= !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps;
automatic int eta = 1 + cyc_start + latency;
pi[mut].latency = latency;
wait ( cycle_reactive == eta );
awaiting--;
pi[mut].sout = prod[pi[mut].idx];
pi[mut].cyc_tot += cycle - cyc_start;
end join_none;
end
wait ( awaiting == 0 );

// Check the output of each Module Under Test.
//
foreach ( pi[ mut ] )
if ( prod[0] !== pi[mut].sout ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < 5 )
\$write
("%-25s wrong result: %0d * %0d:  0x%0h != 0x%0h (correct)\n",
mut, plier, cand, pi[mut].sout, prod[0]);
end

@( posedge clk_reactive );

end

foreach ( pi[ mut ] )
\$write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
num_tests, mut, pi[mut].err_count,
pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);

done = 1;

\$finish(2);

end

endmodule