/// EE 4755 - Digital Design Using HDLs // // Classroom demo code. // Time-stamp: <21 November 2016, 13:03:18 CST, koppel@cyc.ece.lsu.edu> ////////////////////////////////////////////////////////////////////////////// /// Pipelining Concept /// Pipelining Concept <-- Very Important but Tricky Concept, Pay Attention! // // // :Def: Pipelining // Performing an operation in *stages* on multiple data items. // :Example: // // Output x has the value that input a had two cycles in the past. // module very_simple_pipe #( int w = 16 ) ( output logic [w-1:0] x, input wire [w-1:0] a, input wire clk ); logic [w-1:0] r; always_ff @( posedge clk ) r <= a; always_ff @( posedge clk ) x <= r; endmodule // :Example: // // Output x has the value that input a had nstages cycles in the past. // module simple_pipe2 #( int w = 16, int nstages = 4 ) ( output wire [w-1:0] x, input wire [w-1:0] a, input wire clk ); logic [w-1:0] r[nstages]; always_ff @( posedge clk ) begin r[0] <= a; for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1]; end assign x = r[nstages-1]; endmodule // /// Important thing to notice: // // At any moment the module holds the most recent nstages values of a. // :Example: // // Output x has the value that input a had nstages cycles in the past.// // Output avg is the average of these values. // module simple_pipe_avg #( int w = 16, int nstages = 4 ) ( output wire [w-1:0] x, output logic [w-1:0] avg, input wire [w-1:0] a, input wire clk ); logic [w-1:0] r[nstages]; always_ff @( posedge clk ) begin r[0] <= a; for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1]; end assign x = r[nstages-1]; logic [w+$clog2(nstages):0] sum; always_comb begin sum = 0; for ( int i=0; i<nstages; i++ ) sum += r[i]; avg = sum / nstages; end endmodule // :Example: // // Output x has value of input a nstages cycles in the past, but incremented. // module simple_pipe_add1 #( int w = 16, int nstages = 4 ) ( output wire [w-1:0] x, input wire [w-1:0] a, input wire clk ); logic [w-1:0] r[nstages]; always_ff @( posedge clk ) begin r[0] <= a; for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1] + 1; end assign x = r[nstages-1]; endmodule ////////////////////////////////////////////////////////////////////////////// /// Pipelined Multiplier /// Pipelining Idea <-- Very Important Concept, Pay Attention! // // Consider a four-bit multiply. // // Module mult_seq computes the product .. // .. using the same logic .. // .. on four consecutive cycles. // // Time: 0 1 2 3 // Hardw: S S S S // // // A pipelined module might compute the product .. // .. using four different pieces of logic .. // .. each for one cycle. // // Time: 0 1 2 3 // Hardw: P0 P1 P2 P3 <- 16 * 52 // // Time: 0 1 2 3 4 5 6 // Hardw: P0 P1 P2 P3 <- 16 * 52 // Hardw: P0 P1 P2 P3 <- 14 * 37 // Hardw: P0 P1 P2 P3 <- 7 * 2 // Hardw: P0 P1 P2 P3 <- 12 * 9 // // // Benefits of Pipelining // // A new operation can start *each* clock cycle. // Simpler hardware (though more of it). /// Throughput v. Latency // // :Def: Throughput // The amount of work per unit time. // For example, 10 multiplies per clock cycle. // Quadrillion floating-point operations per second. // // :Def: Latency // The amount of time it takes to do something. // For example, the latency of a multiply is 20 ns. // // /// Which is better, high throughput or low latency: // // Impatient people low latency. // Productive people prefer high throughput. // // Of course, it depends on the situation. /// Coding of Pipelined Units // // :Def: Pipeline Latch // A set of registers that divides stages. // Input to pipeline latch is in one stage ... // ... output is in the next stage. // // Note: // "Pipeline Latch" indicates how a register is used ... // ... but it no different than other registers. // // /// :Example: Basic Pipelined Multiplier -- mult_pipe1 // // Computes 1 partial product per stage. // module mult_pipe1 #( int wid = 16, int pp_per_stage = 1 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int stages = wid; // Note: pl is for pipeline latch. logic [2*wid-1:0] pl_accum[0:stages]; logic [wid-1:0] pl_plier[0:stages]; logic [wid-1:0] pl_cand[0:stages]; always_ff @( posedge clk ) begin pl_accum[0] = 0; pl_plier[0] = plier; pl_cand[0] = cand; for ( int stage=0; stage<stages; stage++ ) begin /// Compute some stuff within this stage. // logic [2*wid-1:0] pp, accum; pp = pl_cand[stage][stage] ? pl_plier[stage] << stage : 0; // POS POS accum = pl_accum[stage] + pp; /// Move *everything* to next stage. // pl_accum[stage+1] <= accum; pl_cand[stage+1] <= pl_cand[stage]; pl_plier[stage+1] <= pl_plier[stage]; end end assign prod = pl_accum[stages]; endmodule /// :Example: Basic Pipelined Multiplier -- mult_pipe // // Computes pp_per_stage partial products per stage. // module mult_pipe #( int wid = 16, int pp_per_stage = 2 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage; // Note: pl is for pipeline latch. logic [2*wid-1:0] pl_accum[0:stages]; logic [wid-1:0] pl_plier[0:stages]; logic [wid-1:0] pl_cand[0:stages]; always_ff @( posedge clk ) begin pl_accum[0] = 0; pl_plier[0] = plier; pl_cand[0] = cand; for ( int stage=0; stage<stages; stage++ ) begin logic [2*wid-1:0] accum; accum = pl_accum[stage]; for ( int j=0; j<pp_per_stage; j++ ) begin int pos; pos = stage * pp_per_stage + j; if ( pos < wid && pl_cand[stage][pos] ) accum += pl_plier[stage] << pos; accum += pl_cond[stage][pos] * pl_plier[stage] << pos; end /// Values to use in the next clock cycle, *not* the next iteration. pl_accum[stage+1] <= accum; pl_cand[stage+1] <= pl_cand[stage]; pl_plier[stage+1] <= pl_plier[stage]; end end assign prod = pl_accum[stages]; endmodule // Areas for improvement of mult_pipe // // -- Use fewer bits: // // accum: Earlier stages need fewer bits. // cand: Later stages need fewer bits. // // -- Use a carry-save adder until the last stage. // :Example: Alternative computation of partial products. // // Like mult_pipe, except compute set of pp_per_stage partial // products using a multiplier. // // We (humans) know that the two are equivalent. Will the synthesis // program's optimization code see the two as equivalent? module mult_pipe_2 #( int wid = 16, int pp_per_stage = 2 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage; logic [2*wid-1:0] pl_accum[0:stages]; logic [wid-1:0] pl_plier[0:stages]; logic [stages-1:0][pp_per_stage-1:0] pl_cand[0:stages]; always_ff @( posedge clk ) begin pl_accum[0] = 0; pl_plier[0] = plier; pl_cand[0] = cand; for ( int stage=0; stage<stages; stage++ ) begin pl_accum[stage+1] <= pl_accum[stage] + ( pl_plier[stage] * pl_cand[stage][stage] << stage*pp_per_stage ); pl_cand[stage+1] <= pl_cand[stage]; pl_plier[stage+1] <= pl_plier[stage]; end end assign prod = pl_accum[stages]; endmodule // Module Name Area Clock Total Init. // Period Delay Interv // mult_pipe_2_wid16_pp_per_stage1 652540 1988 -1988 1988 // mult_pipe_wid16_pp_per_stage1 747364 1717 27472 1717 // mult_pipe_2_wid16_pp_per_stage2 390304 2530 -2530 2530 // mult_pipe_wid16_pp_per_stage2 459860 2425 19400 2425 // mult_pipe_2_wid16_pp_per_stage4 330368 2913 -2913 2913 // mult_pipe_wid16_pp_per_stage4 357580 2983 11932 2983 // mult_pipe_2_wid16_pp_per_stage8 256392 3515 -3515 3515 // mult_pipe_wid16_pp_per_stage8 264352 3498 6996 3498 ////////////////////////////////////////////////////////////////////////////// /// Pipelined Multiplier, Instantiated Stages // mult_pipe_comb_stage // // Very similar to mult_pipe, except each stage is a module. module pipe_stage #( int wid = 16, int pp_per_stage = 2, int stage = 0 ) ( output logic [2*wid-1:0] accum_out, input logic [2*wid-1:0] accum_in, input logic [wid-1:0] plier, input logic [wid-1:0] cand ); always_comb begin logic [2*wid-1:0] accum; accum = accum_in; for ( int j=0; j<pp_per_stage; j++ ) begin int pos; pos = stage * pp_per_stage + j; if ( pos < wid && cand[pos] ) accum += plier << pos; end accum_out = accum; end endmodule module mult_pipe_c_cpa #( int wid = 16, int pp_per_stage = 2 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage; logic [2*wid-1:0] pl_accum[0:stages]; logic [wid-1:0] pl_plier[0:stages]; logic [wid-1:0] pl_cand[0:stages]; for ( genvar stage = 0; stage < stages; stage++ ) begin wire [2*wid-1:0] accum; pipe_stage #(wid, pp_per_stage, stage) our_stage (accum, pl_accum[stage], pl_plier[stage], pl_cand[stage]); always_ff @( posedge clk ) pl_accum[stage+1] <= accum; end always_comb begin pl_plier[0] = plier; pl_cand[0] = cand; end always_ff @( posedge clk ) begin pl_accum[0] = 0; for ( int stage=0; stage<stages; stage++ ) begin pl_cand[stage+1] <= pl_cand[stage]; pl_plier[stage+1] <= pl_plier[stage]; end end assign prod = pl_accum[stages]; endmodule `include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v" module pipe_stage_csa #( int wid = 16, int pp_per_stage = 2, int stage = 0 ) ( output wire [2*wid-1:0] accum_out_a, accum_out_b, input wire [2*wid-1:0] accum_in_a, accum_in_b, input wire [wid-1:0] plier, input wire [wid-1:0] cand ); wire [2*wid-1:0] accum_a[pp_per_stage-1:-1]; wire [2*wid-1:0] accum_b[pp_per_stage-1:-1]; wire co[-1:pp_per_stage-1]; assign accum_a[-1] = accum_in_a; assign accum_b[-1] = accum_in_b; assign accum_out_a = accum_a[pp_per_stage-1]; assign accum_out_b = accum_b[pp_per_stage-1]; for ( genvar i = 0; i < pp_per_stage; i++ ) begin localparam int pos = stage * pp_per_stage + i; wire [2*wid-1:0] pp = pos < wid && cand[pos] ? plier << pos : 0; CW_csa #(2*wid) csa ( .carry(accum_a[i]), .sum(accum_b[i]), .co(co[i]), .a(accum_a[i-1]), .b(accum_b[i-1]), .c(pp), .ci(1'b0) ); end endmodule module mult_pipe_c_csa #( int wid = 16, int pp_per_stage = 2 ) ( output wire [2*wid-1:0] prod, input wire [wid-1:0] plier, input wire [wid-1:0] cand, input wire clk); localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage; logic [2*wid-1:0] pl_accum_a[0:stages]; logic [2*wid-1:0] pl_accum_b[0:stages]; logic [wid-1:0] pl_plier[0:stages]; logic [wid-1:0] pl_cand[0:stages]; for ( genvar stage = 0; stage < stages; stage++ ) begin wire [2*wid-1:0] accum_a, accum_b; pipe_stage_csa #(wid, pp_per_stage, stage) our_stage (accum_a, accum_b, pl_accum_a[stage], pl_accum_b[stage], pl_plier[stage], pl_cand[stage]); always_ff @( posedge clk ) begin pl_accum_a[stage+1] <= accum_a; pl_accum_b[stage+1] <= accum_b; end end always_comb begin pl_plier[0] = plier; pl_cand[0] = cand; end always_ff @( posedge clk ) begin pl_accum_a[0] = 0; pl_accum_b[0] = 0; for ( int stage=0; stage<stages; stage++ ) begin pl_cand[stage+1] <= pl_cand[stage]; pl_plier[stage+1] <= pl_plier[stage]; end end assign prod = pl_accum_a[stages] + pl_accum_b[stages]; endmodule module mult_behav_1 #(int wid = 16) (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand); assign prod = plier * cand; endmodule ////////////////////////////////////////////////////////////////////////////// /// Testbench Code // cadence translate_off program reactivate (output wire clk_reactive, output int cycle_reactive, input wire clk, input int cycle); assign clk_reactive = clk; assign cycle_reactive = cycle; endprogram module testbench; localparam int wid = 16; localparam int num_tests = 1000; localparam int NUM_MULT = 20; localparam int err_limit = 7; bit use_others; logic [wid-1:0] plier, cand; logic [wid-1:0] plierp, candp; logic [2*wid-1:0] prod[NUM_MULT]; typedef struct { int idx; int err_count = 0; bit seq = 0; bit pipe = 0; int deg = 1; logic [2*wid-1:0] sout = 'h111; int cyc_tot = 0; int latency = 0; } Info; Info pi[string]; localparam int cycle_limit = num_tests * wid * 4; int cycle; bit done; logic clock; logic clk_reactive; int cycle_reactive; reactivate ra(clk_reactive,cycle_reactive,clock,cycle); initial begin clock = 0; cycle = 0; fork forever #10 cycle += clock++; wait( done ); wait( cycle >= cycle_limit ) $write("*** Cycle limit exceeded, ending.\n"); join_any; $finish(); end initial begin while ( !done ) @( posedge clk_reactive ) #1 if ( use_others ) begin plierp = plier; candp = cand; use_others = 0; end else begin plierp = cycle; candp = 256; end end task pi_seq(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; endtask task pi_pipe(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; endtask mult_behav_1 #(wid) mb1(prod[0], plier, cand); initial pi["Behavioral"].idx = 0; mult_pipe1 #(wid) ms18(prod[18], plierp, candp, clock); initial pi_pipe(18,"Pipelined Simple",1); mult_pipe #(wid,4) ms54(prod[7], plierp, candp, clock); initial pi_pipe(7,"Pipelined",ms54.pp_per_stage); mult_pipe #(wid,3) ms53(prod[8], plierp, candp, clock); initial pi_pipe(8,"Pipelined",ms53.pp_per_stage); mult_pipe_2 #(wid,4) ms17(prod[17], plierp, candp, clock); initial pi_pipe(17,"Pipelined 2",ms17.pp_per_stage); mult_pipe_2 #(wid,3) ms16(prod[16], plierp, candp, clock); initial pi_pipe(16,"Pipelined 2",ms16.pp_per_stage); mult_pipe_c_cpa #(wid,4) pgam4(prod[12], plierp, candp, clock); initial pi_pipe(12,"Pipelined Comb CPA",pgam4.pp_per_stage); mult_pipe_c_cpa #(wid,3) pgam13(prod[13], plierp, candp, clock); initial pi_pipe(13,"Pipelined Comb CPA",pgam13.pp_per_stage); mult_pipe_c_csa #(wid,3) pgam2(prod[2], plierp, candp, clock); initial pi_pipe(2,"Pipelined CSA",pgam2.pp_per_stage); mult_pipe_c_csa #(wid,4) pgam3(prod[3], plierp, candp, clock); initial pi_pipe(3,"Pipelined CSA",pgam3.pp_per_stage); // Array of multiplier/multiplicand values to try out. // After these values are used a random number generator will be used. // int tests[$] = {1,1, 1,2, 1,32, 32, 1}; initial begin done = 0; use_others = 0; @( posedge clk_reactive ); for ( int i=0; i<num_tests; i++ ) begin automatic int cyc_start = cycle; automatic int awaiting = pi.num(); // Set multiplier and multiplicand values for non-piped units. // plier = tests.size() ? tests.pop_front() : $random(); cand = tests.size() ? tests.pop_front() : $random(); // Set multiplier and multiplicand values for piped units. // plierp = plier; candp = cand; use_others = 1; foreach ( pi[muti] ) begin automatic string mut = muti; // Bug workaround? automatic Info p = pi[mut]; fork begin automatic int steps = ( wid + pi[mut].deg - 1 ) / pi[mut].deg; automatic int latency = !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps; automatic int eta = 1 + cyc_start + latency; pi[mut].latency = latency; wait ( cycle_reactive == eta ); awaiting--; pi[mut].sout = prod[pi[mut].idx]; pi[mut].cyc_tot += cycle - cyc_start; end join_none; end wait ( awaiting == 0 ); // Check the output of each Module Under Test. // foreach ( pi[ mut ] ) if ( prod[0] !== pi[mut].sout ) begin pi[mut].err_count++; if ( pi[mut].err_count < 5 ) $write ("%-25s wrong result: %0d * %0d: 0x%0h != 0x%0h (correct)\n", mut, plier, cand, pi[mut].sout, prod[0]); end @( posedge clk_reactive ); end foreach ( pi[ mut ] ) $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n", num_tests, mut, pi[mut].err_count, pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1); done = 1; $finish(2); end endmodule // cadence translate_on