/// EE 4755 - Digital Design Using HDLs // // Classroom demo code. // Time-stamp: <14 November 2016, 11:52:48 CST, koppel@cyc.ece.lsu.edu> ////////////////////////////////////////////////////////////////////////////// /// Behavioral Multiplier module mult_behav_1 #(int wid = 16) (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand); assign prod = plier * cand; endmodule ////////////////////////////////////////////////////////////////////////////// /// Linear Multiplier /// Simple Adder, Don't Modify module carry_prop_adder#(int w=16)(output [w:1] s, input [w:1] a,b); assign s = a + b; endmodule module mult_linear #(int wid = 16) (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand); logic [2*wid-1:0] rsum [wid-1:-1]; assign rsum[-1] = 0; for ( genvar i=0; i<wid; i++ ) begin wire [2*wid-1:0] pprod = plier[i] ? cand << i : 0; carry_prop_adder #(2*wid) adder(rsum[i], rsum[i-1], pprod ); end assign prod = rsum[wid-1]; endmodule module mult_linear_clk #(int wid = 16) (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand, input clk); wire [2*wid-1:0] p; logic [wid-1:0] pliercpy, candcpy; mult_linear #(wid) ml(p, plier, cand); always_ff @( posedge clk ) begin pliercpy <= plier; candcpy <= cand; prod <= p; end endmodule ////////////////////////////////////////////////////////////////////////////// /// Tree Multiplier module mult_tree #(int wid = 16) (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand); localparam int widp2 = 1 << $clog2(wid); logic [2*wid-1:0] rsum [2*wid-1:0]; localparam int mask = 2*wid-1; // Compute partial products. // for ( genvar i=0; i<wid; i++ ) assign rsum[i] = plier[i] ? cand << i : 0; // Add partial products together. // for ( genvar i=wid; i<2*wid-1; i++ ) carry_prop_adder #( 2*wid ) adder ( rsum[i], rsum[ mask & (i<<1) ], // Left child. rsum[ mask & ( (i<<1) + 1 ) ] // Right child. ); assign prod = rsum[2*wid-2]; endmodule ////////////////////////////////////////////////////////////////////////////// /// Simple Sequential Multiplier // This adder works correctly, but is unnecessarily slow. module mult_seq #( int wid = 16 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int wlog = $clog2(wid); logic [wlog-1:0] pos; logic [2*wid-1:0] accum; // cadence translate_off initial pos = 0; // cadence translate_on always @( posedge clk ) begin if ( pos == 0 ) begin prod = accum; accum = 0; end if ( cand[pos] == 1 ) accum += plier << pos; pos++; end endmodule ////////////////////////////////////////////////////////////////////////////// /// Sequential Multiplier, Using Instantiated Adder // // Simple multiplier, no handshaking. module mult_seq_ga #( int wid = 16 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk ); localparam int wlog = $clog2(wid); logic [wlog-1:0] pos; logic [2*wid-1:0] accum; wire [2*wid-1:0] sum; // cadence translate_off initial begin pos = 0; accum = 0; end // cadence translate_on wire [2*wid-1:0] pp = cand[pos] ? plier << pos : 0; carry_prop_adder #(2*wid) ga( sum, accum, pp ); always @( posedge clk ) pos <= pos + 1; always @( posedge clk ) begin if ( pos == 0 ) begin prod = sum; accum = 0; end else begin accum = sum; end end endmodule `include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v" module mult_seq_csa #( int wid = 16 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int wlog = $clog2(wid); logic [wlog-1:0] pos; logic [2*wid-1:0] accum_sum_a_reg, accum_sum_b_reg; wire co; // cadence translate_off initial begin pos = 0; accum_sum_a_reg = 0; accum_sum_b_reg = 0; end // cadence translate_on wire [2*wid-1:0] accum_sum_a, accum_sum_b; wire [2*wid-1:0] pp = cand[pos] ? plier << pos : 0; // Instantiate a carry save adder from the ChipWare library. // CW_csa #(2*wid) csa ( .carry(accum_sum_a), .sum(accum_sum_b), .co(co), .a(accum_sum_a_reg), .b(accum_sum_b_reg), .c(pp), .ci(1'b0) ); always @( posedge clk ) pos <= pos + 1; always @( posedge clk ) begin if ( pos == wid-1 ) begin prod = accum_sum_a + accum_sum_b; accum_sum_a_reg = 0; accum_sum_b_reg = 0; end else begin accum_sum_a_reg = accum_sum_a; accum_sum_b_reg = accum_sum_b; end end endmodule ////////////////////////////////////////////////////////////////////////////// /// Streamlined Sequential Multiplier /// Techniques For Lowering Cost // // Instead of shifting the multiplier, shift the accumulator. // Use part of the accumulator to store the multiplicand. module mult_seq_stream #( int wid = 16 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int wlog = $clog2(wid); logic [wlog-1:0] pos; logic [2*wid-1:0] accum; // cadence translate_off initial pos = 0; // cadence translate_on always @( posedge clk ) begin logic [wid:0] pp; if ( pos == wid - 1 ) begin prod = accum; accum = cand; pos = 0; end else begin pos++; end // Note: the multiplicand is in the lower bits of the accumulator. // pp = accum[0] ? { 1'b0, plier } : 0; // Add on the partial product and shift the accumulator. // accum = { { 1'b0, accum[2*wid-1:wid] } + pp, accum[wid-1:1] }; end endmodule ////////////////////////////////////////////////////////////////////////////// /// Degree-m Sequential Multipliers // Compute m partial products in each iteration. // // Will the synthesis program figure it out? module mult_seq_m #( int wid = 16, int m = 2 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int iterations = ( wid + m - 1 ) / m; localparam int iter_lg = $clog2(iterations); logic [iter_lg:1] iter; logic [2*wid-1:0] accum; // cadence translate_off initial iter = 0; // cadence translate_on always @( posedge clk ) begin if ( iter == iter_lg'(iterations) ) begin prod = accum; accum = 0; iter = 0; end for ( int i=0; i<m; i++ ) begin int pos; pos = iter * m + i; if ( cand[pos] ) accum += plier << pos; end iter++; end endmodule module mult_seq_dm #( int wid = 16, int m = 2 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int iterations = ( wid + m - 1 ) / m; localparam int iter_lg = $clog2(iterations); wire [iterations-1:0][m-1:0] cand_2d = cand; logic [iter_lg:1] iter; logic [2*wid-1:0] accum; // cadence translate_off initial iter = 0; // cadence translate_on always @( posedge clk ) begin if ( iter == iter_lg'(iterations) ) begin prod = accum; accum = 0; iter = 0; end accum += plier * cand_2d[iter] << ( iter * m ); iter++; end endmodule module mult_seq_csa_m #( int wid = 16, int pp_per_cycle = 2 ) ( output logic [2*wid-1:0] prod, input logic [wid-1:0] plier, input logic [wid-1:0] cand, input clk); localparam int iterations = ( wid + pp_per_cycle - 1 ) / pp_per_cycle; localparam int iter_lg = $clog2(iterations); localparam int wid_lg = $clog2(wid); logic [iter_lg:0] iter; // cadence translate_off initial iter = 0; // cadence translate_on wire [2*wid-1:0] accum_sum_a[0:pp_per_cycle], accum_sum_b[0:pp_per_cycle]; logic [2*wid-1:0] accum_sum_a_reg, accum_sum_b_reg; assign accum_sum_a[0] = accum_sum_a_reg; assign accum_sum_b[0] = accum_sum_b_reg; for ( genvar i=0; i<pp_per_cycle; i++ ) begin wire [wid_lg:1] pos = iter * pp_per_cycle + i; wire co; // Unconnected. wire [2*wid-1:0] pp = pos < wid && cand[pos] ? plier << pos : 0; CW_csa #(2*wid) csa ( .sum(accum_sum_a[i+1]), .carry(accum_sum_b[i+1]), .co(co), .a(accum_sum_a[i]), .b(accum_sum_b[i]), .c(pp), .ci(1'b0) ); end always @( posedge clk ) begin if ( iter == iterations ) begin prod <= accum_sum_a_reg + accum_sum_b_reg; accum_sum_a_reg <= 0; accum_sum_b_reg <= 0; iter <= 0; end else begin prod <= prod; accum_sum_a_reg <= accum_sum_a[pp_per_cycle]; accum_sum_b_reg <= accum_sum_b[pp_per_cycle]; iter <= iter + 1; end end endmodule `ifdef DONT_DEFINE_ME Module Name Area Clock Total Init. Period Delay Interv mult_pipe_2_wid16_pp_per_stage1 652540 1988 -1988 1988 mult_pipe_wid16_pp_per_stage1 747364 1717 27472 1717 mult_pipe_2_wid16_pp_per_stage2 390304 2530 -2530 2530 mult_pipe_wid16_pp_per_stage2 459860 2425 19400 2425 mult_pipe_2_wid16_pp_per_stage4 330368 2913 -2913 2913 mult_pipe_wid16_pp_per_stage4 357580 2983 11932 2983 mult_pipe_2_wid16_pp_per_stage8 256392 3515 -3515 3515 mult_pipe_wid16_pp_per_stage8 264352 3498 6996 3498 Normal exit. `endif ////////////////////////////////////////////////////////////////////////////// /// Testbench Code // cadence translate_off program reactivate (output wire clk_reactive, output int cycle_reactive, input wire clk, input int cycle); assign clk_reactive = clk; assign cycle_reactive = cycle; endprogram module testbench; localparam int wid = 16; localparam int num_tests = 1000; localparam int NUM_MULT = 20; localparam int err_limit = 7; bit use_others; logic [wid-1:0] plier, cand; logic [wid-1:0] plierp, candp; logic [2*wid-1:0] prod[NUM_MULT]; typedef struct { int idx; int err_count = 0; bit seq = 0; bit pipe = 0; int deg = 1; logic [2*wid-1:0] sout = 'h111; int cyc_tot = 0; int latency = 0; } Info; Info pi[string]; localparam int cycle_limit = num_tests * wid * 4; int cycle; bit done; logic clock; logic clk_reactive; int cycle_reactive; reactivate ra(clk_reactive,cycle_reactive,clock,cycle); initial begin clock = 0; cycle = 0; fork forever #10 cycle += clock++; wait( done ); wait( cycle >= cycle_limit ) $write("*** Cycle limit exceeded, ending.\n"); join_any; $finish(); end initial begin while ( !done ) @( posedge clk_reactive ) #1 if ( use_others ) begin plierp = plier; candp = cand; use_others = 0; end else begin plierp = cycle; candp = 256; end end task pi_seq(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; endtask task pi_pipe(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; endtask mult_behav_1 #(wid) mb1(prod[0], plier, cand); initial pi["Behavioral"].idx = 0; mult_linear #(wid) ms1(prod[1], plier, cand); initial pi["Linear"].idx = 1; mult_tree #(wid) ms2(prod[2], plier, cand); initial pi["Tree"].idx = 2; mult_seq #(wid) ms3(prod[3], plier, cand, clock); initial begin automatic string m = "Sequential"; pi[m].idx = 3; pi[m].seq = 1; end mult_seq_ga #(wid) msga1(prod[11], plier, cand, clock); initial begin automatic string m = "Sequential GA"; pi[m].idx = 11; pi[m].seq = 1; end mult_seq_stream #(wid) mss1(prod[4], plier, cand, clock); initial begin automatic string m = "Sequential Streamlined"; pi[m].idx = 4; pi[m].seq = 1; end mult_seq_m #(wid,4) ms44(prod[5], plier, cand, clock); initial pi_seq(5,"Seq", ms44.m); mult_seq_m #(wid,3) ms43(prod[6], plier, cand, clock); initial pi_seq(6,"Seq", ms43.m); mult_seq_dm #(wid,4) msd44(prod[9], plier, cand, clock); initial pi_seq(9,"Seq Rad", ms44.m); mult_seq_dm #(wid,3) msd43(prod[10], plier, cand, clock); initial pi_seq(10,"Seq Rad", ms43.m); mult_seq_csa #(wid) mcsa(prod[14], plier, cand, clock); initial begin automatic string m = $sformatf("Mult Seq CSA"); pi[m].idx = 14; pi[m].seq = 1; end mult_linear_clk #(wid) mlc1(prod[15], plier, cand, clock); initial begin automatic string m = $sformatf("Linear Clock"); pi[m].idx = 15; pi[m].seq = 1; end // Array of multiplier/multiplicand values to try out. // After these values are used a random number generator will be used. // int tests[$] = {1,1, 1,2, 1,32, 32, 1}; initial begin done = 0; use_others = 0; @( posedge clk_reactive ); for ( int i=0; i<num_tests; i++ ) begin automatic int cyc_start = cycle; automatic int awaiting = pi.num(); // Set multiplier and multiplicand values for non-piped units. // plier = tests.size() ? tests.pop_front() : $random(); cand = tests.size() ? tests.pop_front() : $random(); // Set multiplier and multiplicand values for piped units. // plierp = plier; candp = cand; use_others = 1; foreach ( pi[muti] ) begin automatic string mut = muti; // Bug workaround? automatic Info p = pi[mut]; fork begin automatic int steps = ( wid + pi[mut].deg - 1 ) / pi[mut].deg; automatic int latency = !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps; automatic int eta = 1 + cyc_start + latency; pi[mut].latency = latency; wait ( cycle_reactive == eta ); awaiting--; pi[mut].sout = prod[pi[mut].idx]; pi[mut].cyc_tot += cycle - cyc_start; end join_none; end wait ( awaiting == 0 ); // Check the output of each Module Under Test. // foreach ( pi[ mut ] ) if ( prod[0] !== pi[mut].sout ) begin pi[mut].err_count++; if ( pi[mut].err_count < 5 ) $write ("%-25s wrong result: %0d * %0d: 0x%0h != 0x%0h (correct)\n", mut, plier, cand, pi[mut].sout, prod[0]); end @( posedge clk_reactive ); end foreach ( pi[ mut ] ) $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n", num_tests, mut, pi[mut].err_count, pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1); done = 1; $finish(2); end endmodule // cadence translate_on