/// EE 4755 - Digital Design Using HDLs // // Classroom demo code. //////////////////////////////////////////////////////////////////////////////// /// Binary Multiplication Algorithm /// Long Hand Procedure Review // // Multiply 5 times 12 in binary: // // 0101 cand -- Multiplicand // 1100 plier -- Multiplier // """" // 0000 Partial Product // 0000 // 0101 // 0101 // """"""" // 0111100 prod -- Product ////////////////////////////////////////////////////////////////////////////// /// Behavioral Multiplier module mult_behav_1 #(int w = 16) (output logic[2*w-1:0] prod, input logic[w-1:0] plier, cand); assign prod = plier * cand; endmodule ////////////////////////////////////////////////////////////////////////////// /// Linear Multiplier /// Simple Adder, Don't Modify module carry_prop_adder #(int w=16)(output [w:1] s, input [w:1] a,b); assign s = a + b; endmodule module mult_linear #(int w = 16) (output logic[2*w-1:0] prod, input logic[w-1:0] plier, cand); logic [2*w-1:0] rsum [w-1:-1]; assign rsum[-1] = 0; for ( genvar i=0; i<w; i++ ) begin uwire [2*w-1:0] pprod = plier[i] ? cand << i : 0; carry_prop_adder #(2*w) adder(rsum[i], rsum[i-1], pprod ); end assign prod = rsum[w-1]; endmodule module mult_linear_clk #(int w = 16) (output logic[2*w-1:0] prod, input logic[w-1:0] plier, cand, input clk); uwire [2*w-1:0] p; logic [w-1:0] pliercpy, candcpy; mult_linear #(w) ml(p, plier, cand); always_ff @( posedge clk ) begin pliercpy <= plier; candcpy <= cand; prod <= p; end endmodule ////////////////////////////////////////////////////////////////////////////// /// Tree Multiplier module mult_tree #(int w = 16) (output logic[2*w-1:0] prod, input logic[w-1:0] plier, cand); localparam int widp2 = 1 << $clog2(w); logic [2*w-1:0] rsum [2*w-1:0]; localparam int mask = 2*w-1; // Compute partial products. // for ( genvar i=0; i<w; i++ ) assign rsum[i] = plier[i] ? cand << i : 0; // Add partial products together. // for ( genvar i=w; i<2*w-1; i++ ) carry_prop_adder #( 2*w ) adder ( rsum[i], rsum[ mask & (i<<1) ], // Left child. rsum[ mask & ( (i<<1) + 1 ) ] // Right child. ); assign prod = rsum[2*w-2]; endmodule ////////////////////////////////////////////////////////////////////////////// /// Simple Sequential Multiplier module mult_seq #( int w = 16 ) ( output logic [2*w-1:0] prod, input logic [w-1:0] plier, input logic [w-1:0] cand, input clk); localparam int wlog = $clog2(w); logic [wlog-1:0] pos; logic [2*w-1:0] accum; // cadence translate_off initial pos = 0; // cadence translate_on always @( posedge clk ) begin if ( pos == 0 ) begin prod = accum; accum = 0; end if ( cand[pos] == 1 ) accum += plier << pos; pos++; end endmodule ////////////////////////////////////////////////////////////////////////////// /// Sequential Multiplier, Using Instantiated Adder // // Simple multiplier, no handshaking. module mult_seq_ga #( int w = 16 ) ( output logic [2*w-1:0] prod, input logic [w-1:0] plier, input logic [w-1:0] cand, input clk ); localparam int wlog = $clog2(w); logic [wlog-1:0] pos; logic [2*w-1:0] accum; uwire [2*w-1:0] sum; // cadence translate_off initial begin pos = 0; accum = 0; end // cadence translate_on uwire [2*w-1:0] pp = cand[pos] ? plier << pos : 0; carry_prop_adder #(2*w) ga( sum, accum, pp ); always @( posedge clk ) pos <= pos + 1; always @( posedge clk ) begin if ( pos == 0 ) begin prod = sum; accum = 0; end else begin accum = sum; end end endmodule `include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v" module mult_seq_csa #( int w = 16 ) ( output logic [2*w-1:0] prod, input logic [w-1:0] plier, input logic [w-1:0] cand, input clk); localparam int wlog = $clog2(w); logic [wlog-1:0] pos; logic [2*w-1:0] accum_sum_a_reg, accum_sum_b_reg; uwire co; // cadence translate_off initial begin pos = 0; accum_sum_a_reg = 0; accum_sum_b_reg = 0; end // cadence translate_on uwire [2*w-1:0] accum_sum_a, accum_sum_b; uwire [2*w-1:0] pp = cand[pos] ? plier << pos : 0; // Instantiate a carry save adder from the ChipWare library. // CW_csa #(2*w) csa ( .carry(accum_sum_a), .sum(accum_sum_b), .co(co), .a(accum_sum_a_reg), .b(accum_sum_b_reg), .c(pp), .ci(1'b0) ); always @( posedge clk ) pos <= pos + 1; always @( posedge clk ) begin if ( pos == w-1 ) begin prod = accum_sum_a + accum_sum_b; accum_sum_a_reg = 0; accum_sum_b_reg = 0; end else begin accum_sum_a_reg = accum_sum_a; accum_sum_b_reg = accum_sum_b; end end endmodule ////////////////////////////////////////////////////////////////////////////// /// Streamlined Sequential Multiplier /// Techniques For Lowering Cost // // Instead of shifting the multiplier, shift the accumulator. // Use part of the accumulator to store the multiplicand. module mult_seq_stream #( int w = 16 ) ( output logic [2*w-1:0] prod, input logic [w-1:0] plier, input logic [w-1:0] cand, input clk); localparam int wlog = $clog2(w); logic [wlog-1:0] pos; logic [2*w-1:0] accum; // cadence translate_off initial pos = 0; // cadence translate_on always @( posedge clk ) begin logic [w:0] pp; if ( pos == 0 ) begin prod = accum; accum = cand; pos = w - 1; end else begin pos--; end // Note: the multiplicand is in the lower bits of the accumulator. // pp = accum[0] ? { 1'b0, plier } : 0; // Add on the partial product and shift the accumulator. // accum = { { 1'b0, accum[2*w-1:w] } + pp, accum[w-1:1] }; end endmodule // ////////////////////////////////////////////////////////////////////////////// /// Degree-m Sequential Multipliers // Compute m partial products in each iteration. // // Will the synthesis program figure it out? module mult_seq_m #( int w = 16, int m = 2 ) ( output logic [2*w-1:0] prod, input logic [w-1:0] plier, input logic [w-1:0] cand, input clk); localparam int iterations = ( w + m - 1 ) / m; localparam int iter_lg = $clog2(iterations); logic [iter_lg:1] iter; logic [2*w-1:0] accum; // cadence translate_off initial iter = 0; // cadence translate_on always @( posedge clk ) begin if ( iter == iter_lg'(iterations) ) begin prod = accum; accum = 0; iter = 0; end for ( int i=0; i<m; i++ ) begin int pos; pos = iter * m + i; if ( cand[pos] ) accum += plier << pos; end iter++; end endmodule module mult_seq_dm #( int w = 16, int m = 2 ) ( output logic [2*w-1:0] prod, input logic [w-1:0] plier, input logic [w-1:0] cand, input clk); localparam int iterations = ( w + m - 1 ) / m; localparam int iter_lg = $clog2(iterations); uwire [iterations-1:0][m-1:0] cand_2d = cand; logic [iter_lg:1] iter; logic [2*w-1:0] accum; // cadence translate_off initial iter = 0; // cadence translate_on always @( posedge clk ) begin if ( iter == iter_lg'(iterations) ) begin prod = accum; accum = 0; iter = 0; end accum += plier * cand_2d[iter] << ( iter * m ); iter++; end endmodule module mult_seq_csa_m #( int w = 16, int m = 2 // Number of partial products per cycle. ) ( output logic [2*w-1:0] prod, input logic [w-1:0] plier, input logic [w-1:0] cand, input clk); localparam int iterations = ( w + m - 1 ) / m; localparam int iter_lg = $clog2(iterations); localparam int w_lg = $clog2(w); logic [iter_lg:0] iter; // cadence translate_off initial iter = 0; // cadence translate_on uwire [2*w-1:0] accum_sum_a[0:m], accum_sum_b[0:m]; logic [2*w-1:0] accum_sum_a_reg, accum_sum_b_reg; assign accum_sum_a[0] = accum_sum_a_reg; assign accum_sum_b[0] = accum_sum_b_reg; for ( genvar i=0; i<m; i++ ) begin uwire [w_lg:1] pos = iter * m + i; uwire co; // Unconnected. uwire [2*w-1:0] pp = pos < w && cand[pos] ? plier << pos : 0; CW_csa #(2*w) csa ( .sum(accum_sum_a[i+1]), .carry(accum_sum_b[i+1]), .co(co), .a(accum_sum_a[i]), .b(accum_sum_b[i]), .c(pp), .ci(1'b0) ); end always @( posedge clk ) begin if ( iter == iterations ) begin prod <= accum_sum_a_reg + accum_sum_b_reg; accum_sum_a_reg <= 0; accum_sum_b_reg <= 0; iter <= 0; end else begin prod <= prod; accum_sum_a_reg <= accum_sum_a[m]; accum_sum_b_reg <= accum_sum_b[m]; iter <= iter + 1; end end endmodule // Inferred Hardware for m = 2: // // Optimization Plan for m = 2: // `ifdef DONT_DEFINE_ME Module Name Area Clock Total Init. Period Delay Interv mult_pipe_2_wid16_pp_per_stage1 652540 1988 -1988 1988 mult_pipe_wid16_pp_per_stage1 747364 1717 27472 1717 mult_pipe_2_wid16_pp_per_stage2 390304 2530 -2530 2530 mult_pipe_wid16_pp_per_stage2 459860 2425 19400 2425 mult_pipe_2_wid16_pp_per_stage4 330368 2913 -2913 2913 mult_pipe_wid16_pp_per_stage4 357580 2983 11932 2983 mult_pipe_2_wid16_pp_per_stage8 256392 3515 -3515 3515 mult_pipe_wid16_pp_per_stage8 264352 3498 6996 3498 Normal exit. `endif ////////////////////////////////////////////////////////////////////////////// /// Testbench Code // cadence translate_off program reactivate (output uwire clk_reactive, output int cycle_reactive, input uwire clk, input int cycle); assign clk_reactive = clk; assign cycle_reactive = cycle; endprogram module testbench; localparam int w = 16; localparam int num_tests = 1000; localparam int NUM_MULT = 20; localparam int err_limit = 7; bit use_others; logic [w-1:0] plier, cand; logic [w-1:0] plierp, candp; logic [2*w-1:0] prod[NUM_MULT]; typedef struct { int idx; int err_count = 0; bit seq = 0; bit pipe = 0; int deg = 1; logic [2*w-1:0] sout = 'h111; int cyc_tot = 0; int latency = 0; } Info; Info pi[string]; localparam int cycle_limit = num_tests * w * 4; int cycle; bit done; logic clock; logic clk_reactive; int cycle_reactive; reactivate ra(clk_reactive,cycle_reactive,clock,cycle); initial begin clock = 0; cycle = 0; fork forever #10 cycle += clock++; wait( done ); wait( cycle >= cycle_limit ) $write("*** Cycle limit exceeded, ending.\n"); join_any; $finish(); end initial begin while ( !done ) @( posedge clk_reactive ) #1 if ( use_others ) begin plierp = plier; candp = cand; use_others = 0; end else begin plierp = cycle; candp = 256; end end task pi_seq(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; endtask task pi_pipe(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; endtask mult_behav_1 #(w) mb1(prod[0], plier, cand); initial pi["Behavioral"].idx = 0; mult_linear #(w) ms1(prod[1], plier, cand); initial pi["Linear"].idx = 1; mult_tree #(w) ms2(prod[2], plier, cand); initial pi["Tree"].idx = 2; mult_seq #(w) ms3(prod[3], plier, cand, clock); initial begin automatic string m = "Sequential"; pi[m].idx = 3; pi[m].seq = 1; end mult_seq_ga #(w) msga1(prod[11], plier, cand, clock); initial begin automatic string m = "Sequential GA"; pi[m].idx = 11; pi[m].seq = 1; end mult_seq_stream #(w) mss1(prod[4], plier, cand, clock); initial begin automatic string m = "Sequential Streamlined"; pi[m].idx = 4; pi[m].seq = 1; end mult_seq_m #(w,4) ms44(prod[5], plier, cand, clock); initial pi_seq(5,"Seq", ms44.m); mult_seq_m #(w,3) ms43(prod[6], plier, cand, clock); initial pi_seq(6,"Seq", ms43.m); mult_seq_dm #(w,4) msd44(prod[9], plier, cand, clock); initial pi_seq(9,"Seq Rad", ms44.m); mult_seq_dm #(w,3) msd43(prod[10], plier, cand, clock); initial pi_seq(10,"Seq Rad", ms43.m); mult_seq_csa #(w) mcsa(prod[14], plier, cand, clock); initial begin automatic string m = $sformatf("Mult Seq CSA"); pi[m].idx = 14; pi[m].seq = 1; end mult_linear_clk #(w) mlc1(prod[15], plier, cand, clock); initial begin automatic string m = $sformatf("Linear Clock"); pi[m].idx = 15; pi[m].seq = 1; end // Array of multiplier/multiplicand values to try out. // After these values are used a random number generator will be used. // int tests[$] = {1,1, 1,2, 1,32, 32, 1}; initial begin done = 0; use_others = 0; @( posedge clk_reactive ); for ( int i=0; i<num_tests; i++ ) begin automatic int cyc_start = cycle; automatic int awaiting = pi.num(); // Set multiplier and multiplicand values for non-piped units. // plier = tests.size() ? tests.pop_front() : $random(); cand = tests.size() ? tests.pop_front() : $random(); // Set multiplier and multiplicand values for piped units. // plierp = plier; candp = cand; use_others = 1; foreach ( pi[muti] ) begin automatic string mut = muti; // Bug workaround? automatic Info p = pi[mut]; fork begin automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg; automatic int latency = !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps; automatic int eta = 1 + cyc_start + latency; pi[mut].latency = latency; wait ( cycle_reactive == eta ); awaiting--; pi[mut].sout = prod[pi[mut].idx]; pi[mut].cyc_tot += cycle - cyc_start; end join_none; end wait ( awaiting == 0 ); // Check the output of each Module Under Test. // foreach ( pi[ mut ] ) if ( prod[0] !== pi[mut].sout ) begin pi[mut].err_count++; if ( pi[mut].err_count < 5 ) $write ("%-25s wrong result: %0d * %0d: 0x%0h != 0x%0h (correct)\n", mut, plier, cand, pi[mut].sout, prod[0]); end @( posedge clk_reactive ); end foreach ( pi[ mut ] ) $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n", num_tests, mut, pi[mut].err_count, pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1); done = 1; $finish(2); end endmodule // cadence translate_on