//////////////////////////////////////////////////////////////////////////////// // /// LSU EE 4755 Fall 2017 Homework 7 -- SOLUTION // /// Assignment http://www.ece.lsu.edu/koppel/v/2017/hw07.pdf ////////////////////////////////////////////////////////////////////////////// /// Problem 1 // /// Complete so that mult_fast sets out_avail as described in the handout. // // [✔] The module must be synthesizable. // [✔] Code must be reasonably efficient. // [✔] Do not change module parameters. // [✔] Do not change ports, EXCEPT changing between var and net kinds. // [✔] Don't assume that parameter values will match those used here. // [✔] USE DEBUGGING TOOLS LIKE SimVision. // [✔] Make sure that Avg cyc shown in testbench for Fast is lower // than Pipelined module of same degree (when all tests pass). /// SOLUTION - Problem 1a // // A new pipeline latch will be created, pl_occ, which will carry the // value of in_valid through the pipeline. The pl_occ register for the // last stage will connect to the output port out_avail. // The timing diagram below is for a w=12, m=4 multiplier in which the // multiplier is always set to 2 (not shown). Three different // multiplicands arrive, 'h123, 'h4, and 'h56, their products, 2 * // 'h123 = 'h246, 2 * 'h4 = 'h8, and 2 * 'h56 = 'hac, appear at the // outputs three cycles later. The progress of 'h123 through the // pipeline is highlighted in blue. // // The timing diagram does not show the plier and cand inputs. As // stated above, the plier is always 2 (for the sample execution shown // in the diagram). The value of input cand appearing at a positive // edge can be seen in pl_cand[0] just after the positive edge. For // example, at the positive edge between cycle 0 and 1 input cand must // be 'h123. // /// SOLUTION -- Problem 1a // module mult_fast_1a #( int w = 16, int m = 4 ) ( output uwire [2*w-1:0] prod, output uwire out_avail, input uwire clk, in_valid, input uwire [w-1:0] plier, cand ); localparam int nstages = ( w + m - 1 ) / m; logic [2*w-1:0] pl_accum[0:nstages]; logic [w-1:0] pl_plier[0:nstages]; logic [w-1:0] pl_cand[0:nstages]; assign prod = pl_accum[nstages]; /// SOLUTION -- Problem 1a // logic pl_occ[0:nstages]; assign out_avail = pl_occ[nstages]; // // Provide a pipeline latch for the in_valid signal to pass through // the pipeline and connect the last stage's latch to the out_avail // port. always_ff @( posedge clk ) begin /// SOLUTION -- Problem 1a // pl_occ[0] = in_valid; // // Connect in_valid port to the first stage. pl_accum[0] = 0; pl_plier[0] = plier; pl_cand[0] = cand; for ( int stage=0; stage<nstages; stage++ ) begin pl_accum[stage+1] <= pl_accum[stage] + ( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m ); pl_cand[stage+1] <= pl_cand[stage] >> m; pl_plier[stage+1] <= pl_plier[stage]; /// SOLUTION -- Problem 1a // pl_occ[stage+1] <= pl_occ[stage]; // // Pass the in_valid signal through the pipeline. end end endmodule /// SOLUTION -- Problem 1b (and also 1a) // // The fast multiplier is supposed to provide a shortcut connection // from each stage to the multiplier output, prod. Stage x can use the // shortcut connection if the multiplication that the stage is // carrying is complete and if no higher-numbered stages are occupied. // A multiplication (meaning the result of multiplying a plier and // cand) must appear at the output exactly once, and the arriving // multiplications must appear at the outputs in the same order in // which they arrived. /// The Plan // // - Find the highest-numbered occupied stage. Call it oldest_idx. // - Connect the result at that stage, pl_accum[oldest_idx], to prod. // - Set out_avail to true if stage oldest_idx is finished. // - Set pl_occ[oldest_idx] to zero if out_avail is true, to avoid duplicates. // // For the discussion below refer to module mult_fast_1b and to the // timing diagram below. The timing diagram is for a module // instantiated with w=12, m=4, and for which the multiplier is always // 2. The arriving values in the timing diagram below are the same as // the diagram appearing in the solution to Problem 1a. /// Computing oldest_idx // // Combinational logic will be added that computes oldest_idx, the // highest-numbered occupied stage. (Stage x is occupied if pl_occ[x] // is true.) If none of the stages are occupied oldest_idx is set to // zero. // // See the always_comb block in mult_fast_1b below. /// Connect Stage's Result to Prod // // Output prod is set to pl_accum[oldest_idx], see the assign in // mult_fast_1b. The connection is made whether or not stage // oldest_idx is finished or contains a valid value. // In the timing diagram notice that at cycles 2 and 3 prod holds // incomplete multiplications. That's fine because out_avail is zero. // Setting prod to something like zero at cycles like 2 and 3 would // require extra hardware and provide no benefit (based on the problem // statement in the homework handout). /// Set out_avail // // Output out_avail is set to 1 if stage oldest_idx is occupied and if // the multiplicand at that stage is zero, meaning that the // multiplication is complete. Whether it is occupied can be // determined by examining pl_occ[oldest_idx], whether it is finished // can be determined by examining pl_cand[oldest_idx]. See the "assign // out_avail" in mult_fast_1b. /// Set pl_occ[oldest_idx] When Done // // Since the result of a multiplication cannot appear at the output // more than once pl_occ[x] must be set to zero at the end of cycle c // if x was chosen in cycle c. (If this were not done the same // multiplication would appear at the outputs again in the next clock // cycle [c+1], when it is in stage x+1.) For example, stage 2 is // chosen in cycle 5, its product 'h8 appears at the output. In cycle // 6 the calculation is in stage 3, but now its value of pl_occ is // zero and so it won't be chosen a second time. // module mult_fast_1b #( int w = 16, int m = 4 ) ( output uwire [2*w-1:0] prod, output uwire out_avail, input uwire clk, in_valid, input uwire [w-1:0] plier, cand ); localparam int nstages = ( w + m - 1 ) / m; logic [2*w-1:0] pl_accum[0:nstages]; logic [w-1:0] pl_plier[0:nstages], pl_cand[0:nstages]; logic pl_occ[0:nstages]; /// SOLUTION -- Problem 1b // // Determine the idx of the last (highest-numbered) occupied // stage, in which resides the oldest multiplication in the // pipeline. // logic [$clog2(nstages):0] oldest_idx; // always_comb begin oldest_idx = 0; for ( int i=1; i<=nstages; i++ ) if ( pl_occ[i] ) oldest_idx = i; end /// SOLUTION -- Problem 1b // // Connect the last occupied stage to the output .. // assign prod = pl_accum[oldest_idx]; // // .. and set out_avail to true if that stage is occupied and finished. // assign out_avail = pl_occ[oldest_idx] && pl_cand[oldest_idx] == 0; always_ff @( posedge clk ) begin pl_occ[0] = in_valid; pl_accum[0] = 0; pl_plier[0] = plier; pl_cand[0] = cand; for ( int stage=0; stage<nstages; stage++ ) begin pl_accum[stage+1] <= pl_accum[stage] + ( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m ); pl_cand[stage+1] <= pl_cand[stage][w-1:m]; pl_plier[stage+1] <= pl_plier[stage]; /// SOLUTION -- Problem 1b // // Pass 0 to next stage if this stage is providing the // result, otherwise pass this stage's value of occupied to // the next stage. // pl_occ[stage+1] <= oldest_idx == stage && out_avail ? 0 : pl_occ[stage]; end end endmodule module mult_behav_1 #(int w = 16) (output logic [2*w-1:0] prod, input logic [w-1:0] plier, cand); assign prod = plier * cand; endmodule /// :Example: Basic Pipelined Multiplier -- mult_pipe // // Computes m partial products per stage. // module mult_pipe #( int w = 16, int m = 4 ) ( output logic [2*w-1:0] prod, input logic [w-1:0] plier, input logic [w-1:0] cand, input clk); localparam int nstages = ( w + m - 1 ) / m; // Note: pl is for pipeline latch. logic [2*w-1:0] pl_accum[0:nstages]; logic [w-1:0] pl_plier[0:nstages]; logic [w-1:0] pl_cand[0:nstages]; always_ff @( posedge clk ) begin pl_accum[0] = 0; pl_plier[0] = plier; pl_cand[0] = cand; for ( int stage=0; stage<nstages; stage++ ) begin logic [w-1:0] cand_next; cand_next = pl_cand[stage][w-1:m]; pl_accum[stage+1] <= pl_accum[stage] + ( pl_plier[stage] * pl_cand[stage][m-1:0] << stage*m ); pl_cand[stage+1] <= cand_next; pl_plier[stage+1] <= pl_plier[stage]; end end assign prod = pl_accum[nstages]; endmodule ////////////////////////////////////////////////////////////////////////////// /// Testbench Code // cadence translate_off program reactivate (output uwire clk_reactive, output int cycle_reactive, input uwire clk, input int cycle); assign clk_reactive = clk; assign cycle_reactive = cycle; endprogram module testbench; localparam int w = 16; localparam int num_tests = 400; localparam int NUM_MULT = 20; localparam int err_limit = 7; bit use_others; logic [w-1:0] plier, cand; logic [w-1:0] plierp[NUM_MULT], candp[NUM_MULT]; logic [2*w-1:0] prod[NUM_MULT]; uwire availn[NUM_MULT]; logic avail[NUM_MULT]; logic in_valid[NUM_MULT]; typedef struct { int tidx; int cycle_start; } Test_Vector; typedef struct { int idx; int err_count = 0; int err_timing = 0; Test_Vector tests_active[$]; bit all_tests_started = 0; bit seq = 0; bit pipe = 0; bit bpipe = 0; int deg = 1; int ncompleted = 0; int cyc_tot = 0; int latency = 0; } Info; Info pi[string]; localparam int cycle_limit = num_tests * w * 4; int cycle; bit done; logic clock; logic clk_reactive; int cycle_reactive; reactivate ra(clk_reactive,cycle_reactive,clock,cycle); initial begin clock = 0; cycle = 0; fork forever #10 cycle += clock++; wait( done ); wait( cycle >= cycle_limit ) $write("*** Cycle limit exceeded, ending.\n"); join_any; $finish(); end task pi_seq(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; pi[m].bpipe = 0; endtask task pi_pipe(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 0; endtask task pi_bpipe(input int idx, input string name, input int deg); automatic string m = $sformatf("%s Deg %0d", name, deg); pi[m].deg = deg; pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].bpipe = 1; endtask mult_behav_1 #(w) mb1(prod[0], plierp[0], candp[0]); initial pi["Behavioral"].idx = 0; mult_pipe #(w,4) ms2(prod[2], plierp[2], candp[2], clock); initial pi_pipe(2,"Pipelined",ms2.m); mult_pipe #(w,2) ms3(prod[3], plierp[3], candp[3], clock); initial pi_pipe(3,"Pipelined",ms3.m); mult_fast_1a #(w,4) ms7(prod[7], availn[7], clock, in_valid[7], plierp[7], candp[7]); initial pi_bpipe(7,"Fast 1a",ms7.m); mult_fast_1a #(w,2) ms8(prod[8], availn[8], clock, in_valid[8], plierp[8], candp[8]); initial pi_bpipe(8,"Fast 1a",ms8.m); mult_fast_1a #(w,1) ms9(prod[9], availn[9], clock, in_valid[9], plierp[9], candp[9]); initial pi_bpipe(9,"Fast 1a",ms9.m); mult_fast_1b #(w,4) ms17(prod[17], availn[17], clock, in_valid[17], plierp[17], candp[17]); initial pi_bpipe(17,"Fast 1b",ms17.m); mult_fast_1b #(w,2) ms16(prod[16], availn[16], clock, in_valid[16], plierp[16], candp[16]); initial pi_bpipe(16,"Fast 1b",ms16.m); mult_fast_1b #(w,1) ms15(prod[15], availn[15], clock, in_valid[15], plierp[15], candp[15]); initial pi_bpipe(15,"Fast 1b",ms15.m); always @* foreach ( availn[i] ) if ( availn[i] !== 1'bz ) avail[i] = availn[i]; // Array of multiplier/multiplicand values to try out. // After these values are used a random number generator will be used. // int tests[$] = {1,1, 1,2, 1,3, 1,4, 1,5, 1,32, 32, 1}; initial begin automatic int awaiting = pi.size(); logic [w-1:0] pliers[num_tests], cands[num_tests]; done = 0; foreach ( pi[mut] ) begin automatic int midx = pi[mut].idx; automatic int steps = ( w + pi[mut].deg - 1 ) / pi[mut].deg; automatic int latency = !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps; pi[mut].latency = latency; if ( pi[mut].bpipe == 0 ) begin avail[midx] = 1; end in_valid[midx] = 0; end for ( int i=0; i<num_tests; i++ ) begin automatic int num_bits_c = {$random()}%w + 1; automatic logic [w-1:0] mask_c = ( (w+1)'(1) << num_bits_c ) - 1; automatic int num_bits_p = {$random()}%w + 1; automatic logic [w-1:0] mask_p = ( (w+1)'(1) << num_bits_p ) - 1; pliers[i] = tests.size() ? tests.pop_front() : {$random()}&mask_p; cands[i] = tests.size() ? tests.pop_front() : {$random()}&mask_c; end fork forever @( negedge clk_reactive ) foreach ( pi[mut] ) begin automatic int midx = pi[mut].idx; if ( !in_valid[midx] && pi[mut].pipe ) begin plierp[midx] = cycle; candp[midx] = 1; end end join_none; repeat ( 2 * w ) @( negedge clock ); foreach ( pi[mutii] ) begin automatic string muti = mutii; fork begin automatic string mut = muti; automatic int midx = pi[mut].idx; for ( int i=0; i<num_tests; i++ ) begin automatic int gap_cyc = ( {$random} % 2 ) ? {$random} % ( w + 2 ) : 0; automatic Test_Vector tv; repeat ( gap_cyc ) @( negedge clock ); plierp[midx] = pliers[i]; candp[midx] = cands[i]; in_valid[midx] = 1; tv.tidx = i; tv.cycle_start = cycle; pi[mut].tests_active.push_back( tv ); @( negedge clock ); in_valid[midx] = 0; end pi[mut].all_tests_started = 1; end join_none; fork begin automatic string mut = muti; automatic int midx = pi[mut].idx; while ( 1 ) begin @( negedge clock ); while ( pi[mut].tests_active.size() == 0 && !pi[mut].all_tests_started ) @( negedge clock ); if ( pi[mut].tests_active.size() == 0 ) break; begin automatic Test_Vector tv = pi[mut].tests_active.pop_front(); automatic int i = tv.tidx; automatic logic [2*w-1:0] shadow_prod = pliers[i] * cands[i]; automatic int eta = tv.cycle_start + pi[mut].latency; automatic bit timing_err = 0; automatic int delta_t; if ( pi[mut].bpipe ) begin while ( !avail[midx] && cycle < eta ) @( negedge clock ); if ( !avail[midx] || cycle > eta ) begin timing_err = 1; if ( pi[mut].err_timing++ < err_limit ) $write("At cyc %4d (eta %0d) avail not set for %s (idx %0d) after %0d cycles for 0x%0h*0x%0h.\n", cycle, eta, mut, midx, cycle - tv.cycle_start, pliers[i], cands[i]); end end else begin wait ( cycle >= eta ); end delta_t = cycle - tv.cycle_start; if ( !timing_err ) begin pi[mut].ncompleted++; pi[mut].cyc_tot += delta_t; end if ( !timing_err && shadow_prod !== prod[midx] ) begin pi[mut].err_count++; if ( pi[mut].err_count < err_limit ) begin $write ("%-15s test %5d cyc %0d+%0d (%0d) wrong: 0x%0h * 0x%0h: 0x%0h != 0x%0h (correct)\n", mut, i, tv.cycle_start, delta_t, pi[mut].latency, pliers[i], cands[i], prod[midx], shadow_prod); end end end end awaiting--; end join_none; end wait( awaiting == 0 || cycle > cycle_limit ); $write("At cycle %0d. Error types: couldn't test / wrong result / timing\n",cycle); foreach ( pi[ mut ] ) $write("For %-18s ran %4d tests, %4d/%4d/%4d errors found. Avg cyc %.1f\n", mut, num_tests, num_tests - pi[mut].ncompleted, pi[mut].err_count, pi[mut].err_timing, pi[mut].seq ? real'(pi[mut].cyc_tot) / pi[mut].ncompleted : 1); done = 1; $finish(2); end endmodule // cadence translate_on