//////////////////////////////////////////////////////////////////////////////// /// LSU EE 4755 // /// Add-Accumulate Sample Problem and SOLUTION // // Based on 2019 Homework 6 // https://www.ece.lsu.edu/koppel/v/2019/hw06.pdf // /// Related Problems // // Fall 2022 Final Exam Problem 3 - Show timing for add_accum hardware. `default_nettype none ////////////////////////////////////////////////////////////////////////////// /// Problem 1 // /// Complete add_accum so that it accumulates a sum. // // [✔] Put your solution in add_accum. No other modules should // be modified. (Except the testbench, to help debug.) // // [✔] add_accum must use an add_pipe module to compute the sum. // // [✔] Make sure that the testbench does not report errors. // [✔] Module must be synthesizable. Use command: genus -files syn.tcl // // [✔] As always, avoid costly, slow, and confusing code. // [✔] As always, don't assume parameters will be at their default values. module add_accum #( int w = 21, n_stages = 3 ) ( output logic [w-1:0] sum, output logic sum_valid, input uwire [w-1:0] ai, input uwire ai_valid, reset, clk ); /// SOLUTION // Register to keep track of which stage of add pipeline is occupied. // logic [n_stages:0] st_occ; // // If st_occ[i] == 1 then stage i of pipelined adder is occupied. // If none of the adder stages is occupied the value in sum must be valid. // assign sum_valid = ! st_occ; // If true, there is a useful result at the adder output. // uwire aout_valid = st_occ[n_stages-1]; // Connections to adder. // uwire [w-1:0] aout; uwire [w-1:0] a0 = ai_valid ? ai : sum; uwire [w-1:0] a1 = aout_valid ? aout : sum; add_pipe #(w,n_stages) add_p0( aout, a0, a1, clk ); // If true, the value in sum will be needed. // logic sum_occupied; // Number of values ready to be added together. // uwire [1:0] n_values = ai_valid + sum_occupied + aout_valid; // // If n_values == 0: Nothing to do. // If n_values == 1: Put or keep the value in sum. // If n_values == 2: Put the two values into the adder. // If n_values == 3: Put ai and aout into the adder and leave sum unchanged. // If true, a pair of values will be put in the adder in this cycle. // uwire start_an_addition = n_values >= 2; // If true, write sum with either ai or aout. // uwire write_sum = !sum_occupied && n_values == 1; always_ff @( posedge clk ) if ( reset ) begin sum <= 0; sum_occupied <= 0; st_occ <= 0; // Set occupied bit of every stage to 0. end else begin if ( write_sum ) sum <= aout_valid ? aout : ai; // sum will be occupied if there are an odd number (1 or 3) values .. // sum_occupied <= n_values[0]; // // .. because if there were 2 values they both would go into the adder. // Advance occupied bit by one stage. // st_occ <= { st_occ[n_stages-1:0], start_an_addition }; end endmodule `ifdef xxx Synthesizing at effort level "high" Module Name Area Delay Delay Actual Target add_pipe_w24_n_stages1 29928 10.174 90.000 ns add_pipe_w24_n_stages2 47043 5.428 90.000 ns add_pipe_w24_n_stages3 64159 3.701 90.000 ns add_pipe_w24_n_stages4 81275 2.837 90.000 ns add_pipe_w24_n_stages6 115506 1.973 90.000 ns add_pipe_w24_n_stages1 84351 1.114 0.100 ns add_pipe_w24_n_stages2 83959 1.249 0.100 ns add_pipe_w24_n_stages3 103383 1.105 0.100 ns add_pipe_w24_n_stages4 117358 1.001 0.100 ns add_pipe_w24_n_stages6 150854 0.896 0.100 ns add_accum_w24_n_stages1 87556 11.449 90.000 ns add_accum_w24_n_stages2 105305 6.349 90.000 ns add_accum_w24_n_stages3 123530 4.560 90.000 ns add_accum_w24_n_stages4 141598 3.696 90.000 ns add_accum_w24_n_stages6 177545 3.061 90.000 ns add_accum_w24_n_stages1 150738 2.023 0.100 ns add_accum_w24_n_stages2 149544 1.757 0.100 ns add_accum_w24_n_stages3 183994 1.514 0.100 ns add_accum_w24_n_stages4 191611 1.444 0.100 ns add_accum_w24_n_stages6 224175 1.332 0.100 ns Normal exit. `endif module add_pipe #( int w = 21, n_stages = 3 ) ( output uwire [w-1:0] sum, input uwire [w-1:0] a, b, input uwire clk ); localparam int bits_per_stage = ( w + n_stages - 1 ) / n_stages; localparam int wr = n_stages * bits_per_stage; // w rounded. logic [wr-1:0] pl_a[n_stages+1], pl_b[n_stages+1], pl_sum[n_stages+1]; logic pl_carry[n_stages+1]; always_ff @( posedge clk ) begin pl_a[0] = a; pl_b[0] = b; pl_carry[0] = 0; for ( int s=0; s> bits_per_stage; pl_a[s+1] <= pl_a[s] >> bits_per_stage; pl_b[s+1] <= pl_b[s] >> bits_per_stage; end end assign sum = pl_sum[ n_stages ][w-1:0]; endmodule // cadence translate_off program reactivate (output uwire clk_reactive, output int cycle_reactive, input uwire clk, input var int cycle); assign clk_reactive = clk; assign cycle_reactive = cycle; endprogram module testbench; // localparam int n_stages[] = { 1, 2, 3, 5, 6 }; localparam int n_stages[] = { 3 }; localparam int nw = 1; // Cadence, please fix this. initial if ( nw != n_stages.size() ) $fatal(1,"Constant nw should be %0d.\n",n_stages.size() ); int t_errs; // Total number of errors. initial t_errs = 0; final $write("Total number of errors: %0d\n",t_errs); uwire d[nw:-1]; // Start / Done signals. assign d[-1] = 1; // Initialize first at true. // Instantiate a testbench at each size. // for ( genvar i=0; i= cycle_limit ) $write("Exit from clock loop at cycle %0d, limit %0d. %s\n %s\n", cycle, cycle_limit, "** CYCLE LIMIT EXCEEDED **", event_trace); join_any; done = 1; end uwire [w-1:0] sum; uwire sum_valid; logic [w-1:0] a; logic a_valid, reset; add_accum #(w,n_stages) add_accum0(sum, sum_valid, a, a_valid, reset, clk); int rsum; bit tests_start; int series_idx, value_idx, series_n_vals; int n_errs, n_underdue_errs, n_overdue_errs, n_tests_done; int sum_due_cyc_earliest, sum_due_cyc, n_correct; int last_a_cyc; int latency_sum, latency_sum_n; bit error_val_issued, error_late_issued; initial wait ( done ) begin automatic int not_done = n_tests - series_idx + 1; $write("Done with %0d-stage tests, %0d series.\n Correct, %0d. Errors: %0d not done, %0d val, %0d/%0d early/late.\n", n_stages, series_idx-1, n_correct, not_done, n_errs, n_underdue_errs, n_overdue_errs ); $write("For %0d stages average latency %.2f cycles.\n", n_stages, real'(latency_sum) / ( latency_sum_n ? latency_sum_n : 1 ) ); testbench.t_errs += n_errs + n_underdue_errs + n_overdue_errs + not_done; end initial begin wait( tests_start ); while ( !done ) @( posedge clk_reactive ) begin if ( sum_valid ) begin automatic bit pending = sum_due_cyc < cyc_max; if ( pending ) begin n_tests_done++; sum_due_cyc = cyc_max; if ( sum === rsum ) n_correct++; latency_sum += cycle - last_a_cyc; latency_sum_n++; if ( cycle < sum_due_cyc_earliest ) begin n_underdue_errs++; if ( n_underdue_errs < 5 ) begin $write ("At cyc %0d, value ready too soon, %0d, cyc. (Min cyc %0d.)\n", cycle, last_a_cyc - cycle, lat_limit_empty ); if ( event_trace != "" ) $write(" %s\n",event_trace); end end end if ( !error_val_issued && sum !== rsum ) begin error_val_issued = 1; n_errs++; if ( n_errs < 5 ) begin $write("At cyc %0d, wrong sum, %0d != %g (correct)\n", cycle, sum, rsum); if ( event_trace != "" ) $write(" %s\n",event_trace); end end end else if ( sum_due_cyc <= cycle ) begin if ( !error_late_issued ) begin error_late_issued = 1; n_overdue_errs++; sum_due_cyc = cyc_max; if ( n_overdue_errs < 5 ) begin $write("At cycle %0d, sum overdue.\n",cycle); if ( event_trace != "" ) $write(" %s\n",event_trace); end end end end end typedef struct { int gap_cycles; int value; } Arrival; Arrival arrivals[$] = '{ '{0,1}, '{5,10}, '{2,100}, '{1,1}, '{1,10} }; initial begin automatic int seed = 4755; automatic int series_sparsity = 0; automatic int arrivals_next_event_cyc = -1; automatic bit is_arrivals_series = 0; rsum = 0; n_errs = 0; latency_sum_n = 0; latency_sum = 0; error_val_issued = 0; error_late_issued = 1; series_idx = 0; value_idx = 0; series_n_vals = 0; n_overdue_errs = 0; n_underdue_errs = 0; sum_due_cyc = cyc_max; sum_due_cyc_earliest = 0; n_tests_done = 0; n_correct = 0; event_trace = ""; wait( tstart ); $write("Starting tests for %0d-stage pipeline.\n",n_stages); @( negedge clk ); reset = 1; event_trace = $sformatf("R(%0d)",cycle); a_valid = 0; a = 0; @( negedge clk ); cycle_limit = cycle + n_stages * 2; tests_start = 1; reset = 0; @( negedge clk ); wait( sum_valid ); while ( series_idx <= n_tests ) begin @( negedge clk ); // Can be overwritten. a = is_arrivals_series ? 99 : $dist_uniform( seed, 1, a_in_max ); if ( value_idx >= series_n_vals ) begin a_valid = 0; if ( sum_valid ) begin series_idx++; value_idx = 0; event_trace = $sformatf("R(%0d)",cycle); reset = 1; a_valid = 0; rsum = 0; series_n_vals = arrivals.size ? arrivals.size : $dist_uniform( seed, 1, 10 ); is_arrivals_series = arrivals.size > 0; if ( is_arrivals_series ) arrivals_next_event_cyc = cycle + arrivals[0].gap_cycles; series_sparsity = series_idx % 6; sum_due_cyc = cycle + 1; sum_due_cyc_earliest = cycle; error_val_issued = 0; error_late_issued = 0; cycle_limit = cycle + 1; end end else if ( is_arrivals_series ) begin reset = 0; if ( arrivals.size && arrivals_next_event_cyc <= cycle ) begin automatic Arrival arrival = arrivals.pop_front; arrivals_next_event_cyc = cycle + ( arrivals.size ? arrivals[0].gap_cycles : 10*n_stages ); a_valid = 1; a = arrival.value; end else begin a_valid = 0; end cycle_limit = cycle + lat_limit_full; end else begin reset = 0; a_valid = series_sparsity == 0 || $dist_uniform( seed, 0, series_sparsity ) == 0; cycle_limit = cycle + lat_limit_full; end if ( a_valid ) begin value_idx++; event_trace = {event_trace,$sformatf("+%0d(%0d)",a,cycle)}; error_val_issued = 0; error_late_issued = 0; rsum += a; last_a_cyc = cycle; sum_due_cyc = cycle + ( sum_valid ? lat_limit_empty : lat_limit_full ); sum_due_cyc_earliest = cycle + ( value_idx > 1 ? lat_min_empty : 0 ); end end done = 1; end endmodule // cadence translate_on