//////////////////////////////////////////////////////////////////////////////// // /// LSU EE 4755 Fall 2016 Homework 6 /// PRELIMINARY SOLUTION // /// Assignment http://www.ece.lsu.edu/koppel/v/2016/hw06.pdf /// Additional Resources // // Instructions for Account Setup, Verilog, Synthesis, Chipware, Emacs. // http://www.ece.lsu.edu/koppel/v/proc.html // // // Verilog Documentation // The Verilog Standard // http://standards.ieee.org/getieee/1800/download/1800-2012.pdf // Introductory Treatment (Warning: Does not include SystemVerilog) // Brown & Vranesic, Fundamentals of Digital Logic with Verilog, 3rd Ed. // // ChipWare Component Library Documentation // Documentation for the FP modules (and other) such as CW_fp_add. // Look for the link to ChipWare on: http://www.ece.lsu.edu/v/ref.html // // Load Verilog for ChipWare floating-point multiply and add modules. // `include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_fp_mult.v" `include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_fp_add.v" `default_nettype none ////////////////////////////////////////////////////////////////////////////// /// Problem 0 // // Look over but don't modify these modules. // cadence translate_off /// Non-Synthesizable Mag Module --- Complete, Don't Edit // module mag_functional ( output shortreal mag, input shortreal v [3] ); always_comb begin shortreal sos; sos = 0; for ( int i=0; i<3; i++ ) sos += v[i] * v[i]; mag = sos; end endmodule // cadence translate_on /// Combinational Module --- Complete, Don't Edit // module mag_comb ( output uwire [31:0] mag, input uwire [31:0] v [3] ); uwire [31:0] vsq[3]; uwire [7:0] status[5]; uwire [31:0] sum01; localparam logic [2:0] rnd = 0; // 0 is round toward even. for ( genvar i=0; i<3; i++ ) CW_fp_mult m1( v[i], v[i], rnd, status[i], vsq[i]); // Product is last! CW_fp_add a1( vsq[0], vsq[1], rnd, sum01, status[3] ); CW_fp_add a2( sum01, vsq[2], rnd, mag, status[4] ); endmodule ////////////////////////////////////////////////////////////////////////////// /// Problem 1 // /// Complete mag_seq so that it computes mag sequentially, using one /// fp add and one fp multiply module. // // [x] Learn to use SimVision *before* wasting hours on simple problems. // [x] The code must be synthesizable. // [x] Make sure that the testbench does not report errors. // [x] Can use behavioral or implicit structural code. // [x] Do not rename modules or change ports. // [x] Must use exactly one CW_fp_add and one CW_fp_mult. // [x] Assume that data arrives at module inputs late in the clock cycle. // cadence translate_off class Debug; int cycle; int test_cyc; // Number of cycles since test began. int test_num; shortreal vr[3]; logic [31:0] v[3]; shortreal magr; // Correct result. logic [31:0] mag; // Correct result. endclass // cadence translate_on module mag_seq ( output uwire [31:0] mag, output uwire ready, input uwire [31:0] v [3], input uwire start, input uwire clk ); // cadence translate_off Debug db; // cadence translate_on localparam logic [2:0] rnd = 0; // 1 is round towards zero. uwire [7:0] sm, sa; logic [31:0] accum[2]; uwire [31:0] prod, sum; logic [2:0] step; /// SOLUTION -- Assign multiplier input. // uwire [31:0] ma = v[ step ]; CW_fp_mult m1( .a(ma), .b(ma), .rnd(rnd), .z(prod), .status(sm)); CW_fp_add a1( .a(accum[0]), .b(accum[1]), .rnd(rnd), .z(sum), .status(sa)); localparam int last_step = 4; assign ready = step == last_step; always_ff @( posedge clk ) if ( start ) step <= 0; else if ( step < last_step ) step <= step + 1; always_ff @( posedge clk ) begin case ( step ) 0: accum[0] <= prod; // Save v[0] * v[0]. /// SOLUTION below. 1: accum[1] <= prod; // Save v[1] * v[1]. 2: begin accum[0] <= prod; // Save v[2] * v[2]. accum[1] <= sum; // Save (v[0]*v[0]) + (v[1]*v[1]) end 3: accum[1] <= sum; // Save (v[0]*v[0]+v[1]*v[1]) + (v[2]*v[2]). endcase end assign mag = accum[1]; endmodule ////////////////////////////////////////////////////////////////////////////// /// Problem 2 // /// Complete mag_pipe so that it computes mag in pipelined fashion and /// has at most one fp operation delay per cycle. // // [x] Learn to use SimVision *before* wasting hours on simple problems. // [x] The code must be synthesizable. // [x] Make sure that the testbench does not report errors. // [x] Can use behavioral or implicit structural code. // [x] Do not rename modules or change ports. // [x] Choose number of stages to maximize throughput (minimize delay). // [x] Use as many CW_fp_add and CW_fp_mult modules as needed, but no more. // [x] Assume that data arrives at module inputs late in the clock cycle. module mag_pipe ( output uwire [31:0] mag, input uwire [31:0] v [3], input uwire clk ); // cadence translate_off Debug db; // cadence translate_on /// Do not rename nstages. The testbench examines its value and it must be set /// correctly. // For a vector arriving at cycle t, magnitude will be available at // cycle t + nstages. localparam int nstages = 4; localparam logic [2:0] rnd = 0; // 1 is round towards zero. logic [31:0] pl_vsq[1:2][3]; logic [31:0] pl_sos[2:3]; uwire [31:0] vsq[3], sum01, sum012; uwire [7:0] s[5]; // Pipeline latches between inputs and stage 0. // logic [31:0] pl_v[3]; /// /// Logic Within Stages /// // Stage 0: Three Multipliers. // // Instantiate 3 multipliers. All of these are in stage 0. // for ( genvar i=0; i<3; i++ ) CW_fp_mult m1(.a(pl_v[i]), .b(pl_v[i]), .rnd(rnd), .z(vsq[i]), .status(s[i])); // Stage 1: An adder. // CW_fp_add a1( pl_vsq[1][0], pl_vsq[1][1], rnd, sum01, s[3] ); // Stage 2: Another adder. // CW_fp_add a2( pl_sos[2], pl_vsq[2][2], rnd, sum012, s[4] ); /// /// Pipeline Latches (Registers Separating Stages) /// always_ff @( posedge clk ) begin // Module input -> Stage 0 // pl_v <= v; // Stage 0 -> 1 // // Result of multiplications done in stage 0. // pl_vsq[1] <= vsq; // Note: vsq is a 3-element array of 32-bit vals. // Stage 1 -> 2 // // Pass along multiplications done in stage 1. // pl_vsq[2][2] <= pl_vsq[1][2]; // // Sum performed in stage 1. // pl_sos[2] <= sum01; // Stage 2 -> 3 // // Sum performed in stage 2. // pl_sos[3] <= sum012; end assign mag = pl_sos[3]; endmodule // Synthesized hardware after optimization: // : ////////////////////////////////////////////////////////////////////////////// /// Testbench Code // // The code below instantiates some of the modules above, // provides test inputs, and verifies the outputs. // // The testbench may be modified to facilitate your solution. Of // course, the removal of tests which your module fails is not a // method of fixing a broken module. (One might modify the testbench // so that the first tests it performs are thoe which make it easier // to determine what the problem is, for example, test inputs that // are all 0's or all 1's.) // cadence translate_off function automatic real rand_real(real minv, real maxv); rand_real = minv + ( maxv - minv ) * ( real'({$random}) ) / 2.0**32; endfunction function automatic shortreal fabs(shortreal val); fabs = val < 0 ? -val : val; endfunction program reactivate (output uwire clk_reactive, output int cycle_reactive, input uwire clk, input var int cycle); assign clk_reactive = clk; assign cycle_reactive = cycle; endprogram module testbench(); typedef enum { MT_comb, MT_seq, MT_pipe } Module_Type; localparam int wid = 32; localparam int max_latency = 10; localparam int num_tests = 16; localparam int nmuts = 10; int err[nmuts]; uwire [31:0] mag[nmuts]; uwire ready[nmuts]; shortreal magr; shortreal vr[3]; logic [31:0] v[3]; logic [31:0] vp[3]; logic start; typedef struct { int idx; int err_count = 0; int ncyc = 0; Module_Type mt = MT_comb; logic [wid-1:0] sout = 'h111; int cyc_tot = 0; int latency = 0; } Info; Info pi[string]; localparam int cycle_limit = num_tests * max_latency * 4; int cycle, cyc_start; bit done; logic clock; bit use_others; logic clk_reactive; int cycle_reactive; reactivate ra(clk_reactive,cycle_reactive,clock,cycle); task pi_seq(input int idx, input string name); automatic string m = $sformatf("%s", name); pi[m].idx = idx; pi[m].mt = MT_seq; endtask task pi_pipe(input int idx, input string name, input int ncyc); automatic string m = $sformatf("%s", name); pi[m].idx = idx; pi[m].mt = MT_pipe; pi[m].ncyc = ncyc; endtask Debug db; initial db = new; initial begin clock = 0; cycle = 0; fork forever #10 begin cycle += clock++; db.cycle = cycle; db.test_cyc = cycle - cyc_start; end wait( done ); wait( cycle >= cycle_limit ) $write("*** Cycle limit exceeded, ending.\n"); join_any; $finish(); end mag_functional mf( magr, vr ); mag_comb m1( mag[0], v ); initial pi["Comb."].idx = 0; mag_seq m2( mag[1], ready[1], v, start, clock ); initial begin pi_seq(1,"Seq."); m2.db = db; end mag_pipe m4( mag[3], vp, clock ); initial begin pi_pipe(3,"Pipe",m4.nstages); m4.db = db; end initial begin while ( !done ) @( posedge clk_reactive ) #2 if ( use_others ) begin vp = v; use_others = 0; start = 1; end else begin vp[0] = $shortrealtobits(shortreal'(cycle-cyc_start)); vp[1] = cycle - cyc_start; vp[2] = 0; start = 0; end end initial begin automatic int tot_errors = 0; done = 0; use_others = 0; start = 0; @( posedge clk_reactive ); for ( int i=0; i<num_tests; i++ ) begin automatic int awaiting = pi.num(); db.test_num = i; cyc_start = cycle; db.test_cyc = 0; if ( i < 8 ) begin // In first eight test vector components are zero or one. // for ( int j=0; j<3; j++ ) vr[j] = i & 1 << j ? 1.0 : 0.0; end else begin // In other tests vector components are randomly chosen. // for ( int j=0; j<3; j++ ) vr[j] = rand_real(-10,+10); end for ( int j=0; j<3; j++ ) v[j] = $shortrealtobits(vr[j]); db.vr = vr; db.v = v; fork #0 begin db.magr = magr; db.mag = $shortrealtobits(magr); end join_none vp = v; use_others = 1; /// Collect Result (mag) From Each Module Under Test (mut) // foreach ( pi[muti] ) begin automatic string mut = muti; // Informal name of module. automatic Info p = pi[mut]; // Create a child thread to get response from current mut. // The parent thread, without delay, proceeds to join_none. // fork begin automatic int steps = pi[mut].ncyc; automatic int latency = pi[mut].mt == MT_comb ? 1 : pi[mut].mt == MT_seq ? 2 : steps; // Compute time at which result should be ready or // when to start examining a READY output. // automatic int eta = 1 + cyc_start + latency; pi[mut].latency = latency; // Wait (just this thread waits) until result should be ready. // wait ( cycle_reactive == eta ); // If this module has a READY output, wait for it. // if ( pi[mut].mt == MT_seq ) wait( ready[pi[mut].idx] ); // Decrement count of the number of modules we are waiting for. // awaiting--; // Store the module MAG output, it will be checked later // for correctness. // pi[mut].sout = mag[pi[mut].idx]; pi[mut].cyc_tot += cycle - cyc_start; // This thread ends execution here. end join_none; end // Wait until data collected from all modules under test. // wait ( awaiting == 0 ); // Check the output of each Module Under Test. // foreach ( pi[ mut ] ) begin // Assign module output to a shortreal. // automatic shortreal mmagr = $bitstoshortreal(pi[mut].sout); // // Note: pi[mut].sout is type logic which is assumed to be // an unsigned integer. However, the contents is really an // IEEE 754 single-precision float (shortreal in // SystemVerilog) and so $bitstoshortreal is used so that // pi[mut].sout is copied bit-for-bit unchanged to mmagr. // Compute difference between module output and expected // output. With FP small differences can be okay, they might // occur, for example, due to differences in the order of // operations. // automatic shortreal err_mag = fabs( mmagr - magr ); automatic bit okay = err_mag < 1e-4; if ( !okay ) begin pi[mut].err_count++; if ( pi[mut].err_count < 5 ) $write("%s test #%0d vec (%.1f,%.1f,%.1f) error: h'%8h %7.4f != %7.4f (correct)\n", mut, i, vr[2], vr[1], vr[0], pi[mut].sout, mmagr, magr); end end while ( {$random} & 1 == 1 ) @( posedge clk_reactive ); // // Note: By waiting for reactive clock we can be sure that // modules under test have completed all work due to the // positive edge of the regular clk. Wait a random amount of // time in case any modules are only correct at some stride. end foreach ( pi[ mut ] ) $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n", num_tests, mut, pi[mut].err_count, pi[mut].mt == MT_comb ? 1 : real'(pi[mut].cyc_tot) / num_tests); done = 1; $finish(2); end endmodule // cadence translate_on