```////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2016 Homework 6
/// PRELIMINARY SOLUTION
//

/// Assignment  http://www.ece.lsu.edu/koppel/v/2016/hw06.pdf

//
// Instructions for Account Setup, Verilog, Synthesis, Chipware, Emacs.
//      http://www.ece.lsu.edu/koppel/v/proc.html
//
//
// Verilog Documentation
//    The Verilog Standard
//    Introductory Treatment (Warning: Does not include SystemVerilog)
//      Brown & Vranesic, Fundamentals of Digital Logic with Verilog, 3rd Ed.
//
// ChipWare Component Library Documentation
//    Documentation for the FP modules (and other) such as CW_fp_add.
//    Look for the link to ChipWare on: http://www.ece.lsu.edu/v/ref.html
//

//

`default_nettype none

//////////////////////////////////////////////////////////////////////////////
///  Problem 0
//
//  Look over but don't modify these modules.

/// Non-Synthesizable Mag Module --- Complete, Don't Edit
//
module mag_functional
( output shortreal mag,
input shortreal v [3] );

always_comb begin
shortreal sos;
sos = 0;
for ( int i=0; i<3; i++ ) sos += v[i] * v[i];
mag = sos;
end

endmodule

/// Combinational Module  --- Complete, Don't Edit
//
module mag_comb
( output uwire [31:0] mag,
input uwire [31:0] v [3] );

uwire [31:0]   vsq[3];
uwire [7:0]    status[5];
uwire [31:0]    sum01;
localparam    logic [2:0] rnd = 0; // 0 is round toward even.

for ( genvar i=0; i<3; i++ )
CW_fp_mult m1( v[i], v[i], rnd, status[i], vsq[i]); // Product is last!

CW_fp_add a1( vsq[0], vsq[1], rnd, sum01, status[3] );
CW_fp_add a2( sum01, vsq[2], rnd, mag, status[4] );

endmodule

//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
/// Complete mag_seq so that it computes mag sequentially, using one
/// fp add and one fp multiply module.
//
//     [x] Learn to use SimVision *before* wasting hours on simple problems.
//     [x] The code must be synthesizable.
//     [x] Can use behavioral or implicit structural code.
//     [x] Do not rename modules or change ports.
//     [x] Must use exactly one CW_fp_add and one CW_fp_mult.
//     [x] Assume that data arrives at module inputs late in the clock cycle.

class Debug;

int cycle;
int test_cyc;  // Number of cycles since test began.
int test_num;
shortreal vr[3];
logic [31:0] v[3];
shortreal magr; // Correct result.
logic [31:0] mag; // Correct result.

endclass

module mag_seq
( output uwire [31:0] mag,
input uwire [31:0] v [3],
input uwire start,
input uwire clk );

Debug db;

localparam logic [2:0] rnd = 0; // 1 is round towards zero.

uwire [7:0] sm, sa;

logic [31:0] accum[2];
uwire [31:0]  prod, sum;
logic [2:0]  step;

/// SOLUTION -- Assign multiplier input.
//
uwire [31:0]  ma = v[ step ];

CW_fp_mult m1( .a(ma),       .b(ma),       .rnd(rnd), .z(prod), .status(sm));
CW_fp_add  a1( .a(accum[0]), .b(accum[1]), .rnd(rnd), .z(sum),  .status(sa));

localparam int last_step = 4;
assign     ready = step == last_step;

always_ff @( posedge clk )
if ( start ) step <= 0;
else if ( step < last_step ) step <= step + 1;

always_ff @( posedge clk )
begin
case ( step )
0: accum[0] <= prod;  // Save v[0] * v[0].

/// SOLUTION below.
1: accum[1] <= prod;  // Save v[1] * v[1].

2: begin
accum[0] <= prod;  // Save v[2] * v[2].
accum[1] <= sum;   // Save (v[0]*v[0]) + (v[1]*v[1])
end

3: accum[1] <= sum;   // Save (v[0]*v[0]+v[1]*v[1]) + (v[2]*v[2]).

endcase
end

assign mag = accum[1];

endmodule

//////////////////////////////////////////////////////////////////////////////
///  Problem 2
//
/// Complete mag_pipe so that it computes mag in pipelined fashion and
/// has at most one fp operation delay per cycle.
//
//     [x] Learn to use SimVision *before* wasting hours on simple problems.
//     [x] The code must be synthesizable.
//     [x] Can use behavioral or implicit structural code.
//     [x] Do not rename modules or change ports.
//     [x] Choose number of stages to maximize throughput (minimize delay).
//     [x] Use as many CW_fp_add and CW_fp_mult modules as needed, but no more.
//     [x] Assume that data arrives at module inputs late in the clock cycle.

module mag_pipe
( output uwire [31:0] mag,
input uwire [31:0] v [3],
input uwire clk );

Debug db;

/// Do not rename nstages. The testbench examines its value and it must be set
///  correctly.
//  For a vector arriving at cycle t, magnitude will be available at
//  cycle t + nstages.
localparam int nstages = 4;

localparam logic [2:0] rnd = 0; // 1 is round towards zero.

logic [31:0] pl_vsq[1:2][3];
logic [31:0] pl_sos[2:3];
uwire [31:0]  vsq[3], sum01, sum012;

uwire [7:0]   s[5];

// Pipeline latches between inputs and stage 0.
//
logic [31:0] pl_v[3];

///
/// Logic Within Stages
///

// Stage 0: Three Multipliers.
//
// Instantiate 3 multipliers. All of these are in stage 0.
//
for ( genvar i=0; i<3; i++ )
CW_fp_mult m1(.a(pl_v[i]), .b(pl_v[i]),
.rnd(rnd), .z(vsq[i]), .status(s[i]));

//
CW_fp_add a1( pl_vsq[1][0], pl_vsq[1][1], rnd, sum01, s[3] );

//
CW_fp_add a2( pl_sos[2], pl_vsq[2][2], rnd, sum012, s[4] );

///
/// Pipeline Latches (Registers Separating Stages)
///
always_ff @( posedge clk ) begin

// Module input -> Stage 0
//
pl_v <= v;

// Stage 0 -> 1
//
//  Result of multiplications done in stage 0.
//
pl_vsq[1] <= vsq;   // Note: vsq is a 3-element array of 32-bit vals.

// Stage 1 -> 2
//
//  Pass along multiplications done in stage 1.
//
pl_vsq[2][2] <= pl_vsq[1][2];
//
//  Sum performed in stage 1.
//
pl_sos[2] <= sum01;

// Stage 2 -> 3
//
//  Sum performed in stage 2.
//
pl_sos[3] <= sum012;
end

assign mag = pl_sos[3];

endmodule

// Synthesized hardware after optimization:
// :

//////////////////////////////////////////////////////////////////////////////
/// Testbench Code
//
//  The code below instantiates some of the modules above,
//  provides test inputs, and verifies the outputs.
//
//  The testbench may be modified to facilitate your solution. Of
//  course, the removal of tests which your module fails is not a
//  method of fixing a broken module. (One might modify the testbench
//  so that the first tests it performs are thoe which make it easier
//  to determine what the problem is, for example, test inputs that
//  are all 0's or all 1's.)

function automatic real rand_real(real minv, real maxv);
rand_real = minv + ( maxv - minv ) * ( real'({\$random}) ) / 2.0**32;
endfunction

function automatic shortreal fabs(shortreal val);
fabs = val < 0 ? -val : val;
endfunction

program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input var int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram

module testbench();

typedef enum { MT_comb, MT_seq, MT_pipe } Module_Type;

localparam int wid = 32;
localparam int max_latency = 10;
localparam int num_tests = 16;
localparam int nmuts = 10;
int err[nmuts];

uwire [31:0]  mag[nmuts];
shortreal   magr;
shortreal vr[3];
logic [31:0] v[3];
logic [31:0] vp[3];
logic        start;

typedef struct
{
int idx;
int err_count = 0;
int ncyc = 0;
Module_Type mt = MT_comb;
logic [wid-1:0] sout = 'h111;
int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];

localparam int cycle_limit = num_tests * max_latency * 4;
int cycle, cyc_start;
bit done;
logic clock;
bit   use_others;

logic      clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

task pi_seq(input int idx, input string name);
automatic string m = \$sformatf("%s", name);
pi[m].idx = idx; pi[m].mt = MT_seq;

task pi_pipe(input int idx, input string name, input int ncyc);
automatic string m = \$sformatf("%s", name);
pi[m].idx = idx; pi[m].mt = MT_pipe;
pi[m].ncyc = ncyc;

Debug db;
initial db = new;

initial begin
clock = 0;
cycle = 0;

fork
forever #10 begin
cycle += clock++;
db.cycle = cycle;
db.test_cyc = cycle - cyc_start;
end
wait( done );
wait( cycle >= cycle_limit )
\$write("*** Cycle limit exceeded, ending.\n");
join_any;

\$finish();
end

mag_functional mf( magr, vr );
mag_comb m1( mag[0], v );
initial pi["Comb."].idx = 0;
mag_seq m2( mag[1], ready[1], v, start, clock );
initial begin pi_seq(1,"Seq."); m2.db = db; end
mag_pipe m4( mag[3], vp, clock );
initial begin pi_pipe(3,"Pipe",m4.nstages); m4.db = db; end

initial begin

while ( !done ) @( posedge clk_reactive ) #2

if ( use_others ) begin

vp = v;
use_others = 0;
start = 1;

end else begin

vp[0] = \$shortrealtobits(shortreal'(cycle-cyc_start));
vp[1] = cycle - cyc_start;
vp[2] = 0;
start = 0;

end
end

initial begin

automatic int tot_errors = 0;

done = 0;
use_others = 0;
start = 0;

@( posedge clk_reactive );

for ( int i=0; i<num_tests; i++ ) begin

automatic int awaiting = pi.num();

db.test_num = i;
cyc_start = cycle;
db.test_cyc = 0;

if ( i < 8 ) begin

// In first eight test vector components are zero or one.
//
for ( int j=0; j<3; j++ ) vr[j] = i & 1 << j ? 1.0 : 0.0;

end else begin

// In other tests vector components are randomly chosen.
//
for ( int j=0; j<3; j++ ) vr[j] = rand_real(-10,+10);

end

for ( int j=0; j<3; j++ ) v[j] = \$shortrealtobits(vr[j]);
db.vr = vr;
db.v = v;
fork
#0 begin
db.magr = magr;
db.mag = \$shortrealtobits(magr);
end
join_none

vp = v;
use_others = 1;

/// Collect Result (mag) From Each Module Under Test (mut)
//
foreach ( pi[muti] ) begin

automatic string mut = muti;  // Informal name of module.
automatic Info p = pi[mut];

// Create a child thread to get response from current mut.
// The parent thread, without delay, proceeds to join_none.
//
fork begin

automatic int steps = pi[mut].ncyc;
automatic int latency =
pi[mut].mt == MT_comb ? 1 :
pi[mut].mt == MT_seq ? 2 : steps;

// Compute time at which result should be ready or
// when to start examining a READY output.
//
automatic int eta = 1 + cyc_start + latency;

pi[mut].latency = latency;

//
wait ( cycle_reactive == eta );

// If this module has a READY output, wait for it.
//
if ( pi[mut].mt == MT_seq ) wait( ready[pi[mut].idx] );

// Decrement count of the number of modules we are waiting for.
//
awaiting--;

// Store the module MAG output, it will be checked later
// for correctness.
//
pi[mut].sout = mag[pi[mut].idx];

pi[mut].cyc_tot += cycle - cyc_start;

// This thread ends execution here.
end join_none;

end

// Wait until data collected from all modules under test.
//
wait ( awaiting == 0 );

// Check the output of each Module Under Test.
//
foreach ( pi[ mut ] ) begin

// Assign module output to a shortreal.
//
automatic shortreal mmagr = \$bitstoshortreal(pi[mut].sout);
//
// Note: pi[mut].sout is type logic which is assumed to be
// an unsigned integer. However, the contents is really an
// IEEE 754 single-precision float (shortreal in
// SystemVerilog) and so \$bitstoshortreal is used so that
// pi[mut].sout is copied bit-for-bit unchanged to mmagr.

// Compute difference between module output and expected
// output.  With FP small differences can be okay, they might
// occur, for example, due to differences in the order of
// operations.
//
automatic shortreal err_mag = fabs( mmagr - magr );
automatic bit okay = err_mag < 1e-4;

if ( !okay ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < 5 )
\$write("%s test #%0d vec (%.1f,%.1f,%.1f) error: h'%8h  %7.4f != %7.4f (correct)\n",
mut, i, vr[2], vr[1], vr[0],
pi[mut].sout, mmagr, magr);
end
end

while ( {\$random} & 1 == 1 ) @( posedge clk_reactive );
//
// Note: By waiting for reactive clock we can be sure that
// modules under test have completed all work due to the
// positive edge of the regular clk. Wait a random amount of
// time in case any modules are only correct at some stride.

end

foreach ( pi[ mut ] )
\$write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
num_tests, mut, pi[mut].err_count,
pi[mut].mt == MT_comb ? 1 : real'(pi[mut].cyc_tot) / num_tests);

done = 1;

\$finish(2);

end

endmodule