```/// LSU EE 4755
//
// Code based on 2016 Final Exam Problem 1.
// This file includes code written in class on 8 November 2021, 15:52:44 CST.

`default_nettype none

/// Non-Synthesizable Mag Module --- Complete, Don't Edit

module prob1_functional
( output shortreal mag,
input shortreal v0, v1 );

always_comb mag = v0 * v0 + v0 * v1 + v1 * v1;

endmodule

//

/// Version of module written in class  8 November 2021, 15:34:37 CST
//
module prob1_seq
( output uwire [31:0] result,
input uwire [31:0] v0, v1,
input uwire start, clk);

uwire [31:0] prod;

logic [31:0] ac0, ac1;
logic [2:0]  step;

localparam   int last_step = 4;  /// SET CORRECTLY

always_ff @( posedge clk )
if ( start ) step <= 0; else if ( step < last_step ) step <= step + 1;

logic [31:0] mul_a, mul_b;

always_comb begin

case ( step )

0: begin mul_a = v0;  mul_b = v0; end
1: begin mul_a = v0;  mul_b = v1; end
2: begin mul_a = v1;  mul_b = v1; end
default begin mul_a = v0;  mul_b = v1; end

endcase

end

// Step
//  0
//  1
//  2
//  3

always_ff @( posedge clk ) begin

ac0 <= prod;
ac1 <= step == 0 ? 32'h0 : step < 3 ? result : ac1;
ready <= start ? 0 : step == 3 ? 1 : ready;

end

CW_fp_mult m1( .a(mul_a), .b(mul_b), .rnd(3'd0), .z(prod), .status(mul_s));

endmodule

//

module prob1_seq_sol
( output logic [31:0] result,
input uwire [31:0] v0, v1,
input uwire start,
input uwire clk );

localparam logic [2:0] rnd = 0; // 1 is round towards zero.

logic [2:0]  step;

uwire [31:0] mul_a, mul_b;
uwire [31:0] prod, sum;

logic [31:0] ac0, ac1;

localparam   int last_step = 4;

always_ff @( posedge clk )
if ( start ) step <= 0;
else if ( step < last_step ) step <= step + 1;

CW_fp_mult m1( .a(mul_a), .b(mul_b), .rnd(rnd), .z(prod), .status(mul_s));

assign mul_a = step < 2  ? v0 : v1;
assign mul_b = step == 0 ? v0 : v1;

always_ff @( posedge clk )
begin

ac0 <= prod;

case ( step )
0: ac1 <= 0;
1: ac1 <= sum;
2: ac1 <= sum;
endcase

if ( start ) ready <= 0; else if ( step == last_step-1 ) ready <= 1;

end

assign result = sum;

endmodule

function automatic real rand_real(real minv, real maxv);
rand_real = minv + ( maxv - minv ) * ( real'({\$random}) ) / 2.0**32;
endfunction

function automatic shortreal fabs(shortreal val);
fabs = val < 0 ? -val : val;
endfunction

program reactivate
(output uwire clk_reactive, output int cycle_reactive,
input uwire clk, input var int cycle);
assign clk_reactive = clk;
assign cycle_reactive = cycle;
endprogram

module testbench();

typedef enum { MT_comb, MT_seq, MT_pipe } Module_Type;

localparam wid = 32;
localparam max_latency = 10;
localparam int num_tests = 16;
localparam   int nmuts = 10;
int err[nmuts];

uwire [31:0]  mag[nmuts];
shortreal   magr;
shortreal vr[2];
logic [31:0] v[2];
logic [31:0] vp[2];
logic        start;

typedef struct
{
int idx;
int err_count = 0;
int ncyc = 0;
Module_Type mt = MT_comb;
logic [wid-1:0] sout = 'h111;
int cyc_tot = 0;
int latency = 0;
} Info;
Info pi[string];

localparam int cycle_limit = num_tests * max_latency * 4;
int cycle, cyc_start;
bit done;
logic clock;
bit   use_others;

logic      clk_reactive;
int cycle_reactive;
reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

task pi_seq(input int idx, input string name);
automatic string m = \$sformatf("%s", name);
pi[m].idx = idx; pi[m].mt = MT_seq;

task pi_pipe(input int idx, input string name, input int ncyc);
automatic string m = \$sformatf("%s", name);
pi[m].idx = idx; pi[m].mt = MT_pipe;
pi[m].ncyc = ncyc;

initial begin
clock = 0;
cycle = 0;

fork
forever #10 begin
cycle += clock++;
end
wait( done );
wait( cycle >= cycle_limit )
\$write("*** Cycle limit exceeded, ending.\n");
join_any;

\$finish();
end

prob1_functional mf( magr, vr[0], vr[1] );
prob1_seq_sol m2( mag[1], ready[1], v[0],v[1], start, clock );
initial begin pi_seq(1,"Seq. Sol"); end
prob1_seq m2r( mag[2], ready[2], v[0],v[1], start, clock );
initial begin pi_seq(2,"Seq."); end
//  prob1_pipe m4( mag[3], vp, clock );
//  initial begin pi_pipe(3,"Pipe",m4.nstages); m4.db = db; end

initial begin

while ( !done ) @( posedge clk_reactive ) #2

if ( use_others ) begin

vp = v;
use_others = 0;
start = 1;

end else begin

vp[0] = \$shortrealtobits(shortreal'(cycle-cyc_start));
vp[1] = cycle - cyc_start;
start = 0;

end
end

initial begin

automatic int tot_errors = 0;

done = 0;
use_others = 0;
start = 0;

@( posedge clk_reactive );

for ( int i=0; i<num_tests; i++ ) begin

automatic int awaiting = pi.num();

cyc_start = cycle;

if ( i < 4 ) begin

// In first eight test vector components are zero or one.
//
for ( int j=0; j<2; j++ ) vr[j] = i & 1 << j ? 1.0 : 0.0;

end else begin

// In other tests vector components are randomly chosen.
//
for ( int j=0; j<2; j++ ) vr[j] = rand_real(-10,+10);

end

for ( int j=0; j<2; j++ ) v[j] = \$shortrealtobits(vr[j]);

vp = v;
use_others = 1;

/// Collect Result (mag) From Each Module Under Test (mut)
//
foreach ( pi[muti] ) begin

automatic string mut = muti;  // Informal name of module.
automatic Info p = pi[mut];

// Create a child thread to get response from current mut.
// The parent thread, without delay, proceeds to join_none.
//
fork begin

automatic int steps = pi[mut].ncyc;
automatic int latency =
pi[mut].mt == MT_comb ? 1 :
pi[mut].mt == MT_seq ? 2 : steps;

// Compute time at which result should be ready or
// when to start examining a READY output.
//
automatic int eta = 1 + cyc_start + latency;

pi[mut].latency = latency;

//
wait ( cycle_reactive == eta );

// If this module has a READY output, wait for it.
//
if ( pi[mut].mt == MT_seq ) wait( ready[pi[mut].idx] );

// Decrement count of the number of modules we are waiting for.
//
awaiting--;

// Store the module MAG output, it will be checked later
// for correctness.
//
pi[mut].sout = mag[pi[mut].idx];

pi[mut].cyc_tot += cycle - cyc_start;

// This thread ends execution here.
end join_none;

end

// Wait until data collected from all modules under test.
//
wait ( awaiting == 0 );

// Check the output of each Module Under Test.
//
foreach ( pi[ mut ] ) begin

// Assign module output to a shortreal.
//
automatic shortreal mmagr = \$bitstoshortreal(pi[mut].sout);
//
// Note: pi[mut].sout is type logic which is assumed to be
// an unsigned integer. However, the contents is really an
// IEEE 754 single-precision float (shortreal in
// SystemVerilog) and so \$bitstoshortreal is used so that
// pi[mut].sout is copied bit-for-bit unchanged to mmagr.

// Compute difference between module output and expected
// output.  With FP small differences can be okay, they might
// occur, for example, due to differences in the order of
// operations.
//
automatic shortreal err_mag = fabs( mmagr - magr );
automatic bit okay = err_mag < 1e-4;

if ( !okay ) begin
pi[mut].err_count++;
if ( pi[mut].err_count < 5 )
\$write("%s test #%0d vec (%.1f,%.1f) error: h'%8h  %7.4f != %7.4f (correct)\n",
mut, i, vr[1], vr[0],
pi[mut].sout, mmagr, magr);
end
end

while ( {\$random} & 1 == 1 ) @( posedge clk_reactive );
//
// Note: By waiting for reactive clock we can be sure that
// modules under test have completed all work due to the
// positive edge of the regular clk. Wait a random amount of
// time in case any modules are only correct at some stride.

end

foreach ( pi[ mut ] )
\$write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
num_tests, mut, pi[mut].err_count,
pi[mut].mt == MT_comb ? 1 : real'(pi[mut].cyc_tot) / num_tests);

done = 1;

\$finish(2);

end

endmodule