////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2019 Homework 6 -- SOLUTION
//

 /// Assignment  https://www.ece.lsu.edu/koppel/v/2019/hw06.pdf


`default_nettype none

//////////////////////////////////////////////////////////////////////////////
///  Problem 1
//
 ///    Complete add_accum so that it accumulates a sum.
//
//     [✔] Put your solution in add_accum. No other modules should
//         be modified. (Except the testbench, to help debug.)
//
//     [✔] add_accum must use an add_pipe module to compute the sum.
//
//     [✔] Make sure that the testbench does not report errors.
//     [✔] Module must be synthesizable. Use command: genus -files syn.tcl
//
//     [✔] As always, avoid costly, slow, and confusing code.
//     [✔] As always, don't assume parameters will be at their default values.


module add_accum
  #( int w = 21, n_stages = 3 )
   ( output logic [w-1:0] sum,
     output logic sum_valid,
     input uwire [w-1:0] ai,
     input uwire ai_valid, reset, clk );

   /// SOLUTION

   // Register to keep track of which stage of add pipeline is occupied.
   //
   logic [n_stages:0] st_occ;
   //
   // If st_occ[i] == 1 then stage i of pipelined adder is occupied.

   // If none of the adder stages is occupied the value in sum must be valid.
   //
   assign sum_valid = !st_occ;

   // If true, there is a useful result at the adder output.
   //
   uwire aout_valid = st_occ[n_stages-1];

   // Connections to adder.
   //
   uwire [w-1:0] aout;
   uwire [w-1:0] a0 = ai_valid   ? ai   : sum;
   uwire [w-1:0] a1 = aout_valid ? aout : sum;

   add_pipe #(w,n_stages) add_p0( aout, a0, a1, clk );

   // If true, the value in sum is needed.
   //
   logic sum_occupied;

   // Number of values ready to be added together.
   //
   uwire [1:0] n_values = ai_valid + sum_occupied + aout_valid;
   //
   // If n_values == 0: Nothing to do.
   // If n_values == 1: Put or keep the value in sum.
   // If n_values == 2: Put the two values into the adder.
   // If n_values == 3: Put ai and aout into the adder and leave sum unchanged.

   // If true, a pair of values will be put in the adder in this cycle.
   //
   uwire start_an_addition = n_values >= 2;

   // If true, write sum with either ai or aout.
   //
   uwire write_sum = !sum_occupied && n_values == 1;

   always_ff @( posedge clk ) if ( reset ) begin

      sum <= 0;
      sum_occupied <= 0;
      st_occ <= 0;        // Set occupied bit of every stage to 0.

   end else begin

      if ( write_sum ) sum <= aout_valid ? aout : ai;

      // sum will be occupied if there are an odd number (1 or 3) values ..
      //
      sum_occupied <= n_values[0];
      //
      // .. because if there were 2 values they both would go into the adder.

      // Advance occupied bit by one stage.
      //
      st_occ <= { st_occ[n_stages-1:0], start_an_addition };

   end

endmodule

`ifdef xxx
Synthesizing at effort level "high"

Module Name                              Area   Delay   Delay
                                               Actual  Target
add_pipe_w24_n_stages1                  29928  10.174  90.000 ns
add_pipe_w24_n_stages2                  47043   5.428  90.000 ns
add_pipe_w24_n_stages3                  64159   3.701  90.000 ns
add_pipe_w24_n_stages4                  81275   2.837  90.000 ns
add_pipe_w24_n_stages6                 115506   1.973  90.000 ns

add_accum_w24_n_stages1                 87556  11.449  90.000 ns
add_accum_w24_n_stages2                105305   6.349  90.000 ns
add_accum_w24_n_stages3                123530   4.560  90.000 ns
add_accum_w24_n_stages4                141598   3.696  90.000 ns
add_accum_w24_n_stages6                177545   3.061  90.000 ns

add_pipe_w24_n_stages1                  84351   1.114   0.100 ns
add_pipe_w24_n_stages2                  83959   1.249   0.100 ns
add_pipe_w24_n_stages3                 103383   1.105   0.100 ns
add_pipe_w24_n_stages4                 117358   1.001   0.100 ns
add_pipe_w24_n_stages6                 150854   0.896   0.100 ns

add_accum_w24_n_stages1                150738   2.023   0.100 ns
add_accum_w24_n_stages2                149544   1.757   0.100 ns
add_accum_w24_n_stages3                183994   1.514   0.100 ns
add_accum_w24_n_stages4                191611   1.444   0.100 ns
add_accum_w24_n_stages6                224175   1.332   0.100 ns
Normal exit.
`endif


module add_pipe
  #( int w = 21, n_stages = 3 )
   ( output uwire [w-1:0] sum,
     input uwire [w-1:0] a, b,
     input uwire clk );

   localparam int bits_per_stage = ( w + n_stages - 1 ) / n_stages;
   localparam int wr = n_stages * bits_per_stage; // w rounded.

   logic [wr-1:0] pl_a[n_stages+1], pl_b[n_stages+1], pl_sum[n_stages+1];
   logic pl_carry[n_stages+1];

   always_ff @( posedge clk ) begin

      pl_a[0] = a;
      pl_b[0] = b;
      pl_carry[0] = 0;

      for ( int s=0; s<n_stages; s++ ) begin

         automatic logic [bits_per_stage:0] sumi =
           pl_a[s][bits_per_stage-1:0] +
             pl_b[s][bits_per_stage-1:0] + pl_carry[s];

         pl_carry[s+1] <= sumi[bits_per_stage];
         pl_sum[s+1] <=
           { sumi[bits_per_stage-1:0], pl_sum[s] } >> bits_per_stage;
         pl_a[s+1] <= pl_a[s] >> bits_per_stage;
         pl_b[s+1] <= pl_b[s] >> bits_per_stage;

      end

   end

   assign sum = pl_sum[ n_stages ][w-1:0];

endmodule



// cadence translate_off


program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input var int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int n_stages[] = { 1, 2, 3, 5, 6 };

   localparam int nw = 5; // Cadence, please fix this.
   initial if ( nw != n_stages.size() )
     $fatal(1,"Constant nw should be %0d.\n",n_stages.size() );

   int t_errs;     // Total number of errors.
   initial t_errs = 0;
   final $write("Total number of errors: %0d\n",t_errs);

   uwire d[nw:-1];    // Start / Done signals.
   assign d[-1] = 1;  // Initialize first at true.

   // Instantiate a testbench at each size.
   //
   for ( genvar i=0; i<nw; i++ )
     testbench_n #(n_stages[i]) t2( .done(d[i]), .tstart(d[i-1]) );

endmodule


module testbench_n
  #( int n_stages = 3 )
   ( output logic done, input uwire tstart );

   localparam int n_tests = 10000;
   localparam int w = 30;

   localparam int a_in_max = 42;
   localparam int cyc_max = 1 << 30;

   localparam int lat_limit_empty = n_stages + 2;
   localparam int lat_min_empty = n_stages;
   localparam int lat_limit_full = 2 + (1+$clog2(n_stages)) * ( n_stages + 1 );

   bit clk;
   int cycle, cycle_limit;
   logic clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clk,cycle);

   string event_trace;

   initial begin
      clk = 0;
      cycle = 0;

      done = 0;
      cycle_limit = cyc_max;
      wait( tstart );

      fork
         while ( !done ) #1 cycle += clk++;
         wait( cycle >= cycle_limit )
           $write("Exit from clock loop at cycle %0d, limit %0d.  %s\n %s\n",
                  cycle, cycle_limit, "** CYCLE LIMIT EXCEEDED **",
                  event_trace);
      join_any;

      done = 1;
   end

   uwire [w-1:0] sum;
   uwire sum_valid;
   logic [w-1:0] a;
   logic a_valid, reset;

   add_accum #(w,n_stages) fpa(sum, sum_valid, a, a_valid, reset, clk);

   int rsum;
   bit tests_start;
   int series_idx, value_idx, series_n_vals;
   int n_errs, n_underdue_errs, n_overdue_errs, n_tests_done;
   int sum_due_cyc_earliest, sum_due_cyc, n_correct;
   int last_a_cyc;
   int latency_sum, latency_sum_n;
   bit error_val_issued, error_late_issued;

   initial wait ( done ) begin
      automatic int not_done = n_tests - series_idx;
      $write("Done with %0d-stage tests, %0d series.\n Correct, %0d.   Errors: %0d not done, %0d val, %0d/%0d early/late.\n",
             n_stages, series_idx,
             n_correct, not_done, n_errs, n_underdue_errs, n_overdue_errs );
      $write("For %0d stages average latency %.2f cycles.\n",
             n_stages,
             real'(latency_sum) / ( latency_sum_n ? latency_sum_n : 1 ) );
      testbench.t_errs += n_errs + n_underdue_errs + n_overdue_errs + not_done;
   end

   initial begin

      wait( tests_start );

      while ( !done ) @( posedge clk_reactive ) begin

         if ( sum_valid ) begin

            automatic bit pending = sum_due_cyc < cyc_max;

            if ( pending ) begin
               n_tests_done++;
               sum_due_cyc = cyc_max;
               if ( sum === rsum ) n_correct++;
               latency_sum += cycle - last_a_cyc;
               latency_sum_n++;
               if ( cycle < sum_due_cyc_earliest ) begin

                  n_underdue_errs++;
                  if ( n_underdue_errs < 5 ) begin
                     $write
                       ("At cyc %0d, value ready too soon, %0d, cyc. (Min cyc %0d.)\n",
                        cycle, last_a_cyc - cycle, lat_limit_empty
                        );
                     if ( event_trace != "" ) $write(" %s\n",event_trace);
                  end

               end
            end

            if ( !error_val_issued && sum !== rsum ) begin
               error_val_issued = 1;
               n_errs++;
               if ( n_errs < 5 ) begin
                  $write("At cyc %0d, wrong sum, %0d != %g (correct)\n",
                         cycle, sum, rsum);
                  if ( event_trace != "" ) $write(" %s\n",event_trace);
               end
            end

         end else if ( sum_due_cyc <= cycle ) begin

            if ( !error_late_issued ) begin
               error_late_issued = 1;
               n_overdue_errs++;
               sum_due_cyc = cyc_max;
               if ( n_overdue_errs < 5 ) begin
                  $write("At cycle %0d, sum overdue.\n",cycle);
                  if ( event_trace != "" ) $write(" %s\n",event_trace);
               end
            end
         end

      end

   end

   initial begin

      automatic int seed = 4755;
      automatic int series_sparsity = 0;
      rsum = 0;
      n_errs = 0;
      latency_sum_n = 0;
      latency_sum = 0;
      error_val_issued = 0;
      error_late_issued = 1;
      series_idx = 0;
      value_idx = 0;
      series_n_vals = 0;
      n_overdue_errs = 0;
      n_underdue_errs = 0;
      sum_due_cyc = cyc_max;
      sum_due_cyc_earliest = 0;
      n_tests_done = 0;
      n_correct = 0;
      event_trace = "";

      wait( tstart );
      $write("Starting tests for %0d-stage pipeline.\n",n_stages);

      @( negedge clk );
      reset = 1;
      event_trace = $sformatf("R(%0d)",cycle);
      a_valid = 0;
      a = 0;
      @( negedge clk );
      cycle_limit = cycle + n_stages * 2;
      tests_start = 1;
      reset = 0;
      @( negedge clk );
      wait( sum_valid );

      while ( series_idx < n_tests ) begin

         @( negedge clk );

         a = $dist_uniform( seed, 1, a_in_max );

         if ( value_idx >= series_n_vals ) begin

            a_valid = 0;

            if ( sum_valid ) begin

               series_idx++;
               value_idx = 0;
               event_trace = $sformatf("R(%0d)",cycle);
               reset = 1;
               a_valid = 0;
               rsum = 0;
               series_n_vals = $dist_uniform( seed, 1, 10 );
               series_sparsity = series_idx % 6;
               sum_due_cyc = cycle + 1;
               sum_due_cyc_earliest = cycle;
               error_val_issued = 0;
               error_late_issued = 0;
               cycle_limit = cycle + 1;
            end

         end else begin

            reset = 0;
            a_valid = series_sparsity == 0
              || $dist_uniform( seed, 0, series_sparsity ) == 0;
            cycle_limit = cycle + lat_limit_full;

         end

         if ( a_valid ) begin
            value_idx++;
            event_trace = {event_trace,$sformatf("+%0d(%0d)",a,cycle)};
            error_val_issued = 0;
            error_late_issued = 0;
            rsum += a;
            last_a_cyc = cycle;
            sum_due_cyc = cycle +
              ( sum_valid ? lat_limit_empty : lat_limit_full );
            sum_due_cyc_earliest =
              cycle + ( value_idx > 1 ? lat_min_empty : 0 );
         end

      end

      done = 1;

   end

endmodule

// cadence translate_on