/// EE 4755 - Digital Design Using HDLs
//
//  Pipelining


//////////////////////////////////////////////////////////////////////////////
/// Throughput v. Latency v. Clock Period + Review
//
 /// Review of Timing Definitions
//
// :Def: Critical Path
//  The longest path from launch to capture. Launch points are
//  typically module inputs and register outputs, capture points are
//  typically module outputs and register inputs.
//
// :Def: The Clock
//  A signal in a sequential circuit used to clock edge-triggered
//  registers. Typically a large digital circuit uses one clock,
//  connected to all its registers. (Distributing a clock to so many
//  places in a way that it arrives at all of the trigger inputs at
//  roughly the same time is an important design concern, but one
//  which is beyond the scope of this class.)
//
// :Def: Clock Period
//  The time from positive edge to positive edge (or some other
//  reference point) of the clock. The reciprocal of the clock period
//  is the clock frequency. The length of the critical path determines
//  the clock period. The clock period is set to the critical path,
//  plus the register setup time, plus a safety margin accounting for
//  the fact that the critical path length can vary with temperature,
//  manufacturing variation, and component age.

 /// Throughput-Related Definitions 
//
// :Def: Unit of Work
//  Something a design is supposed to compute. For example, for a
//  multiplier with inputs a and b the unit of work is a product a⨉b.
//
// :Def: Throughput
//  The number of units of work per unit time. In sequential circuits
//  the unit of time for throughput can be given in seconds or in
//  clock cycles. For example, 10 million multiplies per second, or
//  0.01 multiplies per clock cycles (implying that it takes 100
//  cycles to complete a multiply).
//
// :Def: Latency
//  The amount of time it takes to do one unit of work from start to
//  finish. In a combinational circuit the latency and critical path
//  are the same. In a sequential circuit the latency is the product
//  of the clock period and the number of clock cycles it takes to
//  complete one unit of work.
//
//
//
 /// Which is better, high throughput or low latency?:
//
//   High Throughput: (Good for some)
//     + Hardware is being used efficiently.
//     - Usually costs more. (More hardware.)
//     - Higher Latency (but not by too much).
//     High throughput best achieved with pipelining.
//
//   Low Latency: (Good for some)
//     + The "result" arrives in a short amount of time.
//     - Lower throughput.
//     Low latency best achieved with combinational logic.
//
//   Impatient people prefer low latency.
//   Productive people prefer high throughput.
//
//   Jokes aside, in reality it depends on the situation.


//////////////////////////////////////////////////////////////////////////////
/// Pipelining Concept

 /// Pipelining Concept    <-- Very Important but Tricky Concept, Pay Attention!
 //
 //
 //  :Def: Pipelining
 //   Performing an operation in *stages* on multiple data items.
 //

// :Example:
//
// Output x has the value that input a had two cycles in the past.
//
module very_simple_pipe
  #( int w = 16 )
   ( output logic [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r;

   always_ff @( posedge clk ) r <= a;
   always_ff @( posedge clk ) x <= r;

endmodule

// :


// :Example:
//
// Pipeline that passes data through unchanged.
// Output x has the value that input a had nstages cycles in the past.
//
module simple_pipe2
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r[nstages];

   always_ff @( posedge clk ) begin

      r[0] <= a; // Non-blocking assignment here, blocking in simple_pipe2_ba.
      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];

   end

   assign x = r[nstages-1];

endmodule
//
 /// Important thing to notice:
 //
 //  At any moment the module holds the most recent nstages values of a.


// :



// :Example:
//
// The pipeline below, simple_pipe2_ba, is almost identical to the one above,
// simple_pipe2. The only difference is that here, r[0] is just wire
// whereas in simple_pipe2 r[0] is a register. The difference is due
// to the way in which r[0] is assigned.
//
// Output x has the value that input a had nstages-1 cycles in the past.
//
module simple_pipe2_ba
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r[nstages];

   always_ff @( posedge clk ) begin

      r[0] = a; // Blocking assignment here, non-blocking in simple_pipe2.
      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];

   end

   assign x = r[nstages-1];

endmodule

// :



// :Example:
//
// Compute a running average of data arriving one element per clock
// cycle. The pipeline holds recent values. A sum is computed by
// adding together the values in the pipeline stages each cycle.
//
 /// Warning: Uses more adders than are necessary.
//   See pipe_r_avg2, further below, for a version that uses fewer adders.
//
module simple_pipe_avg
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     output logic [w-1:0] avg,
     input uwire [w-1:0] a,
     input uwire clk );

   logic [w-1:0] r[nstages];

   assign r[0] = a;

   always_ff @( posedge clk ) begin

      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];

   end

   assign x = r[nstages-1];

   logic [w+$clog2(nstages):0] sum;

   always_comb begin

      sum = 0;
      for ( int i=0; i<nstages; i++ ) sum += r[i];
      avg = sum / nstages;

   end

endmodule

 /// Inferred Hardware for simple_pipe_avg

// 

// 


// :Example:
//
// Compute a running average of data arriving one element per clock
// cycle. Do so using just one adder and one subtractor.
//
// Based on 2021 Final Exam Problem 1
//
module pipe_r_avg2
  #( int w = 8, n_samples = 4 )
   ( output logic [w-1:0] r_avg,
     input uwire [w-1:0] sample,
     input uwire reset, clk );

   logic [w-1:0] samples[n_samples];

   parameter int wm = $clog2( n_samples );
   parameter int ws = w + wm;
   logic [ws-1:0] tot;

   always_ff @( posedge clk ) begin

      samples[0] <= sample;

      for ( int i=1; i<n_samples; i++ ) samples[i] <= samples[i-1];

      tot <= tot - samples[n_samples-1] + samples[0];

   end

   assign r_avg = tot / n_samples;

endmodule

//