/// EE 4755 - Digital Design Using HDLs // // Pipelining ////////////////////////////////////////////////////////////////////////////// /// Throughput v. Latency v. Clock Period + Review // /// Review of Timing Definitions // // :Def: Critical Path // The longest path from launch to capture. Launch points are // typically module inputs and register outputs, capture points are // typically module outputs and register inputs. // // :Def: The Clock // A signal in a sequential circuit used to clock edge-triggered // registers. Typically a large digital circuit uses one clock, // connected to all its registers. (Distributing a clock to so many // places in a way that it arrives at all of the trigger inputs at // roughly the same time is an important design concern, but one // which is beyond the scope of this class.) // // :Def: Clock Period // The time from positive edge to positive edge (or some other // reference point) of the clock. The reciprocal of the clock period // is the clock frequency. The length of the critical path determines // the clock period. The clock period is set to the critical path, // plus the register setup time, plus a safety margin accounting for // the fact that the critical path length can vary with temperature, // manufacturing variation, and component age. /// Throughput-Related Definitions // // :Def: Unit of Work // Something a design is supposed to compute. For example, for a // multiplier with inputs a and b the unit of work is a product a⨉b. // // :Def: Throughput // The number of units of work per unit time. In sequential circuits // the unit of time for throughput can be given in seconds or in // clock cycles. For example, 10 million multiplies per second, or // 0.01 multiplies per clock cycles (implying that it takes 100 // cycles to complete a multiply). // // :Def: Latency // The amount of time it takes to do one unit of work from start to // finish. In a combinational circuit the latency and critical path // are the same. In a sequential circuit the latency is the product // of the clock period and the number of clock cycles it takes to // complete one unit of work. // // // /// Which is better, high throughput or low latency?: // // High Throughput: (Good for some) // + Hardware is being used efficiently. // - Usually costs more. (More hardware.) // - Higher Latency (but not by too much). // High throughput best achieved with pipelining. // // Low Latency: (Good for some) // + The "result" arrives in a short amount of time. // - Lower throughput. // Low latency best achieved with combinational logic. // // Impatient people prefer low latency. // Productive people prefer high throughput. // // Jokes aside, in reality it depends on the situation. ////////////////////////////////////////////////////////////////////////////// /// Pipelining Concept /// Pipelining Concept <-- Very Important but Tricky Concept, Pay Attention! // // // :Def: Pipelining // Performing an operation in *stages* on multiple data items. // // :Example: // // Output x has the value that input a had two cycles in the past. // module very_simple_pipe #( int w = 16 ) ( output logic [w-1:0] x, input uwire [w-1:0] a, input uwire clk ); logic [w-1:0] r; always_ff @( posedge clk ) r <= a; always_ff @( posedge clk ) x <= r; endmodule // : // :Example: // // Pipeline that passes data through unchanged. // Output x has the value that input a had nstages cycles in the past. // module simple_pipe2 #( int w = 16, int nstages = 4 ) ( output uwire [w-1:0] x, input uwire [w-1:0] a, input uwire clk ); logic [w-1:0] r[nstages]; always_ff @( posedge clk ) begin r[0] <= a; // Non-blocking assignment here, blocking in simple_pipe2_ba. for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1]; end assign x = r[nstages-1]; endmodule // /// Important thing to notice: // // At any moment the module holds the most recent nstages values of a. // : // :Example: // // The pipeline below, simple_pipe2_ba, is almost identical to the one above, // simple_pipe2. The only difference is that here, r[0] is just wire // whereas in simple_pipe2 r[0] is a register. The difference is due // to the way in which r[0] is assigned. // // Output x has the value that input a had nstages-1 cycles in the past. // module simple_pipe2_ba #( int w = 16, int nstages = 4 ) ( output uwire [w-1:0] x, input uwire [w-1:0] a, input uwire clk ); logic [w-1:0] r[nstages]; always_ff @( posedge clk ) begin r[0] = a; // Blocking assignment here, non-blocking in simple_pipe2. for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1]; end assign x = r[nstages-1]; endmodule // : // :Example: // // Compute a running average of data arriving one element per clock // cycle. The pipeline holds recent values. A sum is computed by // adding together the values in the pipeline stages each cycle. // /// Warning: Uses more adders than are necessary. // See pipe_r_avg2, further below, for a version that uses fewer adders. // module simple_pipe_avg #( int w = 16, int nstages = 4 ) ( output uwire [w-1:0] x, output logic [w-1:0] avg, input uwire [w-1:0] a, input uwire clk ); logic [w-1:0] r[nstages]; assign r[0] = a; always_ff @( posedge clk ) begin for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1]; end assign x = r[nstages-1]; logic [w+$clog2(nstages):0] sum; always_comb begin sum = 0; for ( int i=0; i<nstages; i++ ) sum += r[i]; avg = sum / nstages; end endmodule /// Inferred Hardware for simple_pipe_avg // // // :Example: // // Compute a running average of data arriving one element per clock // cycle. Do so using just one adder and one subtractor. // // Based on 2021 Final Exam Problem 1 // module pipe_r_avg2 #( int w = 8, n_samples = 4 ) ( output logic [w-1:0] r_avg, input uwire [w-1:0] sample, input uwire reset, clk ); logic [w-1:0] samples[n_samples]; parameter int wm = $clog2( n_samples ); parameter int ws = w + wm; logic [ws-1:0] tot; always_ff @( posedge clk ) begin samples[0] <= sample; for ( int i=1; i<n_samples; i++ ) samples[i] <= samples[i-1]; tot <= tot - samples[n_samples-1] + samples[0]; end assign r_avg = tot / n_samples; endmodule //