/// LSU EE 4755 -- Fall 2023 -- Digital Design Using HDLs
//
///  Verilog Notes -- Combinational v. Sequential v. Pipelined

/// Summary
//
//  An illustration of three implementation methods, combinational,
//  sequential, and pipelined. An integer multiplier is implemented
//  using these three methods. Verilog code and inferred hardware are
//  shown.
//
 /// Combinational.
//
//   Defining Feature: No registers and so no clocks.
//   - Lowest latency, because there are no intervening registers.
//   - Hard to use with clocked systems if latency > clock period.
//
 /// Sequential (Not Pipelined)
//
//   Defining Features:
//   - Registers hold state.
//   - Calculation proceeds over several cycles.
//   - At least one piece of hardware used in more than one cycle.
//   Characteristics
//   - Lowest cost (in most cases).
//
 /// Pipelined, n Stages
//
//   Defining Features
//   - Registers hold state and divide pipeline into stages.
//   - Can start a new calculation every cycle (or every I cycles).
//   - Calculation advances by one stage each cycle.
//   - Latency is n cycles (more or less).
//   - Throughput is one calculation / cylce.
//   Characteristics
//   - Minimum latency is calculation time + n register delays.
//   - Highest throughput.
//   - Highest cost. 

 /// Multipliers in this File
//
//   Perform unsigned integer multiplication.
//
//   Are intended to illustrate combinational/sequential/pipeline tradeoffs.
//
//   Are slower than real life multipliers. 
//     For faster multipliers see the "wfront" multipliers in mult-seq.v.

/// Contents
//
 /// - Review of longhand binary multiplication.
 /// - A Combinational Multiplier
 /// - A Simple Sequential Multiplier
 /// - A Streamlined Sequential Multiplier
 /// - A Pipelined Multiplier.


////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm

 /// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  cand  -- Multiplicand
//     1100  plier -- Multiplier
//     """"
//     0000  Partial Product
//    0000
//   0101
//  0101
//  """""""
//  0111100  prod  -- Product
//
//  OED Citation
//    The multiplycand, which must alwaies stand aboue.
//    1594 T. Blundeville Exercises i. iv. f. 6 


`default_nettype none


//////////////////////////////////////////////////////////////////////////////
/// Combinational Multiplier


module mult_combinational
  #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] plier, cand );

   always_comb begin

      logic [2*w-1:0] accum;
      logic [w-1:0] pp;

      accum = 0;

      for ( int pos=0; pos<w; pos++ ) begin

         pp = plier[pos] ?  cand  :  0;
         accum = ( pp << pos ) + accum;

      end

      prod = accum;
   end

endmodule


 /// Wrapper
//
//   The mult_combinational_clk allows mult_combinational to be
//   tested alongside sequential multipliers.
//
module mult_combinational_clk
  #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] cand, plier,
     input uwire clk);

   uwire [2*w-1:0] p;
   logic [w-1:0] candcpy, pliercpy;

   mult_combinational #(w) ml(p, candcpy, pliercpy);

   always_ff @( posedge clk ) begin
      candcpy <= cand;
      pliercpy <= plier;
      prod <= p;
   end

endmodule



//////////////////////////////////////////////////////////////////////////////
/// Sequential Multipliers


 /// Simple Sequential Multiplier
//
// This sequential multiplier is a straightforward adaptation of
// the combinational multiplier.  It is more costly than
// it needs to be, that is fixed in the mult_seq_stream multiplier.
//
module mult_seq #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     output logic ready,
     input uwire [w-1:0] cand, plier,
     input uwire start, clk);

   localparam int wlog = $clog2(w+1);

   logic [wlog-1:0] pos;
   logic [2*w-1:0] accum;
   logic [w-1:0] pp;

   always_ff @( posedge clk )
      if ( start ) begin

         ready <= 0;
         accum <= 0;
         pos <= 0;

      end else if ( pos != w ) begin

         pp = plier[pos] ?  cand  :  0;
         accum <= ( pp << pos ) + accum;
         pos <= pos + 1;

      end else begin

         ready <= 1;
         prod <= accum;

      end

endmodule
// 



 /// Streamlined Sequential Multiplier
//
 // Techniques For Lowering Cost
//
//   Instead of shifting the multiplier, shift the accumulator.
//   Use part of the accumulator to store the multiplicand.
//
module mult_seq_stream #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     output logic ready,
     input uwire [w-1:0] cand, plier,
     input uwire start, clk);

   localparam int wlog = $clog2(w+1);

   logic [wlog-1:0] pos;
   logic [2*w-1:0] accum;
   logic [w-1:0] pp;

   always_ff @( posedge clk )
      if ( start ) begin

         ready <= 0;
         accum <= plier;
         pos <= 0;

      end else if ( pos != w ) begin

         //  plier[pos]
         pp = accum[0]   ?  cand  :  0;
         accum <= ( pp << w-1 ) + ( accum >> 1 );
         pos <= pos + 1;

      end else begin

         ready <= 1;
         prod <= accum;

      end

endmodule
// 


//////////////////////////////////////////////////////////////////////////////
/// Pipelined Multiplier
//
//   Computes 1 partial product per stage.
//
module mult_pipe1
  #( int w = 16, int m = 1 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] cand, plier,
     input uwire clk);

   localparam int stages = w;

   // Note: pl is for pipeline latch.
   logic [2*w-1:0] pl_accum[0:stages];
   logic [w-1:0] pl_cand[0:stages], pp;
   logic [w-1:0] pl_plier[0:stages];

   always_ff @( posedge clk ) begin

      pl_accum[0] <= 0;
      pl_cand[0]  <= cand;
      pl_plier[0] <= plier;

      for ( int stage=0; stage<stages; stage++ ) begin

         automatic int pos = stage; // For comparison with other multipliers.

         pp = pl_plier[stage][pos] ?  pl_cand[stage]  :  0;

         pl_accum[stage+1] <= ( pp << pos ) + pl_accum[stage];
         pl_cand[stage+1]  <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

   assign prod = pl_accum[stages];

endmodule

 /// Inferred Hardware for mult_pipe1 Without and With Labels.
// 
// 


 /// mult_pipe1
 /// Cost Analysis : Delay Analysis
//
//   Note: Cost is entire module, delay is one stage.
//
// Registers: (Cost per bit: 7;  Delay per bit: 6 )
//  pl_accum:         ( w + 1 ) * 2w * 7 = 14w^2 + 14w
//  plier, cand:  2 * ( w + 1 ) *  w * 7 = 14w^2 + 14w
//  Total: 28w^2 + 28w
//
//  pl_cand[stage] << stage :  No hardware since stage is a constant.
//  pp = pl_plier[stage][stage] ? pl_cand[stage] << stage : 0;
//    Per stage: a w-bit mux with one input of 0, synthesizes into w AND gates.
//      w^2 : 1
//  accum = pl_accum[stage] + pp;
//    Per stage: a 2w-bit ripple adder:
//     w 9 2 w = 18w^2 : 4w
//
//  Total cost:
//    28w^2 + 28w + w^2 + 18w^2  =  47 w^2 + 28w
//
//  Critical Path
//    pp mux -> adder:  1 + 4w
//  Register Delay: 6
//
//  Clock Period
//   1 + 4w + 6  =  4w + 7
//  Latency
//   (w+1) (4w+7) = 4w^2 + 11w + 7


//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

module mult_behav_1
  #(int w = 16)
   (output uwire [2*w-1:0] prod, input uwire [w-1:0] cand, plier);

   assign prod = cand * plier;
endmodule


// cadence translate_off

program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input var int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int w = 8;
   localparam int num_tests = 1000;
   localparam int NUM_MULT = 20;
   localparam int err_limit = 7;

   bit use_others;
   logic start;
   logic [w-1:0] plier, cand;
   logic [w-1:0] plierp, candp;
   logic [2*w-1:0] prod[NUM_MULT];
   logic ready[NUM_MULT];

   typedef struct
     { int idx; int err_count = 0;
       bit seq = 0;
       bit pipe = 0; bit wf = 0; bit st_rdy = 0;
       int deg = 1;
       logic [2*w-1:0] sout = 'h111; int cyc_tot = 0;
       int latency = 0;
       } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * w * 8;
   int cycle;
   bit done;
   logic clock;

   logic      clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   initial begin

      while ( !done ) @( posedge clk_reactive ) #1

         if ( use_others ) begin

            start = 1;
            plierp = plier;
            candp = cand;
            use_others = 0;

         end else begin

            start = 0;
            plierp = cycle;
            candp = 256;

         end
   end

   task pi_seq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1;
   endtask

   task pi_seqs(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].st_rdy = 1;
   endtask

   task pi_seqw(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].wf = 1;
   endtask

   task pi_pipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;
   endtask

   mult_behav_1 #(w) mb1(prod[0], plier, cand);
   initial pi["Behavioral"].idx = 0;

   mult_seq #(w) ms4(prod[4], ready[4], plier, cand, start, clock);
   initial pi_seqs(4,"Seq SR Simple",1);

   mult_seq_stream #(w) ms5(prod[5], ready[5], plier, cand, start, clock);
   initial pi_seqs(5,"Seq SR Streamlined",1);

   mult_combinational_clk  #(w) mlc1(prod[15], plier, cand, clock);
   initial begin
      automatic string m = $sformatf("Combinational Clock");
      pi[m].idx = 15; pi[m].seq = 0;
   end

   mult_pipe1    #(w) ms18(prod[18], plierp, candp, clock);
   initial pi_pipe(18,"Pipelined Simple",1);


   // Array of multiplier/multiplicand values to try out.
   // After these values are used a random number generator will be used.
   //
   int tests[$] = {1,32,  1,1, 1,2,  1,32,  32, 1};

   initial begin

      done = 0;
      use_others = 0;
      start = 1;

      // Make sure that no two multipliers are assigned the same index ..
      // .. and show unused indices.
      #0 begin
         string index_used[NUM_MULT];
         automatic int n_unused = 0, n_reused = 0;
         foreach ( pi[ mut ] ) begin
            automatic int idx = pi[mut].idx;
            if ( index_used[idx].len() )
              begin
                 $write("*** Index %0d used by %s and %s.\n",
                        idx, index_used[idx], mut );
                 n_reused++;
              end
            index_used[idx] = mut;
         end
         $write("Unused positions: ");
         foreach ( index_used[idx] )
           if ( index_used[idx].len() == 0 )
             $write("%s%0d", n_unused++ ? ", " : "", idx);
         $write("%s.\n",n_unused ? "" : "none -- all used");
         if ( n_reused )
           $fatal(2, "\nFound %0d re-used indices. Aborting simulation.\n\n",
                  n_reused);
      end

      @( posedge clk_reactive );

      plier = 1;
      cand = 1;
      use_others = 1;

      wait ( cycle_reactive >= 2*w );
      @( posedge clk_reactive );

      for ( int i=0; i<num_tests; i++ ) begin
         automatic int cyc_start = cycle;
         automatic int awaiting = pi.num();

         // Set multiplier and multiplicand values for non-piped units.
         //
         plier = tests.size() ? tests.pop_front() : $random();
         cand = tests.size() ? tests.pop_front() : $random();
         start = 1;

         // Set multiplier and multiplicand values for piped units.
         //
         plierp = plier;
         candp = cand;
         use_others = 1;

         foreach ( pi[muti] ) begin
            automatic string mut = muti; // Bug workaround?
            automatic Info p = pi[mut];
            fork begin
               automatic int arrival_late = 1;
               automatic int steps = ( w + p.deg - 1 ) / p.deg;
               automatic int latency
                 = !p.seq ? 2 :
                   p.wf ? 5 * steps :
                     !p.pipe ? 2 * steps : arrival_late+steps;
               automatic int eta = 1 + cyc_start + latency;
               automatic int idx = p.idx;
               pi[mut].latency = latency;
               if ( p.st_rdy ) begin
                  if ( !ready[idx] ) begin
                     $write("Expected %s to be ready\n",mut);
                     $finish(2);
                  end
                  wait ( !ready[idx] );
                  wait ( ready[idx] );
               end else begin
                  wait ( cycle_reactive == eta );
               end
               awaiting--;
               pi[mut].sout = prod[idx];
               pi[mut].cyc_tot += cycle - cyc_start;
            end join_none;
         end
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] )
           if ( prod[0] !== pi[mut].sout ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write
                  ("%-25s wrong result: %0d * %0d:  0x%0h != 0x%0h (correct)\n",
                   mut, plier, cand, pi[mut].sout, prod[0]);
           end

         @( posedge clk_reactive );

      end

      foreach ( pi[ mut ] )
        $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
               num_tests, mut, pi[mut].err_count,
               pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on