111
 module mult_behav_1
  #(int w = 16)
   (output uwire [2*w-1:0] prod, input uwire [w-1:0] cand, plier);
   assign prod = cand * plier;
endmodule
 module mult_linear
  #(int w = 16)
   (output logic [2*w-1:0] prod, input uwire [w-1:0] cand, plier);
   uwire [2*w-1:0] b[w:0];
   assign b[0] = 0;
   assign prod = b[w];
   for ( genvar pos = 0;  pos < w;  pos++ ) begin
      uwire [2*w-1:0] pp = plier[pos] ? cand << pos : 0;
      assign b[pos+1] = pp + b[pos];
   end
endmodule
  
 module mult_tree
  #( int wa = 16, int wb = wa, int wp = wa + wb )
   ( output uwire [wp-1:0] prod,
     input uwire [wa-1:0] a,
     input uwire [wb-1:0] b );
   if ( wa == 1 ) begin
      assign prod = a ? b : 0;
   end else begin
            localparam int wn = wa / 2;
      localparam int wx = wb + wn;
      uwire [wx-1:0] prod_lo, prod_hi;
      mult_tree #(wn,wb) mlo( prod_lo, a[wn-1:0],  b );
      mult_tree #(wn,wb) mhi( prod_hi, a[wa-1:wn], b );
            assign prod = prod_lo + ( prod_hi << wn );
   end
endmodule
module mult_linear_clk
  #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] cand, plier,
     input uwire clk);
   uwire [2*w-1:0] p;
   logic [w-1:0] candcpy, pliercpy;
   mult_linear #(w) ml(p, candcpy, pliercpy);
   always_ff @( posedge clk ) begin
      candcpy <= cand;
      pliercpy <= plier;
      prod <= p;
   end
endmodule
 module mult_seq #( int w = 16 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] cand, plier,
     input uwire clk);
   localparam int wlog = $clog2(w);
   cadence   initial if ( w != 1 << wlog ) $fatal(1,"Size must be a power of 2.");
   cadence
   bit [wlog-1:0] pos;
   logic [2*w-1:0] accum;
   always_ff @( posedge clk ) begin
      if ( pos == 0 ) begin
         prod = accum;
         accum = 0;
      end
      if ( plier[pos] ) accum += cand << pos;
      pos++;
   end
endmodule
 
 
    :Def: Pipelining 
:Example:module very_simple_pipe
  #( int w = 16 )
   ( output logic [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );
   logic [w-1:0] r;
   always_ff @( posedge clk ) r <= a;
   always_ff @( posedge clk ) x <= r;
endmodule
:Example:module simple_pipe2
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );
   logic [w-1:0] r[nstages];
   always_ff @( posedge clk ) begin
      r[0] <= a;       for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
   end
   assign x = r[nstages-1];
endmodule
   
:Example:module simple_pipe2_ba
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );
   logic [w-1:0] r[nstages];
   always_ff @( posedge clk ) begin
      r[0] = a;       for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
   end
   assign x = r[nstages-1];
endmodule
:Example:module simple_pipe_avg
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     output logic [w-1:0] avg,
     input uwire [w-1:0] a,
     input uwire clk );
   logic [w-1:0] r[nstages];
   assign r[0] = a;
   always_ff @( posedge clk ) begin
      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1];
   end
   assign x = r[nstages-1];
   logic [w+$clog2(nstages):0] sum;
   always_comb begin
      sum = 0;
      for ( int i=0; i<nstages; i++ ) sum += r[i];
      avg = sum / nstages;
   end
endmodule
 
:Example:module simple_pipe_add1
  #( int w = 16, int nstages = 4 )
   ( output uwire [w-1:0] x,
     input uwire [w-1:0] a,
     input uwire clk );
   logic [w-1:0] r[nstages];
   always_ff @( posedge clk ) begin
      r[0] <= a + 1;
      for ( int i=1; i<nstages; i++ ) r[i] <= r[i-1] + 1;
   end
   assign x = r[nstages-1];
endmodule
 
 :Def: Throughput:Def: Latency:Def: Critical Path 
 :Def: Pipeline Latch
 :Example:module mult_pipe1
  #( int w = 16, int m = 1 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] cand, plier,
     input uwire clk);
   localparam int stages = w;
      logic [2*w-1:0] pl_accum[0:stages];
   logic [w-1:0] pl_cand[0:stages];
   logic [w-1:0] pl_plier[0:stages];
   always_ff @( posedge clk ) begin
      pl_accum[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;
      for ( int stage=0; stage<stages; stage++ ) begin
                           automatic int pos = stage;
         logic [2*w-1:0] pp, accum;
         pp = pl_plier[stage][stage] ? pl_cand[stage] << stage : 0;
         
         accum = pl_accum[stage] + pp;
                           pl_accum[stage+1] <= accum;
         pl_cand[stage+1]  <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];
      end
   end
   assign prod = pl_accum[stages];
endmodule
 
  
 :Example:
module mult_pipe #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] cand, plier,
     input clk);
   localparam int stages = ( w + m - 1 ) / m;
      logic [2*w-1:0] pl_accum[0:stages];
   logic [w-1:0] pl_cand[0:stages];
   logic [w-1:0] pl_plier[0:stages];
   always_ff @( posedge clk ) begin
      pl_accum[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;
      for ( int stage=0; stage<stages; stage++ ) begin
         logic [2*w-1:0] accum;
         accum = pl_accum[stage];
         for ( int j=0; j<m; j++ ) begin
            int pos;
            pos = stage * m + j;
            if ( pos < w && pl_plier[stage][pos] )
              accum += pl_cand[stage] << pos;
         end
                  pl_accum[stage+1] <= accum;
         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];
      end
   end
   assign prod = pl_accum[stages];
endmodule
:Example:
module mult_pipe_2 #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] cand, plier,
     input clk);
   localparam int nstages = ( w + m - 1 ) / m;
   logic [2*w-1:0] pl_accum[0:nstages];
   logic [w-1:0] pl_cand[0:nstages];
   logic [nstages-1:0][m-1:0] pl_plier[0:nstages];
   always_ff @( posedge clk ) begin
      pl_accum[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;
      for ( int stage=0; stage<nstages; stage++ ) begin
         pl_accum[stage+1] <=
           pl_accum[stage] +
           ( pl_plier[stage][stage] * pl_cand[stage] << stage*m );
         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];
      end
   end
   assign prod = pl_accum[nstages];
endmodule
 
module mult_pipe_wfront #( int w = 16, int m = 1 )
   ( output logic [2*w-1:0] prod,
     input uwire [w-1:0] cand, plier,
     input uwire clk );
   localparam int stages = 2*w;
      logic [2*w-1:0] pl_prod[0:stages];
   logic [w-1:0] pl_cand[0:stages];
   logic [w-1:0] pl_plier[0:stages];
   logic [w-1:0] pl_sum[0:stages];
   logic [w-1:0] pl_carry[0:stages];
   always_ff @( posedge clk ) begin
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;
      pl_sum[0] <= 0;
      pl_carry[0] <= 0;
      pl_prod[0] <= 0;
      for ( int stage = 0; stage < stages; stage++ ) begin
         logic [2*w-1:0] prod_next;
         logic [w-1:0] sum_next, carry_next;
         logic [1:0] sc;
         prod_next = pl_prod[stage];
         for ( int i=0; i<w; i++ ) begin
            logic a, b, c;
            a = stage && i ? pl_sum[stage][i-1] : 0;
            b = stage      ? pl_carry[stage][i] : 0;
            c = stage < w
              ? pl_plier[stage][w-1-i] && pl_cand[stage][stage] : 0;
            sc = a + b + c;
            { carry_next[i], sum_next[i] } = sc;
         end
         prod_next[stage] = sc[0];
                           pl_cand[stage+1]  <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];
         pl_sum[stage+1] <= sum_next;
         pl_carry[stage+1] <= carry_next;
         pl_prod[stage+1] <= prod_next;
      end
   end
   assign prod = pl_prod[stages];
endmodule
  
module pipe_stage #( int w = 16, int m = 2, int stage = 0 )
   ( output logic [2*w-1:0] accum_out,
     input logic [2*w-1:0] accum_in,
     input logic [w-1:0] cand, plier );
   always_comb begin
      logic [2*w-1:0] accum;  accum = accum_in;
      for ( int j=0; j<m; j++ ) begin
         int pos;  pos = stage * m + j;
         if ( pos < w && plier[pos] )
           accum += cand << pos;
      end
      accum_out = accum;
   end
endmodule
module mult_pipe_c_cpa #( int w = 16, int m = 2 )
   ( output logic [2*w-1:0] prod,
     input logic [w-1:0] cand, plier,
     input clk);
   localparam int stages = ( w + m - 1 ) / m;
   logic [2*w-1:0] pl_accum[0:stages];
   logic [w-1:0] pl_cand[0:stages];
   logic [w-1:0] pl_plier[0:stages];
   for ( genvar stage = 0;  stage < stages; stage++ ) begin
      uwire [2*w-1:0] accum;
      pipe_stage #(w, m, stage) our_stage
        (accum, pl_accum[stage], pl_cand[stage], pl_plier[stage]);
      always_ff @( posedge clk )
        pl_accum[stage+1] <= accum;
   end
   always_ff @( posedge clk ) begin
      pl_accum[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;
      for ( int stage=0; stage<stages; stage++ ) begin
         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];
      end
   end
   assign prod = pl_accum[stages];
endmodule
`include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v"
module pipe_stage_csa #( int wid = 16, int m = 2, int stage = 0 )
   ( output uwire [2*wid-1:0] accum_out_a, accum_out_b,
     input uwire [2*wid-1:0] accum_in_a, accum_in_b,
     input uwire [wid-1:0] cand, plier );
   uwire [2*wid-1:0] accum_a[m-1:-1];
   uwire [2*wid-1:0] accum_b[m-1:-1];
   uwire co[-1:m-1];
   assign accum_a[-1] = accum_in_a;
   assign accum_b[-1] = accum_in_b;
   assign accum_out_a = accum_a[m-1];
   assign accum_out_b = accum_b[m-1];
   for ( genvar i = 0; i < m; i++ ) begin
      localparam int pos = stage * m + i;
      uwire [2*wid-1:0] pp = pos < wid && plier[pos] ? cand << pos : 0;
      CW_csa #(2*wid) csa
        ( .carry(accum_a[i]), .sum(accum_b[i]), .co(co[i]),
          .a(accum_a[i-1]), .b(accum_b[i-1]), .c(pp), .ci(1'b0) );
   end
endmodule
module mult_pipe_c_csa #( int wid = 16, int m = 2 )
   ( output uwire [2*wid-1:0] prod,
     input uwire [wid-1:0] cand, plier,
     input uwire clk);
   localparam int stages = ( wid + m - 1 ) / m;
   logic [2*wid-1:0] pl_accum_a[0:stages];
   logic [2*wid-1:0] pl_accum_b[0:stages];
   logic [wid-1:0] pl_cand[0:stages];
   logic [wid-1:0] pl_plier[0:stages];
   for ( genvar stage = 0;  stage < stages;  stage++ ) begin
      uwire [2*wid-1:0] accum_a, accum_b;
      pipe_stage_csa #(wid, m, stage) our_stage
        (accum_a, accum_b, pl_accum_a[stage], pl_accum_b[stage],
         pl_cand[stage], pl_plier[stage]);
      always_ff @( posedge clk ) begin
        pl_accum_a[stage+1] <= accum_a;
        pl_accum_b[stage+1] <= accum_b;
      end
   end
   always_ff @( posedge clk ) begin
      pl_accum_a[0] <= 0;
      pl_accum_b[0] <= 0;
      pl_cand[0] <= cand;
      pl_plier[0] <= plier;
      for ( int stage=0; stage<stages; stage++ ) begin
         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];
      end
   end
   assign prod = pl_accum_a[stages] + pl_accum_b[stages];
endmodule
cadence
program reactivate
   (output uwire clk_reactive, output int cycle_reactive,
    input uwire clk, input int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram
module testbench;
   localparam int wid = 16;
   localparam int num_tests = 1000;
   localparam int NUM_MULT = 20;
   localparam int err_limit = 7;
   bit use_others;
   logic [wid-1:0] plier, cand;
   logic [wid-1:0] plierp, candp;
   logic [2*wid-1:0] prod[NUM_MULT];
   typedef struct { int idx; int err_count = 0;
                    bit seq = 0; bit pipe = 0; bit wf = 0; int deg = 1;
                    logic [2*wid-1:0] sout = 'h111; int cyc_tot = 0;
                    int latency = 0;
                    } Info;
   Info pi[string];
   localparam int cycle_limit = num_tests * wid * 8;
   int cycle;
   bit done;
   logic clock;
   logic      clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);
   initial begin
      clock = 0;
      cycle = 0;
      fork
         forever #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;
      $finish();
   end
   initial begin
      while ( !done ) @( posedge clk_reactive ) #1
         if ( use_others ) begin
            plierp = plier;
            candp = cand;
            use_others = 0;
         end else begin
            plierp = cycle;
            candp = 256;
         end
   end
   task pi_seq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1;
   endtask
   task pi_pipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;
   endtask
   task pi_wpipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1; pi[m].wf = 1;
   endtask
   mult_behav_1 #(wid) mb1(prod[0], plier, cand);
   initial pi["Behavioral"].idx = 0;
   mult_pipe1    #(wid) ms18(prod[18], plierp, candp, clock);
   initial pi_pipe(18,"Pipelined Simple",1);
   mult_pipe    #(wid,4) ms54(prod[7], plierp, candp, clock);
   initial pi_pipe(7,"Pipelined",ms54.m);
   mult_pipe    #(wid,3) ms53(prod[8], plierp, candp, clock);
   initial pi_pipe(8,"Pipelined",ms53.m);
   mult_pipe_wfront #(wid,1) ms4(prod[4], plierp, candp, clock);
   initial pi_wpipe(4,"Pipelined WF",ms4.m);
   mult_pipe_2    #(wid,4) ms17(prod[17], plierp, candp, clock);
   initial pi_pipe(17,"Pipelined 2",ms17.m);
   mult_pipe_2    #(wid,3) ms16(prod[16], plierp, candp, clock);
   initial pi_pipe(16,"Pipelined 2",ms16.m);
   mult_pipe_c_cpa #(wid,4) pgam4(prod[12], plierp, candp, clock);
   initial pi_pipe(12,"Pipelined Comb CPA",pgam4.m);
   mult_pipe_c_cpa    #(wid,3) pgam13(prod[13], plierp, candp, clock);
   initial pi_pipe(13,"Pipelined Comb CPA",pgam13.m);
   mult_pipe_c_csa    #(wid,3) pgam2(prod[2], plierp, candp, clock);
   initial pi_pipe(2,"Pipelined CSA",pgam2.m);
   mult_pipe_c_csa    #(wid,4) pgam3(prod[3], plierp, candp, clock);
   initial pi_pipe(3,"Pipelined CSA",pgam3.m);
            int tests[$] = {1,1, 1,2,  1,32,  2,1,  32, 1};
   initial begin
      done = 0;
      use_others = 0;
                  #0 begin
         string index_used[NUM_MULT];
         automatic int n_unused = 0, n_reused = 0;
         foreach ( pi[ mut ] ) begin
            automatic int idx = pi[mut].idx;
            if ( index_used[idx].len() )
              begin
                 $write("*** Index %0d used by %s and %s.\n",
                        idx, index_used[idx], mut );
                 n_reused++;
              end
            index_used[idx] = mut;
         end
         $write("Unused positions: ");
         foreach ( index_used[idx] )
           if ( index_used[idx].len() == 0 )
             $write("%s%0d", n_unused++ ? ", " : "", idx);
         $write("%s.\n",n_unused ? "" : "none -- all used");
         if ( n_reused )
           $fatal(2, "\nFound %0d re-used indices. Aborting simulation.\n\n",
                  n_reused);
      end
      @( posedge clk_reactive );
      for ( int i=0; i<num_tests; i++ ) begin
         automatic int cyc_start = cycle;
         automatic int awaiting = pi.num();
                           plier = tests.size() ? tests.pop_front() : $random();
         cand = tests.size() ? tests.pop_front() : $random();
                           plierp = plier;
         candp = cand;
         use_others = 1;
         foreach ( pi[muti] ) begin
            automatic string mut = muti;             automatic Info p = pi[mut];
            fork begin
               automatic int arrival_late = 1;
               automatic int steps = ( wid + pi[mut].deg - 1 ) / pi[mut].deg;
               automatic int latency
                 = !pi[mut].seq ? 1 :
                   !pi[mut].pipe ? 2 * steps :
                    pi[mut].wf ? arrival_late + 2 * steps :
                    arrival_late + steps;
               automatic int eta = 1 + cyc_start + latency;
               pi[mut].latency = latency;
               wait ( cycle_reactive == eta );
               awaiting--;
               pi[mut].sout = prod[pi[mut].idx];
               pi[mut].cyc_tot += cycle - cyc_start;
            end join_none;
         end
         wait ( awaiting == 0 );
                           foreach ( pi[ mut ] )
           if ( prod[0] !== pi[mut].sout ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write
                  ("%-25s wrong result: %0d * %0d:  0x%0h != 0x%0h (correct)\n",
                   mut, plier, cand, pi[mut].sout, prod[0]);
           end
         @( posedge clk_reactive );
      end
      foreach ( pi[ mut ] )
        $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
               num_tests, mut, pi[mut].err_count,
               pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);
      done = 1;
      $finish(2);
   end
endmodule
cadence