/// EE 4755 - Digital Design Using HDLs
//
//  Classroom demo code.

// Time-stamp: <14 November 2016, 11:52:48 CST, koppel@cyc.ece.lsu.edu>


//////////////////////////////////////////////////////////////////////////////
/// Behavioral Multiplier


module mult_behav_1
  #(int wid = 16)
   (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand);

   assign prod = plier * cand;
endmodule


//////////////////////////////////////////////////////////////////////////////
/// Linear Multiplier

 /// Simple Adder, Don't Modify
module carry_prop_adder#(int w=16)(output [w:1] s, input [w:1] a,b);
   assign s = a + b;
endmodule


module mult_linear
  #(int wid = 16)
   (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand);

   logic [2*wid-1:0] rsum [wid-1:-1];

   assign rsum[-1] = 0;

   for ( genvar i=0; i<wid; i++ ) begin
      wire [2*wid-1:0] pprod = plier[i] ? cand << i : 0;
      carry_prop_adder #(2*wid) adder(rsum[i], rsum[i-1], pprod );
   end

   assign    prod = rsum[wid-1];

endmodule

module mult_linear_clk #(int wid = 16)
   (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand, input clk);

   wire [2*wid-1:0] p;
   logic [wid-1:0] pliercpy, candcpy;

   mult_linear #(wid) ml(p, plier, cand);

   always_ff @( posedge clk ) begin
      pliercpy <= plier;
      candcpy <= cand;
      prod <= p;
   end

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Tree Multiplier


module mult_tree
  #(int wid = 16)
   (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand);

   localparam int widp2 = 1 << $clog2(wid);

   logic [2*wid-1:0] rsum [2*wid-1:0];
   localparam int mask = 2*wid-1;

   // Compute partial products.
   //
   for ( genvar i=0; i<wid; i++ )
     assign rsum[i] = plier[i] ? cand << i : 0;

   // Add partial products together.
   //
   for ( genvar i=wid; i<2*wid-1; i++ )
     carry_prop_adder #( 2*wid ) adder
                 ( rsum[i],
                   rsum[ mask &   (i<<1)       ],   // Left child.
                   rsum[ mask & ( (i<<1) + 1 ) ]    // Right child.
                   );

   assign    prod = rsum[2*wid-2];

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Simple Sequential Multiplier

// This adder works correctly, but is unnecessarily slow.


module mult_seq #( int wid = 16 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int wlog = $clog2(wid);

   logic [wlog-1:0] pos;
   logic [2*wid-1:0] accum;

   // cadence translate_off
   initial pos = 0;
   // cadence translate_on

   always @( posedge clk ) begin
      if ( pos == 0 ) begin
         prod = accum;
         accum = 0;
      end
      if ( cand[pos] == 1 ) accum += plier << pos;
      pos++;
   end

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Sequential Multiplier, Using Instantiated Adder
//
//  Simple multiplier, no handshaking.

module mult_seq_ga #( int wid = 16 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk );

   localparam int wlog = $clog2(wid);

   logic [wlog-1:0] pos;
   logic [2*wid-1:0] accum;
   wire [2*wid-1:0] sum;

   // cadence translate_off
   initial begin pos = 0; accum = 0; end
   // cadence translate_on

   wire [2*wid-1:0] pp = cand[pos] ? plier << pos : 0;

   carry_prop_adder #(2*wid) ga( sum, accum, pp );

   always @( posedge clk ) pos <= pos + 1;
   always @( posedge clk ) begin

      if ( pos == 0 ) begin
         prod = sum;
         accum = 0;
      end else begin
         accum = sum;
      end

   end

endmodule

`include "/apps/linux/cadence/RC142/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v"

module mult_seq_csa #( int wid = 16 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int wlog = $clog2(wid);

   logic [wlog-1:0] pos;

   logic [2*wid-1:0] accum_sum_a_reg, accum_sum_b_reg;
   wire             co;

   // cadence translate_off
   initial begin pos = 0; accum_sum_a_reg = 0; accum_sum_b_reg = 0;  end
   // cadence translate_on

   wire [2*wid-1:0] accum_sum_a, accum_sum_b;

   wire [2*wid-1:0] pp = cand[pos] ? plier << pos : 0;

   // Instantiate a carry save adder from the ChipWare library.
   //
   CW_csa #(2*wid) csa
     ( .carry(accum_sum_a), .sum(accum_sum_b), .co(co),
       .a(accum_sum_a_reg), .b(accum_sum_b_reg), .c(pp), .ci(1'b0) );

   always @( posedge clk ) pos <= pos + 1;

   always @( posedge clk ) begin

      if ( pos == wid-1 ) begin

         prod = accum_sum_a + accum_sum_b;
         accum_sum_a_reg = 0;
         accum_sum_b_reg = 0;

      end else begin

         accum_sum_a_reg = accum_sum_a;
         accum_sum_b_reg = accum_sum_b;

      end

   end

endmodule



//////////////////////////////////////////////////////////////////////////////
/// Streamlined Sequential Multiplier

 /// Techniques For Lowering Cost
//
//   Instead of shifting the multiplier, shift the accumulator.
//   Use part of the accumulator to store the multiplicand.

module mult_seq_stream #( int wid = 16 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int wlog = $clog2(wid);

   logic [wlog-1:0] pos;
   logic [2*wid-1:0] accum;

   // cadence translate_off
   initial pos = 0;
   // cadence translate_on

   always @( posedge clk ) begin

      logic [wid:0] pp;

      if ( pos == wid - 1 ) begin

         prod = accum;
         accum = cand;
         pos = 0;

      end else begin

         pos++;

      end

      // Note: the multiplicand is in the lower bits of the accumulator.
      //
      pp = accum[0] ? { 1'b0, plier } : 0;

      // Add on the partial product and shift the accumulator.
      //
      accum = { { 1'b0, accum[2*wid-1:wid] } + pp, accum[wid-1:1] };

   end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// Degree-m Sequential Multipliers

// Compute m partial products in each iteration.
//
// Will the synthesis program figure it out?


module mult_seq_m #( int wid = 16, int m = 2 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int iterations = ( wid + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);

   logic [iter_lg:1] iter;
   logic [2*wid-1:0] accum;

   // cadence translate_off
   initial iter = 0;
   // cadence translate_on

   always @( posedge clk ) begin

      if ( iter == iter_lg'(iterations) ) begin

         prod = accum;
         accum = 0;
         iter = 0;

      end

      for ( int i=0; i<m; i++ )
        begin
           int pos;
           pos = iter * m + i;
           if ( cand[pos] ) accum += plier << pos;
        end

      iter++;

   end

endmodule


module mult_seq_dm
  #( int wid = 16,
     int m = 2 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int iterations = ( wid + m - 1 ) / m;
   localparam int iter_lg = $clog2(iterations);

   wire [iterations-1:0][m-1:0] cand_2d = cand;

   logic [iter_lg:1] iter;
   logic [2*wid-1:0] accum;

   // cadence translate_off
   initial iter = 0;
   // cadence translate_on

   always @( posedge clk ) begin

      if ( iter == iter_lg'(iterations) ) begin

         prod = accum;
         accum = 0;
         iter = 0;

      end

      accum += plier * cand_2d[iter] << ( iter * m );

      iter++;

   end

endmodule


module mult_seq_csa_m #( int wid = 16, int pp_per_cycle = 2 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int iterations = ( wid + pp_per_cycle - 1 ) / pp_per_cycle;
   localparam int iter_lg = $clog2(iterations);
   localparam int wid_lg = $clog2(wid);

   logic [iter_lg:0] iter;

   // cadence translate_off
   initial iter = 0;
   // cadence translate_on

   wire [2*wid-1:0] accum_sum_a[0:pp_per_cycle], accum_sum_b[0:pp_per_cycle];
   logic [2*wid-1:0] accum_sum_a_reg, accum_sum_b_reg;

   assign           accum_sum_a[0] = accum_sum_a_reg;
   assign           accum_sum_b[0] = accum_sum_b_reg;

   for ( genvar i=0; i<pp_per_cycle; i++ ) begin

      wire [wid_lg:1] pos = iter * pp_per_cycle + i;
      wire            co; // Unconnected.

      wire [2*wid-1:0] pp = pos < wid && cand[pos] ? plier << pos : 0;

      CW_csa #(2*wid) csa
        ( .sum(accum_sum_a[i+1]), .carry(accum_sum_b[i+1]), .co(co),
          .a(accum_sum_a[i]), .b(accum_sum_b[i]), .c(pp), .ci(1'b0) );

   end

   always @( posedge clk ) begin

      if ( iter == iterations ) begin

         prod <= accum_sum_a_reg + accum_sum_b_reg;

         accum_sum_a_reg <= 0;
         accum_sum_b_reg <= 0;
         iter <= 0;

      end else begin

         prod <= prod;

         accum_sum_a_reg <= accum_sum_a[pp_per_cycle];
         accum_sum_b_reg <= accum_sum_b[pp_per_cycle];
         iter <= iter + 1;

      end

   end

endmodule


`ifdef DONT_DEFINE_ME
Module Name                             Area   Clock    Total    Init.
                                              Period    Delay   Interv
mult_pipe_2_wid16_pp_per_stage1       652540    1988    -1988     1988
mult_pipe_wid16_pp_per_stage1         747364    1717    27472     1717

mult_pipe_2_wid16_pp_per_stage2       390304    2530    -2530     2530
mult_pipe_wid16_pp_per_stage2         459860    2425    19400     2425

mult_pipe_2_wid16_pp_per_stage4       330368    2913    -2913     2913
mult_pipe_wid16_pp_per_stage4         357580    2983    11932     2983

mult_pipe_2_wid16_pp_per_stage8       256392    3515    -3515     3515
mult_pipe_wid16_pp_per_stage8         264352    3498     6996     3498
Normal exit.
`endif



//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

// cadence translate_off

program reactivate
   (output wire clk_reactive, output int cycle_reactive,
    input wire clk, input int cycle);
   assign clk_reactive = clk;
   assign cycle_reactive = cycle;
endprogram

module testbench;

   localparam int wid = 16;
   localparam int num_tests = 1000;
   localparam int NUM_MULT = 20;
   localparam int err_limit = 7;

   bit use_others;
   logic [wid-1:0] plier, cand;
   logic [wid-1:0] plierp, candp;
   logic [2*wid-1:0] prod[NUM_MULT];

   typedef struct { int idx; int err_count = 0;
                    bit seq = 0; bit pipe = 0; int deg = 1;
                    logic [2*wid-1:0] sout = 'h111; int cyc_tot = 0;
                    int latency = 0;
                    } Info;
   Info pi[string];

   localparam int cycle_limit = num_tests * wid * 4;
   int cycle;
   bit done;
   logic clock;

   logic      clk_reactive;
   int cycle_reactive;
   reactivate ra(clk_reactive,cycle_reactive,clock,cycle);

   initial begin
      clock = 0;
      cycle = 0;

      fork
         forever #10 cycle += clock++;
         wait( done );
         wait( cycle >= cycle_limit )
           $write("*** Cycle limit exceeded, ending.\n");
      join_any;

      $finish();
   end

   initial begin

      while ( !done ) @( posedge clk_reactive ) #1

         if ( use_others ) begin

            plierp = plier;
            candp = cand;
            use_others = 0;

         end else begin

            plierp = cycle;
            candp = 256;

         end
   end

   task pi_seq(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1;
   endtask

   task pi_pipe(input int idx, input string name, input int deg);
      automatic string m = $sformatf("%s Deg %0d", name, deg);
      pi[m].deg = deg;
      pi[m].idx = idx; pi[m].seq = 1; pi[m].pipe = 1;
   endtask

   mult_behav_1 #(wid) mb1(prod[0], plier, cand);
   initial pi["Behavioral"].idx = 0;

   mult_linear  #(wid) ms1(prod[1], plier, cand);
   initial pi["Linear"].idx = 1;

   mult_tree    #(wid) ms2(prod[2], plier, cand);
   initial pi["Tree"].idx = 2;

   mult_seq #(wid) ms3(prod[3], plier, cand, clock);
   initial begin
      automatic string m = "Sequential";
      pi[m].idx = 3; pi[m].seq = 1;
   end

   mult_seq_ga  #(wid) msga1(prod[11], plier, cand, clock);
   initial begin
      automatic string m = "Sequential GA";
      pi[m].idx = 11; pi[m].seq = 1;
   end

   mult_seq_stream #(wid) mss1(prod[4], plier, cand, clock);
   initial begin
      automatic string m = "Sequential Streamlined";
      pi[m].idx = 4; pi[m].seq = 1;
   end

   mult_seq_m   #(wid,4) ms44(prod[5], plier, cand, clock);
   initial pi_seq(5,"Seq", ms44.m);

   mult_seq_m   #(wid,3) ms43(prod[6], plier, cand, clock);
   initial pi_seq(6,"Seq", ms43.m);

   mult_seq_dm  #(wid,4) msd44(prod[9], plier, cand, clock);
   initial pi_seq(9,"Seq Rad", ms44.m);

   mult_seq_dm  #(wid,3) msd43(prod[10], plier, cand, clock);
   initial pi_seq(10,"Seq Rad", ms43.m);

   mult_seq_csa  #(wid) mcsa(prod[14], plier, cand, clock);
   initial begin
      automatic string m = $sformatf("Mult Seq CSA");
      pi[m].idx = 14; pi[m].seq = 1;
   end

   mult_linear_clk  #(wid) mlc1(prod[15], plier, cand, clock);
   initial begin
      automatic string m = $sformatf("Linear Clock");
      pi[m].idx = 15; pi[m].seq = 1;
   end


   // Array of multiplier/multiplicand values to try out.
   // After these values are used a random number generator will be used.
   //
   int tests[$] = {1,1, 1,2,  1,32,  32, 1};

   initial begin

      done = 0;
      use_others = 0;

      @( posedge clk_reactive );

      for ( int i=0; i<num_tests; i++ ) begin
         automatic int cyc_start = cycle;
         automatic int awaiting = pi.num();

         // Set multiplier and multiplicand values for non-piped units.
         //
         plier = tests.size() ? tests.pop_front() : $random();
         cand = tests.size() ? tests.pop_front() : $random();

         // Set multiplier and multiplicand values for piped units.
         //
         plierp = plier;
         candp = cand;
         use_others = 1;

         foreach ( pi[muti] ) begin
            automatic string mut = muti; // Bug workaround?
            automatic Info p = pi[mut];
            fork begin
               automatic int steps = ( wid + pi[mut].deg - 1 ) / pi[mut].deg;
               automatic int latency
                 = !pi[mut].seq ? 1 : !pi[mut].pipe ? 2 * steps : steps;
               automatic int eta = 1 + cyc_start + latency;
               pi[mut].latency = latency;
               wait ( cycle_reactive == eta );
               awaiting--;
               pi[mut].sout = prod[pi[mut].idx];
               pi[mut].cyc_tot += cycle - cyc_start;
            end join_none;
         end
         wait ( awaiting == 0 );

         // Check the output of each Module Under Test.
         //
         foreach ( pi[ mut ] )
           if ( prod[0] !== pi[mut].sout ) begin
              pi[mut].err_count++;
              if ( pi[mut].err_count < 5 )
                $write
                  ("%-25s wrong result: %0d * %0d:  0x%0h != 0x%0h (correct)\n",
                   mut, plier, cand, pi[mut].sout, prod[0]);
           end

         @( posedge clk_reactive );

      end

      foreach ( pi[ mut ] )
        $write("Ran %4d tests for %-25s, %4d errors found. Avg cyc %.1f\n",
               num_tests, mut, pi[mut].err_count,
               pi[mut].seq ? real'(pi[mut].cyc_tot) / num_tests : 1);

      done = 1;

      $finish(2);

   end

endmodule

// cadence translate_on