////////////////////////////////////////////////////////////////////////////////
//
/// LSU EE 4755 Fall 2014 Homework 3
//

 /// SOLUTION

 /// Assignment  http://www.ece.lsu.edu/koppel/v/2014/hw03.pdf
 /// Solution  http://www.ece.lsu.edu/koppel/v/2014/hw03_sol.pdf

 /// Instructions:
  //
  // (1) Find the undergraduate workstation laboratory, room 126 EE
  //     Building.
  //
  // (2) Locate your account.  If you did not get an account please
  //     E-mail: koppel@ece.lsu.edu
  //
  // (3) Log in to a Linux workstation.
  //     The account should start up with a WIMP interface (windows, icons,
  //     mouse, pull-down menus)  ( :-) ) but one or two things need
  //     to be done from a command-line shell.  If you need to brush up
  //     on Unix commands follow http://www.ece.lsu.edu/koppel/v/4ltrwrd/.
  //
  // (4) If you haven't already, follow the account setup instructions here:
  //     http://www.ece.lsu.edu/koppel/v/proc.html
  //
  // (5) Copy this assignment, local path name
  //     /home/faculty/koppel/pub/ee4755/hw/2014f/hw03
  //     to a directory ~/hw02 in your class account. (~ is your home
  //     directory.) Use this file for your solution.
  //
  // (6) Find the problems in this file and solve them.
  //
  //     Your entire solution should be in this file.
  //
  //     Do not change module names.
  //
  // (7) Your solution will automatically be copied from your account by
  //     the TA-bot.


 /// Additional Resources
  //
  // Verilog Documentation
  //    The Verilog Standard
  //      http://standards.ieee.org/getieee/1800/download/1800-2012.pdf
  //    Introductory Treatment (Warning: Does not include SystemVerilog)
  //      Brown & Vranesic, Fundamentals of Digital Logic with Verilog, 3rd Ed.
  //
  // Account Setup and Emacs (Text Editor) Instructions
  //      http://www.ece.lsu.edu/koppel/v/proc.html
  //      To learn Emacs look for Emacs tutorial.
  //
  // Unix Help
  //      http://www.ece.lsu.edu/koppel/v/4ltrwrd/



//////////////////////////////////////////////////////////////////////////////
/// Behavioral Multiplier


module mult_behav_1
  #(int wid = 16)
   (output logic[2*wid-1:0] prod, input logic[wid-1:0] plier, cand);

   assign prod = plier * cand;
endmodule




//////////////////////////////////////////////////////////////////////////////
/// Simple m-Step Sequential Multiplier


module mult_seq_m #( int wid = 16, int pp_per_cycle = 2 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int iterations = ( wid + pp_per_cycle - 1 ) / pp_per_cycle;
   localparam int iter_lg = $clog2(iterations);

   logic [iter_lg:1] iter;
   logic [2*wid-1:0] accum;

   // cadence translate_off
   initial iter = 0;
   // cadence translate_on

   always @( posedge clk ) begin

      if ( iter == iter_lg'(iterations) ) begin

         prod = accum;
         accum = 0;
         iter = 0;

      end

      for ( int i=0; i<pp_per_cycle; i++ )
        begin
           int pos;  pos = iter * pp_per_cycle + i;
           if ( cand[pos] ) accum += plier << pos;
        end

      iter++;

   end

endmodule


//////////////////////////////////////////////////////////////////////////////
/// An Sequential Multiplier using a Carry-Save Adder


// Examine this module for Problem 1.
// Don't modify the module.


`include "/apps/linux/cadence/RC141/share/synth/lib/chipware/sim/verilog/CW/CW_csa.v"

module mult_seq_csa #( int wid = 16 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int wlog = $clog2(wid);

   logic [wlog-1:0] pos;

   logic [2*wid-1:0] accum_sum_a_reg, accum_sum_b_reg;
   wire             co;

   // cadence translate_off
   initial begin pos = 0; accum_sum_a_reg = 0; accum_sum_b_reg = 0;  end
   // cadence translate_on

   wire [2*wid-1:0] accum_sum_a, accum_sum_b;

   wire [2*wid-1:0] pp = cand[pos] ? plier << pos : 0;

   CW_csa #(2*wid) csa
     ( .carry(accum_sum_a), .sum(accum_sum_b), .co(co),
       .a(accum_sum_a_reg), .b(accum_sum_b_reg), .c(pp), .ci(1'b0) );
                        
   always @( posedge clk ) pos <= pos + 1;

   always @( posedge clk ) begin

      if ( pos == wid-1 ) begin

         prod = accum_sum_a + accum_sum_b;
         accum_sum_a_reg = 0;
         accum_sum_b_reg = 0;

      end else begin

         accum_sum_a_reg = accum_sum_a;
         accum_sum_b_reg = accum_sum_b;

      end

   end

endmodule

//////////////////////////////////////////////////////////////////////////////
/// An m-bit Sequential Multiplier using a CSA


 /// Problem 2: Modify this module.


module mult_seq_csa_m #( int wid = 16, int pp_per_cycle = 2 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   /// SOLUTION

   localparam int iterations = ( wid + pp_per_cycle - 1 ) / pp_per_cycle;
   localparam int iter_lg = $clog2(iterations);
   localparam int wid_lg = $clog2(wid);

   logic [iter_lg:0] iter;

   // cadence translate_off
   initial iter = 0;
   // cadence translate_on

   wire [2*wid-1:0] accum_sum_a[0:pp_per_cycle], accum_sum_b[0:pp_per_cycle];
   logic [2*wid-1:0] accum_sum_a_reg, accum_sum_b_reg;

   assign           accum_sum_a[0] = accum_sum_a_reg;
   assign           accum_sum_b[0] = accum_sum_b_reg;

   for ( genvar i=0; i<pp_per_cycle; i++ ) begin

      wire [wid_lg:1] pos = iter * pp_per_cycle + i;
      wire            co; // Unconnected.
      
      wire [2*wid-1:0] pp = pos < wid && cand[pos] ? plier << pos : 0;

      CW_csa #(2*wid) csa
        ( .sum(accum_sum_a[i+1]), .carry(accum_sum_b[i+1]), .co(co),
          .a(accum_sum_a[i]), .b(accum_sum_b[i]), .c(pp), .ci(1'b0) );

   end

   always @( posedge clk ) begin

      if ( iter == iterations ) begin

         // The commented-out line below shows the wrong way of
         // designing this module.
         //
         //  prod = accum_sum_a[pp_per_cycle] + accum_sum_b[pp_per_cycle];

         // Note that the product is computed by using the register
         // outputs, rather than the output of the last CSA.
         // 
         prod <= accum_sum_a_reg + accum_sum_b_reg;
         
         accum_sum_a_reg <= 0;
         accum_sum_b_reg <= 0;
         iter <= 0;

      end else begin

         accum_sum_a_reg <= accum_sum_a[pp_per_cycle];
         accum_sum_b_reg <= accum_sum_b[pp_per_cycle];
         iter <= iter + 1;

      end

   end
   
endmodule



//////////////////////////////////////////////////////////////////////////////
/// Pipelined Multiplier


module mult_pipe #( int wid = 16, int pp_per_stage = 2 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage;

   logic [2*wid-1:0] pl_accum[0:stages];
   logic [wid-1:0] pl_plier[0:stages];
   logic [wid-1:0] pl_cand[0:stages];

   always @( posedge clk ) begin

      pl_accum[0] = 0;
      pl_plier[0] = plier;
      pl_cand[0] = cand;

      for ( int stage=0; stage<stages; stage++ ) begin

         logic [2*wid-1:0] accum;  accum = pl_accum[stage];

         for ( int j=0; j<pp_per_stage; j++ ) begin

            int pos;  pos = stage * pp_per_stage + j;

            if ( pos < wid && pl_cand[stage][pos] )
              accum += pl_plier[stage] << pos;

         end

         pl_accum[stage+1] <= accum;
         pl_cand[stage+1] <= pl_cand[stage];
         pl_plier[stage+1] <= pl_plier[stage];

      end

   end

   assign prod = pl_accum[stages];

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Pipelined Multiplier, Instantiated Stages


module mult_pipe_stage #( int wid = 16, int pp_per_stage = 2, int stage = 0 )
   ( output logic [2*wid-1:0] accum_out,
     input [2*wid-1:0] accum_in,
     input [wid-1:0] plier,
     input [wid-1:0] cand);

   always @* begin

      logic [2*wid-1:0] accum;  accum = accum_in;

      for ( int j=0; j<pp_per_stage; j++ ) begin

            int pos;  pos = stage * pp_per_stage + j;

            if ( pos < wid && cand[pos] ) accum += plier << pos;

      end

      accum_out = accum;

   end

endmodule

module mult_pipe_ia #( int wid = 16, int pp_per_stage = 2 )
   ( output logic [2*wid-1:0] prod,
     input logic [wid-1:0] plier,
     input logic [wid-1:0] cand,
     input clk);

   localparam int stages = ( wid + pp_per_stage - 1 ) / pp_per_stage;

   logic [2*wid-1:0] pl_accum[0:stages];
   logic [wid-1:0] pl_plier[0:stages];
   logic [wid-1:0] pl_cand[0:stages];

   always @* begin

      pl_accum[0] = 0;
      pl_plier[0] = plier;
      pl_cand[0] = cand;

   end

   for ( genvar stage = 0;  stage < stages;  stage++ ) begin

      wire logic [2*wid-1:0] accum;

      mult_pipe_stage_x #(wid, pp_per_stage, stage) this_stage
        ( accum, pl_accum[stage], pl_plier[stage], pl_cand[stage]);

      always @( posedge clk ) begin
         pl_accum[stage+1] <= accum;
         pl_plier[stage+1] <= pl_plier[stage];
         pl_cand[stage+1] <= pl_cand[stage];
      end

   end

   assign prod = pl_accum[stages];

endmodule


//////////////////////////////////////////////////////////////////////////////
/// Testbench Code

// cadence translate_off

module testbench;

   localparam int wid = 16;
   localparam int num_tests = 1000;
   localparam int NUM_MULT = 10;
   localparam int err_limit = 7;
   localparam bit pipeline_test_exact = 1;

   logic clock;

   always #1 clock <= !clock;

   logic [wid-1:0] plier, cand;
   logic [wid-1:0] plierp, candp;
   logic [2*wid-1:0] prod[NUM_MULT];
   logic [2*wid-1:0] prodp[NUM_MULT];

   mult_behav_1 #(wid) mb1(prod[0], plier, cand);

   mult_seq_m   #(wid,8) ms44(prod[1], plier, cand, clock);
   mult_seq_m   #(wid,3) ms43(prod[2], plier, cand, clock);
   mult_seq_csa   #(wid) mc(prod[3], plier, cand, clock);
   mult_seq_csa_m   #(wid,4) mc4(prod[4], plier, cand, clock);
   mult_seq_csa_m   #(wid,1) mc1(prod[5], plier, cand, clock);

   localparam int ppps_2 = 1;

   mult_pipe    #(wid,4) mp4(prodp[6], plierp, candp, clock);
   mult_pipe    #(wid,ppps_2) mp3(prodp[7], plierp, candp, clock);
   mult_pipe_ia #(wid,4) mpi4(prodp[8], plierp, candp, clock);
   mult_pipe_ia #(wid,ppps_2) mpi3(prodp[9], plierp, candp, clock);

   string names[] = '{"Behav_1",
                      "Seq m4",
                      "Seq m3",
                      "Seq CSA",
                      "Seq CSA m4",
                      "Seq CSA m1",
                      "Pipelined m4",
                      "Pipelined m1",
                      "Pipelined IA m4",
                      "Pipelined IA m1"
                      };

   int err_cnt[NUM_MULT];

   // Array of multiplier/multiplicand values to try out.
   // After these values are used a random number generator will be used.
   //
   int tests[$] = {1,1, 1,2,  2,1, 'h10,'h20, 1,32,  32, 1};

   initial begin

      clock = 0;

      for ( int i=0; i<num_tests; i++ ) begin

         // Change input to pipelined units.
         //
         for ( int t=0; t<=wid; t++ ) begin
            plierp = t;
            candp = 256;
            #2;
         end

         // Set multiplier and multiplicand values for non-piped units.
         //
         plier = tests.size() ? tests.pop_front() : $random();
         cand = tests.size() ? tests.pop_front() : $random();

         // Set multiplier and multiplicand values for piped units.
         //
         plierp = plier;
         candp = cand;

         // For pipelined units, copy output at the time it should be ready.
         //
         fork
            #(2 * wid/4) prod[6] = prodp[8];
            #(2 * wid/4) prod[8] = prodp[8];
            #(2 * ((wid+ppps_2-1)/ppps_2)) prod[7] = prodp[7];
            #(2 * ((wid+ppps_2-1)/ppps_2)) prod[9] = prodp[9];
         join_none

         if ( pipeline_test_exact ) begin

            // Modify the inputs to the pipelined units in subsequent cycles.
            //
            for ( int t=0; t<=wid; t++ ) begin
               #2;
               plierp = t;
               candp = 1;
            end

            plierp = 0;
            candp = 0;

         end

         #1000;

         // Make sure each module's output is correct.
         //
         for ( int mut=1; mut<NUM_MULT; mut++ ) begin

            if ( prod[0] !== prod[mut] ) begin

               err_cnt[mut]++;

               if ( err_cnt[mut] < err_limit )
                 $display("Error in %s test %4d:  %x != %x (correct)\n",
                          names[mut], i, prod[mut], prod[0]);
            end

         end

      end

      // Tests completed, report error count for each device.
      //
      for ( int mut=1; mut<NUM_MULT; mut++ ) begin

         $display("Mut %s, %d errors (%.1f%% of tests)\n",
                  names[mut], err_cnt[mut],
                  100.0 * err_cnt[mut]/real'(num_tests) );

      end

      $finish(2);

   end

endmodule

// cadence translate_on