/// LSU EE 3755 -- Spring 2002 -- Computer Organization
//
/// Verilog Notes 7 -- Integer Multiply and Divide

// Time-stamp: <13 March 2002, 10:18:27 CST, koppel@sol>

/// Contents
//
// Binary Multiplication Algorithm
// Simple Multiplication Hardware
// Streamlined Multiplication Hardware
// Booth Recoding for Higher-Radix and Signed Multiplication
// Binary Division Algorithm
// Division Hardware (Simple and Streamlined)
// "Real" Arithmetic Circuits


/// References
//
// :P:   Palnitkar, "Verilog HDL"
// :Q:   Qualis, "Verilog HDL Quick Reference Card Revision 1.0"
// :H:   Hyde, "Handbook on Verilog HDL"
// :LRM: IEEE, Verilog Language Reference Manual  (Hawaii Section Numbering)
// :PH:  Patterson & Hennessy, "Computer Organization & Design"
// :HP:  Hennessy & Patterson, "Computer Architecture: A Quantitative Approach"


////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm

// :PH: 4.6

 /// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//     0000
//    0000
//   0101
//  0101
//  """""""
//  0111100  Product


////////////////////////////////////////////////////////////////////////////////
/// Simple Multiplication Hardware

// :PH: 4.6

 /// Simple Multipliers
//
// Designed to show basic techniques, but are inefficient.
// Following sections show improved multipliers.

 /// Multipliers in this section:
//
// Simple Unsigned Combinational Multiplier
//   See :PH: Figure 4.25
//   Multiplies using combinational logic only.
//   Cost of n-bit multiplier: proportional to n^2.
//   Speed of n-bit multiplier: time needed for n additions:
//      Proportional to n.
//
// Simple Unsigned Multiplier
//   Unlike combinational multiplier, uses a clock.
//   Cost of n-bit multiplier: 
//      Proportional to n.
//      Includes 5n bits of register storage.
//   Speed of n-bit multiplier: 
//      Product in n clocks.
//      Clock period includes 2n-bit add.
//   Uses more hardware than streamlined multipliers in next section.


// :Example:
//
// Simple Unsigned Combinational Multiplier
// Based on :PH: Figure 4.25
//
// This is a straightforward coding of the longhand multiplication
// technique.

module simple_combinational_mult(product,multiplier,multiplicand); //P
   input [15:0]  multiplier, multiplicand;
   output        product;

   reg [31:0]    product;

   integer       i;

   always @( multiplier or multiplicand )
     begin
        
        product = 0;
            
        for(i=0; i<16; i=i+1)
          if( multiplier[i] == 1'b1 ) product = product + ( multiplicand << i );
  
     end
     
endmodule


// :Example:
//
// Simple Unsigned Multiplier
//
// Sequential multiplier. Handshaking signals are added to command
// multiplier to start and to indicate when finished.  Registers are
// provided for shifted versions of the multiplier and multiplicand.

module simple_mult(product,ready,multiplier,multiplicand,start,clk); //P
   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [15:0]    multiplier_copy;
   reg [31:0]    multiplicand_copy;
   
   reg [4:0]     bit; 
   wire          ready = !bit;
   
   initial bit = 0;

   always @( posedge clk )

     if( ready && start ) begin

        bit               = 16;
        product           = 0;
        multiplicand_copy = { 16'd0, multiplicand };
        multiplier_copy   = multiplier;
        
     end else if( bit ) begin

        if( multiplier_copy[0] == 1'b1 ) product = product + multiplicand_copy;

        multiplier_copy = multiplier_copy >> 1;
        multiplicand_copy = multiplicand_copy << 1;
        bit = bit - 1;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// Streamlined Multiplication Hardware

// :PH: 4.6

 /// Streamlined Multipliers
//
// Uses less hardware than simple multiplier.
// Much slower than "real" multipliers.

 /// Multipliers in this section:
//
// Streamlined Unsigned Multiplier
//   See :PH: Figure 4.31
//   Less register storage than simple multiplier.
//   Cost of n-bit multiplier: 
//      Proportional to n.
//      Includes 2n bits of register storage.
//   Speed of n-bit multiplier: (Same as simple.)
//      Product in n clocks.
//      Clock period includes a 2n-bit add.
//
// Streamlined Signed Multiplier
//   Like previous multiplier, but handles signed numbers.
//   A better signed multiplier covered later (using Booth recoding).
//   Cost of n-bit multiplier: 
//      Proportional to n.
//      Includes 3n bits of register storage.
//   Speed of n-bit multiplier:
//      Product in n clocks.
//      Clock period includes 2n-bit add.


// :Example:
//
// Streamlined Unsigned Multiplier
// Based on :PH: Figure 4.31
//
// Improvements:
//
//   The product register also holds the multiplier.
//   Only the product register is shifted.
//   A register is not needed to hold the multiplicand.
//   Wow, you save 2n bits! (To multiply n-bit numbers.)

module streamlined_mult(product,ready,multiplier,multiplicand,start,clk);
   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [4:0]     bit; 
   wire          ready = !bit;
   
   initial bit = 0;

   always @( posedge clk )

     if( ready && start ) begin

        bit     = 16;
        product = { 16'd0, multiplier };
        
     end else if( bit ) begin:A
        
        reg lsb;

        lsb     = product[0];
        product = product >> 1;
        bit     = bit - 1;

        if( lsb ) product[31:15] = product[30:15] + multiplicand;

     end

endmodule


// :Example:
//
// Streamlined Signed Multiplier
//
// Multiplies absolute value of multiplier and multiplicand.
// Negates product if multiplier and multiplicand differ in sign.
// Otherwise similar to streamlined unsigned multiplier.

module streamlined_signed_mult(product,ready,multiplier,multiplicand,
                               start,clk);

   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;
   reg [15:0]    abs_multiplicand;

   reg [4:0]     bit; 
   wire          ready = !bit;
   
   initial bit = 0;

   always @( posedge clk )

     if( ready && start ) begin

        bit     = 16;
        product = { 16'd0, multiplier[15] ? -multiplier : multiplier };
        abs_multiplicand = multiplicand[15] ? -multiplicand : multiplicand;
        
     end else if( bit ) begin:A
        
        reg lsb;

        lsb     = product[0];
        product = product >> 1;
        bit     = bit - 1;

        if( lsb ) product[31:15] = product[30:15] + abs_multiplicand;

        if( !bit && multiplicand[15] ^ multiplier[15] ) product = -product;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// Booth Recoding for Higher-Radix and Signed Multiplication

// :PH: 4.6  Only describes Radix-2 Booth Recoding

 /// Basic Idea
//
// Rather than repeatedly adding either 0 or 1 times multiplicand,
// repeatedly add 0, 1, -1, 2, -2, etc. times the multiplicand.
//
 /// Benefits
//
// Performs signed multiplication without having to first compute the
// absolute value of the multiplier and multiplicand.
//
// Improved performance (when radix higher than 2).

 /// Multipliers Described Here
//
// Ordinary Radix-4 Unsigned Multiplier
//   Presented for pedagogical reasons, Booth multipliers better.
//   Twice as fast as earlier multipliers.
//   Uses more hardware than Booth multipliers below.
//
// Booth Recoding Radix-4 Multiplier
//   Multiplies signed numbers.
//   Twice as fast as earlier multipliers.
//
// Booth Recoding Radix-2 Multiplier
//   Multiplies signed numbers.
//   Uses about the same amount of hardware than earlier signed multiplier.


 /// Ordinary Radix-4 Multiplier Idea
//
// Review of Radix-2 Multiplication
//
//  Multiply 5 times 12.  Radix-2 multiplication (the usual way).
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//     0000  0 x 0101
//    0000   0 x 0101
//   0101    1 x 0101
//  0101     1 x 0101
//  """""""
//  0111100  Product
//
// Radix-4 Multiplication
//   Let "a" denote the multiplicand and b denote the multiplier.
//   Pre-compute 2a and 3a.
//   Examine multiplier two bits at a time (rather than one bit at a time).
//   Based on the value of those bits add 0, a, 2a, or 3a (shifted by
//     the appropriate amount).
//
//   Uses n/2 additions to multiply two n-bit numbers.
//
// Two Radix-4 Multiplication Examples
//
//  Multiply 5 times 12.  Radix-4 multiplication (the faster way).
//
//     Precompute: 2a: 01010,  3a: 01111
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//    00000  00 x 0101  
//  01111    11 x 0101
//  """""""
//  0111100  Product
//
//  Multiply 5 times 9.  Radix-4 multiplication (the faster way).
//
//     0101  Multiplicand
//     1001  Multiplier
//     """"
//    00101  01 x 0101
//  01010    10 x 0101
//  """""""
//  0101101  Product

 // Ordinary Radix-2^d Multiplier
//
// This is a generalization of the Radix-4 multiplier.
//
//   Let "a" denote the multiplicand and b denote the multiplier.
//   Pre-compute 2a, 3a, 4a, 5a, ..., (2^d-1)a
//   Examine multiplier d bits at a time.
//   Let the value of those bits be v.
//   Add v shifted by the appropriate amount.
//
//   Uses n/d additions to multiply two n-bit numbers.


// :Example:
//
// A Radix-4 multiplier.  Takes unsigned numbers.

module imult_ord_radix_4(prod,ready,multiplicand,multiplier,start,clk);

   input [15:0]  multiplicand, multiplier;
   input         start, clk;
   output        prod;
   output        ready;

   reg [32:0]    product;
   wire [31:0]   prod = product[31:0];

   reg [4:0]     bit; 
   wire          ready = !bit;

   reg [17:0]    pp;
   
   initial bit = 0;

   wire [17:0]   multiplicand_X_1 = {1'b0,multiplicand};
   wire [17:0]   multiplicand_X_2 = {multiplicand,1'b0};
   wire [17:0]   multiplicand_X_3 = multiplicand_X_2 + multiplicand_X_1;

   always @( posedge clk )

     if( ready && start ) begin

        bit     = 8;
        product = { 16'd0, multiplier };
        
     end else if( bit ) begin

        case ( {product[1:0]} )
          2'd0: pp = {2'b0, product[31:16] };
          2'd1: pp = {2'b0, product[31:16] } + multiplicand_X_1;
          2'd2: pp = {2'b0, product[31:16] } + multiplicand_X_2;
          2'd3: pp = {2'b0, product[31:16] } + multiplicand_X_3;
        endcase

        product = { pp, product[15:2] };
        bit     = bit - 1;

     end

endmodule


 /// Booth Recoding
//
// Goal: Reduce the number of pre-computed multiples of the multiplicand.
// Bonus: Handles signed numbers.
//
// Idea: Use something like carries.
//
// Why It's Called Recoding
//   Numbers like 3 are recoded into 4-1, avoiding the need to deal with 3.
//
// Consider need to compute multiplicand_X_3 for radix-4 multiplier.
// Instead:
//   Subtract multiplicand and add 4 times multiplicand.
//   The subtraction is done in the "current" step.
//   The addition is done in the "next" step.

//  Multiply 5 times 3.  Booth radix-4 multiplication.
//
//     0101  Multiplicand
//     0011  Multiplier
//  """""""
//  1111011  -1      x 0101  Multiplier bits 11 -> -1 + 4 -> -1 now, +1 next step
//  00101     (00+1) x 0101  Multiplier bits 00 -> 0 (now) + 1 (from last step)
//  """""""
//  0001111  Product


// The table below indicates:
//
// MB: Multiplier bits value.
//  C: Carry bit (lostbit in modules) from previous step.
//  x: Amount to multiply multiplicand by.
//  c: Carry bit to next step.
//
// Radix-4 Booth Table
//
//  MB C  |  x      c
//  """""""""""""""""
//  00 0  |  0 + 0  0
//  00 1  |  1 + 0  0
//  01 0  |  1 + 0  0
//  01 1  |  2 + 0  0
//  10 0  | -2 + 4  1
//  10 1  | -1 + 4  1
//  11 0  | -1 + 4  1
//  11 1  |  0 + 4  1
//
// For example above:
// Step 1: MB -> 11,  C -> 0,  lookup reveals  x -> -1,  c -> 1
// Step 2: MB -> 00,  C -> 1,  lookup reveals  x -> 1,  c -> 0
//
// Radix-2 Booth Table
//
//  MB C  |  x      c
//  """""""""""""""""
//  0  0  |  0 + 0  0
//  0  1  |  1 + 0  0
//  1  0  | -1 + 2  1
//  1  1  |  0 + 2  1


// Note:
//  The carry out is the same as the MSB of the multiplier bits examined.
//  That's intentional, so that no special storage or computation is need
//    for the carry.
//
//  The carry out of the last step is not used.
//
//  Construction of higher-radix tables is straightforward.


// :Example:
//
// A Booth radix-4 multiplier.  This takes signed numbers.

module imult_Booth_radix_4(prod,ready,multiplicand,multiplier,start,clk); // P

   input [15:0]  multiplicand, multiplier;
   input         start, clk;
   output        prod;
   output        ready;

   reg [32:0]    product;
   wire [31:0]   prod = product[31:0];


   reg [4:0]     bit; 
   wire          ready = !bit;
   reg           lostbit;
   
   initial bit = 0;

   wire [16:0]   multsx = {multiplicand[15],multiplicand};

   always @( posedge clk )

     if( ready && start ) begin

        bit     = 8;
        product = { 17'd0, multiplier };
        lostbit = 0;
        
     end else if( bit ) begin:A

        case ( {product[1:0],lostbit} )
          3'b001: product[32:16] = product[32:16] + multsx;
          3'b010: product[32:16] = product[32:16] + multsx;
          3'b011: product[32:16] = product[32:16] + 2 * multiplicand;
          3'b100: product[32:16] = product[32:16] - 2 * multiplicand;
          3'b101: product[32:16] = product[32:16] - multsx;
          3'b110: product[32:16] = product[32:16] - multsx;
        endcase

        lostbit = product[1];
        product = { product[32], product[32], product[32:2] };
        bit     = bit - 1;

     end

endmodule


// :Example:
//
// Radix-2 Booth multiplier.  Unlike signed multiplier, does not
// require time or hardware for taking absolute value of operands.

module imult_Booth(product,ready,multiplicand,multiplier,start,clk); // P

   input [15:0]  multiplicand, multiplier;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [4:0]     bit; 
   wire          ready = !bit;
   reg           lostbit;
   
   initial bit = 0;

   always @( posedge clk )

     if( ready && start ) begin

        bit     = 16;
        product = { 16'd0, multiplier };
        lostbit = 0;
        
     end else if( bit ) begin:A

        case ( {product[0],lostbit} )
          2'b01: product[31:16] = product[31:16] + multiplicand;
          2'b10: product[31:16] = product[31:16] - multiplicand;
        endcase

        lostbit = product[0];
        product = { product[31], product[31:1] };
        bit     = bit - 1;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// Binary Division Algorithm

// :PH: 4.7

// Binary Version of Longhand Division Technique
//
// 11 divided by 3:
//
// 11 (1011) is dividend.
//  3 (0011) is divider.
//
//  """"""""|
//     1011 |
// -0011    |
//  """"""""|    0  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |
//  -0011   |
//  """"""""|   00  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |
//   -0011  |
//  """"""""|  001  Difference is positive: use difference and put 1 in quotient.
//     0101 |
//    -0011 |
//  """"""""| 0011  Difference is positive: use difference and put 1 in quotient.
//       10 | 
//
// Quotient, 3 (0011); remainder 2 (10).


////////////////////////////////////////////////////////////////////////////////
/// Division Hardware (Simple and Streamlined)

// :PH: 4.7

 /// Division Hardware
//
// Simple Divider
//   See :PH: Figure 4.36
//   A straightforward translation of binary division algorithm into hardware.
//   Cost of n-bit divider:
//      Proportional to n.
//      Uses 5n bits of register storage.
//   Speed of n-bit divider:
//      Quotient in n clock cycles.
//
// Streamlined Divider
//   See :PH: Figure 4.41
//   Less storage needed.
//   Cost of n-bit divider:
//      Proportional to n.
//      Uses 2n bits of register storage.
//   Speed of n-bit divider:
//      Quotient in n clock cycles.


// :Example:
//
// Simple divider.
// Based on :PH: Figure 4.36

module simple_divider(quotient,remainder,ready,dividend,divider,start,clk);
   
   input [15:0]  dividend,divider;
   input         start, clk;
   output        quotient,remainder;
   output        ready;

//  """"""""|
//     1011 |  <----  dividend_copy
// -0011    |  <----  divider_copy
//  """"""""|    0  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |  <----  dividend_copy
//  -0011   |  <----  divider_copy
//  """"""""|   00  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |  <----  dividend_copy
//   -0011  |  <----  divider_copy
//  """"""""|  001  Difference is positive: use difference and put 1 in quotient.
//            quotient (numbers above)   


   reg [15:0]    quotient;
   reg [31:0]    dividend_copy, divider_copy, diff;
   wire [15:0]   remainder = dividend_copy[15:0];

   reg [4:0]     bit; 
   wire          ready = !bit;
   
   initial bit = 0;

   always @( posedge clk ) 

     if( ready && start ) begin

        bit = 16;
        quotient = 0;
        dividend_copy = {16'd0,dividend};
        divider_copy = {1'b0,divider,15'd0};

     end else begin

        diff = dividend_copy - divider_copy;

        quotient = quotient << 1;

        if( !diff[31] ) begin

           dividend_copy = diff;
           quotient[0] = 1'd1;

        end

        divider_copy = divider_copy >> 1;
        bit = bit - 1;

     end

endmodule


// :Example:
//
// Streamlined divider.  
// Based on :PH: Figure 4.41
//
// Uses less register storage than simple divider.

module streamlined_divider(quotient,remainder,ready,dividend,divider,start,clk);
   
   input [15:0]  dividend,divider;
   input         start, clk;
   output        quotient,remainder;
   output        ready;

   reg [31:0]    qr;
   reg [16:0]    diff;

//
//              0000 1011
//  """"""""|
//     1011 |   0001 0110     <- qr reg
// -0011    |  -0011          <- divider (never changes)
//  """"""""|           
//     1011 |   0010 110o     <- qr reg
//  -0011   |  -0011
//  """"""""|
//     1011 |   0101 10oo     <- qr reg
//   -0011  |  -0011
//  """"""""|   0010 1000     <- qr reg before shift
//     0101 |   0101 0ooi     <- after shift
//    -0011 |  -0011
//  """"""""|   0010 ooii
//       10 |   
//
// Quotient, 3 (0011); remainder 2 (10).

   
   wire [15:0]   remainder = qr[31:16];
   wire [15:0]   quotient = qr[15:0];

   reg [4:0]     bit; 
   wire          ready = !bit;
   
   initial bit = 0;

   always @( posedge clk ) 

     if( ready && start ) begin

        bit = 16;
        qr = {16'd0,dividend};

     end else begin

        diff = qr[31:15] - {1'b0,divider};

        if( diff[16] )
          qr = {qr[30:0],1'd0};
        else
          qr = {diff[15:0],qr[14:0],1'd1};
        
        bit = bit - 1;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// "Real" Arithmetic Circuits

// :HP: Appendix A

 /// Adders
//
// Multiple-level CLA covered in class reasonably close to adders used
// in processors.
//
// Other fast adder techniques also used. (See :PH: problems.)

 /// Multipliers
//
// The multiplication hardware presented above is much slower than
// the hardware used in real processors.
//
// "Real" n-bit Multiplier Features
//
//   Multiplication done in one or two cycles (assume one cycle).
//   Uses higher-radix (say 4) Booth recoding or something similar.
//   Enough adders are provided so that product computed in one cycle.
//   These adders can add n numbers quickly without the cost of n CLA's.
//     Cost may be something like 3/2 n ripple adders plus 1 CLA.
//     Delay through each adder is only a handful of gates.
//   Adders may be connected as a tree, so that number of adders
//     from input to output is proportional to log n (rather than n).

 /// Dividers
//
// The division hardware presented above is slower than hardware used
// in real processors.
//
// Division trickier than multiplication because result of step i
// needed for i+1.  This precludes the tree structures used in
// fast multipliers.
//
// FP Dividers in general-purpose processors typically take 10-20 cycles.
//
// Several division techniques used.  Examples:
//   SRT: Sweeney, Robertson, Tocher
//   Newton Iteration
//   Goldschmidt's Algorithm
//