```/// LSU EE 3755 -- Spring 2002 -- Computer Organization
//
/// Verilog Notes 7 -- Integer Multiply and Divide

// Time-stamp: <13 March 2002, 10:18:27 CST, koppel@sol>

/// Contents
//
// Binary Multiplication Algorithm
// Simple Multiplication Hardware
// Streamlined Multiplication Hardware
// Booth Recoding for Higher-Radix and Signed Multiplication
// Binary Division Algorithm
// Division Hardware (Simple and Streamlined)
// "Real" Arithmetic Circuits

/// References
//
// :P:   Palnitkar, "Verilog HDL"
// :Q:   Qualis, "Verilog HDL Quick Reference Card Revision 1.0"
// :H:   Hyde, "Handbook on Verilog HDL"
// :LRM: IEEE, Verilog Language Reference Manual  (Hawaii Section Numbering)
// :PH:  Patterson & Hennessy, "Computer Organization & Design"
// :HP:  Hennessy & Patterson, "Computer Architecture: A Quantitative Approach"

////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm

// :PH: 4.6

/// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//     0000
//    0000
//   0101
//  0101
//  """""""
//  0111100  Product

////////////////////////////////////////////////////////////////////////////////
/// Simple Multiplication Hardware

// :PH: 4.6

/// Simple Multipliers
//
// Designed to show basic techniques, but are inefficient.
// Following sections show improved multipliers.

/// Multipliers in this section:
//
// Simple Unsigned Combinational Multiplier
//   See :PH: Figure 4.25
//   Multiplies using combinational logic only.
//   Cost of n-bit multiplier: proportional to n^2.
//   Speed of n-bit multiplier: time needed for n additions:
//      Proportional to n.
//
// Simple Unsigned Multiplier
//   Unlike combinational multiplier, uses a clock.
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 5n bits of register storage.
//   Speed of n-bit multiplier:
//      Product in n clocks.
//      Clock period includes 2n-bit add.
//   Uses more hardware than streamlined multipliers in next section.

// :Example:
//
// Simple Unsigned Combinational Multiplier
// Based on :PH: Figure 4.25
//
// This is a straightforward coding of the longhand multiplication
// technique.

module simple_combinational_mult(product,multiplier,multiplicand); //P
input [15:0]  multiplier, multiplicand;
output        product;

reg [31:0]    product;

integer       i;

always @( multiplier or multiplicand )
begin

product = 0;

for(i=0; i<16; i=i+1)
if( multiplier[i] == 1'b1 ) product = product + ( multiplicand << i );

end

endmodule

// :Example:
//
// Simple Unsigned Multiplier
//
// Sequential multiplier. Handshaking signals are added to command
// multiplier to start and to indicate when finished.  Registers are
// provided for shifted versions of the multiplier and multiplicand.

input [15:0]  multiplier, multiplicand;
input         start, clk;
output        product;

reg [31:0]    product;

reg [15:0]    multiplier_copy;
reg [31:0]    multiplicand_copy;

reg [4:0]     bit;

initial bit = 0;

always @( posedge clk )

if( ready && start ) begin

bit               = 16;
product           = 0;
multiplicand_copy = { 16'd0, multiplicand };
multiplier_copy   = multiplier;

end else if( bit ) begin

if( multiplier_copy[0] == 1'b1 ) product = product + multiplicand_copy;

multiplier_copy = multiplier_copy >> 1;
multiplicand_copy = multiplicand_copy << 1;
bit = bit - 1;

end

endmodule

////////////////////////////////////////////////////////////////////////////////
/// Streamlined Multiplication Hardware

// :PH: 4.6

/// Streamlined Multipliers
//
// Uses less hardware than simple multiplier.
// Much slower than "real" multipliers.

/// Multipliers in this section:
//
// Streamlined Unsigned Multiplier
//   See :PH: Figure 4.31
//   Less register storage than simple multiplier.
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 2n bits of register storage.
//   Speed of n-bit multiplier: (Same as simple.)
//      Product in n clocks.
//      Clock period includes a 2n-bit add.
//
// Streamlined Signed Multiplier
//   Like previous multiplier, but handles signed numbers.
//   A better signed multiplier covered later (using Booth recoding).
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 3n bits of register storage.
//   Speed of n-bit multiplier:
//      Product in n clocks.
//      Clock period includes 2n-bit add.

// :Example:
//
// Streamlined Unsigned Multiplier
// Based on :PH: Figure 4.31
//
// Improvements:
//
//   The product register also holds the multiplier.
//   Only the product register is shifted.
//   A register is not needed to hold the multiplicand.
//   Wow, you save 2n bits! (To multiply n-bit numbers.)

input [15:0]  multiplier, multiplicand;
input         start, clk;
output        product;

reg [31:0]    product;

reg [4:0]     bit;

initial bit = 0;

always @( posedge clk )

if( ready && start ) begin

bit     = 16;
product = { 16'd0, multiplier };

end else if( bit ) begin:A

reg lsb;

lsb     = product[0];
product = product >> 1;
bit     = bit - 1;

if( lsb ) product[31:15] = product[30:15] + multiplicand;

end

endmodule

// :Example:
//
// Streamlined Signed Multiplier
//
// Multiplies absolute value of multiplier and multiplicand.
// Negates product if multiplier and multiplicand differ in sign.
// Otherwise similar to streamlined unsigned multiplier.

start,clk);

input [15:0]  multiplier, multiplicand;
input         start, clk;
output        product;

reg [31:0]    product;
reg [15:0]    abs_multiplicand;

reg [4:0]     bit;

initial bit = 0;

always @( posedge clk )

if( ready && start ) begin

bit     = 16;
product = { 16'd0, multiplier[15] ? -multiplier : multiplier };
abs_multiplicand = multiplicand[15] ? -multiplicand : multiplicand;

end else if( bit ) begin:A

reg lsb;

lsb     = product[0];
product = product >> 1;
bit     = bit - 1;

if( lsb ) product[31:15] = product[30:15] + abs_multiplicand;

if( !bit && multiplicand[15] ^ multiplier[15] ) product = -product;

end

endmodule

////////////////////////////////////////////////////////////////////////////////
/// Booth Recoding for Higher-Radix and Signed Multiplication

// :PH: 4.6  Only describes Radix-2 Booth Recoding

/// Basic Idea
//
// Rather than repeatedly adding either 0 or 1 times multiplicand,
// repeatedly add 0, 1, -1, 2, -2, etc. times the multiplicand.
//
/// Benefits
//
// Performs signed multiplication without having to first compute the
// absolute value of the multiplier and multiplicand.
//
// Improved performance (when radix higher than 2).

/// Multipliers Described Here
//
//   Presented for pedagogical reasons, Booth multipliers better.
//   Twice as fast as earlier multipliers.
//   Uses more hardware than Booth multipliers below.
//
//   Multiplies signed numbers.
//   Twice as fast as earlier multipliers.
//
//   Multiplies signed numbers.
//   Uses about the same amount of hardware than earlier signed multiplier.

//
//
//  Multiply 5 times 12.  Radix-2 multiplication (the usual way).
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//     0000  0 x 0101
//    0000   0 x 0101
//   0101    1 x 0101
//  0101     1 x 0101
//  """""""
//  0111100  Product
//
//   Let "a" denote the multiplicand and b denote the multiplier.
//   Pre-compute 2a and 3a.
//   Examine multiplier two bits at a time (rather than one bit at a time).
//   Based on the value of those bits add 0, a, 2a, or 3a (shifted by
//     the appropriate amount).
//
//   Uses n/2 additions to multiply two n-bit numbers.
//
//
//  Multiply 5 times 12.  Radix-4 multiplication (the faster way).
//
//     Precompute: 2a: 01010,  3a: 01111
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//    00000  00 x 0101
//  01111    11 x 0101
//  """""""
//  0111100  Product
//
//  Multiply 5 times 9.  Radix-4 multiplication (the faster way).
//
//     0101  Multiplicand
//     1001  Multiplier
//     """"
//    00101  01 x 0101
//  01010    10 x 0101
//  """""""
//  0101101  Product

//
// This is a generalization of the Radix-4 multiplier.
//
//   Let "a" denote the multiplicand and b denote the multiplier.
//   Pre-compute 2a, 3a, 4a, 5a, ..., (2^d-1)a
//   Examine multiplier d bits at a time.
//   Let the value of those bits be v.
//   Add v shifted by the appropriate amount.
//
//   Uses n/d additions to multiply two n-bit numbers.

// :Example:
//
// A Radix-4 multiplier.  Takes unsigned numbers.

input [15:0]  multiplicand, multiplier;
input         start, clk;
output        prod;

reg [32:0]    product;
wire [31:0]   prod = product[31:0];

reg [4:0]     bit;

reg [17:0]    pp;

initial bit = 0;

wire [17:0]   multiplicand_X_1 = {1'b0,multiplicand};
wire [17:0]   multiplicand_X_2 = {multiplicand,1'b0};
wire [17:0]   multiplicand_X_3 = multiplicand_X_2 + multiplicand_X_1;

always @( posedge clk )

if( ready && start ) begin

bit     = 8;
product = { 16'd0, multiplier };

end else if( bit ) begin

case ( {product[1:0]} )
2'd0: pp = {2'b0, product[31:16] };
2'd1: pp = {2'b0, product[31:16] } + multiplicand_X_1;
2'd2: pp = {2'b0, product[31:16] } + multiplicand_X_2;
2'd3: pp = {2'b0, product[31:16] } + multiplicand_X_3;
endcase

product = { pp, product[15:2] };
bit     = bit - 1;

end

endmodule

/// Booth Recoding
//
// Goal: Reduce the number of pre-computed multiples of the multiplicand.
// Bonus: Handles signed numbers.
//
// Idea: Use something like carries.
//
// Why It's Called Recoding
//   Numbers like 3 are recoded into 4-1, avoiding the need to deal with 3.
//
// Consider need to compute multiplicand_X_3 for radix-4 multiplier.
//   Subtract multiplicand and add 4 times multiplicand.
//   The subtraction is done in the "current" step.
//   The addition is done in the "next" step.

//  Multiply 5 times 3.  Booth radix-4 multiplication.
//
//     0101  Multiplicand
//     0011  Multiplier
//  """""""
//  1111011  -1      x 0101  Multiplier bits 11 -> -1 + 4 -> -1 now, +1 next step
//  00101     (00+1) x 0101  Multiplier bits 00 -> 0 (now) + 1 (from last step)
//  """""""
//  0001111  Product

// The table below indicates:
//
// MB: Multiplier bits value.
//  C: Carry bit (lostbit in modules) from previous step.
//  x: Amount to multiply multiplicand by.
//  c: Carry bit to next step.
//
//
//  MB C  |  x      c
//  """""""""""""""""
//  00 0  |  0 + 0  0
//  00 1  |  1 + 0  0
//  01 0  |  1 + 0  0
//  01 1  |  2 + 0  0
//  10 0  | -2 + 4  1
//  10 1  | -1 + 4  1
//  11 0  | -1 + 4  1
//  11 1  |  0 + 4  1
//
// For example above:
// Step 1: MB -> 11,  C -> 0,  lookup reveals  x -> -1,  c -> 1
// Step 2: MB -> 00,  C -> 1,  lookup reveals  x -> 1,  c -> 0
//
//
//  MB C  |  x      c
//  """""""""""""""""
//  0  0  |  0 + 0  0
//  0  1  |  1 + 0  0
//  1  0  | -1 + 2  1
//  1  1  |  0 + 2  1

// Note:
//  The carry out is the same as the MSB of the multiplier bits examined.
//  That's intentional, so that no special storage or computation is need
//    for the carry.
//
//  The carry out of the last step is not used.
//
//  Construction of higher-radix tables is straightforward.

// :Example:
//
// A Booth radix-4 multiplier.  This takes signed numbers.

input [15:0]  multiplicand, multiplier;
input         start, clk;
output        prod;

reg [32:0]    product;
wire [31:0]   prod = product[31:0];

reg [4:0]     bit;
reg           lostbit;

initial bit = 0;

wire [16:0]   multsx = {multiplicand[15],multiplicand};

always @( posedge clk )

if( ready && start ) begin

bit     = 8;
product = { 17'd0, multiplier };
lostbit = 0;

end else if( bit ) begin:A

case ( {product[1:0],lostbit} )
3'b001: product[32:16] = product[32:16] + multsx;
3'b010: product[32:16] = product[32:16] + multsx;
3'b011: product[32:16] = product[32:16] + 2 * multiplicand;
3'b100: product[32:16] = product[32:16] - 2 * multiplicand;
3'b101: product[32:16] = product[32:16] - multsx;
3'b110: product[32:16] = product[32:16] - multsx;
endcase

lostbit = product[1];
product = { product[32], product[32], product[32:2] };
bit     = bit - 1;

end

endmodule

// :Example:
//
// Radix-2 Booth multiplier.  Unlike signed multiplier, does not
// require time or hardware for taking absolute value of operands.

input [15:0]  multiplicand, multiplier;
input         start, clk;
output        product;

reg [31:0]    product;

reg [4:0]     bit;
reg           lostbit;

initial bit = 0;

always @( posedge clk )

if( ready && start ) begin

bit     = 16;
product = { 16'd0, multiplier };
lostbit = 0;

end else if( bit ) begin:A

case ( {product[0],lostbit} )
2'b01: product[31:16] = product[31:16] + multiplicand;
2'b10: product[31:16] = product[31:16] - multiplicand;
endcase

lostbit = product[0];
product = { product[31], product[31:1] };
bit     = bit - 1;

end

endmodule

////////////////////////////////////////////////////////////////////////////////
/// Binary Division Algorithm

// :PH: 4.7

// Binary Version of Longhand Division Technique
//
// 11 divided by 3:
//
// 11 (1011) is dividend.
//  3 (0011) is divider.
//
//  """"""""|
//     1011 |
// -0011    |
//  """"""""|    0  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |
//  -0011   |
//  """"""""|   00  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |
//   -0011  |
//  """"""""|  001  Difference is positive: use difference and put 1 in quotient.
//     0101 |
//    -0011 |
//  """"""""| 0011  Difference is positive: use difference and put 1 in quotient.
//       10 |
//
// Quotient, 3 (0011); remainder 2 (10).

////////////////////////////////////////////////////////////////////////////////
/// Division Hardware (Simple and Streamlined)

// :PH: 4.7

/// Division Hardware
//
// Simple Divider
//   See :PH: Figure 4.36
//   A straightforward translation of binary division algorithm into hardware.
//   Cost of n-bit divider:
//      Proportional to n.
//      Uses 5n bits of register storage.
//   Speed of n-bit divider:
//      Quotient in n clock cycles.
//
// Streamlined Divider
//   See :PH: Figure 4.41
//   Less storage needed.
//   Cost of n-bit divider:
//      Proportional to n.
//      Uses 2n bits of register storage.
//   Speed of n-bit divider:
//      Quotient in n clock cycles.

// :Example:
//
// Simple divider.
// Based on :PH: Figure 4.36

input [15:0]  dividend,divider;
input         start, clk;
output        quotient,remainder;

//  """"""""|
//     1011 |  <----  dividend_copy
// -0011    |  <----  divider_copy
//  """"""""|    0  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |  <----  dividend_copy
//  -0011   |  <----  divider_copy
//  """"""""|   00  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |  <----  dividend_copy
//   -0011  |  <----  divider_copy
//  """"""""|  001  Difference is positive: use difference and put 1 in quotient.
//            quotient (numbers above)

reg [15:0]    quotient;
reg [31:0]    dividend_copy, divider_copy, diff;
wire [15:0]   remainder = dividend_copy[15:0];

reg [4:0]     bit;

initial bit = 0;

always @( posedge clk )

if( ready && start ) begin

bit = 16;
quotient = 0;
dividend_copy = {16'd0,dividend};
divider_copy = {1'b0,divider,15'd0};

end else begin

diff = dividend_copy - divider_copy;

quotient = quotient << 1;

if( !diff[31] ) begin

dividend_copy = diff;
quotient[0] = 1'd1;

end

divider_copy = divider_copy >> 1;
bit = bit - 1;

end

endmodule

// :Example:
//
// Streamlined divider.
// Based on :PH: Figure 4.41
//
// Uses less register storage than simple divider.

input [15:0]  dividend,divider;
input         start, clk;
output        quotient,remainder;

reg [31:0]    qr;
reg [16:0]    diff;

//
//              0000 1011
//  """"""""|
//     1011 |   0001 0110     <- qr reg
// -0011    |  -0011          <- divider (never changes)
//  """"""""|
//     1011 |   0010 110o     <- qr reg
//  -0011   |  -0011
//  """"""""|
//     1011 |   0101 10oo     <- qr reg
//   -0011  |  -0011
//  """"""""|   0010 1000     <- qr reg before shift
//     0101 |   0101 0ooi     <- after shift
//    -0011 |  -0011
//  """"""""|   0010 ooii
//       10 |
//
// Quotient, 3 (0011); remainder 2 (10).

wire [15:0]   remainder = qr[31:16];
wire [15:0]   quotient = qr[15:0];

reg [4:0]     bit;

initial bit = 0;

always @( posedge clk )

if( ready && start ) begin

bit = 16;
qr = {16'd0,dividend};

end else begin

diff = qr[31:15] - {1'b0,divider};

if( diff[16] )
qr = {qr[30:0],1'd0};
else
qr = {diff[15:0],qr[14:0],1'd1};

bit = bit - 1;

end

endmodule

////////////////////////////////////////////////////////////////////////////////
/// "Real" Arithmetic Circuits

// :HP: Appendix A

//
// Multiple-level CLA covered in class reasonably close to adders used
// in processors.
//
// Other fast adder techniques also used. (See :PH: problems.)

/// Multipliers
//
// The multiplication hardware presented above is much slower than
// the hardware used in real processors.
//
// "Real" n-bit Multiplier Features
//
//   Multiplication done in one or two cycles (assume one cycle).
//   Uses higher-radix (say 4) Booth recoding or something similar.
//   Enough adders are provided so that product computed in one cycle.
//   These adders can add n numbers quickly without the cost of n CLA's.
//     Cost may be something like 3/2 n ripple adders plus 1 CLA.
//     Delay through each adder is only a handful of gates.
//   Adders may be connected as a tree, so that number of adders
//     from input to output is proportional to log n (rather than n).

/// Dividers
//
// The division hardware presented above is slower than hardware used
// in real processors.
//
// Division trickier than multiplication because result of step i
// needed for i+1.  This precludes the tree structures used in
// fast multipliers.
//
// FP Dividers in general-purpose processors typically take 10-20 cycles.
//
// Several division techniques used.  Examples:
//   SRT: Sweeney, Robertson, Tocher
//   Newton Iteration
//   Goldschmidt's Algorithm
//
```