/// LSU EE 3755 -- Fall 2013 -- Computer Organization
//
/// Verilog Notes 7 -- Integer Multiply and Divide

// Time-stamp: <8 October 2013, 19:03:19 CDT, koppel@sky.ece.lsu.edu>

/// Contents
//
// Binary Multiplication Algorithm
// Simple Multiplication Hardware
// Streamlined Multiplication Hardware
// Booth Recoding for Higher-Radix and Signed Multiplication
// Binary Division Algorithm
// Division Hardware (Simple and Streamlined)
// "Real" Arithmetic Circuits


/// References
//
// :PH:  Patterson & Hennessy, "Computer Organization & Design", 4th Edition
// :C:   Michael D. Ciletti, "Adv. Digital Design with the Verilog HDL" 2nd Ed
// :HP:  Hennessy & Patterson, "Computer Architecture: A Quantitative Approach"


////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm

// :PH: 3.3

 /// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//     0000
//    0000
//   0101
//  0101
//  """""""
//  0111100  Product


////////////////////////////////////////////////////////////////////////////////
/// Simple Multiplication Hardware

// :PH: 3.3

 /// Simple Multipliers
//
// Designed to show basic techniques, but are inefficient.
// Following sections show improved multipliers.

 /// Multipliers in this section:
//
// Simple Unsigned Combinational Multiplier
//   See :PH: Figure 3.4
//   Multiplies using combinational logic only.
//   Cost of n-bit multiplier: proportional to n^2.
//   Speed of n-bit multiplier: time needed for n additions:
//      Proportional to n.
//
// Simple Unsigned Multiplier
//   Unlike combinational multiplier, uses a clock.
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 5n bits of register storage.
//   Speed of n-bit multiplier:
//      Product in n clocks.
//      Clock period includes 2n-bit add.
//   Uses more hardware than streamlined multipliers in next section.


// :Example:
//
// Simple Unsigned Combinational Multiplier
// Based on :PH: Figure 3.4
//
// This is a straightforward coding of the longhand multiplication
// technique.

module simple_combinational_mult(product,multiplier,multiplicand);
   input [15:0]  multiplier, multiplicand;
   output        product;

   reg [31:0]    product;

   integer       i;

   always @( multiplier or multiplicand )
     begin

        product = 0;

        for (i=0; i<16; i=i+1)
          product = product + multiplier[i] * ( multiplicand << i );

     end

endmodule


// :Example:
//
// Simple Unsigned Multiplier
//
// Sequential multiplier. Handshaking signals are added to command
// multiplier to start and to indicate when finished.  Registers are
// provided for shifted versions of the multiplier and multiplicand.

module simple_mult(product,ready,multiplier,multiplicand,start,clk); //P
   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [15:0]    multiplier_copy;
   reg [31:0]    multiplicand_copy;

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit               = 16;
        product           = 0;
        multiplicand_copy = { 16'd0, multiplicand };
        multiplier_copy   = multiplier;

     end else if ( bit ) begin

        if ( multiplier_copy[0] == 1'b1 ) product = product + multiplicand_copy;

        multiplier_copy = multiplier_copy >> 1;
        multiplicand_copy = multiplicand_copy << 1;
        bit = bit - 1;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// Streamlined Multiplication Hardware

// :PH: 3.3

 /// Streamlined Multipliers
//
// Uses less hardware than simple multiplier.
// Still much slower than "real" multipliers.

 /// Multipliers in this section:
//
// Streamlined Unsigned Multiplier
//   See :PH: Figure 3.6
//
//   Fewer register bits than simple multiplier.
//
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 2n bits of register storage.
//
//   Speed of n-bit multiplier: (Same as simple.)
//      Product in n clocks.
//      Clock period includes a 2n-bit add.
//
// Streamlined Signed Multiplier
//   Like previous multiplier, but handles signed numbers.
//   A better signed multiplier covered later (using Booth recoding).
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 3n bits of register storage.
//   Speed of n-bit multiplier:
//      Product in n clocks.
//      Clock period includes 2n-bit add.


// :Example:
//
// Streamlined Unsigned Multiplier
// Based on :PH: Figure 3.6
//
// Improvements:
//
//   The product register also holds the multiplier.
//   Only the product register is shifted.
//   A register is not needed to hold the multiplicand.
//   Wow, you save 2n bits! (To multiply n-bit numbers.)

module streamlined_mult(product,ready,multiplier,multiplicand,start,clk);
   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit     = 16;
        product = { 16'd0, multiplier };

     end else if ( bit ) begin:A

        reg lsb;

        lsb     = product[0];
        product = product >> 1;
        bit     = bit - 1;

        if ( lsb ) product[31:15] = product[30:15] + multiplicand;

     end

endmodule


// :Example:
//
// Streamlined Signed Multiplier
//
// Multiplies absolute value of multiplier and multiplicand.
// Negates product if multiplier and multiplicand differ in sign.
// Otherwise similar to streamlined unsigned multiplier.

module streamlined_signed_mult(product,ready,multiplier,multiplicand,
                               start,clk);

   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;
   reg [15:0]    abs_multiplicand;

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit     = 16;
        product = { 16'd0, multiplier[15] ? -multiplier : multiplier };
        abs_multiplicand = multiplicand[15] ? -multiplicand : multiplicand;

     end else if ( bit ) begin:A

        reg lsb;

        lsb     = product[0];
        product = product >> 1;
        bit     = bit - 1;

        if ( lsb ) product[31:15] = product[30:15] + abs_multiplicand;

        if ( !bit && multiplicand[15] ^ multiplier[15] ) product = -product;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// Booth Recoding for Higher-Radix and Signed Multiplication

// See http://www.geoffknagge.com/fyp/booth.shtml
// :C: 10.3.11  Radix-2 Booth recoding.
// :C: 10.3.12  Radix-4 Booth recoding, called Bit-Pair encoding in text.


 /// Basic Idea
//
// Rather than repeatedly adding either 0 or 1 times multiplicand,
// repeatedly add 0, 1, -1, 2, -2, etc. times the multiplicand.
//
 /// Benefits
//
// Performs signed multiplication without having to first compute the
// absolute value of the multiplier and multiplicand.
//
// Improved performance (when radix higher than 2).

 /// Multipliers Described Here
//
// Ordinary Radix-4 Unsigned Multiplier
//   Presented for pedagogical reasons, Booth multipliers better.
//   Twice as fast as earlier multipliers.
//   Uses more hardware than Booth multipliers below.
//
// Booth Recoding Radix-4 Multiplier
//   Multiplies signed numbers.
//   Twice as fast as earlier multipliers.
//
// Booth Recoding Radix-2 Multiplier
//   Multiplies signed numbers.
//   Uses about the same amount of hardware than earlier signed multiplier.


 /// Ordinary Radix-4 Multiplier Idea
//
// Review of Radix-2 Multiplication
//
//  Multiply 5 times 12.  Radix-2 multiplication (the usual way).
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//     0000  0 x 0101
//    0000   0 x 0101
//   0101    1 x 0101
//  0101     1 x 0101
//  """""""
//  0111100  Product
//
// Radix-4 Multiplication
//   Let "a" denote the multiplicand and b denote the multiplier.
//   Pre-compute 2a and 3a.
//   Examine multiplier two bits at a time (rather than one bit at a time).
//   Based on the value of those bits add 0, a, 2a, or 3a (shifted by
//     the appropriate amount).
//
//   Uses n/2 additions to multiply two n-bit numbers.
//
// Two Radix-4 Multiplication Examples
//
//  Multiply 5 times 12.  Radix-4 multiplication (the faster way).
//
//     Precompute: 2a: 01010,  3a: 01111
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//    00000  00 x 0101
//  01111    11 x 0101
//  """""""
//  0111100  Product
//
//  Multiply 5 times 9.  Radix-4 multiplication (the faster way).
//
//     0101  Multiplicand
//     1001  Multiplier
//     """"
//    00101  01 x 0101
//  01010    10 x 0101
//  """""""
//  0101101  Product

 // Ordinary Radix-2^d Multiplier
//
// This is a generalization of the Radix-4 multiplier.
//
//   Let "a" denote the multiplicand and b denote the multiplier.
//   Pre-compute 2a, 3a, 4a, 5a, ..., (2^d-1)a
//   Examine multiplier d bits at a time.
//   Let the value of those bits be v.
//   Add v shifted by the appropriate amount.
//
//   Uses n/d additions to multiply two n-bit numbers.


// :Example:
//
// A Radix-4 multiplier.  Takes unsigned numbers.

module imult_ord_radix_4(product,ready,multiplicand,multiplier,start,clk);

   input [15:0]  multiplicand, multiplier;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   wire [17:0]   multiplicand_X_1 = {2'b0,multiplicand};
   wire [17:0]   multiplicand_X_2 = {1'b0,multiplicand,1'b0};
   wire [17:0]   multiplicand_X_3 = multiplicand_X_2 + multiplicand_X_1;

   always @( posedge clk )

     if ( ready && start ) begin

        bit     = 8;
        product = { 16'd0, multiplier };

     end else if ( bit ) begin:A

        reg [17:0]    pp;  // Partial Product
        reg [1:0]     mb;  // Multiplier Bits

        mb = product[1:0];

        case ( mb )
          2'd0: pp = {2'b0, product[31:16] };
          2'd1: pp = {2'b0, product[31:16] } + multiplicand_X_1;
          2'd2: pp = {2'b0, product[31:16] } + multiplicand_X_2;
          2'd3: pp = {2'b0, product[31:16] } + multiplicand_X_3;
        endcase

        product = { pp, product[15:2] };
        bit     = bit - 1;

     end

endmodule

// Synthesized hardware without optimization:
//
// 
/// Booth Recoding // // Goal: Reduce the number of pre-computed multiples of the multiplicand. // Bonus: Handles signed numbers. // // Idea: Use something like carries. // // Why It's Called Recoding // Numbers like 3 are recoded into 4-1, avoiding the need to deal with 3. // // Consider need to compute multiplicand_X_3 for radix-4 multiplier. // Instead: // Subtract multiplicand and add 4 times multiplicand. // The subtraction is done in the "current" step. // The addition is done in the "next" step. // Multiply 5 times 3. Booth radix-4 multiplication. // // 0101 Multiplicand // 0011 Multiplier // """"""" // 1111011 -1 x 0101 Multiplier bits 11 -> -1 + 4 -> -1 now, +1 next step // 00101 (00+1) x 0101 Multiplier bits 00 -> 0 (now) + 1 (from last step) // """"""" // 0001111 Product // The table below indicates: // // MB: Multiplier bits value. // C: Carry bit (lostbit in modules) from previous step. // x: Amount to multiply multiplicand by. // c: Carry bit to next step. // // Radix-4 Booth Table // // MB C | x c // """"""""""""""""" // 00 0 | 0 + 0 0 // 00 1 | 1 + 0 0 // 01 0 | 1 + 0 0 // 01 1 | 2 + 0 0 // 10 0 | -2 + 4 1 // 10 1 | -1 + 4 1 // 11 0 | -1 + 4 1 // 11 1 | 0 + 4 1 // // For example above: // Step 1: MB -> 11, C -> 0, lookup reveals x -> -1, c -> 1 // Step 2: MB -> 00, C -> 1, lookup reveals x -> 1, c -> 0 // // Radix-2 Booth Table // // MB C | x c // """"""""""""""""" // 0 0 | 0 + 0 0 // 0 1 | 1 + 0 0 // 1 0 | -1 + 2 1 // 1 1 | 0 + 2 1 // Note: // The carry out is the same as the MSB of the multiplier bits examined. // That's intentional, so that no special storage or computation is need // for the carry. // // The carry out of the last step is not used. // // Construction of higher-radix tables is straightforward. // :Example: // // A Booth radix-4 multiplier. This takes signed numbers. module imult_Booth_radix_4(prod,ready,mcand,mplier,start,clk); input [15:0] mcand, mplier; input start, clk; output [31:0] prod; output ready; reg [31:0] prod; reg [3:0] bit; wire ready = !bit; reg lostbit; initial bit = 0; // Pre-compute products of multiplicand. // wire [17:0] mcand_X_p1 = { mcand[15], mcand[15], mcand }; // Times +1 wire [17:0] mcand_X_p2 = { mcand[15], mcand, 1'b0 }; // Times +2 wire [17:0] mcand_X_m1 = -mcand_X_p1; // Times -1 wire [17:0] mcand_X_m2 = -mcand_X_p2; // Timex -2 always @( posedge clk ) if ( ready && start ) begin bit = 8; prod = { 16'd0, mplier }; lostbit = 0; end else if ( bit ) begin:A reg [1:0] mb; // Multiplier Bits // Upper 18 bits of product to be computed in this step. reg [17:0] high_prod_next; // Upper 16 bits of product from previous step, // sign extended to 18 bits. reg [17:0] high_prod_prev; high_prod_prev = { prod[31], prod[31], prod[31:16] }; mb = prod[1:0]; // Same as MB in table above. // At this point lostbit is same as upper-case C in table above. case ( {prod[1:0],lostbit} ) 3'b000: high_prod_next = high_prod_prev; 3'b001: high_prod_next = high_prod_prev + mcand_X_p1; 3'b010: high_prod_next = high_prod_prev + mcand_X_p1; 3'b011: high_prod_next = high_prod_prev + mcand_X_p2; 3'b100: high_prod_next = high_prod_prev + mcand_X_m2; 3'b101: high_prod_next = high_prod_prev + mcand_X_m1; 3'b110: high_prod_next = high_prod_prev + mcand_X_m1; 3'b111: high_prod_next = high_prod_prev; endcase lostbit = prod[1]; // Same as lower-case c in table above. // Combine product bits just computed with lower product bits. // Note that some of the lower product bits hold the multiplier. prod = { high_prod_next, prod[15:2] }; bit = bit - 1; end endmodule // :Example: // // Radix-2 Booth multiplier. Unlike signed multiplier, does not // require time or hardware for taking absolute value of operands. module imult_Booth(product,ready,mcand,mplier,start,clk); input [15:0] mcand, mplier; input start, clk; output product; output ready; reg [31:0] product; reg [4:0] bit; wire ready = !bit; reg lostbit; initial bit = 0; always @( posedge clk ) if ( ready && start ) begin bit = 16; product = { 16'd0, mplier }; lostbit = 0; end else if ( bit ) begin:A case ( {product[0],lostbit} ) 2'b01: product[31:16] = product[31:16] + mcand; 2'b10: product[31:16] = product[31:16] - mcand; endcase lostbit = product[0]; product = { product[31], product[31:1] }; bit = bit - 1; end endmodule //////////////////////////////////////////////////////////////////////////////// /// Binary Division Algorithm // :PH: 3.4 // Binary Version of Longhand Division Technique // // 11 divided by 3: // // 11 (1011) is dividend. // 3 (0011) is divider. // // """"""""| // 1011 | // -0011 | // """"""""| 0 Difference is negative: copy dividend and put 0 in quotient. // 1011 | // -0011 | // """"""""| 00 Difference is negative: copy dividend and put 0 in quotient. // 1011 | // -0011 | // """"""""| 001 Difference is positive: use difference and put 1 in quotient. // 0101 | // -0011 | // """"""""| 0011 Difference is positive: use difference and put 1 in quotient. // 10 | // // Quotient, 3 (0011); remainder 2 (10). //////////////////////////////////////////////////////////////////////////////// /// Division Hardware (Simple and Streamlined) // :PH: 3.4 /// Division Hardware // // Simple Divider // See :PH: Figure 4.36 // A straightforward translation of binary division algorithm into hardware. // Cost of n-bit divider: // Proportional to n. // Uses 5n bits of register storage. // Speed of n-bit divider: // Quotient in n clock cycles. // // Streamlined Divider // See :PH: Figure 4.41 // Less storage needed. // Cost of n-bit divider: // Proportional to n. // Uses 2n bits of register storage. // Speed of n-bit divider: // Quotient in n clock cycles. // :Example: // // Simple divider. // Based on :PH: Figure 4.36 module simple_divider(quotient,remainder,ready,dividend,divider,start,clk); input [15:0] dividend,divider; input start, clk; output [15:0] quotient,remainder; output ready; // """"""""| // 1011 | <---- dividend_copy // -0011 | <---- divider_copy // """"""""| 0 Difference is negative: copy dividend and put 0 in quotient. // 1011 | <---- dividend_copy // -0011 | <---- divider_copy // """"""""| 00 Difference is negative: copy dividend and put 0 in quotient. // 1011 | <---- dividend_copy // -0011 | <---- divider_copy // """"""""| 001 Difference is positive: use difference and put 1 in quotient. // quotient (numbers above) reg [15:0] quotient; reg [31:0] dividend_copy, divider_copy, diff; wire [15:0] remainder = dividend_copy[15:0]; reg [4:0] bit; wire ready = !bit; initial bit = 0; always @( posedge clk ) if ( ready && start ) begin bit = 16; quotient = 0; dividend_copy = {16'd0,dividend}; divider_copy = {1'b0,divider,15'd0}; end else begin diff = dividend_copy - divider_copy; quotient = { quotient[14:0], ~diff[31] }; divider_copy = { 1'b0, divider_copy[31:1] }; if ( !diff[31] ) dividend_copy = diff; bit = bit - 1; end endmodule // :Example: // // Streamlined divider. // Based on :PH: Figure 4.41 // // Uses less register storage than simple divider. module streamlined_divider(quotient,remainder,ready,dividend,divider,start,clk); input [15:0] dividend,divider; input start, clk; output quotient,remainder; output ready; reg [31:0] qr; reg [16:0] diff; // // 0000 1011 // """"""""| // 1011 | 0001 0110 <- qr reg // -0011 | -0011 <- divider (never changes) // """"""""| // 1011 | 0010 110o <- qr reg // -0011 | -0011 // """"""""| // 1011 | 0101 10oo <- qr reg // -0011 | -0011 // """"""""| 0010 1000 <- qr reg before shift // 0101 | 0101 0ooi <- after shift // -0011 | -0011 // """"""""| 0010 ooii // 10 | // // Quotient, 3 (0011); remainder 2 (10). wire [15:0] remainder = qr[31:16]; wire [15:0] quotient = qr[15:0]; reg [4:0] bit; wire ready = !bit; initial bit = 0; always @( posedge clk ) if ( ready && start ) begin bit = 16; qr = {16'd0,dividend}; end else begin diff = qr[31:15] - {1'b0,divider}; if ( diff[16] ) qr = {qr[30:0],1'd0}; else qr = {diff[15:0],qr[14:0],1'd1}; bit = bit - 1; end endmodule //////////////////////////////////////////////////////////////////////////////// /// "Real" Arithmetic Circuits // :HP: Appendix A /// Adders // // Multiple-level CLA covered in class reasonably close to adders used // in processors. // // Other fast adder techniques also used. (See :PH: problems.) /// Multipliers // // The multiplication hardware presented above is much slower than // the hardware used in real processors. // // "Real" n-bit Multiplier Features // // Multiplication done in one or two cycles (assume one cycle). // Uses higher-radix (say 4) Booth recoding or something similar. // Enough adders are provided so that product computed in one cycle. // These adders can add n numbers quickly without the cost of n CLA's. // Cost may be something like 3/2 n ripple adders plus 1 CLA. // Delay through each adder is only a handful of gates. // Adders may be connected as a tree, so that number of adders // from input to output is proportional to log n (rather than n). /// Dividers // // The division hardware presented above is slower than hardware used // in real processors. // // Division trickier than multiplication because result of step i // needed for i+1. This precludes the tree structures used in // fast multipliers. // // FP Dividers in general-purpose processors typically take 10-20 cycles. // // Several division techniques used. Examples: // SRT: Sweeney, Robertson, Tocher // Newton Iteration // Goldschmidt's Algorithm // //////////////////////////////////////////////////////////////////////////////// /// Testbenches /// // cadence translate_off module l07_test_mult(); wire [31:0] or4_prod, prod_2; reg [31:0] shadow_prod; reg signed [15:0] a, b; integer i; parameter num_tests = 1000; reg clk; initial clk = 0; always #1 clk = ~clk; reg or4_start; wire or4_ready; // imult_ord_radix_4 c1(or4_prod,or4_ready,a,b,or4_start,clk); // imult_Booth_radix_4 c1(or4_prod,or4_ready,a,b,or4_start,clk); imult_Booth c1(or4_prod,or4_ready,a,b,or4_start,clk); initial begin # 0.5; while ( !or4_ready ) #1; for (i=0; i<num_tests; i=i+1) begin a = $random; b = $random; or4_start = 1; while ( or4_ready ) #1; or4_start = 0; while ( !or4_ready ) #1; shadow_prod = a * b; #1; if ( or4_prod != shadow_prod ) begin $display("Wrong prod: %h * %h = %h != %h (correct)", a, b, or4_prod, shadow_prod); $stop; end end $display("Tried %d multiply tests",num_tests); $stop; end endmodule module l07_test_div(); wire [15:0] quot, rem; reg [15:0] shadow_quot, shadow_rem; reg [15:0] a, b; integer i; parameter num_tests = 1000; reg clk; initial clk = 0; always #1 clk = ~clk; reg start; wire ready; wire [15:0] infinity; assign infinity = 16'hffff; // simple_divider div(quot,rem,ready,a,b,start,clk); streamlined_divider div(quot,rem,ready,a,b,start,clk); initial begin # 0.5; while ( !ready ) #1; for (i=0; i<num_tests; i=i+1) begin:A integer shadow_quot, shadow_rem; a = $random; b = i & 1 ? $random : $random & 3; start = 1; while ( ready ) #1; start = 0; while ( !ready ) #1; shadow_quot = b ? a / b : infinity; shadow_rem = b ? a % b : a; #1; if ( quot != shadow_quot || rem != shadow_rem ) begin $display("Wrong quot: %h / %h = %h r %h != %h r %h (correct)", a, b, quot, rem, shadow_quot, shadow_rem); $stop; end end $display("Tried %d divide tests",num_tests); $stop; end endmodule // cadence translate_on