/// LSU EE 3755 -- Fall 2012 -- Computer Organization
//
/// Verilog Notes 7 -- Integer Multiply and Divide

// Time-stamp: <18 October 2012, 16:57:57 CDT, koppel@sky.ece.lsu.edu>

/// Contents
//
// Binary Multiplication Algorithm
// Simple Multiplication Hardware
// Streamlined Multiplication Hardware
// Booth Recoding for Higher-Radix and Signed Multiplication
// Binary Division Algorithm
// Division Hardware (Simple and Streamlined)
// "Real" Arithmetic Circuits


/// References
//
// :PH:  Patterson & Hennessy, "Computer Organization & Design", 4th Edition
// :C:   Michael D. Ciletti, "Adv. Digital Design with the Verilog HDL" 2nd Ed
// :HP:  Hennessy & Patterson, "Computer Architecture: A Quantitative Approach"


////////////////////////////////////////////////////////////////////////////////
/// Binary Multiplication Algorithm

// :PH: 3.3

 /// Long Hand Procedure Review
//
//  Multiply 5 times 12 in binary:
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//     0000
//    0000
//   0101
//  0101
//  """""""
//  0111100  Product


////////////////////////////////////////////////////////////////////////////////
/// Simple Multiplication Hardware

// :PH: 3.3

 /// Simple Multipliers
//
// Designed to show basic techniques, but are inefficient.
// Following sections show improved multipliers.

 /// Multipliers in this section:
//
// Simple Unsigned Combinational Multiplier
//   See :PH: Figure 3.4
//   Multiplies using combinational logic only.
//   Cost of n-bit multiplier: proportional to n^2.
//   Speed of n-bit multiplier: time needed for n additions:
//      Proportional to n.
//
// Simple Unsigned Multiplier
//   Unlike combinational multiplier, uses a clock.
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 5n bits of register storage.
//   Speed of n-bit multiplier:
//      Product in n clocks.
//      Clock period includes 2n-bit add.
//   Uses more hardware than streamlined multipliers in next section.


// :Example:
//
// Simple Unsigned Combinational Multiplier
// Based on :PH: Figure 3.4
//
// This is a straightforward coding of the longhand multiplication
// technique.

module simple_combinational_mult(product,multiplier,multiplicand); //P
   input [15:0]  multiplier, multiplicand;
   output        product;

   reg [31:0]    product;

   integer       i;

   always @( multiplier or multiplicand )
     begin

        product = 0;

        for (i=0; i<16; i=i+1)
          if ( multiplier[i] == 1'b1 )
            product = product + ( multiplicand << i );

     end

endmodule


// :Example:
//
// Simple Unsigned Multiplier
//
// Sequential multiplier. Handshaking signals are added to command
// multiplier to start and to indicate when finished.  Registers are
// provided for shifted versions of the multiplier and multiplicand.

module simple_mult(product,ready,multiplier,multiplicand,start,clk); //P
   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [15:0]    multiplier_copy;
   reg [31:0]    multiplicand_copy;

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit               = 16;
        product           = 0;
        multiplicand_copy = { 16'd0, multiplicand };
        multiplier_copy   = multiplier;

     end else if ( bit ) begin

        if ( multiplier_copy[0] == 1'b1 ) product = product + multiplicand_copy;

        multiplier_copy = multiplier_copy >> 1;
        multiplicand_copy = multiplicand_copy << 1;
        bit = bit - 1;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// Streamlined Multiplication Hardware

// :PH: 3.3

 /// Streamlined Multipliers
//
// Uses less hardware than simple multiplier.
// Still much slower than "real" multipliers.

 /// Multipliers in this section:
//
// Streamlined Unsigned Multiplier
//   See :PH: Figure 3.6
//
//   Fewer register bits than simple multiplier.
//
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 2n bits of register storage.
//
//   Speed of n-bit multiplier: (Same as simple.)
//      Product in n clocks.
//      Clock period includes a 2n-bit add.
//
// Streamlined Signed Multiplier
//   Like previous multiplier, but handles signed numbers.
//   A better signed multiplier covered later (using Booth recoding).
//   Cost of n-bit multiplier:
//      Proportional to n.
//      Includes 3n bits of register storage.
//   Speed of n-bit multiplier:
//      Product in n clocks.
//      Clock period includes 2n-bit add.


// :Example:
//
// Streamlined Unsigned Multiplier
// Based on :PH: Figure 3.6
//
// Improvements:
//
//   The product register also holds the multiplier.
//   Only the product register is shifted.
//   A register is not needed to hold the multiplicand.
//   Wow, you save 2n bits! (To multiply n-bit numbers.)

module streamlined_mult(product,ready,multiplier,multiplicand,start,clk);
   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit     = 16;
        product = { 16'd0, multiplier };

     end else if ( bit ) begin:A

        reg lsb;

        lsb     = product[0];
        product = product >> 1;
        bit     = bit - 1;

        if ( lsb ) product[31:15] = product[30:15] + multiplicand;

     end

endmodule


// :Example:
//
// Streamlined Signed Multiplier
//
// Multiplies absolute value of multiplier and multiplicand.
// Negates product if multiplier and multiplicand differ in sign.
// Otherwise similar to streamlined unsigned multiplier.

module streamlined_signed_mult(product,ready,multiplier,multiplicand,
                               start,clk);

   input [15:0]  multiplier, multiplicand;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;
   reg [15:0]    abs_multiplicand;

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit     = 16;
        product = { 16'd0, multiplier[15] ? -multiplier : multiplier };
        abs_multiplicand = multiplicand[15] ? -multiplicand : multiplicand;

     end else if ( bit ) begin:A

        reg lsb;

        lsb     = product[0];
        product = product >> 1;
        bit     = bit - 1;

        if ( lsb ) product[31:15] = product[30:15] + abs_multiplicand;

        if ( !bit && multiplicand[15] ^ multiplier[15] ) product = -product;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// Booth Recoding for Higher-Radix and Signed Multiplication

// See http://www.geoffknagge.com/fyp/booth.shtml
// :C: 10.3.11  Radix-2 Booth recoding.
// :C: 10.3.12  Radix-4 Booth recoding, called Bit-Pair encoding in text.


 /// Basic Idea
//
// Rather than repeatedly adding either 0 or 1 times multiplicand,
// repeatedly add 0, 1, -1, 2, -2, etc. times the multiplicand.
//
 /// Benefits
//
// Performs signed multiplication without having to first compute the
// absolute value of the multiplier and multiplicand.
//
// Improved performance (when radix higher than 2).

 /// Multipliers Described Here
//
// Ordinary Radix-4 Unsigned Multiplier
//   Presented for pedagogical reasons, Booth multipliers better.
//   Twice as fast as earlier multipliers.
//   Uses more hardware than Booth multipliers below.
//
// Booth Recoding Radix-4 Multiplier
//   Multiplies signed numbers.
//   Twice as fast as earlier multipliers.
//
// Booth Recoding Radix-2 Multiplier
//   Multiplies signed numbers.
//   Uses about the same amount of hardware than earlier signed multiplier.


 /// Ordinary Radix-4 Multiplier Idea
//
// Review of Radix-2 Multiplication
//
//  Multiply 5 times 12.  Radix-2 multiplication (the usual way).
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//     0000  0 x 0101
//    0000   0 x 0101
//   0101    1 x 0101
//  0101     1 x 0101
//  """""""
//  0111100  Product
//
// Radix-4 Multiplication
//   Let "a" denote the multiplicand and b denote the multiplier.
//   Pre-compute 2a and 3a.
//   Examine multiplier two bits at a time (rather than one bit at a time).
//   Based on the value of those bits add 0, a, 2a, or 3a (shifted by
//     the appropriate amount).
//
//   Uses n/2 additions to multiply two n-bit numbers.
//
// Two Radix-4 Multiplication Examples
//
//  Multiply 5 times 12.  Radix-4 multiplication (the faster way).
//
//     Precompute: 2a: 01010,  3a: 01111
//
//     0101  Multiplicand
//     1100  Multiplier
//     """"
//    00000  00 x 0101
//  01111    11 x 0101
//  """""""
//  0111100  Product
//
//  Multiply 5 times 9.  Radix-4 multiplication (the faster way).
//
//     0101  Multiplicand
//     1001  Multiplier
//     """"
//    00101  01 x 0101
//  01010    10 x 0101
//  """""""
//  0101101  Product

 // Ordinary Radix-2^d Multiplier
//
// This is a generalization of the Radix-4 multiplier.
//
//   Let "a" denote the multiplicand and b denote the multiplier.
//   Pre-compute 2a, 3a, 4a, 5a, ..., (2^d-1)a
//   Examine multiplier d bits at a time.
//   Let the value of those bits be v.
//   Add v shifted by the appropriate amount.
//
//   Uses n/d additions to multiply two n-bit numbers.


// :Example:
//
// A Radix-4 multiplier.  Takes unsigned numbers.

module imult_ord_radix_4(product,ready,multiplicand,multiplier,start,clk);

   input [15:0]  multiplicand, multiplier;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   wire [17:0]   multiplicand_X_1 = {2'b0,multiplicand};
   wire [17:0]   multiplicand_X_2 = {1'b0,multiplicand,1'b0};
   wire [17:0]   multiplicand_X_3 = multiplicand_X_2 + multiplicand_X_1;

   always @( posedge clk )

     if ( ready && start ) begin

        bit     = 8;
        product = { 16'd0, multiplier };

     end else if ( bit ) begin:A

        reg [17:0]    pp;  // Partial Product
        reg [1:0]     mb;  // Multiplier Bits

        mb = product[1:0];

        case ( mb )
          2'd0: pp = {2'b0, product[31:16] };
          2'd1: pp = {2'b0, product[31:16] } + multiplicand_X_1;
          2'd2: pp = {2'b0, product[31:16] } + multiplicand_X_2;
          2'd3: pp = {2'b0, product[31:16] } + multiplicand_X_3;
        endcase

        product = { pp, product[15:2] };
        bit     = bit - 1;

     end

endmodule


 /// Booth Recoding
//
// Goal: Reduce the number of pre-computed multiples of the multiplicand.
// Bonus: Handles signed numbers.
//
// Idea: Use something like carries.
//
// Why It's Called Recoding
//   Numbers like 3 are recoded into 4-1, avoiding the need to deal with 3.
//
// Consider need to compute multiplicand_X_3 for radix-4 multiplier.
// Instead:
//   Subtract multiplicand and add 4 times multiplicand.
//   The subtraction is done in the "current" step.
//   The addition is done in the "next" step.

//  Multiply 5 times 3.  Booth radix-4 multiplication.
//
//     0101  Multiplicand
//     0011  Multiplier
//  """""""
//  1111011  -1      x 0101  Multiplier bits 11 -> -1 + 4 -> -1 now, +1 next step
//  00101     (00+1) x 0101  Multiplier bits 00 -> 0 (now) + 1 (from last step)
//  """""""
//  0001111  Product


// The table below indicates:
//
// MB: Multiplier bits value.
//  C: Carry bit (lostbit in modules) from previous step.
//  x: Amount to multiply multiplicand by.
//  c: Carry bit to next step.
//
// Radix-4 Booth Table
//
//  MB C  |  x      c
//  """""""""""""""""
//  00 0  |  0 + 0  0
//  00 1  |  1 + 0  0
//  01 0  |  1 + 0  0
//  01 1  |  2 + 0  0
//  10 0  | -2 + 4  1
//  10 1  | -1 + 4  1
//  11 0  | -1 + 4  1
//  11 1  |  0 + 4  1
//
// For example above:
// Step 1: MB -> 11,  C -> 0,  lookup reveals  x -> -1,  c -> 1
// Step 2: MB -> 00,  C -> 1,  lookup reveals  x -> 1,  c -> 0
//
// Radix-2 Booth Table
//
//  MB C  |  x      c
//  """""""""""""""""
//  0  0  |  0 + 0  0
//  0  1  |  1 + 0  0
//  1  0  | -1 + 2  1
//  1  1  |  0 + 2  1


// Note:
//  The carry out is the same as the MSB of the multiplier bits examined.
//  That's intentional, so that no special storage or computation is need
//    for the carry.
//
//  The carry out of the last step is not used.
//
//  Construction of higher-radix tables is straightforward.


// :Example:
//
// A Booth radix-4 multiplier.  This takes signed numbers.

module imult_Booth_radix_4(prod,ready,mcand,mplier,start,clk);

   input [15:0]  mcand, mplier;
   input         start, clk;
   output [31:0] prod;
   output        ready;

   reg [31:0]   prod;

   reg [3:0]     bit;
   wire          ready = !bit;
   reg           lostbit;

   initial bit = 0;

   // Pre-compute products of multiplicand.
   //
   wire [17:0]   mcand_X_p1 = { mcand[15], mcand[15], mcand };   // Times +1
   wire [17:0]   mcand_X_p2 = { mcand[15], mcand, 1'b0 };        // Times +2
   wire [17:0]   mcand_X_m1 = -mcand_X_p1;                       // Times -1
   wire [17:0]   mcand_X_m2 = -mcand_X_p2;                       // Timex -2

   always @( posedge clk )

     if ( ready && start ) begin

        bit     = 8;
        prod = { 16'd0, mplier };
        lostbit = 0;

     end else if ( bit ) begin:A

        reg [1:0] mb;  // Multiplier Bits

        // Upper 18 bits of product to be computed in this step.
        reg [17:0] high_prod_next;

        // Upper 16 bits of product from previous step,
        // sign extended to 18 bits.
        reg [17:0] high_prod_prev;
        high_prod_prev = { prod[31], prod[31], prod[31:16] };

        mb = prod[1:0];  // Same as MB in table above.
        // At this point lostbit is same as upper-case C in table above.

        case ( {mb,lostbit} )
          3'b000: high_prod_next = high_prod_prev;
          3'b001: high_prod_next = high_prod_prev + mcand_X_p1;
          3'b010: high_prod_next = high_prod_prev + mcand_X_p1;
          3'b011: high_prod_next = high_prod_prev + mcand_X_p2;
          3'b100: high_prod_next = high_prod_prev + mcand_X_m2;
          3'b101: high_prod_next = high_prod_prev + mcand_X_m1;
          3'b110: high_prod_next = high_prod_prev + mcand_X_m1;
          3'b111: high_prod_next = high_prod_prev;
        endcase

        lostbit = prod[1];  // Same as lower-case c in table above.

        // Combine product bits just computed with lower product bits.
        // Note that some of the lower product bits hold the multiplier.
        prod = { high_prod_next, prod[15:2] };

        bit = bit - 1;

     end

endmodule


// :Example:
//
// Radix-2 Booth multiplier.  Unlike signed multiplier, does not
// require time or hardware for taking absolute value of operands.

module imult_Booth(product,ready,mcand,mplier,start,clk);
   input [15:0]  mcand, mplier;
   input         start, clk;
   output        product;
   output        ready;

   reg [31:0]    product;

   reg [4:0]     bit;
   wire          ready = !bit;
   reg           lostbit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit     = 16;
        product = { 16'd0, mplier };
        lostbit = 0;

     end else if ( bit ) begin:A

        case ( {product[0],lostbit} )
          2'b01: product[31:16] = product[31:16] + mcand;
          2'b10: product[31:16] = product[31:16] - mcand;
        endcase

        lostbit = product[0];
        product = { product[31], product[31:1] };
        bit     = bit - 1;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// Binary Division Algorithm

// :PH: 3.4

// Binary Version of Longhand Division Technique
//
// 11 divided by 3:
//
// 11 (1011) is dividend.
//  3 (0011) is divider.
//
//  """"""""|
//     1011 |
// -0011    |
//  """"""""|    0  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |
//  -0011   |
//  """"""""|   00  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |
//   -0011  |
//  """"""""|  001  Difference is positive: use difference and put 1 in quotient.
//     0101 |
//    -0011 |
//  """"""""| 0011  Difference is positive: use difference and put 1 in quotient.
//       10 |
//
// Quotient, 3 (0011); remainder 2 (10).


////////////////////////////////////////////////////////////////////////////////
/// Division Hardware (Simple and Streamlined)

// :PH: 3.4

 /// Division Hardware
//
// Simple Divider
//   See :PH: Figure 4.36
//   A straightforward translation of binary division algorithm into hardware.
//   Cost of n-bit divider:
//      Proportional to n.
//      Uses 5n bits of register storage.
//   Speed of n-bit divider:
//      Quotient in n clock cycles.
//
// Streamlined Divider
//   See :PH: Figure 4.41
//   Less storage needed.
//   Cost of n-bit divider:
//      Proportional to n.
//      Uses 2n bits of register storage.
//   Speed of n-bit divider:
//      Quotient in n clock cycles.


// :Example:
//
// Simple divider.
// Based on :PH: Figure 4.36

module simple_divider(quotient,remainder,ready,dividend,divider,start,clk);

   input [15:0]  dividend,divider;
   input         start, clk;
   output [15:0] quotient,remainder;
   output        ready;

//  """"""""|
//     1011 |  <----  dividend_copy
// -0011    |  <----  divider_copy
//  """"""""|    0  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |  <----  dividend_copy
//  -0011   |  <----  divider_copy
//  """"""""|   00  Difference is negative: copy dividend and put 0 in quotient.
//     1011 |  <----  dividend_copy
//   -0011  |  <----  divider_copy
//  """"""""|  001  Difference is positive: use difference and put 1 in quotient.
//            quotient (numbers above)


   reg [15:0]    quotient;
   reg [31:0]    dividend_copy, divider_copy, diff;
   wire [15:0]   remainder = dividend_copy[15:0];

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit = 16;
        quotient = 0;
        dividend_copy = {16'd0,dividend};
        divider_copy = {1'b0,divider,15'd0};

     end else begin

        diff = dividend_copy - divider_copy;
        quotient = { quotient[14:0], ~diff[31] };
        divider_copy = { 1'b0, divider_copy[31:1] };
        if ( !diff[31] ) dividend_copy = diff;
        bit = bit - 1;

     end

endmodule


// :Example:
//
// Streamlined divider.
// Based on :PH: Figure 4.41
//
// Uses less register storage than simple divider.

module streamlined_divider(quotient,remainder,ready,dividend,divider,start,clk);

   input [15:0]  dividend,divider;
   input         start, clk;
   output        quotient,remainder;
   output        ready;

   reg [31:0]    qr;
   reg [16:0]    diff;

//
//              0000 1011
//  """"""""|
//     1011 |   0001 0110     <- qr reg
// -0011    |  -0011          <- divider (never changes)
//  """"""""|
//     1011 |   0010 110o     <- qr reg
//  -0011   |  -0011
//  """"""""|
//     1011 |   0101 10oo     <- qr reg
//   -0011  |  -0011
//  """"""""|   0010 1000     <- qr reg before shift
//     0101 |   0101 0ooi     <- after shift
//    -0011 |  -0011
//  """"""""|   0010 ooii
//       10 |
//
// Quotient, 3 (0011); remainder 2 (10).


   wire [15:0]   remainder = qr[31:16];
   wire [15:0]   quotient = qr[15:0];

   reg [4:0]     bit;
   wire          ready = !bit;

   initial bit = 0;

   always @( posedge clk )

     if ( ready && start ) begin

        bit = 16;
        qr = {16'd0,dividend};

     end else begin

        diff = qr[31:15] - {1'b0,divider};

        if ( diff[16] )
          qr = {qr[30:0],1'd0};
        else
          qr = {diff[15:0],qr[14:0],1'd1};

        bit = bit - 1;

     end

endmodule


////////////////////////////////////////////////////////////////////////////////
/// "Real" Arithmetic Circuits

// :HP: Appendix A

 /// Adders
//
// Multiple-level CLA covered in class reasonably close to adders used
// in processors.
//
// Other fast adder techniques also used. (See :PH: problems.)

 /// Multipliers
//
// The multiplication hardware presented above is much slower than
// the hardware used in real processors.
//
// "Real" n-bit Multiplier Features
//
//   Multiplication done in one or two cycles (assume one cycle).
//   Uses higher-radix (say 4) Booth recoding or something similar.
//   Enough adders are provided so that product computed in one cycle.
//   These adders can add n numbers quickly without the cost of n CLA's.
//     Cost may be something like 3/2 n ripple adders plus 1 CLA.
//     Delay through each adder is only a handful of gates.
//   Adders may be connected as a tree, so that number of adders
//     from input to output is proportional to log n (rather than n).

 /// Dividers
//
// The division hardware presented above is slower than hardware used
// in real processors.
//
// Division trickier than multiplication because result of step i
// needed for i+1.  This precludes the tree structures used in
// fast multipliers.
//
// FP Dividers in general-purpose processors typically take 10-20 cycles.
//
// Several division techniques used.  Examples:
//   SRT: Sweeney, Robertson, Tocher
//   Newton Iteration
//   Goldschmidt's Algorithm
//


////////////////////////////////////////////////////////////////////////////////
/// Testbenches
///

// cadence translate_off

module l07_test_mult();
   wire [31:0] or4_prod, prod_2;
   reg [31:0] shadow_prod;
   reg signed [15:0]  a, b;
   integer    i;
   parameter  num_tests = 1000;

   reg        clk;
   initial clk = 0;
   always #1 clk = ~clk;

   reg        or4_start;
   wire       or4_ready;
   //  imult_ord_radix_4 c1(or4_prod,or4_ready,a,b,or4_start,clk);
   //  imult_Booth_radix_4 c1(or4_prod,or4_ready,a,b,or4_start,clk);
   imult_Booth c1(or4_prod,or4_ready,a,b,or4_start,clk);

   initial begin

      # 0.5;

      while ( !or4_ready ) #1;

      for (i=0; i<num_tests; i=i+1) begin

         a = $random;
         b = $random;
         or4_start = 1;

         while ( or4_ready ) #1;

         or4_start = 0;

         while ( !or4_ready ) #1;

         shadow_prod = a * b;
         #1;
         if ( or4_prod != shadow_prod ) begin
            $display("Wrong prod: %h * %h = %h != %h (correct)",
                     a, b, or4_prod, shadow_prod);
            $stop;
         end
      end

      $display("Tried %d multiply tests",num_tests);
      $stop;
   end

endmodule


module l07_test_div();
   wire [15:0] quot, rem;
   reg [15:0]  shadow_quot, shadow_rem;
   reg [15:0]  a, b;
   integer    i;
   parameter  num_tests = 1000;

   reg        clk;
   initial clk = 0;
   always #1 clk = ~clk;

   reg        start;
   wire       ready;

   wire [15:0] infinity;
   assign     infinity = 16'hffff;

   // simple_divider div(quot,rem,ready,a,b,start,clk);
   streamlined_divider div(quot,rem,ready,a,b,start,clk);

   initial begin

      # 0.5;

      while ( !ready ) #1;

      for (i=0; i<num_tests; i=i+1) begin:A

         integer shadow_quot, shadow_rem;

         a = $random;
         b = i & 1 ? $random : $random & 3;
         start = 1;

         while ( ready ) #1;

         start = 0;

         while ( !ready ) #1;

         shadow_quot = b ? a / b : infinity;
         shadow_rem  = b ? a % b : a;

         #1;
         if ( quot != shadow_quot || rem != shadow_rem ) begin
            $display("Wrong quot: %h / %h  =  %h r %h  !=  %h r %h (correct)",
                     a, b, quot, rem, shadow_quot, shadow_rem);
            $stop;
         end
      end

      $display("Tried %d divide tests",num_tests);
      $stop;
   end

endmodule

// cadence translate_on