/// LSU EE 3755 Spring
2009 Computer Organization
//
/// Verilog Notes 10 -- Floating Point
///
Contents
//
// Binary
Floating-Point Representation and Arithmetic
// IEEE 754 FLP
Standard
// FLP Addition
Hardware
///
References
//
// :P: Palnitkar, "Verilog
HDL"
// :Q: Qualis, "Verilog HDL Quick
Reference Card Revision 1.0"
// :PH: Patterson & Hennessy, "Computer
Organization & Design"
////////////////////////////////////////////////////////////////////////////////
/// Binary
Floating-Point Representation and Arithmetic
// :PH:
4.8
/// Binary Floating-Point (FLP)
Representations
//
// The
floating-point (FLP) representations in this section
(before
// IEEE 754) are
NOT computer representations.
//
// Among other
things, that means the number of bits needed to store a
// number is not
specified.
//
// Computer
representations for FLP numbers covered in the next
section,
// IEEE
754.
/// Binary Fixed
Point Representation
//
//
// Each digit
position has a weight.
//
//
// FXP Binary
Number: 1 0 1
0 1. 1 0 0 1
// Digit
Position: 4 3
2 1 0 -1 -2
-3 -4
// Weight:
16 8 4
2 1 1/2 1/4 1/8
1/16
//
// Value of
number: 1*16 + 0*8 + 1*4 + 0*2 +
1*1 + 1/2 + 0/4 + 0/8 + 1/16
//
= 21.5625
//
// Other
Examples:
//
// 1.1 = 1.5
// 1.01 = 1.25
// 1.11 = 1.75
// 1.001 = 1.125
// 1111.1111 =
15.9375
//
//
// Fixed Point
Decimal to Binary Conversion
//
// To convert decimal number x, 0 < x < 1.
//
// Method 1:
//
// For bit position
-1:
// if x
>= 1/2, bit is 1, x = x -
1/2;
// if x
< 1/2, bit is 0, x unchanged.
// For bit position
-2:
// if x
>= 1/4, bit is 1, x = x -
1/4
// if x
< 1/4, bit is 0, x unchanged.
// For bit position
-3:
// if x
>= 1/8, bit is 1, x = x -
1/8
// if x
< 1/8, bit is 0, x unchanged.
// For bit position
-4:
// if x
>= 1/16, bit is 1, x = x - 1/16
// if x
< 1/16, bit is 0, x
unchanged.
//
Etc.
//
//
Example:
// x=
.75
// For bit position
-1:
// x = .75 >=
1/2, bit is 1 and updated x is (0.75) - (0.5) = 0.25;
//
first bit = 1(MSB of fraction).
// For bit position
-2:
// x = .25 >=
1/4, bit is 1 and updated x is (0.25) - (0.25) = 0;
//
second bit = 1.
// For bit position
-3:
// x = 0 <
1/8, bit is 0 and x
unchanged ; x= 0;
//
third bit =
0.
// For bit position
-4:
// x = 0 <
1/16, bit is 0 and x
unchanged ; x= 0;
//
fourth bit =
0.
// so result is
.1100.
// Method 2:
//
// Let r be the number of
bits past binary point desired.
//
// Convert x * 2^r to
binary.
//
// MSB is first bit past
binary point, etc.
//
//
Example:
// r =
4, x = .75
// Convert
.75 * 2^4 = 12 to binary: 1100
// x in
binary is: .1100
//
// This is the same thing
like multiplying by 2 each time and keep
// the integer
part.
// The first one is MSB
of fraction.
// .75 * 2 = 1.5 keep
1
// .5 * 2 = 1.0 keep 1
// 0 * 2 = 0
keep 0
// 0 * 2 = 0
keep 0
// so result is .1100 for 4 bit representation.
// Examples to 12
digits:
//
// 1.1 = 1.000110011001... 1.1 * 2^12 = 4505 =
1000110011001
// 1.2 =
1.001100110011...
// 1.3 =
1.010011001100...
// 1.4 =
1.011001100110...
// 1.5 = 1.1
//
//
Note:
//
// Common numbers
such as 0.2 do not have exact representations.
/// Binary Scientific
Notation
//
// Binary
Scientific Representation Similar to
// Decimal Scientific
Notation
//
// Decimal: SIGN SIGNIFICAND(FRACTION) x
10^{EXPONENT}
// Binary: SIGN SIGNIFICAND(FRACTION) x
2^{EXPONENT}
//
// Significand does not always mean 100%
fractional number for
// scientific
notation.
//
// Decimal Examples:
//
// 1.23 x 10^{2} = 123
// 1.23 x 10^{0} = 1.23
// 1.23 x 10^{-1} =
.123
// Examples above are
normalized
// (only one non-zero digit before radix
point).
// Examples below are not(more
than one non-zero digit or
// zero digit before radix
point).
//
// 12.3 x 10^{1} = 123
// .123 x 10^{1} = 1.23
// 123 x 10^{-3} = .123
//
// Binary Examples
//
// 1 x 2^{0} = 1 =
1
// 1 x 2^{1} = 10 =
2
// 1 x 2^{2} = 100 =
4
// 1.1 x 2^{2} = 110 = 6
// 1.1 x 2^{1} = 11 = 3
// 1.1 x 2^{0} = 1.1 = 1.5
// 1.1 x 2^{-1} = .11 =
.75
// Examples above are
normalized(only one digit(1) before
//
radix point).
// Examples below are not(more
than one digit or zero digit before
//
radix point).
// So when binary number is
normalized, there is always 1 before
//
radix point.
// IEEE 754 drops the 1(hidden
1) and saves only number(.xxxxx) after
// radix
point(saving 1 bit).
// for IEEE 754 format the
signifand(fraction) is 1.xxxxxx.
// 11 x 2^{1} = 110 = 6
// 11 x 2^{0} = 11 = 3
// 11 x 2^{-1} = 1.1 = 1.5
// 11 x 2^{-2} = .11 = .75
/// Addition Using Scientific
Notation
//
//
Consider:
//
// a_scand x
2^{a_exp}
// b_scand x
2^{b_exp}
//
// Assume a is larger magnitude
number.
// (a>b or for
simplicity a_exp >= b_exp).
//
// To add
these:
//
// Set b'_exp = a_exp.
//adjustment
// Set b'scand = b_scand
/ 2^(a_exp - b_exp) //shift right
// Set s_scand = a_scand
+ b'_scand //add
// Normalize
result.
//
//
//
//
// Subtraction is
similar.
/// Multiplication Using Scientific
Notation
// Not biased
exponents.
//
//
Consider:
//
// a_scand x
2^{a_exp}
// b_scand x
2^{b_exp}
//
// To multiply
these:
//
// Set p_scand = a_scand
x b_scand
// Set p_exp = a_exp +
b_exp
// Normalize p //having
only one digit before radix point
//
// Product is p_scand x
2^{p_exp}
//
//
////////////////////////////////////////////////////////////////////////////////
/// IEEE 754 FLP
Standard
// :PH:
4.8
/// Standard
Specifies
//
// Formats of FLP
numbers. (There are several sizes.)
/// Features
//
// Can
Represent:
// Floating-point
number.
// + and - Infinity, and other
special values.
//
// Special
Properties
// Positive Zero is
0.
/// Sizes
//
// Single: 32
bits.
// Double: 64
bits.
//
// Format
Specifies:
//
// Sign.
// Exponent.
// Significand
(Fraction)
//
// Slight
Complications :
//
// Exponent is
biased.
// Significand may not include
MSB.
// We assume normalized fraction and
normalized fraction means
// there is always 1 at MSB
part
// So we drop the MSB(hidden 1) and when we
convert the FLP to
// decimal number we bring back the hidden
1.
// See the examples
below.
/// IEEE 754 Single
Format
//
// Format:
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
// 31: S: Sign bit: 1 negative, 0
positive.
// 30-23: E:
Biased Exponent. (Exponent is E-127)
// 22-0: F: Significand
(Fraction)
//
// E Biased
Exponent will be in the range of 0000 0000 to 1111 1111
// (0 to 255).
// so actual Exponent value will
be
// 0000 0000 - 127 which is -127 and
1111 1111 - 127 which is 128
// (-127 to
128).
// IEEE 754 single format considers the bias
127.
// Case
Value formula.
// 0 < E < 255, S = 0 : ( 1.0 + F / 2^{23} )
2^{E-127} //this 1.0
// is hidden 1.
// 0 < E < 255, S = 1 :
- ( 1.0 + F / 2^{23} ) 2^{E-127}
// E = 0, S = 0, F = 0 : 0
// E = 0, S = 1, F = 0 :
- 0
// E = 255, S = 0, F = 0:
Infinity
// E = 255, S = 1, F = 0: - Infinity
/// IEEE 754 Double
Format
//
// Format:
SEEEEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
// 63: S: Sign bit: 1 negative, 0
positive.
// 62-52: E:
Biased Exponent. (Exponent is E-1023)
// 51-0: F: Significand
(Fraction)
//
// E Biased
Exponent will be in the range of (0 to 2047).
// so actual Exponent value will
be
// (-1023 to
1024).
// IEEE 754 double format considers the bias
1023.
//
// Case
Value formula.
// 0 < E < 2047, S = 0 : ( 1.0 + F / 2^{52} )
2^{E-1023}
// 0 < E < 2047, S = 1 : - ( 1.0 + F / 2^{52} )
2^{E-1023}
// E = 0, S = 0, F = 0 : 0
// E = 0, S = 1, F = 0 : - 0
// E = 2047, S = 0, F = 0 :
Infinity
// E = 2047, S = 1, F = 0 : - Infinity
/// IEEE 754 Single Format Examples: IEEE 754 to
Value
//
// Single
FLP:
32h'3fc00000
//
= 32b'00111111110000000000000000000000
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
//
0 01111111 10000000000000000000000
//
S EEEEEEEE FFFFFFFFFFFFFFFFFFFFFFF
//
// S
= 0, E = 7f = 127, F 400000 = 4194304
//
// Based on value
of S and E, the following case applies:
//
// 0 < E < 255, S = 0 : ( 1.0 + F / 2^{23} )
2^{E-127}
//
// Value = ( 1.0 + 4194304 / 2^{23} )
2^{127-127}
// = (
1.0 + 0.5 )
// =
1.5
// Single
FLP:
32h'456ab000
//
= 32b'01000101011010101011000000000000
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
//
0 10001010 11010101011000000000000
//
S EEEEEEEE FFFFFFFFFFFFFFFFFFFFFFF
//
//
S = 0, E = 8a = 138, F 6ab000 = 6991872
//
// Based on value
of S and E, the following case applies:
//
// 0 < E < 255, S = 0 : ( 1.0 + F / 2^{23} )
2^{E-127}
//
// Value = ( 1.0 + 6991872 / 2^{23} )
2^{138-127}
// = (
1.0 + 0.833496 ) 2048
// =
3755
// Single
FLP:
32h'c0490fdb
//
= 32b'11000000010010010000111111011011
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
//
1 10000000 10010010000111111011011
//
S EEEEEEEE FFFFFFFFFFFFFFFFFFFFFFF
//
//
S = 1, E = 80 = 128, F 490fdb = 4788187
//
// Based on value
of S and E, the following case applies:
//
// 0 < E < 255, S = 1 :
- ( 1.0 + F / 2^{23} ) 2^{E-127}
//
// Value = - ( 1.0 + 4788187 / 2^{23} )
2^{128-127}
// = -
( 1.0 + 0.570796 ) 2
// =
-3.14159
// Single
FLP:
32h'0
//
= 32b'00000000000000000000000000000000
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
//
0 00000000 00000000000000000000000
//
S EEEEEEEE FFFFFFFFFFFFFFFFFFFFFFF
//
//
S = 0, E = 0, F = 0
//
// Based on value
of S and E, the following case applies:
//
// E = 0, S = 0, F = 0 : 0
//
// Value = 0
// Single
FLP:
32h'7f800000
//
= 32b'01111111100000000000000000000000
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
//
0 11111111 00000000000000000000000
//
S EEEEEEEE FFFFFFFFFFFFFFFFFFFFFFF
//
//
S = 0, E = 255, F = 0
//
// Based on value
of S and E, the following case applies:
//
// E = 255, S = 0, F = 0:
Infinity
//
// Value = Infinity.
/// IEEE 754 Single Format Examples: Value to IEEE
754
//
// Value
(decimal):
12.75
// Convert to binary: 1100.11
// Convert to normalized binary
scientific notation: 1.10011 x 2^3
//
//
// S = 0 (its positive)
// E = 127 + 3 = 130 = 1000
0010
// F = 10011 000000000000000000
(Notice we dropped the 1(hidden 1)
// before the binary point).
//
// Single: 0 1000 0010 10011
000000000000000000
//
= 0100 0001 0100 1100 0000 0000 0000 0000
//
= 32h'414c0000
////////////////////////////////////////////////////////////////////////////////
/// FLP Addition
Hardware
// FLP arithmetic
hardware simple in principle,
// but details can be very
complicated.
// Only hardware
for FLP adder shown.
/// FLP Addition Hardware
Examples
//
// Two Adders
//
//
Combinational.
// Sequential.
//
//
//
:Example:
// Add IEEE 754
Single
// Combinational
floating point adder. Computes the
sum of two 32bit
// floating point numbers
// that consist
of one sign bit and 8 bit biased exponents and 23 bit
// unsigned
normalized fractions.
// for simplicity
the two numbers are considered to be positive.
// the format for
the number is :
// Format:
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
31: S: Sign bit:
//
30:23 E: Exponent
:
//
22:0 F: Fraction
:
module
fp_add(sum, exp_overflow, a_original,b_original);
input [31:0] a_original,
b_original;
output [31:0]
sum;
output
exp_overflow;
reg
exp_overflow;
reg [7:0]
sumexp;
reg [23:0]
sumfraction;
assign
sum[31] = a_original[31];//sign bit
always positive
//this case
assign
sum[30:23] =
sumexp;
assign
sum[22:0] =
sumfraction[22:0];//It shows dropping of
//hidden 1(or c at the
below)
//which is always 1.
reg [31:0] a, b;
reg [23:0] afraction, bfraction;//24
bits to include hidden 1.
reg [7:0] aexp,
bexp;
reg
c;
reg [7:0]
diff;
always @( a_original or b_original
)
begin
/// Compute Floating-Point Sum in Four Steps
/// Step 1: Adjust and
Alignment
//
Put the number with the larger exponent in a
if(
a_original[30:23] < b_original[30:23] ) begin
a = b_original; b =
a_original;
end
else begin
a = a_original; b =
b_original;
end
///
Break operand into exponent, and fraction.
aexp
= a[30:23]; bexp =
b[30:23];
afraction ={1'b1, a[22:0]};
//Inserting hidden 1
bfraction ={1'b1, b[22:0]};
//Inserting hidden 1
///
alignment so that aexp ==
bexp.
//
diff
= aexp - bexp;
bfraction = bfraction >> diff;
///
Step 2: add fractions
c = 0;
{c,sumfraction} = afraction + bfraction;
///
Step 3: post normalize.
if(
c ) begin
//shift right sumfraction with c and increment
exponent.
// we can check exponent overflow here.
// if(aexp == 8'd255) exp_overflow = 1 // this is
// another way.
// step 4 is not
// needed in this case.
sumexp = aexp + 1;
sumfraction =
{c,sumfraction[23:1]};//c is hidden 1 and
// it's value is 1
//so we should drop it.
end
else
begin
sumexp = aexp;//sumfraction does not change
//sumfraction[23] is hidden 1 and
//it's value is 1
//so we
should drop it.
end
/// step
4:check for exponent overflow
if(aexp ==8’d255 && c ==1) exp_overflow =1; //look, we
are
// checking aexp instead of sumexp.
//if we check sumexp,
it is too late.
end
endmodule
//
:Example:
// Add IEEE 754
Single
// Floating Point Sequential
adder. Computes the sum of two
32bit
// floating point numbers
// that consist
of one sign bit and 8 bit biased exponents and 23 bit
// unsigned
normalized fractions.
// for simplicity
the two numbers are considered to be positive.
// the format for
the number is :
// Format:
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
31: S: Sign bit:
//
30:23 E:
Exponent:
//
22:0 F:
Fraction:
.
module
fp_add_seq(sum, exp_overflow,
ready,a_original,b_original,start,clk);
input [31:0] a_original,
b_original;
output [31:0]
sum;
input
start, clk;
output
ready;
output
exp_overflow;
reg
exp_overflow;
reg [7:0]
sumexp;
reg [23:0]
sumfraction;
assign
sum[31] = a_original[31];//sign bit
always
//positive this case
assign
sum[30:23] =
sumexp;
assign
sum[22:0] =
sumfraction[22:0];//It shows dropping of
// hidden 1(or c at the below)
//which is always 1.
reg [31:0] a, b;
reg [23:0] afraction,
bfraction;
reg [7:0] aexp,
bexp;
reg
c;
reg [7:0]
diff;
parameter st_idle = 0;
parameter st_cyc_1 =
1;
parameter st_cyc_2 =
2;
parameter st_cyc_3 =
3;
reg [1:0]
state;
initial state =
st_idle;
assign
ready = state == st_idle;
always @( posedge clk
)
case( state
)
st_idle:
if( start ) begin
/// Step 1:
Alignment and Adjust.
//
Put the number with the larger exponent in a
if(
a_original[30:23] < b_original[30:23] ) begin
a = b_original; b =
a_original;
end
else begin
a = a_original; b =
b_original;
end
///
Break operand into exponent, and fraction.
aexp
= a[30:23]; bexp =
b[30:23];
afraction = {1'b1, a[22:0]};
//Inserting hidden 1
bfraction = {1'b1, b[22:0]};
//Inserting hidden 1
///
alignment so that aexp ==
bexp.
//
diff
= aexp - bexp;
bfraction = bfraction >> diff;
state = st_cyc_1;
end
st_cyc_1:
begin
/// Step 2: add
fractions
c = 0;
{c,sumfraction} = afraction + bfraction;
state = st_cyc_2;
end
st_cyc_2:
begin
/// Step 3: post
normalize.
if(
c ) begin
//shift right fraction with c and increment
exponent.
sumexp = aexp + 1;
sumfraction =
{c,sumfraction[23:1]};//c is hidden 1 and
//it's value is 1
//so we should drop it.
end
else
begin
sumexp = aexp;//sumfraction does not change
//sumfraction[23] is hidden 1 and it's value is 1
//so we should drop it.
end
state = st_cyc_3;
end
st_cyc_3:
/// step 4:check for exponent overflow
begin
if(aexp == 8’d255 && c == 1’b1) exp_overflow =1;
state = st_idle;
end
endcase
endmodule