///
LSU EE 3755 Computer Organization
//
///
Verilog Notes 10 -- Floating Point
///
Contents
//
//
Binary Floating-Point Representation and Arithmetic
//
IEEE 754 FLP Standard
//
FLP Addition Hardware
///
References
//
//
:P: Palnitkar, "Verilog HDL"
//
:Q: Qualis, "Verilog HDL Quick
Reference Card Revision 1.0"
//
:PH: Patterson & Hennessy,
"Computer Organization & Design"
////////////////////////////////////////////////////////////////////////////////
///
Binary Floating-Point Representation and Arithmetic
//
:PH: 4.8
/// Binary Floating-Point (FLP)
Representations
//
//
The floating-point (FLP) representations in this section (before
//
IEEE 754) are NOT computer representations.
//
//
Among other things, that means the number of bits needed to store a
//
number is not specified.
//
//
Computer representations for FLP numbers covered in the next section,
//
IEEE 754.
///
Binary Fixed Point Representation
//
//
//
Each digit position has a weight.
//
//
//
FXP Binary Number: 1 0 1
0 1. 1 0 0 1
//
Digit Position: 4 3 2 1 0
-1 -2
-3 -4
//
Weight: 16 8
4 2 1 1/2 1/4 1/8 1/16
//
//
Value of number: 1*16 + 0*8 + 1*4 + 0*2
+ 1*1 + 1/2 + 0/4 + 0/8 + 1/16
// = 21.5625
//
//
Other Examples:
//
// 1.1
= 1.5
// 1.01
= 1.25
// 1.11
= 1.75
// 1.001
= 1.125
// 1111.1111 = 15.9375
//
//
//
Fixed Point Decimal to Binary Conversion
//
// To convert decimal number x, 0 < x < 1.
//
// Method 1:
//
// For bit position -1:
// if x >= 1/2, bit is 1, x = x - 1/2;
// if x < 1/2, bit is 0, x
unchanged.
// For bit position -2:
// if x >= 1/4, bit is 1, x = x - 1/4
// if x < 1/4, bit is 0, x
unchanged.
// For bit position -3:
// if x >= 1/8, bit is 1, x = x - 1/8
// if x < 1/8, bit is 0, x
unchanged.
// For bit position -4:
// if x >= 1/16, bit is 1, x = x -
1/16
// if x < 1/16, bit is 0, x unchanged.
// Etc.
//
// Example:
// x= .75
// For bit position -1:
// x = .75 >= 1/2, bit is 1 and updated
x is (0.75) - (0.5) = 0.25;
// first
bit = 1(MSB of fraction).
// For bit position -2:
// x = .25 >= 1/4, bit is 1 and updated
x is (0.25) - (0.25) = 0;
// second
bit = 1.
// For bit position -3:
// x = 0 < 1/8, bit is 0 and x unchanged ; x= 0;
// third bit = 0.
// For bit position -4:
// x = 0 < 1/16, bit is 0 and x unchanged ; x= 0;
// fourth bit = 0.
// so result is .1100.
// Method 2:
//
// Let r be the number of bits past binary
point desired.
//
// Convert x * 2^r to binary.
//
// MSB is first bit past binary point, etc.
//
// Example:
// r = 4,
x = .75
// Convert .75 * 2^4 = 12 to binary: 1100
// x in binary is: .1100
//
// This is the same thing like multiplying
by 2 each time and keep
// the integer part.
// The first one is MSB of fraction.
// .75 * 2 = 1.5 keep 1
// .5
* 2 = 1.0 keep 1
// 0
* 2 = 0 keep 0
// 0
* 2 = 0 keep 0
// so result is .1100 for 4 bit
representation.
//
Examples to 12 digits:
//
// 1.1 = 1.000110011001... 1.1 * 2^12 = 4505 = 1000110011001
// 1.2 = 1.001100110011...
// 1.3 = 1.010011001100...
// 1.4 = 1.011001100110...
// 1.5 = 1.1
//
//
Note:
//
//
Common numbers such as 0.2 do not have exact representations.
/// Binary Scientific Notation
//
//
Binary Scientific Representation Similar to
// Decimal Scientific Notation
//
// Decimal: SIGN SIGNIFICAND(FRACTION) x 10^{EXPONENT}
// Binary:
SIGN SIGNIFICAND(FRACTION) x 2^{EXPONENT}
//
// Significand does not always mean 100%
fractional number for
//
scientific notation.
//
// Decimal Examples:
//
// 1.23 x 10^{2} = 123
// 1.23 x 10^{0} = 1.23
// 1.23 x 10^{-1} = .123
// Examples above are normalized
// (only one non-zero digit before radix
point).
// Examples below are not(more than one
non-zero digit or
// zero digit before radix point).
//
// 12.3 x 10^{1} = 123
// .123 x 10^{1} = 1.23
// 123 x 10^{-3} = .123
//
// Binary Examples
//
// 1 x 2^{0} = 1 = 1
// 1 x 2^{1} = 10 = 2
// 1 x 2^{2} = 100 = 4
// 1.1 x 2^{2} = 110 = 6
// 1.1 x 2^{1} = 11 = 3
// 1.1 x 2^{0} = 1.1 = 1.5
// 1.1 x 2^{-1} = .11 = .75
// Examples above are normalized(only one
digit(1) before
// radix point).
// Examples below are not(more than one digit
or zero digit before
// radix point).
// So when binary number is normalized, there
is always 1 before
// radix point.
// IEEE 754 drops the 1(hidden 1) and saves
only number(.xxxxx) after
//
radix point(saving 1 bit).
// for IEEE 754 format the
signifand(fraction) is 1.xxxxxx.
// 11 x 2^{1} = 110 = 6
// 11 x 2^{0} = 11 = 3
// 11 x 2^{-1} = 1.1 = 1.5
// 11 x 2^{-2} = .11 = .75
/// Addition Using Scientific Notation
//
//
Consider:
//
// a_scand x 2^{a_exp}
// b_scand x 2^{b_exp}
//
// Assume a is larger magnitude number.
//
(a>b or for simplicity a_exp >= b_exp).
//
// To add these:
//
// Set b'_exp = a_exp. //adjustment
// Set b'scand = b_scand / 2^(a_exp - b_exp)
//shift right
// Set s_scand = a_scand + b'_scand //add
// Normalize result.
//
//
//
//
//
Subtraction is similar.
/// Multiplication Using Scientific Notation
//
Not biased exponents.
//
//
Consider:
//
// a_scand x 2^{a_exp}
// b_scand x 2^{b_exp}
//
// To multiply these:
//
// Set p_scand = a_scand x b_scand
// Set p_exp = a_exp + b_exp
// Normalize p //having only one digit
before radix point
//
// Product is p_scand x 2^{p_exp}
//
//
////////////////////////////////////////////////////////////////////////////////
///
IEEE 754 FLP Standard
//
:PH: 4.8
/// Standard Specifies
//
//
Formats of FLP numbers. (There are several sizes.)
/// Features
//
//
Can Represent:
// Floating-point number.
// + and - Infinity, and other special values.
//
//
Special Properties
// Positive Zero is 0.
/// Sizes
//
//
Single: 32 bits.
//
Double: 64 bits.
//
//
Format Specifies:
//
// Sign.
// Exponent.
// Significand (Fraction)
//
//
Slight Complications :
//
// Exponent is biased.
// Significand may not include MSB.
// We assume normalized fraction and normalized
fraction means
// there is always 1 at MSB part
// So we drop the MSB(hidden 1) and when we
convert the FLP to
// decimal number we bring back the hidden 1.
// See the examples below.
/// IEEE 754 Single Format
//
//
Format:
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
31: S: Sign bit: 1 negative, 0
positive.
//
30-23: E: Biased Exponent. (Exponent is E-127)
//
22-0: F: Significand (Fraction)
//
//
E Biased Exponent will be in the range of 0000 0000 to 1111 1111
// (0 to 255).
// so actual Exponent value will be
// 0000 0000 - 127 which is -127 and 1111 1111
- 127 which is 128
// (-127 to 128).
// IEEE 754 single format considers the bias 127.
// Case Value formula.
// 0 < E < 255, S = 0 : ( 1.0 + F / 2^{23} ) 2^{E-127} //this 1.0
// is
hidden 1.
// 0 < E < 255, S = 1 : - ( 1.0 + F / 2^{23} ) 2^{E-127}
// E = 0, S = 0, F = 0 :
0
// E = 0, S = 1, F = 0 : -
0
// E = 255, S = 0, F = 0: Infinity
// E = 255, S = 1, F = 0: - Infinity
/// IEEE 754 Double Format
//
//
Format: SEEEEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
//
63: S: Sign bit: 1 negative, 0
positive.
//
62-52: E: Biased Exponent. (Exponent is E-1023)
//
51-0: F: Significand (Fraction)
//
//
E Biased Exponent will be in the range of (0 to 2047).
// so
actual Exponent value will be
//
(-1023 to 1024).
// IEEE 754 double format considers the bias 1023.
//
// Case Value formula.
// 0 < E < 2047, S = 0
: ( 1.0 + F / 2^{52} )
2^{E-1023}
// 0 < E < 2047, S = 1
: - ( 1.0 + F / 2^{52} )
2^{E-1023}
// E = 0, S = 0, F = 0 :
0
// E = 0, S = 1, F = 0 :
- 0
// E = 2047, S = 0, F = 0 : Infinity
// E = 2047, S = 1, F = 0 : - Infinity
/// IEEE 754 Single Format Examples: IEEE 754 to Value
//
//
Single FLP: 32h'3fc00000
// =
32b'00111111110000000000000000000000
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
// 0 01111111
10000000000000000000000
// S EEEEEEEE
FFFFFFFFFFFFFFFFFFFFFFF
//
// S = 0, E =
7f = 127, F 400000 = 4194304
//
//
Based on value of S and E, the following case applies:
//
// 0 < E < 255, S = 0 : ( 1.0 + F / 2^{23} ) 2^{E-127}
//
// Value = ( 1.0 + 4194304 / 2^{23} )
2^{127-127}
// = ( 1.0 + 0.5 )
// = 1.5
//
Single FLP: 32h'456ab000
// =
32b'01000101011010101011000000000000
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
// 0 10001010
11010101011000000000000
// S EEEEEEEE FFFFFFFFFFFFFFFFFFFFFFF
//
// S = 0, E = 8a = 138, F 6ab000 = 6991872
//
//
Based on value of S and E, the following case applies:
//
// 0 < E < 255, S = 0 : ( 1.0 + F / 2^{23} ) 2^{E-127}
//
// Value = ( 1.0 + 6991872 / 2^{23} ) 2^{138-127}
// = ( 1.0 + 0.833496 ) 2048
// = 3755
//
Single FLP: 32h'c0490fdb
// =
32b'11000000010010010000111111011011
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
// 1 10000000
10010010000111111011011
// S EEEEEEEE
FFFFFFFFFFFFFFFFFFFFFFF
//
// S = 1, E = 80 = 128, F 490fdb = 4788187
//
//
Based on value of S and E, the following case applies:
//
// 0 < E < 255, S = 1 : - ( 1.0 + F / 2^{23} ) 2^{E-127}
//
// Value = - ( 1.0 + 4788187 / 2^{23} )
2^{128-127}
// = - ( 1.0 + 0.570796 ) 2
// = -3.14159
//
Single FLP: 32h'0
// =
32b'00000000000000000000000000000000
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
// 0 00000000
00000000000000000000000
// S EEEEEEEE
FFFFFFFFFFFFFFFFFFFFFFF
//
// S = 0, E = 0,
F = 0
//
//
Based on value of S and E, the following case applies:
//
// E = 0, S = 0, F = 0 :
0
//
// Value = 0
//
Single FLP: 32h'7f800000
// =
32b'01111111100000000000000000000000
//
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
//
// 0 11111111
00000000000000000000000
// S EEEEEEEE
FFFFFFFFFFFFFFFFFFFFFFF
//
// S = 0, E = 255,
F = 0
//
//
Based on value of S and E, the following case applies:
//
// E = 255, S = 0, F = 0: Infinity
//
// Value = Infinity.
/// IEEE 754 Single Format Examples: Value to IEEE 754
//
//
Value (decimal): 12.75
// Convert to binary: 1100.11
// Convert to normalized binary scientific
notation: 1.10011 x 2^3
//
//
// S = 0
(its positive)
// E = 127 + 3 = 130 = 1000 0010
// F = 10011 000000000000000000 (Notice we
dropped the 1(hidden 1)
// before the
binary point).
//
// Single: 0
1000 0010 10011
000000000000000000
// = 0100 0001 0100 1100 0000 0000 0000
0000
// = 32h'414c0000
////////////////////////////////////////////////////////////////////////////////
///
FLP Addition Hardware
//
FLP arithmetic hardware simple in principle,
//
but details can be very complicated.
//
Only hardware for FLP adder shown.
/// FLP Addition Hardware Examples
//
// Two Adders
//
// Combinational.
// Sequential.
//
//
//
:Example:
//
Add IEEE 754 Single
//
Combinational floating point adder.
Computes the sum of two 32bit
//
floating point numbers
//
that consist of one sign bit and 8 bit biased exponents and 23 bit
//
unsigned normalized fractions.
//
for simplicity the two numbers are considered to be positive.
//
the format for the number is :
// Format:
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
// 31: S: Sign bit:
// 30:23 E: Exponent :
// 22:0 F: Fraction :
module
fp_add(sum, exp_overflow, a_original,b_original);
input [31:0] a_original, b_original;
output [31:0] sum;
output exp_overflow;
reg
exp_overflow;
reg [7:0] sumexp;
reg [23:0] sumfraction;
assign sum[31] = a_original[31];//sign bit always positive
//this
case
assign sum[30:23] =
sumexp;
assign sum[22:0] =
sumfraction[22:0];//It shows dropping of
//hidden 1(or c at the below)
//which is always 1.
reg [31:0] a, b;
reg [23:0] afraction, bfraction;//24 bits to include hidden 1.
reg [7:0] aexp, bexp;
reg c;
reg [7:0] diff;
always @( a_original or b_original )
begin
/// Compute
Floating-Point Sum in Four Steps
///
Step 1: Adjust and Alignment
// Put the number with the larger
exponent in a
if( a_original[30:23] <
b_original[30:23] ) begin
a = b_original; b = a_original;
end else begin
a = a_original; b = b_original;
end
/// Break operand into exponent, and
fraction.
aexp = a[30:23]; bexp = b[30:23];
afraction ={1'b1, a[22:0]}; //Inserting hidden 1
bfraction ={1'b1, b[22:0]}; //Inserting hidden 1
/// alignment so that aexp == bexp.
//
diff = aexp - bexp;
bfraction = bfraction >> diff;
/// Step 2: add fractions
c = 0;
{c,sumfraction} = afraction +
bfraction;
/// Step 3: post normalize.
if( c ) begin
//shift right sumfraction with c
and increment exponent.
// we can check exponent overflow
here.
// if(aexp == 8'd255) exp_overflow
= 1 // this is
// another way.
// step 4 is not
//
needed in this case.
sumexp = aexp + 1;
sumfraction = {c,sumfraction[23:1]};//c is hidden 1 and
// it's value is 1
//so we should drop it.
end
else begin
sumexp = aexp;//sumfraction does
not change
//sumfraction[23] is
hidden 1 and
//it's value is 1
//so we should drop it.
end
/// step 4:check for exponent
overflow
if(aexp ==8’d255 && c ==1)
exp_overflow =1; //look, we are
//
checking aexp instead of sumexp.
//if we check sumexp, it is too late.
end
endmodule
//
:Example:
//
Add IEEE 754 Single
// Floating Point Sequential adder. Computes the sum of two 32bit
//
floating point numbers
//
that consist of one sign bit and 8 bit biased exponents and 23 bit
//
unsigned normalized fractions.
//
for simplicity the two numbers are considered to be positive.
//
the format for the number is :
//
Format:
SEEEEEEEEFFFFFFFFFFFFFFFFFFFFFFF
// 31: S: Sign bit:
// 30:23 E: Exponent:
// 22:0 F: Fraction:
.
module
fp_add_seq(sum, exp_overflow, ready,a_original,b_original,start,clk);
input [31:0] a_original, b_original;
output [31:0] sum;
input start, clk;
output ready;
output exp_overflow;
reg
exp_overflow;
reg [7:0] sumexp;
reg [23:0] sumfraction;
assign sum[31] = a_original[31];//sign bit always
//positive
this case
assign sum[30:23] =
sumexp;
assign sum[22:0] = sumfraction[22:0];//It
shows dropping of
//
hidden 1(or c at the below)
//which is always 1.
reg [31:0] a, b;
reg [23:0] afraction, bfraction;
reg [7:0] aexp, bexp;
reg c;
reg [7:0] diff;
parameter st_idle = 0;
parameter st_cyc_1 = 1;
parameter st_cyc_2 = 2;
parameter st_cyc_3 = 3;
reg [1:0] state;
initial state = st_idle;
assign ready = state == st_idle;
always @( posedge clk )
case( state )
st_idle:
if( start ) begin
///
Step 1: Alignment and Adjust.
// Put the number with the larger
exponent in a
if( a_original[30:23] <
b_original[30:23] ) begin
a = b_original; b = a_original;
end else begin
a = a_original; b = b_original;
end
/// Break operand into exponent, and
fraction.
aexp = a[30:23]; bexp = b[30:23];
afraction = {1'b1, a[22:0]}; //Inserting hidden 1
bfraction = {1'b1, b[22:0]}; //Inserting hidden 1
/// alignment so that aexp == bexp.
//
diff = aexp - bexp;
bfraction = bfraction >> diff;
state = st_cyc_1;
end
st_cyc_1:
begin
/// Step 2: add fractions
c = 0;
{c,sumfraction} = afraction +
bfraction;
state = st_cyc_2;
end
st_cyc_2:
begin
///
Step 3: post normalize.
if( c ) begin
//shift right fraction with c and
increment exponent.
sumexp = aexp + 1;
sumfraction = {c,sumfraction[23:1]};//c is hidden 1 and
//it's value is 1
//so we should drop it.
end
else
begin
sumexp = aexp;//sumfraction does not
change
//sumfraction[23] is
hidden 1 and it's value is 1
//so we should drop
it.
end
state = st_cyc_3;
end
st_cyc_3:
/// step 4:check for exponent
overflow
begin
if(aexp == 8’d255
&& c == 1’b1)
exp_overflow =1;
state = st_idle;
end
endcase
endmodule