## LSU EE 3755 --
Fall 2009-- Computer
Organization
#
## MIPS Notes 5 -- More Procedure Code and
Compiler
#procedure swap
#1. registers to program
variables.
#2. Produce code for the body of the
procedure
#3. Preserve registers across the procedure
invocation
#
swap(int v[ ], int k)
#
{
#
int temp;
#
temp = v[k];
# v[k]
= v[k+1];
#
v[k+1] = temp;
#
}
# Swap
#
Register Usage
#
#
$a0 : Address of array element v[0].
#
$a1: index k.
#
$t1 : k * 4 ; because array v element is integer which is 4byte long.
#
$t1: v[k] ; for
swapping
#
$t2: v[k+1] ; for
swapping
swap:
add $t1, $a1, $a1
# $t1 = k *
2;
add $t1, $t1, $t1 # $t1 = k * 4;
add $t1, $a0, $t1
# $t1 = v + (k *4)
# $t1 has the address of
v[k]
lw $t0,0($t1)
# $t0 (temp) = v[k]
# $t0 has the value of v[k]
lw $t2,4($t1)
# $t2 =
v[k+1]
# $t2 has the value of v[k+1]
sw
$t2,0($t1)
# save the value of v[k+1] to address of v[k]
sw $t0,4($t1)
# save the value of v[k] to address of v[k+1]
# so swapped
jr $ra
# return to calling routine
# other
way of doing this
# add $t1, $a1, $zero //only these two lines
are different
# sll $t1,2
//.........
k*4
# add $t1, $a0, $t1 //
remaining are same
# lw $t0,0($t1)
# $t0 (temp) = v[k]
# $t0 has the value of v[k]
# lw $t2,4($t1)
# $t2 =
v[k+1]
#
$t2 has the value of v[k+1]
# sw $t2,0($t1)
# save the value of v[k+1] to address of v[k]
# sw $t0,4($t1)
# save the value of v[k] to address of v[k+1]
# so swapped
# jr $ra
# return to calling routine
# # changing that to
one instruction
# sll $a1, 2
// now we are using only one instruction
// instead of two instructions.
// code becomes short.
//
is this better than above?
# add $t1, $a0, $a1 // # $t1 has the address of
v[k]
// only above two are different
// remaining are
same
# lw $t0,0($t1)
# $t0 (temp) = v[k]
# $t0 has the value of v[k]
# lw $t2,4($t1)
# $t2 =
v[k+1]
# $t2 has the value of v[k+1]
# sw $t2,0($t1)
# save the value of v[k+1] to address of v[k]
# sw $t0,4($t1)
# save the value of v[k] to address of v[k+1]
# so swapped
# jr $ra
# return to calling routine
### This
is Why..
#
main() // incomplete
code.
# for (k =
1; k < 10; k++)
{
# swap(v[],
k);
#
}
#
##
# procedure sort
# sort (int v[], int n)
# {
#
int i, j;
#
for(i = 0; i < n; i = i +1) {
#
for (j = i -1 ; j > =0 && v [j] > v [j +1 ] ; j = j-1) {
swap(v,j);}
#
}
#
}
# }
## primitive way of sorting is shown
below
##(slower than above one)
## sort (int v[], int n)
## {
##
int i, j;
##
for(i = 0; i < n; i = i +1) {
##
for (j = 0; j <n; j = j+1) {
##
if(v[j]
> v[j+1])
swap(v,j);
##
}
##
}
##
}
## }
#
MIPS code for the first sort program
#
Register usage :
#
$a0 : Address of array element v[0].
#
$a1: Array size n
#
$s0: index i for for loop
#
$s1: index j for for loop
# saving registers
#
sort:
addi $sp,$sp, -20 # make
room on stack for 5 registers
sw $ra,16($sp) #save $ra on
stack
sw $s3,12($sp) #save $s3 on
stack
sw $s2, 8($sp) #save $s2 on
stack
sw $s1, 4($sp) #save $s1 on
stack
sw $s0, 0($sp) #save $s0 on
stack
#we save $s' registers.
#procedure call shouldn't modify $s' registers.
#we save $ra register.
#we call swap procedure, and we will modify $ra at that
time.
#procedure body
move $s2, $a0 #
This is pseudo instruction .How do you do this?
move $s3, $a1 #
save the array size n($a1) to
$s3.
move $s0,$zero # index i for for loop ( i =0)
for1tst:
slt $t0, $s0,$s3
# reg $t0 = 0 if $s0 >= $s3 (i>=n)
beq $t0,$zero,exit1 # outside for loop conditon
check.
addi $s1,$s0,-1 # set j = i
-1 for second for loop.
for2tst:
slti $t0,$s1,0 # reg $t0 = 1 if
$s1 <0 (j <0)
bne $t0,$zero,exit2
add $t1,$s1,$s1 # reg $t1 =
j * 2
add $t1,$t1,$t1
# reg $t1 = j * 4
add
$t2,$s2,$t1 # reg $t2 = v +(j
* 4)//address of v[j]
lw $t3,0($t2)
# reg $t3 = v[j]
lw $t4,4($t2)
# reg $t4 = v[j + 1]
slt $t0,$t4,$t3 # reg $t0 = 0 if
v[j+1] >= v[j]
beg $t0,$zero,exit2
move $a0,$s2
#1-st parameter of swap is v
move $a1,$s1
#2 nd parameter of swap is j
jal
swap
# $ra will be modified by jal, $ra
<=exit2+4.
exit2:
addi $s0,$s0, 1 # i= i
+1
j for1tst
# will be returned here after swap.
#now we restore saved $s' registers and $ra
#and adjust stack pointer.
exit1:
lw $s0,0($sp)
lw $s1,4($sp)
lw $s2,8($sp)
lw $s3,12($sp)
lw
$ra,16($sp)
addi $sp,$sp,20
jr $ra
# return to calling procedure.
#
Compiler
# Contents
#
# Optimization Introduction
# Steps in Preparing a
Program
# Compiler Definitions
# High-Level Optimizations
# Low-Level Optimizations
# Compiler Optimization
Options
# Use of Compiler Switches
#Lecture Goals
#
# Understand the Program Building and Compilation
Process
#
Describe steps in program building and optimization,
including
#
intermediate files (assembler, object, ...) and tool names
(preprocessor,
#
compiler, etc.).
#
# Understand Specific Optimizations and Assumption
Switches
#
Describe, work example, explain benefit.
#
# Understand Profiling
# Steps,
how performance is improved.
#
# Understand ISA and Implementation
Options
# How
programmer chooses them, how compiler uses them.
#
# Understand how Programmers Use Compilation
Options
# Normal
program development, high-performance programs, SPEC
disclosure.
###################
# Optimization Introduction
# :Def: Optimization
# The optional steps in compiling a program that
reduce the program's
#execution time, reduce the size of the program, or
both.
# Typically, the only time optimization is NOT done
is when a program
# is being debugged.
#
# In most cases, the programmer sets overall
optimization effort (low,
#medium, high).
#
#When performance is very important the programmer
can specify which
# specific optimizations to
use.
# :Example:
#
# Data on a program that computes pi with and without
optimization.
#
# System:
SunOS sol 5.6 Generic_105181-31 sun4u sparc
SUNW,Ultra-Enterprise
# Compiler: Sun WorkShop 6 2000/04/07 C
5.1
#Clock Frequency: 300 MHz
#
# Without Optimization
#
Size
: 6408 bytes
#
Run Time
: 3.00 s
#
Instruction Count : 325,338,749
#
CPI
: (/ (* 3.00 300e6) 325338749.0 ) = 2.7663
#
# With Optimization
#
Size
: 6340 bytes
Smaller! (OK, only a tiny bit smaller.)
!
Run Time
: 1.65 s
Faster!
!
Instruction Count : 100,338,751
!
CPI
: (/ (* 1.65 300e6) 100338751.0) = 4.9333
!
# Reasons *not* to Optimize
#
# Percentage shows roughly how often reason
applies.
#
# 95% It
makes debugging difficult, so don't optimize while
debugging.
# This is true for
almost everyone that uses a debugger.
#
#10% It
slows down compilation.
# Only important when
there is a very large amount of code to
# recompile. (Back in the 20th century when computers
were slow
# this was
important.)
#
#
1% Optimization introduces
bugs.
# It does, but very
rarely.
##############################
# Steps in Building a
Program
# Typical Steps in Building a
Program
#
#
Pre-Process, Compile, Assemble, Link, Load
#
pi.c ->
pi.i -> pi.s -> pi.o -> pi
#
# These steps can all be automatically performed of
the compiler
# driver. (cc, gcc, MS Visual Studio,
etc.)
#
#:Sample: cc pi.c -o pi
#
# They can also be specified
individually.
#
# More details appear below, compile is what we're
interested in.
# Pre-Process
# Load
include files, expand macros.
#
Typical pre-processor name: cpp
# Input
File: pi.c
# Output
File: pi.i (High-Level
Language)
# Compile
#
Convert high-level language into assembly
language.
#
Typical compiler name: cc1 (not cc, or gcc, or Visual
Studio)
# Input
File: pi.i
# Output
File: pi.s (Assembly
language)
# Assemble
#
Convert assembly language in to machine
language.
#
Typical assembler name: as
#
Input File: pi.s
#
Output File: pi.o (Object
file containing machine code.)
# Link
#
Combine object files, libraries, and other code into
executable.
#
Typical linker name: ld
# Input
File: pi.o
# Output
File: pi
# Load
#
Copy executable into memory, link with shared libraries, and
start.
#
Loader name: exec system call.
#######################################
# Compiler Definitions
# A
program is compiled using steps or
/passes/.
#
# The
first pass reads the pre-processed high-level source
code.
#
#
The last pass produces assembly code.
#
#
Between the first and last passes the program is in
some
#
intermediate representation.
#
#
The way passes are defined and organized varies greatly
from
#
compiler to compiler.
# :Def: Pass
#
A step in compiling a program.
A pass usually looks at an entire function.
# :Def: Intermediate
Representation
# The form of a program (sort of a special-purpose
language) used internally
# by a compiler. A compiler might use several
intermediate representations.
# Typical Passes
#
# Parse
#
Convert the source code to a high-level intermediate representation
(H-IR).
#
#High-Level Optimization (Optional, may be done in
several passes.)
#
Also called front-end optimization.
#
Modify H-IR to improve performance or reduce code
size.
# Reads
and writes H-IR
#
# Low-Level Intermediate Representation (L-IR)
Generation
#
Convert H-IR to a low-level intermediate representation
(L-IR).
#
# Low-Level Optimization (Optional, may be done in several
passes.)
#
Also called back-end optimization.
#
Modify L-IR to improve performance or reduce code
size.
#
# Register Assignment (Part of low-level
optimization.)
# Choose
machine registers.
#
# Code Generation
#
Convert L-IR to assembly code.
#
# Pigeonhole Optimizations (Optional, may be done in several
passes.)
#
These are also called low-level optimizations.
#
Modify L-IR to improve performance or reduce code
size.
# Some
of these can be done at link time.
#:Def: Compiler Front End
# The parts of the compiler that do the parsing and
high-level
# optimization passes.
# :Def: Compiler Back End
# The parts of the compiler that do the low-level
optimization,
#register assignment, and code generation
passes.
##############################
# High-Level Optimizations
# Easy high-level optimizations presented
here.
#Some Easy-To-Explain Front-End
Optimizations
#
# Dead-Code Elimination
(DCE)
# Common Subexpression Elimination
(CSE)
# Constant Propagation,
Folding
# Dead-Code Elimination
(DCE)
#
Removal of code which isn't used.
# Yes,
it happens.
# :Example:
#
# Code benefiting from DCE
# High-level code shown for clarity. Most compilers will
transform
# an intermediate
representation.
#
# Before:
#
int
main(int argv, char **argc)
{
double
i; double sum =
0;
for(i=1; i<50000000;)
{
sum = sum + 4.0
/ i; i +=
2;
sum = sum - 4.0
/ i; i +=
2;
}
printf("Performed %d iterations.
Thank you for running me.\n",i);
}
#
# After:
#
int
main(int argv, char **argc)
{
double
i;
for(i=1; i<50000000;)
{
i +=
2;
i +=
2;
}
printf("Performed %d iterations.
Thank you for running me.\n",i);
}
#
# Note:
Other optimizations would leave only the printf.
# Common
Subexpression Elimination
# Remove
duplicated code.
#
# Before:
r = ( a
+ b ) / ( x + y );
s = ( a
+ b ) / ( x - y );
# After:
temp =
a + b;
r = (
temp ) / ( x + y );
s = (
temp ) / ( x - y );
# Constant Propagation,
Folding
# The
compiler performs whatever arithmetic it can at compile
time
#
rather than emitting code to perform the arithmetic at run
time.
#
# Before:
int
sec_per_day = 60 * 60 * 24;
int
sec_per_year = sec_per_day * 365;
some_routine(sec_per_day * x, sec_per_year *
y);
# After:
int
sec_per_day = 86400;
int
sec_per_year = 31536000;
some_routine(86400 * x, 31536000 * y);
#####################################
# Low-Level Optimizations
# Some Low-Level
Optimizations
#
# This is not a complete
list.
#
# Register Assignment
# Instruction Selection
# Scheduling
# :Def: Register Assignment
# Selection of which values will be held in
registers.
Values
#not held in registers are stored in
memory.
#
# Without Register Assignment
Optimizations
# All values corresponding to variables (in
high-level program) are
# written to memory (and not held in registers). Intermediate results
are
# held in registers.(accessing memory takes longer
time than accessing regs.)
#
# With Register Assignment
Optimization
# Registers are assigned to as many variables as
possible, with priority
# given to frequently used
variables.
#
#Advantage of Register Assignment
Optimization
# Fewer memory writes and
reads.(faster)
# :Def: Scheduling
# Re-arranging instructions to minimize the amount of
time one instruction
# has to wait for another.
#
#For example, if an instruction takes a long time it
will be started early
# so that other instructions will not have to wait
for its result.
#
#
######################################
## Compiler Optimization
Options
# Optimization options tell the
compiler:
#
# How
much EFFORT to put into optimization.
E.g., -O2(do level 2 optimization)
#
Which PARTICULAR optimizations to perform. E.g.,
-fno-strength-reduce
# (loop
optimization)
# The
TARGET system the code will be running on.
E.g., -xultra2 (ultra2
machine)
#
Whether to make certain ASSUMPTIONS about the
code.
#
Whether to use PROFILING data from a training
run.
##Optimization EFFORT
#
# Most compilers have optimization
levels.
#
# The higher the number, the more optimizations
done.
# :Example:
#
# Optimization Levels for Sun Workshop 6
Compiler
# -xO
[1|2|3|4|5]
#
Optimizes the object code. Note the upper-case
letter
#
O.
#
The levels (1, 2, 3, 4, or 5) you can use
differ
#
according to the platform you are using.
#
( SPARC)
#
-xO1 Does basic local optimization (peephole).
#
peephole: compiler is looking only small part of code
#
and does optimization only for that part.
#
if
compiler is looking larger part of code,
#
it may produce better code. but that kind of
#
process
takes more time and complicates.
#
-xO2 Does basic local and global optimization.
#
This is induction variable elimination, local
#
and global common subexpression elimination,
#
algebraic simplification, copy propagation,
#
constant propagation, loop-invariant optimi-
#
zation, register allocation, basic block
#
merging, tail recursion elimination, dead
#
code elimination, tail call elimination and
#
complex expression expansion.
#
: Any compiler Book will explain these.
#
The -xO2 level does not assign global, exter-
#
nal, or indirect references or definitions to
#
registers. It treats these references and
#
definitions as if they were declared "vola-
#
tile." In general, the -xO2 level results in
#
minimum code size.
#
-xO3 Performs like -xO2 but, also optimizes
refer-
#
ences or definitions for external variables.
#
Loop unrolling and software pipelining are
#
also performed. The -xO3 level does not trace
#
the effects of pointer assignments. When com-
#
piling either device drivers, or programs
#
that modify external variables from within
#
signal handlers, you may need to use the
#
volatile type qualifier to protect the object
#
from optimization. In
general, the -xO3
#
level results in increased code size.
#
: loop unrolling : saving time at branch
instruction.
#
for(i=0;i<3;++i) s
= s+i;
#
becomes : s =
s+0;
#
s
= s+1;
#
s= s+ 2;
#
longer code but no conditional branch.
#
-xO4 Performs like -xO3 but, also does
automatic
#
inlining
of functions contained in the same
#
file; this usually improves execution speed.
#
The -xO4 level does trace the effects of
#
pointer assignments. In
general, the -xO4
#
level
results in increased code size.
#
If you want to control which functions are
#
inlined, see -xinline .
#
-xO5 Generates the highest level of
optimization.
#
Uses optimization algorithms that take more
#
compilation time or that do not have as high
#
a certainty of improving execution time.
#
Optimization at this level is more likely to
#
improve performance if it is done with pro-
#
file feedback.
## PARTICULAR Optimizations
#
# The levels specify sets of optimizations (like
ordering the "Sport Package"
# for a new car).
#
# In contrast to optimization levels (-O3), the
compiler can be told which
# particular optimizations to
make.
#
# These are typically used by skilled programmers
trying to get
# fastest code.
#
# Sun Workshop 6
#
# Prefetch instructions read memory in advance,
eliminating some cache
# misses .
#
Cache is expensive but a
high speed memory.
#
#
# They can also increase
# cache misses or increase the time to access memory,
hurting
# performance.
The compiler does not know if they will help or
hurt.
#
#
-xprefetch[=val],val
#
(SPARC) Enable prefetch instructions on those
architec-
#
tures that support prefetch, such as UltraSPARC
II.
#
(-xarch=v8plus, v9plusa, v9, or v9a)
#
Explicit prefetching should only be used under
special
#
circumstances that are supported by
measurements.
## TARGET
#
# Specify exact type of machine code will be run
on.
#
# What might be specified:
#
# ISA
# The exact instruction set
used.
# Specifies not just the ISA family, but a particular
variation.
# A poor choice will limit the number of machines
code can run on.
#
# Implementation
# Specify the implementation code will run
on.
# A poor choice will result in slower
code.
#
# Cache
#Specify configuration of
cache.
#A poor choice will result in slower
code.
## Background
#
# Sun SPARC Architecture
#
#
Developed by Sun
#
# Major Versions: v7, v8, v9 (first 64-bit SPARC
ISA).
# Sub Version: v8plusa (v9 subset for 32-bit
operating systems)
#
# Some Implementations
# UltraSPARC III Cu Implements v9, (On newer
Suns.)
# UltraSPARC II Implements v9, (On
sol.)
#:Example:
#
# Switches for Sun Workshop 6
Compiler.
#
#
-xarch Specifies
ISA
#
-xchip
Specifies implementation.
#
-xcache
Specifies cache.
# -xtarget Specifies system.
ISA/implementation and cache.
#
# -xarch
Values:
#
v7, v8a, v8, v8plus (v8 with 32-bit v9), v8plusa (v8 with 32 bit v9
etc.)
#
-xchip=c
#
c must be one of: generic, generic64, native,
native64,
#
old, super, super2, micro, micro2, hyper,
hyper2,
#
powerup, ultra, ultra2, ultra2i, 386, 486,
pentium,
#
pentium_pro, 603, 604.
## ASSUMPTIONS About the
Program
# Compilers must generate correct
code.
#
# That
is, the code must execute in the way specified by the
#
high-level language definition.
#
# Correct code can be slow.
#
#
# Some options tell the compiler to make assumptions
about the program.
#
# These
assumptions would not hold for every program.
#
# The
compiled program runs faster ...
# ... and correctly if the assumptions are
valid.
# Some switches specifying
assumptions:
#gcc, Assume the program does not require strict IEEE
754 FP features.
# `-ffast-math'
# This option allows GCC
to violate some ANSI or IEEE rules and/or
# specifications in the
interest of optimizing code for speed.
For
#
example, it allows the compiler to assume arguments to the
`sqrt'
# function are
non-negative numbers and that no floating-point values
# are
NaNs.
# cc (Sun Workshop 6 Compiler) Assume certain
pointers do not overlap.
#
#
-xrestrict=f
#
(SPARC) Treats pointer-valued function parameters
as
#
restricted pointers. f is a comma-separated list
that
#
consists of one or more function parameters,
%all,
#
%none. This command-line
option can be used on its
# own,
but is best used with optimization of -xO3 or
#
greater.
#
#
The default is %none. Specifying -xrestrict is
#
equivalent to specifying -xrestrict=%all.
#
#
## PROFILING
# :Def: Profiling
# A compilation technique in which data taken from a
sample run is used
# to guide compiler
decisions.
# Branches occur frequently in code. There is a performance
penalty
# in taking a branch and so it's best if the compiler
organizes code
# (rearranges the intermediate representation) so
that branches are
# not taken as much as possible. To do that the compiler needs
to
# know how often an "if" or other condition (for
which the branch is
# emitted) is true. Only in a few cases can the compiler
figure that
# out on its own, because, for example, "if"
conditions depend on
# input data.
To obtain this useful information a two-step
# compilation process called profiling is used. In the first step
the
# code is compiled so that it writes branch
information (more
# precisely basic block, covered later) to a
file. The program is
run
# with typical input data, called the training input,
and it writes
# the branch information to a file. In the second step the
compiler
# reads the information and uses that to better
organize the code.
# Sun Profiling Compiler
Switches
#
#
-xprofile=p
#
Collects data for a profile or use a profile to
optim-
#
ize.
#
#
p must be collect[:name], use[:name], or tcov.
#
#
This option causes execution frequency data to be
col-
#
lected and saved during execution, then the data can
be
#
used in subsequent runs to improve performance.
This
#
option is only valid when a level of optimization
is
#
specified.
## Use of Compiler Switches
# ISA
# Type of system all uses have. (IA-32 for PCs, SPARC
for Sun users, etc.)
# Users can't normally run code compiled for a
different ISA.
# Implementation
# Type
of system most users have.
# Other
users can run code, but won't run as fast.
# Optimization
#
Select medium or high optimization level.
# If
market very sensitive to performance, use specific
optimizations.