## LSU EE
3755 -- Fall 2003 -- Computer Organization
#
## MIPS Notes 5 -- More Procedure Code and Compiler
#procedure swap
#1. registers to
program variables.
#2. Produce code for
the body of the procedure
#3. Preserve registers
across the procedure invocation
# swap(int v[ ], int k)
# {
# int temp;
# temp = v[k];
# v[k] = v[k+1];
# v[k+1] = temp;
# }
# Swap
# Register Usage
#
# $a0 : Address of array element v[0].
# $a1: index k.
# $t1 : k * 4 ; because array v element is
integer which is 4byte long.
# $t1: v[k]
; for swapping
# $t2: v[k+1] ; for swapping
swap:
add $t1, $a1, $a1 #
$t1 = k * 2;
add $t1, $t1, $t1
# $t1 = k * 4;
add $t1, $a0, $t1 #
$t1 = v + (k *4)
# $t1 has the address of v[k]
lw $t0,0($t1) # $t0 (temp) = v[k]
# $t0 has the value of v[k]
lw $t2,4($t1) #
$t2 = v[k+1]
# $t2 has the value of v[k+1]
sw $t2,0($t1) #
save the value of v[k+1] to address of v[k]
sw $t0,4($t1) # save the value of v[k] to address
of v[k+1]
#
so swapped
jr
$ra # return
to calling routine
# other way of doing this
# add $t1, $a1, $zero //only these two lines are different
# sll $t1,2
//......... k*4
# add $t1, $a0, $t1 //
remaining are same
# lw $t0,0($t1) # $t0 (temp) = v[k]
# $t0 has the value of v[k]
# lw $t2,4($t1) #
$t2 = v[k+1]
# $t2 has the value of v[k+1]
# sw $t2,0($t1) # save the value of v[k+1] to
address of v[k]
# sw $t0,4($t1) # save the value of v[k] to address
of v[k+1]
#
so swapped
# jr $ra
# return to calling routine
# #
changing that to one instruction
#
sll $a1, 2 // now we
are using only one instruction
// instead of two instructions.
// code becomes short.
// is this better than above?
# add $t1, $a0, $a1 // #
$t1 has the address of v[k]
// only above two are different
// remaining are same
# lw $t0,0($t1) # $t0 (temp) = v[k]
# $t0 has the value of v[k]
# lw $t2,4($t1) #
$t2 = v[k+1]
# $t2 has the value of v[k+1]
# sw $t2,0($t1) # save the value of v[k+1] to
address of v[k]
# sw $t0,4($t1) # save the value of v[k] to address
of v[k+1]
#
so swapped
# jr $ra
# return to calling routine
### This is Why..
# main()
// incomplete code.
# for (k = 1; k < 10; k++) {
# swap(v[], k);
# }
#
##
# procedure sort
# sort (int v[], int n)
# {
# int i, j;
# for(i = 0; i < n; i = i +1)
{
# for (j = i -1 ; j > =0
&& v [j] > v [j +1 ] ; j = j-1) { swap(v,j);}
# }
# }
# }
## primitive way of
sorting is shown below
##(slower than above
one)
## sort (int v[], int
n)
## {
## int i, j;
## for(i = 0; i < n; i = i +1)
{
## for (j = 0; j <n; j =
j+1) {
## if(v[j] > v[j+1]) swap(v,j);
## }
## }
## }
## }
# MIPS code for the first sort program
# Register usage :
# $a0 : Address of array element v[0].
# $a1: Array size n
# $s0: index i for for loop
# $s1: index j for for loop
# saving registers
#
sort:
addi $sp,$sp,
-20 # make room on stack for 5
registers
sw $ra,16($sp) #save $ra on stack
sw
$s3,12($sp) #save $s3 on stack
sw $s2, 8($sp) #save $s2 on stack
sw $s1, 4($sp) #save $s1 on stack
sw $s0, 0($sp) #save $s0 on stack
#we save $s'
registers.
#procedure call
shouldn't modify $s' registers.
#we save $ra
register.
#we call swap
procedure, and we will modify $ra at that time.
#procedure body
move $s2, $a0 # This is pseudo instruction .How do you
do this?
move $s3, $a1 # save the array size n($a1) to $s3.
move $s0,$zero #
index i for for loop ( i =0)
for1tst: slt $t0, $s0,$s3
# reg $t0 = 0 if $s0 >= $s3 (i>=n)
beq $t0,$zero,exit1 # outside for loop conditon check.
addi $s1,$s0,-1 # set j = i -1 for second for loop.
for2tst: slti
$t0,$s1,0 # reg $t0 = 1 if
$s1 <0 (j <0)
bne $t0,$zero,exit2
add $t1,$s1,$s1 # reg $t1 = j * 2
add $t1,$t1,$t1 # reg $t1 = j * 4
add $t2,$s2,$t1 # reg $t2 = v +(j * 4)//address of v[j]
lw $t3,0($t2) # reg $t3 = v[j]
lw $t4,4($t2) # reg $t4 = v[j + 1]
slt
$t0,$t4,$t3 # reg $t0 = 0 if
v[j+1] >= v[j]
beg $t0,$zero,exit2
move $a0,$s2 #1-st parameter of swap is v
move $a1,$s1 #2 nd parameter of swap is j
jal swap # $ra will be modified by jal, $ra <=exit2+4.
exit2: addi $s0,$s0, 1
# i= i +1
j for1tst # will be returned here after
swap.
#now we restore saved
$s' registers and $ra
#and adjust stack
pointer.
exit1: lw $s0,0($sp)
lw $s1,4($sp)
lw $s2,8($sp)
lw $s3,12($sp)
lw $ra,16($sp)
addi $sp,$sp,20
jr $ra # return to calling procedure.
# Compiler
# Contents
#
# Optimization
Introduction
# Steps in Preparing a
Program
# Compiler Definitions
# High-Level
Optimizations
# Low-Level
Optimizations
# Compiler Optimization
Options
# Use of Compiler
Switches
#Lecture Goals
#
# Understand the
Program Building and Compilation Process
# Describe steps in program building and
optimization, including
# intermediate files (assembler, object, ...)
and tool names (preprocessor,
# compiler, etc.).
#
# Understand Specific
Optimizations and Assumption Switches
# Describe, work example, explain benefit.
#
# Understand Profiling
# Steps, how performance is improved.
#
# Understand ISA and
Implementation Options
# How programmer chooses them, how compiler
uses them.
#
# Understand how
Programmers Use Compilation Options
# Normal program development, high-performance
programs, SPEC disclosure.
###################
# Optimization
Introduction
# :Def: Optimization
# The optional steps in
compiling a program that reduce the program's
#execution time, reduce
the size of the program, or both.
# Typically, the only
time optimization is NOT done is when a program
# is being debugged.
#
# In most cases, the
programmer sets overall optimization effort (low,
#medium, high).
#
#When performance is
very important the programmer can specify which
# specific
optimizations to use.
# :Example:
#
# Data on a program
that computes pi with and without optimization.
#
# System: SunOS sol 5.6 Generic_105181-31 sun4u sparc
SUNW,Ultra-Enterprise
# Compiler: Sun
WorkShop 6 2000/04/07 C 5.1
#Clock Frequency: 300
MHz
#
# Without Optimization
# Size : 6408 bytes
# Run Time : 3.00 s
# Instruction Count : 325,338,749
# CPI : (/ (* 3.00 300e6) 325338749.0 ) = 2.7663
#
# With Optimization
# Size : 6340 bytes
Smaller! (OK, only a tiny bit smaller.)
! Run Time : 1.65 s
Faster!
! Instruction Count : 100,338,751
! CPI : (/ (* 1.65 300e6) 100338751.0) = 4.9333
!
# Reasons *not* to
Optimize
#
# Percentage shows
roughly how often reason applies.
#
# 95% It makes debugging difficult, so don't
optimize while debugging.
# This is true for almost everyone that
uses a debugger.
#
#10% It slows down compilation.
# Only important when there is a very large
amount of code to
# recompile. (Back in the 20th century when computers were slow
# this was important.)
#
# 1%
Optimization introduces bugs.
# It does, but very rarely.
##############################
# Steps in Building a
Program
# Typical Steps in
Building a Program
#
# Pre-Process, Compile, Assemble, Link, Load
# pi.c -> pi.i -> pi.s ->
pi.o -> pi
#
# These steps can all
be automatically performed of the compiler
# driver. (cc, gcc, MS
Visual Studio, etc.)
#
#:Sample: cc pi.c -o pi
#
# They can also be
specified individually.
#
# More details appear
below, compile is what we're interested in.
# Pre-Process
# Load include files, expand macros.
# Typical pre-processor name: cpp
# Input File: pi.c
# Output File: pi.i (High-Level Language)
# Compile
# Convert high-level language into assembly
language.
# Typical compiler name: cc1 (not cc, or gcc,
or Visual Studio)
# Input File: pi.i
# Output File: pi.s (Assembly language)
# Assemble
# Convert assembly language in to machine
language.
# Typical assembler name: as
# Input File: pi.s
# Output File: pi.o (Object file containing machine code.)
# Link
# Combine object files, libraries, and other
code into executable.
# Typical linker name: ld
# Input File: pi.o
# Output File: pi
# Load
# Copy executable into memory, link with
shared libraries, and start.
# Loader name: exec system call.
#######################################
# Compiler Definitions
# A program is compiled using steps or /passes/.
#
# The first pass reads the pre-processed
high-level source code.
#
# The last pass produces assembly code.
#
# Between the first and last passes the
program is in some
# intermediate representation.
#
# The way passes are defined and organized
varies greatly from
# compiler to compiler.
# :Def: Pass
# A step in compiling a program. A pass usually looks at an entire function.
# :Def: Intermediate
Representation
# The form of a program
(sort of a special-purpose language) used internally
# by a compiler. A compiler might use several intermediate
representations.
# Typical Passes
#
# Parse
# Convert the source code to a high-level
intermediate representation (H-IR).
#
#High-Level
Optimization (Optional, may be done in several passes.)
# Also called front-end optimization.
# Modify H-IR to improve performance or
reduce code size.
# Reads and writes H-IR
#
# Low-Level Intermediate
Representation (L-IR) Generation
# Convert H-IR to a low-level intermediate
representation (L-IR).
#
# Low-Level
Optimization (Optional, may be done in
several passes.)
# Also called back-end optimization.
# Modify L-IR to improve performance or
reduce code size.
#
# Register Assignment
(Part of low-level optimization.)
# Choose machine registers.
#
# Code Generation
# Convert L-IR to assembly code.
#
# Pigeonhole
Optimizations (Optional, may be done in
several passes.)
# These are also called low-level
optimizations.
# Modify L-IR to improve performance or
reduce code size.
# Some of these can be done at link time.
#:Def: Compiler Front
End
# The parts of the
compiler that do the parsing and high-level
# optimization passes.
# :Def: Compiler Back
End
# The parts of the
compiler that do the low-level optimization,
#register assignment,
and code generation passes.
##############################
# High-Level
Optimizations
# Easy high-level
optimizations presented here.
#Some Easy-To-Explain
Front-End Optimizations
#
# Dead-Code Elimination
(DCE)
# Common Subexpression
Elimination (CSE)
# Constant Propagation,
Folding
# Dead-Code Elimination
(DCE)
# Removal of code which isn't used.
# Yes, it happens.
# :Example:
#
# Code benefiting from
DCE
# High-level code shown
for clarity. Most compilers will
transform
# an intermediate
representation.
#
# Before:
#
int
main(int argv,
char **argc)
{
double i;
double sum = 0;
for(i=1; i<50000000;)
{
sum = sum + 4.0 / i; i
+= 2;
sum = sum - 4.0 / i; i += 2;
}
printf("Performed %d iterations. Thank you for running me.\n",i);
}
#
# After:
#
int
main(int argv, char
**argc)
{
double i;
for(i=1; i<50000000;)
{
i += 2;
i += 2;
}
printf("Performed %d iterations. Thank you for running me.\n",i);
}
#
# Note: Other optimizations would leave only
the printf.
# Common Subexpression Elimination
# Remove duplicated code.
#
# Before:
r = ( a + b ) / ( x + y );
s = ( a + b ) / ( x - y );
# After:
temp = a + b;
r = ( temp ) / ( x + y );
s = ( temp ) / ( x - y );
# Constant Propagation,
Folding
# The compiler performs whatever arithmetic it
can at compile time
# rather than emitting code to perform the
arithmetic at run time.
#
# Before:
int sec_per_day = 60 * 60 * 24;
int sec_per_year = sec_per_day * 365;
some_routine(sec_per_day * x, sec_per_year *
y);
# After:
int sec_per_day = 86400;
int sec_per_year = 31536000;
some_routine(86400 * x, 31536000 * y);
#####################################
# Low-Level
Optimizations
# Some Low-Level
Optimizations
#
# This is not a
complete list.
#
# Register Assignment
# Instruction Selection
# Scheduling
# :Def: Register
Assignment
# Selection of which
values will be held in registers.
Values
#not held in registers
are stored in memory.
#
# Without Register
Assignment Optimizations
# All values
corresponding to variables (in high-level program) are
# written to memory
(and not held in registers).
Intermediate results are
# held in
registers.(accessing memory takes longer time than accessing regs.)
#
# With Register
Assignment Optimization
# Registers are
assigned to as many variables as possible, with priority
# given to frequently
used variables.
#
#Advantage of Register
Assignment Optimization
# Fewer memory writes
and reads.(faster)
# :Def: Scheduling
# Re-arranging
instructions to minimize the amount of time one instruction
# has to wait for another.
#
#For example, if an
instruction takes a long time it will be started early
# so that other
instructions will not have to wait for its result.
#
#
######################################
## Compiler
Optimization Options
# Optimization options
tell the compiler:
#
# How much EFFORT to put into
optimization. E.g., -O2(do level 2
optimization)
# Which PARTICULAR optimizations to
perform. E.g., -fno-strength-reduce
# (loop optimization)
# The TARGET system the code will be running
on. E.g., -xultra2 (ultra2 machine)
# Whether to make certain ASSUMPTIONS about
the code.
# Whether to use PROFILING data from a
training run.
##Optimization EFFORT
#
# Most compilers have
optimization levels.
#
# The higher the
number, the more optimizations done.
# :Example:
#
# Optimization Levels
for Sun Workshop 6 Compiler
# -xO [1|2|3|4|5]
# Optimizes the object code. Note the
upper-case letter
# O.
# The levels (1, 2, 3, 4, or 5) you
can use differ
# according to the platform you are
using.
# ( SPARC)
# -xO1 Does basic local
optimization (peephole).
#
peephole: compiler is looking only small part of code
#
and does optimization only for that part.
# if
compiler is looking larger part of code,
#
it may produce better code. but that kind of
#
process
takes more time and complicates.
# -xO2 Does basic local and
global optimization.
# This is induction variable
elimination, local
# and global common
subexpression elimination,
# algebraic simplification,
copy propagation,
# constant propagation,
loop-invariant optimi-
# zation, register
allocation, basic block
# merging, tail recursion
elimination, dead
# code elimination, tail
call elimination and
# complex expression
expansion.
# : Any compiler Book will
explain these.
# The -xO2 level does not
assign global, exter-
# nal, or indirect
references or definitions to
# registers. It treats these
references and
# definitions as if they
were declared "vola-
# tile." In general,
the -xO2 level results in
# minimum code size.
# -xO3 Performs like -xO2 but,
also optimizes refer-
# ences or definitions for
external variables.
# Loop unrolling and
software pipelining are
# also performed. The -xO3
level does not trace
# the effects of pointer
assignments. When com-
# piling either device
drivers, or programs
# that modify external
variables from within
# signal handlers, you may
need to use the
# volatile type qualifier
to protect the object
# from optimization. In general, the -xO3
# level results in increased
code size.
# : loop unrolling
: saving time at branch instruction.
#
for(i=0;i<3;++i) s = s+i;
# becomes :
s = s+0;
# s = s+1;
# s= s+
2;
# longer code
but no conditional branch.
# -xO4 Performs like -xO3 but,
also does automatic
# inlining of functions contained in the same
# file; this usually
improves execution speed.
# The -xO4 level does trace
the effects of
# pointer assignments. In general, the -xO4
# level results in increased code size.
# If you want to control
which functions are
# inlined, see -xinline .
# -xO5 Generates the highest
level of optimization.
# Uses optimization
algorithms that take more
# compilation time or that
do not have as high
# a certainty of improving
execution time.
# Optimization at this
level is more likely to
# improve performance if it
is done with pro-
# file feedback.
## PARTICULAR
Optimizations
#
# The levels specify
sets of optimizations (like ordering the "Sport Package"
# for a new car).
#
# In contrast to
optimization levels (-O3), the compiler can be told which
# particular
optimizations to make.
#
# These are typically
used by skilled programmers trying to get
# fastest code.
#
# Sun Workshop 6
#
# Prefetch instructions
read memory in advance, eliminating some cache
# misses .
# Cache is
expensive but a high speed memory.
#
#
# They can also
increase
# cache misses or
increase the time to access memory, hurting
# performance. The compiler does not know if they will help
or hurt.
#
# -xprefetch[=val],val
# (SPARC) Enable prefetch instructions
on those architec-
# tures that support prefetch, such as
UltraSPARC II.
# (-xarch=v8plus, v9plusa, v9, or v9a)
# Explicit prefetching should only be
used under special
# circumstances that are supported by
measurements.
## TARGET
#
# Specify exact type of
machine code will be run on.
#
# What might be
specified:
#
# ISA
# The exact instruction
set used.
# Specifies not just
the ISA family, but a particular variation.
# A poor choice will
limit the number of machines code can run on.
#
# Implementation
# Specify the
implementation code will run on.
# A poor choice will
result in slower code.
#
# Cache
#Specify configuration
of cache.
#A poor choice will
result in slower code.
## Background
#
# Sun SPARC
Architecture
#
# Developed by Sun
#
# Major Versions: v7, v8, v9 (first 64-bit SPARC ISA).
# Sub Version: v8plusa
(v9 subset for 32-bit operating systems)
#
# Some Implementations
# UltraSPARC III
Cu Implements v9, (On newer Suns.)
# UltraSPARC II Implements v9, (On sol.)
#:Example:
#
# Switches for Sun
Workshop 6 Compiler.
#
# -xarch
Specifies ISA
# -xchip
Specifies implementation.
# -xcache
Specifies cache.
# -xtarget Specifies system. ISA/implementation and cache.
#
# -xarch Values:
# v7, v8a, v8, v8plus (v8 with 32-bit v9),
v8plusa (v8 with 32 bit v9 etc.)
# -xchip=c
# c must be one of: generic,
generic64, native, native64,
# old, super, super2, micro, micro2,
hyper, hyper2,
# powerup, ultra, ultra2, ultra2i,
386, 486, pentium,
# pentium_pro, 603, 604.
## ASSUMPTIONS About
the Program
# Compilers must
generate correct code.
#
# That is, the code must execute in the way
specified by the
# high-level language definition.
#
# Correct code can be
slow.
#
#
# Some options tell the
compiler to make assumptions about the program.
#
# These assumptions would not hold for every
program.
#
# The compiled program runs faster ...
# ... and correctly if
the assumptions are valid.
# Some switches
specifying assumptions:
#gcc, Assume the
program does not require strict IEEE 754 FP features.
# `-ffast-math'
# This option allows GCC to violate some
ANSI or IEEE rules and/or
# specifications in the interest of
optimizing code for speed. For
# example, it allows the compiler to assume
arguments to the `sqrt'
# function are non-negative numbers and
that no floating-point values
# are NaNs.
# cc (Sun Workshop 6
Compiler) Assume certain pointers do not overlap.
#
# -xrestrict=f
# (SPARC) Treats pointer-valued
function parameters as
# restricted pointers. f is a
comma-separated list that
# consists of one or more function
parameters, %all,
# %none. This command-line option can be used on its
# own, but is best used with optimization of -xO3 or
# greater.
#
# The default is %none. Specifying
-xrestrict is
# equivalent to specifying
-xrestrict=%all.
#
#
## PROFILING
# :Def: Profiling
# A compilation
technique in which data taken from a sample run is used
# to guide compiler
decisions.
# Branches occur
frequently in code. There is a
performance penalty
# in taking a branch
and so it's best if the compiler organizes code
# (rearranges the
intermediate representation) so that branches are
# not taken as much as
possible. To do that the compiler needs
to
# know how often an
"if" or other condition (for which the branch is
# emitted) is
true. Only in a few cases can the compiler
figure that
# out on its own,
because, for example, "if" conditions depend on
# input data. To obtain this useful information a two-step
# compilation process
called profiling is used. In the first
step the
# code is compiled so
that it writes branch information (more
# precisely basic
block, covered later) to a file. The
program is run
# with typical input
data, called the training input, and it writes
# the branch
information to a file. In the second
step the compiler
# reads the information
and uses that to better organize the code.
# Sun Profiling
Compiler Switches
#
# -xprofile=p
# Collects data for a profile or use a
profile to optim-
# ize.
#
# p must be collect[:name],
use[:name], or tcov.
#
# This option causes execution
frequency data to be col-
# lected and saved during execution,
then the data can be
# used in subsequent runs to improve
performance. This
# option is only valid when a level of
optimization is
# specified.
## Use of Compiler
Switches
# ISA
# Type of system all
uses have. (IA-32 for PCs, SPARC for Sun users, etc.)
# Users can't normally
run code compiled for a different ISA.
# Implementation
# Type of system most users have.
# Other users can run code, but won't run as
fast.
# Optimization
# Select medium or high optimization level.
# If market very sensitive to performance, use
specific optimizations.