## LSU EE 3755 -- Fall 2004-- Computer Organization

## LSU EE 3755 --OCT 18 Fall 2004 -- Computer Organization

## MIPS Notes 5 -- More Procedure Code and Compiler

#procedure swap

#1. registers to program variables.

#2. Produce code for the body of the procedure

#3. Preserve registers across the procedure invocation

# swap(int v[ ], int k)

# {

# int temp;

# temp = v[k];

# v[k] = v[k+1];

# v[k+1] = temp;

# }

# Swap

# Register Usage

# $a0 : Address of array element v[0].

# $a1: index k.

# $t1 : k * 4 ; because array v element is integer which is 4byte long.

# $t1: v[k] ; for swapping

# $t2: v[k+1] ; for swapping

swap:

add $t1, $a1, $a1 # $t1 = k * 2;

add $t1, $t1, $t1 # $t1 = k * 4;

add $t1, $a0, $t1 # $t1 = v + (k *4)

# $t1 has the address of v[k]

lw $t0,0($t1) # $t0 (temp) = v[k]

# $t0 has the value of v[k]

lw $t2,4($t1) # $t2 = v[k+1]

# $t2 has the value of v[k+1]

sw $t2,0($t1) # save the value of v[k+1] to address of v[k]

sw $t0,4($t1) # save the value of v[k] to address of v[k+1]

# so swapped

jr $ra # return to calling routine

# other way of doing this

# add $t1, $a1, $zero //only these two lines are different

# sll $t1,2 //......... k*4

# add $t1, $a0, $t1 // remaining are same

# lw $t0,0($t1) # $t0 (temp) = v[k]

# $t0 has the value of v[k]

# lw $t2,4($t1) # $t2 = v[k+1]

# $t2 has the value of v[k+1]

# sw $t2,0($t1) # save the value of v[k+1] to address of v[k]

# sw $t0,4($t1) # save the value of v[k] to address of v[k+1]

# so swapped

# jr $ra # return to calling routine

# # changing that to one instruction

# sll $a1, 2 // now we are using only one instruction

// instead of two instructions.

// code becomes short.

// is this better than above?

# add $t1, $a0, $a1 // # $t1 has the address of v[k]

// only above two are different

// remaining are same

# lw $t0,0($t1) # $t0 (temp) = v[k]

# $t0 has the value of v[k]

# lw $t2,4($t1) # $t2 = v[k+1]

# $t2 has the value of v[k+1]

# sw $t2,0($t1) # save the value of v[k+1] to address of v[k]

# sw $t0,4($t1) # save the value of v[k] to address of v[k+1]

# so swapped

# jr $ra # return to calling routine

### This is Why..

# main() // incomplete code.

# for (k = 1; k < 10; k++) {

# swap(v[], k);

# }

# procedure sort

# sort (int v[], int n)

# {

# int i, j;

# for(i = 0; i < n; i = i +1) {

# for (j = i -1 ; j > =0 && v [j] > v [j +1 ] ; j = j-1) { swap(v,j);}

# }

## primitive way of sorting is shown below

##(slower than above one)

## sort (int v[], int n)

## {

## int i, j;

## for(i = 0; i < n; i = i +1) {

## for (j = 0; j <n; j = j+1) {

## if(v[j] > v[j+1]) swap(v,j);

## }

# MIPS code for the first sort program

# Register usage :

# $a0 : Address of array element v[0].

# $a1: Array size n

# $s0: index i for for loop

# $s1: index j for for loop

# saving registers

sort:

addi $sp,$sp, -20 # make room on stack for 5 registers

sw $ra,16($sp) #save $ra on stack

sw $s3,12($sp) #save $s3 on stack

sw $s2, 8($sp) #save $s2 on stack

sw $s1, 4($sp) #save $s1 on stack

sw $s0, 0($sp) #save $s0 on stack

#we save $s' registers.

#procedure call shouldn't modify $s' registers.

#we save $ra register.

#we call swap procedure, and we will modify $ra at that time.

#procedure body

move $s2, $a0 # This is pseudo instruction .How do you do this?

move $s3, $a1 # save the array size n($a1) to $s3.

move $s0,$zero # index i for for loop ( i =0)

for1tst: slt $t0, $s0,$s3 # reg $t0 = 0 if $s0 >= $s3 (i>=n)

beq $t0,$zero,exit1 # outside for loop conditon check.

addi $s1,$s0,-1 # set j = i -1 for second for loop.

for2tst: slti $t0,$s1,0 # reg $t0 = 1 if $s1 <0 (j <0)

bne $t0,$zero,exit2

add $t1,$s1,$s1 # reg $t1 = j * 2

add $t1,$t1,$t1 # reg $t1 = j * 4

add $t2,$s2,$t1 # reg $t2 = v +(j * 4)//address of v[j]

lw $t3,0($t2) # reg $t3 = v[j]

lw $t4,4($t2) # reg $t4 = v[j + 1]

slt $t0,$t4,$t3 # reg $t0 = 0 if v[j+1] >= v[j]

beg $t0,$zero,exit2

move $a0,$s2 #1-st parameter of swap is v

move $a1,$s1 #2 nd parameter of swap is j

jal swap # $ra will be modified by jal, $ra <=exit2+4.

exit2: addi $s0,$s0, 1 # i= i +1

j for1tst # will be returned here after swap.

#now we restore saved $s' registers and $ra

#and adjust stack pointer.

exit1: lw $s0,0($sp)

lw $s1,4($sp)

lw $s2,8($sp)

lw $s3,12($sp)

lw $ra,16($sp)

addi $sp,$sp,20

jr $ra # return to calling procedure.

# Compiler

# Contents

# Optimization Introduction

# Steps in Preparing a Program

# Compiler Definitions

# High-Level Optimizations

# Low-Level Optimizations

# Compiler Optimization Options

# Use of Compiler Switches

#Lecture Goals

# Understand the Program Building and Compilation Process

# Describe steps in program building and optimization, including

# intermediate files (assembler, object, ...) and tool names (preprocessor,

# compiler, etc.).

# Understand Specific Optimizations and Assumption Switches

# Describe, work example, explain benefit.

# Understand Profiling

# Steps, how performance is improved.

# Understand ISA and Implementation Options

# How programmer chooses them, how compiler uses them.

# Understand how Programmers Use Compilation Options

# Normal program development, high-performance programs, SPEC disclosure.

###################

# Optimization Introduction

# :Def: Optimization

# The optional steps in compiling a program that reduce the program's

#execution time, reduce the size of the program, or both.

# Typically, the only time optimization is NOT done is when a program

# is being debugged.

# In most cases, the programmer sets overall optimization effort (low,

#medium, high).

#When performance is very important the programmer can specify which

# specific optimizations to use.

# :Example:

# Data on a program that computes pi with and without optimization.

# System: SunOS sol 5.6 Generic_105181-31 sun4u sparc SUNW,Ultra-Enterprise

# Compiler: Sun WorkShop 6 2000/04/07 C 5.1

#Clock Frequency: 300 MHz

# Without Optimization

# Size : 6408 bytes

# Run Time : 3.00 s

# Instruction Count : 325,338,749

# CPI : (/ (* 3.00 300e6) 325338749.0 ) = 2.7663

# With Optimization

# Size : 6340 bytes Smaller! (OK, only a tiny bit smaller.)

! Run Time : 1.65 s Faster!

! Instruction Count : 100,338,751

! CPI : (/ (* 1.65 300e6) 100338751.0) = 4.9333

# Reasons *not* to Optimize

# Percentage shows roughly how often reason applies.

# 95% It makes debugging difficult, so don't optimize while debugging.

# This is true for almost everyone that uses a debugger.

#10% It slows down compilation.

# Only important when there is a very large amount of code to

# recompile. (Back in the 20th century when computers were slow

# this was important.)

# 1% Optimization introduces bugs.

# It does, but very rarely.

##############################

# Steps in Building a Program

# Typical Steps in Building a Program

# Pre-Process, Compile, Assemble, Link, Load

# pi.c -> pi.i -> pi.s -> pi.o -> pi

# These steps can all be automatically performed of the compiler

# driver. (cc, gcc, MS Visual Studio, etc.)

#:Sample: cc pi.c -o pi

# They can also be specified individually.

# More details appear below, compile is what we're interested in.

# Pre-Process

# Load include files, expand macros.

# Typical pre-processor name: cpp

# Input File: pi.c

# Output File: pi.i (High-Level Language)

# Compile

# Convert high-level language into assembly language.

# Typical compiler name: cc1 (not cc, or gcc, or Visual Studio)

# Input File: pi.i

# Output File: pi.s (Assembly language)

# Assemble

# Convert assembly language in to machine language.

# Typical assembler name: as

# Input File: pi.s

# Output File: pi.o (Object file containing machine code.)

# Link

# Combine object files, libraries, and other code into executable.

# Typical linker name: ld

# Input File: pi.o

# Output File: pi

# Load

# Copy executable into memory, link with shared libraries, and start.

# Loader name: exec system call.

#######################################

# Compiler Definitions

# A program is compiled using steps or /passes/.

# The first pass reads the pre-processed high-level source code.

# The last pass produces assembly code.

# Between the first and last passes the program is in some

# intermediate representation.

# The way passes are defined and organized varies greatly from

# compiler to compiler.

# :Def: Pass

# A step in compiling a program. A pass usually looks at an entire function.

# :Def: Intermediate Representation

# The form of a program (sort of a special-purpose language) used internally

# by a compiler. A compiler might use several intermediate representations.

# Typical Passes

# Parse

# Convert the source code to a high-level intermediate representation (H-IR).

#High-Level Optimization (Optional, may be done in several passes.)

# Also called front-end optimization.

# Modify H-IR to improve performance or reduce code size.

# Reads and writes H-IR

# Low-Level Intermediate Representation (L-IR) Generation

# Convert H-IR to a low-level intermediate representation (L-IR).

# Low-Level Optimization (Optional, may be done in several passes.)

# Also called back-end optimization.

# Modify L-IR to improve performance or reduce code size.

# Register Assignment (Part of low-level optimization.)

# Choose machine registers.

# Code Generation

# Convert L-IR to assembly code.

# Pigeonhole Optimizations (Optional, may be done in several passes.)

# These are also called low-level optimizations.

# Modify L-IR to improve performance or reduce code size.

# Some of these can be done at link time.

#:Def: Compiler Front End

# The parts of the compiler that do the parsing and high-level

# optimization passes.

# :Def: Compiler Back End

# The parts of the compiler that do the low-level optimization,

#register assignment, and code generation passes.

##############################

# High-Level Optimizations

# Easy high-level optimizations presented here.

#Some Easy-To-Explain Front-End Optimizations

# Dead-Code Elimination (DCE)

# Common Subexpression Elimination (CSE)

# Constant Propagation, Folding

# Dead-Code Elimination (DCE)

# Removal of code which isn't used.

# Yes, it happens.

# :Example:

# Code benefiting from DCE

# High-level code shown for clarity. Most compilers will transform

# an intermediate representation.

# Before:

int

main(int argv, char **argc)

{

double i; double sum = 0;

for(i=1; i<50000000;)

{

sum = sum + 4.0 / i; i += 2;

sum = sum - 4.0 / i; i += 2;

}

printf("Performed %d iterations. Thank you for running me.\n",i);

}

# After:

int

main(int argv, char **argc)

{

double i;

for(i=1; i<50000000;)

{

i += 2;

}

printf("Performed %d iterations. Thank you for running me.\n",i);

}

# Note: Other optimizations would leave only the printf.

# Common Subexpression Elimination

# Remove duplicated code.

# Before:

r = ( a + b ) / ( x + y );

s = ( a + b ) / ( x - y );

# After:

temp = a + b;

r = ( temp ) / ( x + y );

s = ( temp ) / ( x - y );

# Constant Propagation, Folding

# The compiler performs whatever arithmetic it can at compile time

# rather than emitting code to perform the arithmetic at run time.

# Before:

int sec_per_day = 60 * 60 * 24;

int sec_per_year = sec_per_day * 365;

some_routine(sec_per_day * x, sec_per_year * y);

# After:

int sec_per_day = 86400;

int sec_per_year = 31536000;

some_routine(86400 * x, 31536000 * y);

#####################################

# Low-Level Optimizations

# Some Low-Level Optimizations

# This is not a complete list.

# Register Assignment

# Instruction Selection

# Scheduling

# :Def: Register Assignment

# Selection of which values will be held in registers. Values

#not held in registers are stored in memory.

# Without Register Assignment Optimizations

# All values corresponding to variables (in high-level program) are

# written to memory (and not held in registers). Intermediate results are

# held in registers.(accessing memory takes longer time than accessing regs.)

# With Register Assignment Optimization

# Registers are assigned to as many variables as possible, with priority

# given to frequently used variables.

#Advantage of Register Assignment Optimization

# Fewer memory writes and reads.(faster)

# :Def: Scheduling

# Re-arranging instructions to minimize the amount of time one instruction

# has to wait for another.

#For example, if an instruction takes a long time it will be started early

# so that other instructions will not have to wait for its result.

######################################

## Compiler Optimization Options

# Optimization options tell the compiler:

# How much EFFORT to put into optimization. E.g., -O2(do level 2 optimization)

# Which PARTICULAR optimizations to perform. E.g., -fno-strength-reduce

# (loop optimization)

# The TARGET system the code will be running on. E.g., -xultra2 (ultra2 machine)

# Whether to make certain ASSUMPTIONS about the code.

# Whether to use PROFILING data from a training run.

##Optimization EFFORT

# Most compilers have optimization levels.

# The higher the number, the more optimizations done.

# :Example:

# Optimization Levels for Sun Workshop 6 Compiler

# -xO [1|2|3|4|5]

# Optimizes the object code. Note the upper-case letter

# O.

# The levels (1, 2, 3, 4, or 5) you can use differ

# according to the platform you are using.

# ( SPARC)

# -xO1 Does basic local optimization (peephole).

# peephole: compiler is looking only small part of code

# and does optimization only for that part.

# if compiler is looking larger part of code,

# it may produce better code. but that kind of

# process takes more time and complicates.

# -xO2 Does basic local and global optimization.

# This is induction variable elimination, local

# and global common subexpression elimination,

# algebraic simplification, copy propagation,

# constant propagation, loop-invariant optimi-

# zation, register allocation, basic block

# merging, tail recursion elimination, dead

# code elimination, tail call elimination and

# complex expression expansion.

# : Any compiler Book will explain these.

# The -xO2 level does not assign global, exter-

# nal, or indirect references or definitions to

# registers. It treats these references and

# definitions as if they were declared "vola-

# tile." In general, the -xO2 level results in

# minimum code size.

# -xO3 Performs like -xO2 but, also optimizes refer-

# ences or definitions for external variables.

# Loop unrolling and software pipelining are

# also performed. The -xO3 level does not trace

# the effects of pointer assignments. When com-

# piling either device drivers, or programs

# that modify external variables from within

# signal handlers, you may need to use the

# volatile type qualifier to protect the object

# from optimization. In general, the -xO3

# level results in increased code size.

# : loop unrolling : saving time at branch instruction.

# for(i=0;i<3;++i) s = s+i;

# becomes : s = s+0;

# s = s+1;

# s= s+ 2;

# longer code but no conditional branch.

# -xO4 Performs like -xO3 but, also does automatic

# inlining of functions contained in the same

# file; this usually improves execution speed.

# The -xO4 level does trace the effects of

# pointer assignments. In general, the -xO4

# level results in increased code size.

# If you want to control which functions are

# inlined, see -xinline .

# -xO5 Generates the highest level of optimization.

# Uses optimization algorithms that take more

# compilation time or that do not have as high

# a certainty of improving execution time.

# Optimization at this level is more likely to

# improve performance if it is done with pro-

# file feedback.

## PARTICULAR Optimizations

# The levels specify sets of optimizations (like ordering the "Sport Package"

# for a new car).

# In contrast to optimization levels (-O3), the compiler can be told which

# particular optimizations to make.

# These are typically used by skilled programmers trying to get

# fastest code.

# Sun Workshop 6

# Prefetch instructions read memory in advance, eliminating some cache

# misses .

# Cache is expensive but a high speed memory.

# They can also increase

# cache misses or increase the time to access memory, hurting

# performance. The compiler does not know if they will help or hurt.

# -xprefetch[=val],val

# (SPARC) Enable prefetch instructions on those architec-

# tures that support prefetch, such as UltraSPARC II.

# (-xarch=v8plus, v9plusa, v9, or v9a)

# Explicit prefetching should only be used under special

# circumstances that are supported by measurements.

## TARGET

# Specify exact type of machine code will be run on.

# What might be specified:

# ISA

# The exact instruction set used.

# Specifies not just the ISA family, but a particular variation.

# A poor choice will limit the number of machines code can run on.

# Implementation

# Specify the implementation code will run on.

# A poor choice will result in slower code.

# Cache

#Specify configuration of cache.

#A poor choice will result in slower code.

## Background

# Sun SPARC Architecture

# Developed by Sun

# Major Versions: v7, v8, v9 (first 64-bit SPARC ISA).

# Sub Version: v8plusa (v9 subset for 32-bit operating systems)

# Some Implementations

# UltraSPARC III Cu Implements v9, (On newer Suns.)

# UltraSPARC II Implements v9, (On sol.)

#:Example:

# Switches for Sun Workshop 6 Compiler.

# -xarch Specifies ISA

# -xchip Specifies implementation.

# -xcache Specifies cache.

# -xtarget Specifies system. ISA/implementation and cache.

# -xarch Values:

# v7, v8a, v8, v8plus (v8 with 32-bit v9), v8plusa (v8 with 32 bit v9 etc.)

# -xchip=c

# c must be one of: generic, generic64, native, native64,

# old, super, super2, micro, micro2, hyper, hyper2,

# powerup, ultra, ultra2, ultra2i, 386, 486, pentium,

# pentium_pro, 603, 604.

## ASSUMPTIONS About the Program

# Compilers must generate correct code.

# That is, the code must execute in the way specified by the

# high-level language definition.

# Correct code can be slow.

# Some options tell the compiler to make assumptions about the program.

# These assumptions would not hold for every program.

# The compiled program runs faster ...

# ... and correctly if the assumptions are valid.

# Some switches specifying assumptions:

#gcc, Assume the program does not require strict IEEE 754 FP features.

# `-ffast-math'

# This option allows GCC to violate some ANSI or IEEE rules and/or

# specifications in the interest of optimizing code for speed. For

# example, it allows the compiler to assume arguments to the `sqrt'

# function are non-negative numbers and that no floating-point values

# are NaNs.

# cc (Sun Workshop 6 Compiler) Assume certain pointers do not overlap.

# -xrestrict=f

# (SPARC) Treats pointer-valued function parameters as

# restricted pointers. f is a comma-separated list that

# consists of one or more function parameters, %all,

# %none. This command-line option can be used on its

# own, but is best used with optimization of -xO3 or

# greater.

# The default is %none. Specifying -xrestrict is

# equivalent to specifying -xrestrict=%all.

## PROFILING

# :Def: Profiling

# A compilation technique in which data taken from a sample run is used

# to guide compiler decisions.

# Branches occur frequently in code. There is a performance penalty

# in taking a branch and so it's best if the compiler organizes code

# (rearranges the intermediate representation) so that branches are

# not taken as much as possible. To do that the compiler needs to

# know how often an "if" or other condition (for which the branch is

# emitted) is true. Only in a few cases can the compiler figure that

# out on its own, because, for example, "if" conditions depend on

# input data. To obtain this useful information a two-step

# compilation process called profiling is used. In the first step the

# code is compiled so that it writes branch information (more

# precisely basic block, covered later) to a file. The program is run

# with typical input data, called the training input, and it writes

# the branch information to a file. In the second step the compiler

# reads the information and uses that to better organize the code.

# Sun Profiling Compiler Switches

# -xprofile=p

# Collects data for a profile or use a profile to optim-

# ize.

# p must be collect[:name], use[:name], or tcov.

# This option causes execution frequency data to be col-

# lected and saved during execution, then the data can be

# used in subsequent runs to improve performance. This

# option is only valid when a level of optimization is

# specified.

## Use of Compiler Switches

# ISA

# Type of system all uses have. (IA-32 for PCs, SPARC for Sun users, etc.)

# Users can't normally run code compiled for a different ISA.

# Implementation

# Type of system most users have.

# Other users can run code, but won't run as fast.

# Optimization

# Select medium or high optimization level.

# If market very sensitive to performance, use specific optimizations.