## LSU EE 3755 -- Fall 2009-- Computer Organization

#

## MIPS  Notes 5 -- More Procedure Code and Compiler

 

 

#procedure swap

#1. registers to program variables.

#2. Produce code for the body of the procedure

#3. Preserve registers across the procedure invocation

 

#    swap(int v[ ], int k)

#   {

#         int temp;

#         temp = v[k];

#        v[k] = v[k+1];

#        v[k+1] = temp;

#    }

 

#  Swap

#  Register Usage

#

#    $a0 : Address of array element v[0].

#    $a1: index k.

#    $t1 : k * 4 ; because array v element is integer which is 4byte long.

#    $t1: v[k]  ;      for swapping

#    $t2: v[k+1] ;  for swapping

 

swap:

           add $t1, $a1, $a1   #  $t1 = k * 2;

           add $t1, $t1,  $t1    #  $t1 = k * 4;

           add $t1, $a0, $t1   #   $t1 = v + (k *4)

                                             #  $t1 has the address of v[k]

 

                                             

           lw $t0,0($t1)            # $t0 (temp) = v[k] 

                                              # $t0 has the value of v[k]

           lw $t2,4($t1)            #  $t2 = v[k+1]

                                              # $t2 has the value of v[k+1]

                                            

           sw $t2,0($t1)          # save the value of v[k+1] to address of v[k]

           sw $t0,4($t1)          # save the value of v[k] to address of v[k+1]

                                             # so swapped

           jr  $ra                       # return to calling routine

 

 # other way of doing this

           # add $t1, $a1, $zero     //only these two lines are different

           # sll    $t1,2                       //.........  k*4

           # add  $t1, $a0, $t1        // remaining are same

          

                                             

          # lw $t0,0($t1)            # $t0 (temp) = v[k] 

                                              # $t0 has the value of v[k]

          # lw $t2,4($t1)            #  $t2 = v[k+1]

                                              # $t2 has the value of v[k+1]

                                            

          # sw $t2,0($t1)          # save the value of v[k+1] to address of v[k]

          # sw $t0,4($t1)          # save the value of v[k] to address of v[k+1]

                                             # so swapped

          # jr  $ra                       # return to calling routine

 

 

       # #  changing that  to  one instruction

            #  sll $a1, 2              // now we are using only one instruction

                                              // instead of two instructions.

                                              // code becomes short.

                                              // is this better than above?

           # add  $t1, $a0, $a1  //    #  $t1 has the address of v[k]

                                                // only above two are different

                                                 //    remaining are same

         

                                             

          # lw $t0,0($t1)            # $t0 (temp) = v[k] 

                                              # $t0 has the value of v[k]

          # lw $t2,4($t1)            #  $t2 = v[k+1]

                                              # $t2 has the value of v[k+1]

                                            

          # sw $t2,0($t1)          # save the value of v[k+1] to address of v[k]

          # sw $t0,4($t1)          # save the value of v[k] to address of v[k+1]

                                             # so swapped

          # jr  $ra                       # return to calling routine

 

 

###  This is Why..

#       main()  //  incomplete code.

#       for (k = 1; k < 10; k++)  {

#               swap(v[], k);

#       }

#

 

##

 

 

# procedure sort

# sort (int v[], int n)

# {

#         int i, j;

#               for(i = 0; i < n; i = i +1) {

#                     for (j = i -1 ; j > =0 && v [j] > v [j +1 ] ; j = j-1) { swap(v,j);}

#          }

#     }

# }

## primitive way of sorting is shown below

##(slower than above one)

## sort (int v[], int n)

## {

##         int i, j;

##               for(i = 0; i < n; i = i +1) {

##                     for (j = 0; j <n; j = j+1) {

##                             if(v[j] > v[j+1])  swap(v,j);

##                      }

##         }

##    }

## }

 

#   MIPS code for the first sort program

#   Register usage :   

#    $a0 : Address of array element v[0].

#    $a1: Array size n

#    $s0: index i for for loop

#    $s1: index j for for loop

 

# saving registers

#

   sort:            

                          addi $sp,$sp, -20  # make room on stack for 5 registers

                          sw  $ra,16($sp)      #save $ra on stack

                          sw  $s3,12($sp)    #save $s3 on stack

                          sw  $s2, 8($sp)     #save $s2 on stack

                          sw  $s1, 4($sp)     #save $s1 on stack

                          sw  $s0, 0($sp)     #save $s0 on stack

                         #we save $s' registers.

                         #procedure call shouldn't modify $s' registers.

                         #we save $ra register.

                         #we call swap procedure, and we will modify $ra at that time.

#procedure body

                          move $s2, $a0    # This is pseudo instruction .How do you do this?

                          move $s3, $a1    # save the array size n($a1)  to $s3.

                          move $s0,$zero # index i for for loop ( i =0)

 for1tst:           slt $t0, $s0,$s3    # reg $t0 = 0 if $s0 >= $s3 (i>=n)

                          beq  $t0,$zero,exit1  # outside for loop conditon check.

                          addi $s1,$s0,-1  # set j = i -1 for second for loop.

for2tst:            slti  $t0,$s1,0      # reg $t0 = 1 if $s1 <0 (j <0)

                          bne $t0,$zero,exit2

                          add $t1,$s1,$s1  # reg $t1 = j * 2

                          add $t1,$t1,$t1    # reg $t1 = j * 4

                          add $t2,$s2,$t1   # reg $t2 = v +(j * 4)//address of v[j]

                          lw $t3,0($t2)         # reg $t3 = v[j]

                          lw $t4,4($t2)         # reg $t4 = v[j + 1]

                         slt   $t0,$t4,$t3      # reg $t0 = 0 if v[j+1] >= v[j]

                         beg $t0,$zero,exit2

                         move $a0,$s2     #1-st parameter of swap is v

                         move $a1,$s1     #2 nd parameter of swap is j

                         jal swap               # $ra will be modified by jal, $ra <=exit2+4.

 exit2:             addi $s0,$s0, 1  # i= i +1

                         j for1tst               # will be returned here after swap.

 

                        #now we restore saved $s' registers and $ra

                        #and adjust stack pointer.

exit1:              lw $s0,0($sp)

                         lw $s1,4($sp)

                         lw $s2,8($sp)

                         lw $s3,12($sp)

                         lw $ra,16($sp)

                         addi $sp,$sp,20

                         jr  $ra                         # return to calling procedure.

 

#  Compiler

 

 

# Contents

#

# Optimization Introduction

# Steps in Preparing a Program

# Compiler Definitions

# High-Level Optimizations

# Low-Level Optimizations

# Compiler Optimization Options

# Use of Compiler Switches

 

#Lecture Goals

#

# Understand the Program Building and Compilation Process

#  Describe steps in program building and optimization, including

#  intermediate files (assembler, object, ...) and tool names (preprocessor,

#  compiler, etc.).

#

# Understand Specific Optimizations and Assumption Switches

#  Describe, work example, explain benefit.

#

# Understand Profiling

#  Steps, how performance is improved.

#

# Understand ISA and Implementation Options

#  How programmer chooses them, how compiler uses them.

#

# Understand how Programmers Use Compilation Options

#  Normal program development, high-performance programs, SPEC disclosure.

 

###################

# Optimization Introduction

 

 

 

# :Def: Optimization

# The optional steps in compiling a program that reduce the program's

#execution time, reduce the size of the program, or both.

 

# Typically, the only time optimization is NOT done is when a program

# is being debugged.

#

# In most cases, the programmer sets overall optimization effort (low,

#medium, high).

#

#When performance is very important the programmer can specify which

# specific optimizations to use.

 

# :Example:

#

# Data on a program that computes pi with and without optimization.

#

# System:   SunOS sol 5.6 Generic_105181-31 sun4u sparc SUNW,Ultra-Enterprise

# Compiler: Sun WorkShop 6 2000/04/07 C 5.1

#Clock Frequency: 300 MHz

#

# Without Optimization

#  Size              : 6408 bytes

#   Run Time          : 3.00 s

#  Instruction Count : 325,338,749

#  CPI               : (/ (* 3.00 300e6) 325338749.0 ) = 2.7663

#

# With Optimization

#   Size              : 6340 bytes    Smaller! (OK, only a tiny bit smaller.)

!   Run Time          : 1.65 s        Faster!

!   Instruction Count : 100,338,751

!   CPI               : (/ (* 1.65 300e6) 100338751.0) = 4.9333

!

 

# Reasons *not* to Optimize

#

# Percentage shows roughly how often reason applies.

#

# 95%  It makes debugging difficult, so don't optimize while debugging.

#      This is true for almost everyone that uses a debugger.

#

#10%  It slows down compilation.

#     Only important when there is a very large amount of code to

#      recompile.  (Back in the 20th century when computers were slow

#      this was important.)

#

#  1%  Optimization introduces bugs.

#     It does, but very rarely.

 

 

##############################

# Steps in Building a Program

 

# Typical Steps in Building a Program

#

#    Pre-Process, Compile, Assemble, Link, Load

#   pi.c ->     pi.i  -> pi.s  ->  pi.o  -> pi

#

# These steps can all be automatically performed of the compiler

# driver. (cc, gcc, MS Visual Studio, etc.)

#

#:Sample: cc pi.c -o pi

#

# They can also be specified individually.

#

# More details appear below, compile is what we're interested in.

 

# Pre-Process

#  Load include files, expand macros.

#   Typical pre-processor name: cpp

#  Input File: pi.c

#  Output File: pi.i  (High-Level Language)

 

# Compile

#   Convert high-level language into assembly language.

#   Typical compiler name: cc1 (not cc, or gcc, or Visual Studio)

#  Input File: pi.i

#  Output File: pi.s  (Assembly language)

 

# Assemble

#   Convert assembly language in to machine language.

#   Typical assembler name: as

#   Input File: pi.s

#   Output File: pi.o  (Object file containing machine code.)

 

# Link

#  Combine object files, libraries, and other code into executable.

#  Typical linker name: ld

#  Input File: pi.o

#  Output File: pi

 

# Load

#   Copy executable into memory, link with shared libraries, and start.

#   Loader name: exec system call.

 

 

#######################################

# Compiler Definitions

 

#  A program is compiled using  steps or /passes/.

#

#  The first pass reads the pre-processed high-level source code.

#

#   The last pass produces assembly code.

#

#   Between the first and last passes the program is in some

#   intermediate representation.

#

#   The way passes are defined and organized varies greatly from

#  compiler to compiler.

 

# :Def: Pass

#   A step in compiling a program.  A pass usually looks at an entire function.

 

# :Def: Intermediate Representation

# The form of a program (sort of a special-purpose language) used internally

# by a compiler.  A compiler might use several intermediate representations.

 

# Typical Passes

#

# Parse

#   Convert the source code to a high-level intermediate representation (H-IR).

#

#High-Level Optimization (Optional, may be done in several passes.)

#   Also called front-end optimization.

#   Modify H-IR to improve performance or reduce code size.

#  Reads and writes H-IR

#

# Low-Level Intermediate Representation (L-IR) Generation

#  Convert H-IR to a low-level intermediate representation (L-IR).

#

# Low-Level Optimization  (Optional, may be done in several passes.)

#   Also called back-end optimization.

#   Modify L-IR to improve performance or reduce code size.

#

# Register Assignment (Part of low-level optimization.)

#  Choose machine registers.

#

# Code Generation

#   Convert L-IR to assembly code.

#

# Pigeonhole Optimizations  (Optional, may be done in several passes.)

#   These are also called low-level optimizations.

#   Modify L-IR to improve performance or reduce code size.

#  Some of these can be done at link time.

 

#:Def: Compiler Front End

# The parts of the compiler that do the parsing and high-level

# optimization passes.

 

# :Def: Compiler Back End

# The parts of the compiler that do the low-level optimization,

#register assignment, and code generation passes.

 

 

##############################

# High-Level Optimizations

 

# Easy high-level optimizations presented here.

 

#Some Easy-To-Explain Front-End Optimizations

#

# Dead-Code Elimination (DCE)

# Common Subexpression Elimination (CSE)

# Constant Propagation, Folding

 

# Dead-Code Elimination (DCE)

#  Removal of code which isn't used.

#  Yes, it happens.

 

# :Example:

#

# Code benefiting from DCE

# High-level code shown for clarity.  Most compilers will transform

# an intermediate representation.

#

# Before:

#

int

main(int argv, char  **argc)

{

  double i;   double sum = 0;

 

  for(i=1; i<50000000;)

    {

      sum = sum + 4.0 / i;   i += 2;

      sum = sum - 4.0 / i;   i += 2;

    }

 

  printf("Performed %d iterations.  Thank you for running me.\n",i);

}

#

# After:

#

int

main(int argv, char **argc)

{

  double i;

 

  for(i=1; i<50000000;)

    {

      i += 2;

      i += 2;

    }

 

  printf("Performed %d iterations.  Thank you for running me.\n",i);

}

 

#

#  Note: Other optimizations would leave only the printf.

 

#  Common Subexpression Elimination

#  Remove duplicated code.

#

# Before:

  r = ( a + b ) / ( x + y );

  s = ( a + b ) / ( x - y );

# After:

  temp = a + b;

  r = ( temp ) / ( x + y );

  s = ( temp ) / ( x - y );

 

# Constant Propagation, Folding

#  The compiler performs whatever arithmetic it can at compile time

#   rather than emitting code to perform the arithmetic at run time.

#

# Before:

  int sec_per_day = 60 * 60 * 24;

  int sec_per_year = sec_per_day * 365;

  some_routine(sec_per_day * x, sec_per_year * y);

# After:

  int sec_per_day = 86400;

  int sec_per_year = 31536000;

  some_routine(86400 * x, 31536000 * y);

 

 

#####################################

# Low-Level Optimizations

 

# Some Low-Level Optimizations

#

# This is not a complete list.

#

# Register Assignment

# Instruction Selection

# Scheduling

 

# :Def: Register Assignment

# Selection of which values will be held in registers.  Values

#not held in registers are stored in memory.

#

# Without Register Assignment Optimizations

# All values corresponding to variables (in high-level program) are

# written to memory (and not held in registers).  Intermediate results are

# held in registers.(accessing memory takes longer time than accessing regs.)

#

# With Register Assignment Optimization

# Registers are assigned to as many variables as possible, with priority

# given to frequently used variables.

#

#Advantage of Register Assignment Optimization

# Fewer memory writes and reads.(faster)

 

# :Def: Scheduling

# Re-arranging instructions to minimize the amount of time one instruction

# has to wait for another.

#

#For example, if an instruction takes a long time it will be started early

# so that other instructions will not have to wait for its result.

#

#

 

 

######################################

## Compiler Optimization Options

 

 

 

# Optimization options tell the compiler:

#

#  How much EFFORT to put into optimization.  E.g., -O2(do level 2 optimization)

#   Which PARTICULAR optimizations to perform.  E.g., -fno-strength-reduce

#  (loop optimization)

#  The TARGET system the code will be running on.  E.g., -xultra2  (ultra2 machine)

#   Whether to make certain ASSUMPTIONS about the code.

#   Whether to use PROFILING data from a training run.

 

 

##Optimization EFFORT

#

# Most compilers have optimization levels.

#

# The higher the number, the more optimizations done.

 

# :Example:

#

# Optimization Levels for Sun Workshop 6 Compiler

 

#     -xO [1|2|3|4|5]

#           Optimizes the object code. Note the upper-case letter

#         O.

 

#          The levels (1, 2, 3, 4, or 5) you can use differ

#          according to the platform you are using.

 

#                ( SPARC)

 

#              -xO1 Does basic local optimization (peephole).

#                                                     peephole: compiler is looking only small part of code

#                                                                          and does optimization only for that part.

#                                                                         if compiler is looking larger part of code,

#                                                                          it may produce better code. but that kind of

#                                                                         process takes more time and complicates.

 

#                -xO2 Does basic local and global optimization.

#                    This is induction variable elimination, local

#                     and global common subexpression elimination,

#                    algebraic simplification, copy propagation,

#                    constant propagation, loop-invariant optimi-

#                    zation, register allocation, basic block

#                     merging, tail recursion elimination, dead

#                    code elimination, tail call elimination and

#                     complex expression expansion.

#                     : Any compiler Book will explain these.

 

#                     The -xO2 level does not assign global, exter-

#                    nal, or indirect references or definitions to

#                    registers. It treats these references and

#                    definitions as if they were declared "vola-

#                    tile." In general, the -xO2 level results in

#                     minimum code size.

 

#                -xO3 Performs like -xO2 but, also optimizes refer-

#                     ences or definitions for external variables.

#                     Loop unrolling and software pipelining are

#                     also performed. The -xO3 level does not trace

#                     the effects of pointer assignments. When com-

#                     piling either device drivers, or programs

#                     that modify external variables from within

#                     signal handlers, you may need to use the

#                     volatile type qualifier to protect the object

#                     from optimization.  In general, the -xO3

#                    level results in increased code size.

#                              : loop unrolling : saving time at branch instruction.

#                                 for(i=0;i<3;++i)   s = s+i;

#                                  becomes  :       s = s+0;  

#                                                             s = s+1;

#                                                             s= s+ 2;

#                                  longer code but no conditional branch.

 

#                -xO4 Performs like -xO3 but, also does automatic

#                     inlining of functions contained in the same

#                    file; this usually improves execution speed.

#                     The -xO4 level does trace the effects of

#                    pointer assignments.  In general, the -xO4

#                     level results in increased code size.

 

#                     If you want to control which functions are

#                    inlined, see -xinline .

 

#               -xO5 Generates the highest level of optimization.

#                     Uses optimization algorithms that take more

#                     compilation time or that do not have as high

#                     a certainty of improving execution time.

#                     Optimization at this level is more likely to

#                    improve performance if it is done with pro-

#                    file feedback.

 

## PARTICULAR Optimizations

#

# The levels specify sets of optimizations (like ordering the "Sport Package"

# for a new car).

#

# In contrast to optimization levels (-O3), the compiler can be told which

# particular optimizations to make.

#

# These are typically used by skilled programmers trying to get

# fastest code.

#

# Sun Workshop 6   

#

# Prefetch instructions read memory in advance, eliminating some cache

# misses .

#                             Cache is expensive but  a high speed memory.

#

#

 

# They can also increase

# cache misses or increase the time to access memory, hurting

# performance.  The compiler does not know if they will help or hurt.

#

#     -xprefetch[=val],val

#         (SPARC) Enable prefetch instructions on those architec-

#         tures that support prefetch, such as UltraSPARC II.

#         (-xarch=v8plus, v9plusa, v9, or v9a)

 

#          Explicit prefetching should only be used under special

#          circumstances that are supported by measurements.

 

 

## TARGET

#

# Specify exact type of machine code will be run on.

#

# What might be specified:

#

# ISA

# The exact instruction set used.

# Specifies not just the ISA family, but a particular variation.

# A poor choice will limit the number of machines code can run on.

#

# Implementation

# Specify the implementation code will run on.

# A poor choice will result in slower code.

#

# Cache

#Specify configuration of cache.

#A poor choice will result in slower code.

 

## Background

#

# Sun SPARC Architecture

#

#  Developed by Sun

#

# Major Versions:  v7, v8, v9 (first 64-bit SPARC ISA).

# Sub Version: v8plusa (v9 subset for 32-bit operating systems)

#

# Some Implementations

# UltraSPARC III Cu   Implements v9, (On newer Suns.)

# UltraSPARC II   Implements v9, (On sol.)

 

 

#:Example:

#

# Switches for Sun Workshop 6 Compiler.

#

#    -xarch     Specifies ISA

#    -xchip      Specifies implementation.

#    -xcache    Specifies cache.

 

#     -xtarget   Specifies system. ISA/implementation and cache.

#

 

#     -xarch Values:

#   v7, v8a, v8, v8plus (v8 with 32-bit v9), v8plusa (v8 with 32 bit v9 etc.)

 

#    -xchip=c

#          c must be one of: generic, generic64, native, native64,

#          old, super, super2, micro, micro2, hyper, hyper2,

#          powerup, ultra, ultra2, ultra2i, 386, 486, pentium,

#         pentium_pro, 603, 604.

 

## ASSUMPTIONS About the Program

 

# Compilers must generate correct code.

#

#  That is, the code must execute in the way specified by the

#  high-level language definition.

#

# Correct code can be slow.

#

#

# Some options tell the compiler to make assumptions about the program.

#

#  These assumptions would not hold for every program.

#

#  The compiled program runs faster ...

# ... and correctly if the assumptions are valid.

 

# Some switches specifying assumptions:

 

#gcc, Assume the program does not require strict IEEE 754 FP features.

# `-ffast-math'

#     This option allows GCC to violate some ANSI or IEEE rules and/or

#     specifications in the interest of optimizing code for speed.  For

#    example, it allows the compiler to assume arguments to the `sqrt'

#     function are non-negative numbers and that no floating-point values

#      are NaNs.

 

 

# cc (Sun Workshop 6 Compiler) Assume certain pointers do not overlap.

#

#      -xrestrict=f

#          (SPARC) Treats pointer-valued function parameters as

#         restricted pointers. f is a comma-separated list that

#          consists of one or more function parameters, %all,

#         %none.  This command-line option can be used on its

#           own, but is best used with optimization of -xO3 or

#          greater.

#

#          The default is %none. Specifying -xrestrict is

#          equivalent to specifying -xrestrict=%all.

#

#

 

## PROFILING

 

# :Def: Profiling

# A compilation technique in which data taken from a sample run is used

# to guide compiler decisions.

 

# Branches occur frequently in code.  There is a performance penalty

# in taking a branch and so it's best if the compiler organizes code

# (rearranges the intermediate representation) so that branches are

# not taken as much as possible.  To do that the compiler needs to

# know how often an "if" or other condition (for which the branch is

# emitted) is true.  Only in a few cases can the compiler figure that

# out on its own, because, for example, "if" conditions depend on

# input data.  To obtain this useful information a two-step

# compilation process called profiling is used.  In the first step the

# code is compiled so that it writes branch information (more

# precisely basic block, covered later) to a file.  The program is run

# with typical input data, called the training input, and it writes

# the branch information to a file.  In the second step the compiler

# reads the information and uses that to better organize the code.

 

# Sun Profiling Compiler Switches

#

#     -xprofile=p

#          Collects data for a profile or use a profile to optim-

#          ize.

#

#          p must be collect[:name], use[:name], or tcov.

#

#          This option causes execution frequency data to be col-

#           lected and saved during execution, then the data can be

#          used in subsequent runs to improve performance. This

#          option is only valid when a level of optimization is

#          specified.

 

 

## Use of Compiler Switches

 

# ISA

# Type of system all uses have. (IA-32 for PCs, SPARC for Sun users, etc.)

# Users can't normally run code compiled for a different ISA.

 

# Implementation

#  Type of system most users have.

#  Other users can run code, but won't run as fast.

 

# Optimization

#   Select medium or high optimization level.

#  If market very sensitive to performance, use specific optimizations.