/*
* LSU EE 7700-1 Spring 2006 Sample Code
*
* Some Branch Predictor Simulators (Using Shade)
*
* gshare
* Bimodal
* YAGS: http://www.ece.lsu.edu/tca/s/eden-98.pdf
* Perceptron http://www.ece.lsu.edu/tca/s/p280-tarjan.pdf
*
* Note: The branch predictors for a given run the branch predictors
* use the same amount of global history but they DO NOT use the
* same amount of storage. Bimodal is the smallest, then gshare, then
* YAGS. The size of the perceptron predictor varies.
*
*
* To Run:
*
* echo MY_BENCHMARK MY_BENCHMARK_ARGS | percep HISTORY_LEN [BHT_LG]
*
*/
#define TR_REGS
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <IHASH.h>
#include <ITYPES.h>
#include <trace.h>
#include <stdtr.h>
#include <trctl.h>
extern int shade_sshell(char *command, void (*analyzer)());
typedef enum
{
BPU_Normal = 0, // Update GHR using outcome. For gshare.
BPU_Include = 0,
BPU_None = 1, // Don't touch GHR. For bimodal.
BPU_Exclude = 1,
BPU_Pad = 2 // Shift a zero into GHR.
}
BP_GHR_Update_Method;
const char* const bpu_str = "IEP";
static void sim_init(int argc, char **argv, char *bm_command);
static void sim_analyzer_phase_one();
static void sim_after_phase_one();
static void sim_results_phase_one_write();
int shade_main(int argc, char **argv)
{
/* Copy first line of stdin to bm_command. */
static char bm_command[200];
{
char *bptr = bm_command;
char* const bstop = bm_command + sizeof(bm_command);
while( 1 )
{
char c = getchar();
if( c == EOF || c == '\n' ) break;
if( bptr == bstop )
{
fprintf(stderr,"Benchmark command too long, exiting.\n");
return 1;
}
*bptr++ = c;
}
*bptr++ = 0;
}
/* Shade Documentation
*
* Overview: http://www.ece.lsu.edu/tca/intro.pdf
* Ref Manual: http://www.ece.lsu.edu/tca/shade_man.pdf
*
* Trace Structure (trace->tr_pc, trace->ih, etc.)
* /data4/koppel/shadekit/shade.v9.elf/src/include/trace.h
* Trace Information Constants (TC_PC, TC_TAKEN, etc.)
* /data4/koppel/shadekit/shade.v9.elf/src/include/trctl.h
* Instruction Types (IT_ANY, IT_BRANCH, etc.)
* /data4/koppel/shadekit/spixtools.v9.elf/src/include/ITYPES.h
*/
/* Tell Shade size of trace structure. (It can be customized.) */
shade_trctl_trsize( sizeof(Trace) );
/*
* Tell Shade which instructions to trace and what information to collect.
*/
shade_trctl_it
(IT_ANY, // Which instructions. (All)
1, // Turn tracing on or off. (On)
0, // Trace annulled instructions? (No)
TC_TAKEN | TC_PC | TC_IH // What info to collect? (Outcome,pc,opcode)
);
/*
* Phase One: Collect data on bimodal and gshare predictors.
*/
sim_init(argc, argv, bm_command);
shade_sshell(bm_command,sim_analyzer_phase_one);
sim_after_phase_one();
return 0;
}
/* ************************************************************************* */
/*
* Branch Predictors
*
*/
#define SINC2(a) { if( (a) < 3 ) (a)++; }
#define SDEC2(a) { if( (a) > 0 ) (a)--; }
#define BP_ACCURACY(bp) bp_gshare_accuracy((BP_gshare*)bp)
typedef struct {
char counter;
int tag;
}
BP_yags_pht_elt;
typedef struct
{
int correct, wrong;
double accuracy;
unsigned int ghr;
int history_length;
BP_yags_pht_elt *pht_t, *pht_n;
int bht_lg;
int bht_size;
int bht_mask;
char *bht;
int tag_size; // Size of tag stored in phts.
int pht_size;
int pht_mask;
int tag_mask;
int capacity_bits;
}
BP_yags;
void bp_yags_reset(BP_yags *g);
BP_yags*
bp_yags_new(int history_length, int bht_lg)
{
BP_yags* const g = (BP_yags*) calloc(1,sizeof(BP_yags));
g->history_length = history_length;
g->pht_size = 1 << history_length;
g->pht_mask = g->pht_size - 1;
g->pht_t = (BP_yags_pht_elt*) calloc(g->pht_size,sizeof(g->pht_t[0]));
g->pht_n = (BP_yags_pht_elt*) calloc(g->pht_size,sizeof(g->pht_n[0]));
g->bht_lg = bht_lg;
g->bht_size = 1 << g->bht_lg;
g->bht_mask = g->bht_size - 1;
g->bht = (char*) calloc(g->bht_size,1);
g->tag_size = 6;
g->tag_mask = ( 1 << ( g->tag_size + 2 ) ) - 1;
const int pht_capacity = g->pht_size * ( 2 + g->tag_size );
const int bht_capacity = g->bht_size * 2;
g->capacity_bits = pht_capacity * 2 + bht_capacity;
bp_yags_reset(g);
return g;
}
void
bp_yags_reset(BP_yags *g)
{
memset(g->pht_t, 0, g->pht_size * sizeof(g->pht_t[0]) );
memset(g->pht_n, 0, g->pht_size * sizeof(g->pht_t[0]) );
memset(g->bht, 0, g->bht_size * sizeof(g->bht[0]) );
g->ghr = 0;
g->accuracy = -1;
}
char*
bp_yags_config(BP_yags *g)
{
static char buffer[100];
sprintf(buffer,"Y%d/%d/%d",
g->history_length,g->bht_lg,g->tag_size);
return buffer;
}
int
bp_yags_predict(BP_yags *g, unsigned int pc, int outcome)
{
char* const bht_entry = &g->bht[ ( pc >> 2 ) & g->pht_mask ];
const int pred_bm = bht_entry[0] > 1;
const int pht_idx = ( g->ghr ^ ( pc >> 2 ) ) & g->pht_mask;
BP_yags_pht_elt* const pht_elt = ( pred_bm ? g->pht_t : g->pht_n ) + pht_idx;
const int tag = pc & g->tag_mask;
const int hit = pht_elt->tag == tag;
const int prediction = hit ? pht_elt->counter > 1 : pred_bm;
const int bm_correct = pred_bm == outcome;
const int yg_correct = prediction == outcome;
if( bm_correct || !yg_correct )
{
if( outcome ) { SINC2( *bht_entry ); } else { SDEC2( *bht_entry ); }
}
if( !bm_correct && !hit )
{
pht_elt->tag = tag;
pht_elt->counter = outcome ? 3 : 0;
}
else if ( hit )
{
if( outcome ) { SINC2( pht_elt->counter ); }
else { SDEC2( pht_elt->counter ); }
}
if( yg_correct ) g->correct++; else g->wrong++;
g->ghr = ( ( g->ghr << 1 ) | outcome ) & g->pht_mask;
return yg_correct;
}
typedef struct
{
int correct, wrong;
double accuracy; /* Set after call to bp_gshare_accuracy. */
unsigned int ghr;
char *pht;
int history_length;
int pht_size;
int pht_mask;
int capacity_bits;
}
BP_gshare;
void bp_gshare_reset(BP_gshare *g);
/*
* Create and return a new branch predictor.
*/
BP_gshare*
bp_gshare_new(int history_length)
{
BP_gshare* const g = (BP_gshare*) malloc(sizeof(BP_gshare));
g->history_length = history_length;
g->pht_size = 1 << history_length;
g->pht_mask = g->pht_size - 1;
g->pht = (char*) malloc(g->pht_size);
g->capacity_bits = g->pht_size * 2;
bp_gshare_reset(g);
return g;
}
/*
* Reset branch predictor tables and statistics.
*/
void
bp_gshare_reset(BP_gshare *g)
{
g->ghr = 0;
g->correct = g->wrong = 0;
g->accuracy = -1;
bzero(g->pht,g->pht_size);
}
/*
* Predict outcome of branch and update GHR using actual outcome.
*
* Method specifies how to update GHR: BPU_Include for gshare, BPU_Exclude
* for bimodal, and BPU_Pad for ??.
* Return 1 if prediction correct and 0 if prediction wrong.
*/
int
bp_gshare_predict
(BP_gshare *g, unsigned int pc, int outcome, BP_GHR_Update_Method method)
{
char* const pht_entry = &g->pht[ ( g->ghr ^ ( pc >> 2 ) ) & g->pht_mask ];
const int prediction = pht_entry[0] > 1;
if( prediction == outcome ) g->correct++; else g->wrong++;
if( outcome && pht_entry[0] < 3 ) pht_entry[0]++;
if( !outcome && pht_entry[0] > 0 ) pht_entry[0]--;
if( method == BPU_Include )
g->ghr = ( ( g->ghr << 1 ) | outcome ) & g->pht_mask;
else if ( method == BPU_Pad )
g->ghr = ( g->ghr << 1 ) & g->pht_mask;
return prediction == outcome;
}
/*
* Return accuracy of predictor. Once this is called the accuracy is
* available in g->accuracy.
*/
double
bp_gshare_accuracy(BP_gshare *g)
{
if( g->accuracy < 0 )
{
const double exec = g->correct + g->wrong;
g->accuracy = exec ? g->correct / exec : 0;
}
return g->accuracy;
}
typedef struct
{
int correct, wrong;
double accuracy; /* Set after call to bp_gshare_accuracy. */
unsigned int ghr;
unsigned int *gpr; // Global path register: Holds addresses of recent br.
int gpr_idx;
char** weight_tables;
char** weight_entries; // Entries used to make a particular prediction.
int weight_bits;
int weight_max, weight_min;
char *bt; // Bias table.
int bt_lg, bt_size, bt_mask;
int spec_wt_per_pred; // Weights per prediction.
int spec_b_per_wt; // Branches per weight.
int spec_a; // If non-zero, use predicted branch pc for wt idx.
int spec_p;
int spec_g_mask;
int history_length;
int wt_lg, wt_size, wt_mask;
int capacity_bits; // Storage needed to implement.
}
BP_perceptron;
BP_perceptron*
bp_perceptron_new(int history_lengthp, int bt_lg, int b_per_wt, int tight)
{
BP_perceptron* const bp = (BP_perceptron*) malloc(sizeof(BP_perceptron));
bp->weight_bits = 7;
bp->spec_b_per_wt = b_per_wt;
bp->spec_wt_per_pred = history_lengthp / bp->spec_b_per_wt;
bp->spec_g_mask = ( 1 << bp->spec_b_per_wt ) - 1;
bp->spec_p = 0;
bp->spec_a = 1;
bp->bt_lg = bt_lg;
bp->bt_size = 1 << bt_lg;
bp->bt_mask = bp->bt_size - 1;
bp->wt_lg = tight || bt_lg < bp->spec_b_per_wt ? bp->spec_b_per_wt : bt_lg;
bp->wt_size = 1 << bp->wt_lg;
bp->wt_mask = bp->wt_size - 1;
bp->history_length = bp->spec_wt_per_pred * bp->spec_b_per_wt;
bp->weight_tables =
(char**) malloc( ( 1 + bp->spec_wt_per_pred ) * sizeof(char*) );
bp->weight_entries =
(char**) malloc( ( 1 + bp->spec_wt_per_pred ) * sizeof(char*) );
bp->capacity_bits =
bp->wt_size * bp->spec_wt_per_pred * bp->weight_bits
+ bp->bt_size * bp->weight_bits;
bp->weight_max = 1 << ( bp->weight_bits - 1 );
bp->weight_min = - bp->weight_max;
int i;
for(i=0; i<bp->spec_wt_per_pred; i++)
bp->weight_tables[i] = (char*) calloc( bp->wt_size, 1 );
bp->weight_tables[i] = (char*) calloc( bp->bt_size, 1 );
bp->gpr = (unsigned int*) calloc( bp->history_length, sizeof(int) );
bp->gpr_idx = 0;
bp->correct = bp->wrong = 0;
bp->accuracy = -1;
return bp;
}
char*
bp_perceptron_config(BP_perceptron *bp)
{
static char buffer[100];
snprintf(buffer,sizeof(buffer),"%s%s%s-%2d/%2d-%2d-%2d",
bp->spec_a ? "A" : "_",
bp->spec_p ? "P" : "_",
bp->spec_g_mask ? "G" : "_",
bp->history_length,
bp->spec_wt_per_pred,
bp->wt_lg,
bp->bt_lg
);
return buffer;
}
#define INC(a) { if( (a) < g->weight_max ) (a)++; }
#define DEC(a) { if( (a) > g->weight_min ) (a)--; }
int
bp_perceptron_predict(BP_perceptron *g, unsigned int pc, int outcome)
{
int ghr = g->ghr;
int gpr_idx = g->gpr_idx;
char **wt_ptrs = g->weight_entries;
const int idx_init_val = g->spec_a ? pc >> 2 : 0;
wt_ptrs[g->spec_wt_per_pred] =
&g->weight_tables[g->spec_wt_per_pred][(pc>>2)&g->bt_mask];
int i;
for( i = 0; i < g->spec_wt_per_pred; i++)
{
int idx = idx_init_val;
if( g->spec_p ) idx = idx ^ g->gpr[gpr_idx];
if( g->spec_g_mask ) idx = idx ^ ( ghr & g->spec_g_mask );
char* const wt = g->weight_tables[i];
wt_ptrs[i] = &wt[ idx & g->wt_mask ];
ghr >>= g->spec_b_per_wt;
gpr_idx -= g->spec_b_per_wt;
if( gpr_idx < 0 ) gpr_idx += g->history_length;
}
int score = 0;
for( i = 0; i <= g->spec_wt_per_pred; i++) score += *wt_ptrs[i];
const int pred = score >= 0;
const int correct = pred == outcome;
const int abs_score = score < 0 ? -score : score;
if( correct ) g->correct++; else g->wrong++;
if( !correct || abs_score < 10 )
{
if( outcome )
for( i = 0; i <= g->spec_wt_per_pred; i++) { INC( *wt_ptrs[i] ); }
else
for( i = 0; i <= g->spec_wt_per_pred; i++) { DEC( *wt_ptrs[i] ); }
}
g->gpr_idx++;
if( g->gpr_idx == g->history_length ) g->gpr_idx = 0;
g->ghr = ( g->ghr << 1 ) | outcome;
g->gpr[g->gpr_idx] = pc >> 2;
return correct;
}
double
bp_perceptron_accuracy(BP_perceptron *g)
{
if( g->accuracy < 0 )
{
const double exec = g->correct + g->wrong;
g->accuracy = exec ? g->correct / exec : 0;
}
return g->accuracy;
}
/* ************************************************************************* */
/*
* Branch Information Table (BIT)
*
* Holds information about each branch instruction in program.
*/
typedef struct
{
unsigned int pc; /* Address of branch. */
int rank; /* Rank in number of executions: 0 most frequent, etc. */
int round;
int correct_p1, wrong_p1; /* Prediction results for phase-one gshare. */
int correct_bm, wrong_bm; /* Prediction results for phase-one bimodal. */
int correct_yg, wrong_yg;
int correct_pr, wrong_pr;
int icount; /* Number of time this branch executes. */
double icount_inv;
double icount_frac; /* icount / executions of all branches. */
double scaled_change;
}
BIT_Elt;
typedef struct
{
int icount; /* Number of instructions. */
int icount_branch; /* Number of branches. */
char *bm_command;
int print_amt_max; /* Maximum number of insn to print. */
int history_length; /* Length of ghr. */
/* Branch information table size variables. */
int bit_lg; /* Log_2 size. (Input or constant.) */
int bit_size; /* Computed. */
int bit_mask; /* Computed. */
int bit_occ; /* Number of branches in table. */
BP_yags *yg_phase_one;
BP_perceptron *pr_phase_one; /* perceptron */
BP_perceptron *pr_array[20];
BP_gshare *gs_phase_one; /* gshare */
BP_gshare *bm_phase_one; /* bimodal */
BIT_Elt *bit; /* Branch information table. */
BIT_Elt **bit_sorted; /* Branch information table sorted by executions. */
}
Sim_Info;
Sim_Info sim_info;
/*
* Return branch information table entry for PC.
*
* If there is a collision (two branches map to same element) table
* entry is reset. This should happen rarely, if it does increase
* sim_info.bit_lg.
*/
BIT_Elt*
bit_get(Sim_Info *si, unsigned int pc)
{
BIT_Elt* const bi =
&si->bit[ si->bit_mask &
( ( pc ^ ( pc >> si->bit_lg ) ^ ( pc >> 8 ) ) >> 2 )];
if( bi->pc != pc )
{
bi->pc = pc;
bi->correct_p1 = bi->wrong_p1 = 0;
bi->correct_bm = bi->wrong_bm = 0;
}
return bi;
}
/*
* Return ordering of two BIT entries (for qsort).
*
* When assigned, sort order is number of times branch executes.
*/
int
comp_mispred(const void *ap, const void *bp)
{
BIT_Elt* const a = *((BIT_Elt**)ap);
BIT_Elt* const b = *((BIT_Elt**)bp);
const int order = b->correct_p1 + b->wrong_p1 - a->wrong_p1 - a->correct_p1;
/* If it's a tie put them in prog order. */
if( order == 0 ) return a->pc - b->pc;
return order;
}
/* ************************************************************************* */
/*
* Simulation Routines
*
*/
void
sim_init(int argv, char **argc, char *bm_command)
{
int i;
if( argv < 2 )
{
fprintf(stderr,"Usage: %s HISTORY_LEN [WT_LG] [BHT_LG]\n",argc[0]);
exit(1);
}
/* Tune this for preference. (Number of insn to print info about.) */
sim_info.print_amt_max = 20;
/* Make this large enough so branches don't share entries. */
sim_info.bit_lg = 19; // Size of branch information table.
sim_info.history_length = atoi(argc[1]);
int wt_lg = argv > 2 ? atoi(argc[2]) : sim_info.history_length;
int bht_lg = argv > 3 ? atoi(argc[3]) : sim_info.history_length;
sim_info.bit_size = 1 << sim_info.bit_lg;
sim_info.bit_mask = sim_info.bit_size - 1;
sim_info.bit = (BIT_Elt*) calloc(sim_info.bit_size,sizeof(sim_info.bit[0]));
sim_info.bit_sorted = (BIT_Elt**) malloc(sim_info.bit_size*sizeof(void*));
for(i=0; i<sim_info.bit_size; i++) sim_info.bit_sorted[i] = &sim_info.bit[i];
sim_info.gs_phase_one = bp_gshare_new(sim_info.history_length);
sim_info.bm_phase_one = bp_gshare_new(sim_info.history_length);
sim_info.yg_phase_one = bp_yags_new(sim_info.history_length,bht_lg);
sim_info.pr_phase_one =
bp_perceptron_new(32,sim_info.history_length-3,4,0);
BP_perceptron **bpptr = sim_info.pr_array;
for(i=0; i < 6; i++)
{
const int wt_table_lg = sim_info.history_length - i;
const int b_per_wt = sim_info.history_length >> i;
if( b_per_wt )
*bpptr++ =
bp_perceptron_new(sim_info.history_length,wt_table_lg,b_per_wt,0);
const int b_per_wt_32 = 32 >> i;
if( b_per_wt_32 && b_per_wt_32 < 32 )
*bpptr++ =
bp_perceptron_new(32,wt_table_lg,b_per_wt_32,0);
}
sim_info.icount_branch = 0;
sim_info.bm_command = bm_command;
}
/*
* Phase One
*
* Compute gshare and bimodal prediction accuracies.
* Record per-instruction prediction accuracies.
*/
void
sim_analyzer_phase_one()
{
Trace *trace_entry;
while( ( trace_entry = shade_step() ) )
{
/* One iteration per (traced) instruction. */
/* ih: Instruction hash, sort of an ideal opcode. */
const int ih = trace_entry->tr_ih;
sim_info.icount++;
if( !ih_isbranch(ih) ) continue; // Continue if not an integer branch.
sim_info.icount_branch++;
int i;
for(i=0; i<sim_info.history_length && sim_info.pr_array[i]; i++)
bp_perceptron_predict
(sim_info.pr_array[i], trace_entry->tr_pc, trace_entry->tr_taken);
const int pr_correct = bp_perceptron_predict
(sim_info.pr_phase_one, trace_entry->tr_pc, trace_entry->tr_taken);
/* Bimodal Prediction */
const int bm_correct = bp_gshare_predict
(sim_info.bm_phase_one,
trace_entry->tr_pc, trace_entry->tr_taken, BPU_Exclude);
/* gshare Prediction */
const int gs_correct = bp_gshare_predict
(sim_info.gs_phase_one,
trace_entry->tr_pc, trace_entry->tr_taken, BPU_Include);
/* yags Prediction */
const int yg_correct = bp_yags_predict
(sim_info.yg_phase_one, trace_entry->tr_pc, trace_entry->tr_taken);
/* Record outcome for this branch. */
BIT_Elt* const bi = bit_get(&sim_info,trace_entry->tr_pc);
if( gs_correct ) bi->correct_p1++; else bi->wrong_p1++;
if( bm_correct ) bi->correct_bm++; else bi->wrong_bm++;
if( pr_correct ) bi->correct_pr++; else bi->wrong_pr++;
if( yg_correct ) bi->correct_yg++; else bi->wrong_yg++;
}
}
void
sim_after_phase_one()
{
BP_ACCURACY(sim_info.gs_phase_one);
BP_ACCURACY(sim_info.bm_phase_one);
BP_ACCURACY(sim_info.pr_phase_one);
BP_ACCURACY(sim_info.yg_phase_one);
/* Prepare sorted list of BIT entries. */
qsort(sim_info.bit_sorted,sim_info.bit_size,
sizeof(sim_info.bit_sorted[0]),comp_mispred);
const double icount_all_inv = 1.0 / sim_info.icount_branch;
int i;
for(i = 0; i < sim_info.bit_size; i++)
{
BIT_Elt* const bi = sim_info.bit_sorted[i];
bi->icount = bi->correct_p1 + bi->wrong_p1;
if( bi->icount == 0 ) break;
bi->rank = i;
bi->icount_frac = bi->icount * icount_all_inv;
bi->icount_inv = 1.0 / bi->icount;
}
sim_info.bit_occ = i;
sim_results_phase_one_write();
}
/* ************************************************************************* */
/*
* Write Results
*
*/
void
sim_results_phase_one_write()
{
BP_gshare* const gs = sim_info.gs_phase_one;
BP_gshare* const bm = sim_info.bm_phase_one;
BP_perceptron* const pr = sim_info.pr_phase_one;
BP_yags* const yg = sim_info.yg_phase_one;
int i;
printf("Found %d instructions of which %d were branches.\n",
sim_info.icount, sim_info.icount_branch);
for(i=0; i<sim_info.history_length && sim_info.pr_array[i]; i++)
printf("Percep %s acc %.4f size %d\n",
bp_perceptron_config(sim_info.pr_array[i]),
BP_ACCURACY(sim_info.pr_array[i]),
sim_info.pr_array[i]->capacity_bits
);
printf("Perceptron config: %s, size %d bits\n",
bp_perceptron_config(pr), pr->capacity_bits);
printf("YAGS config %s, size %d bits\n",
bp_yags_config(yg),yg->capacity_bits);
printf("gshare size %d bits, bm size %d bits.\n",
gs->capacity_bits, bm->capacity_bits);
printf("For ghr len %d: ", sim_info.history_length);
printf("Bimodal %.4f, gshare %.4f, yags %.4f, percep %.4f\n",
bm->accuracy, gs->accuracy, yg->accuracy, pr->accuracy);
printf("Rnk Insn Addr bimodal GS:Corr Wrong gshare yags percep\n");
for(i=0; i<sim_info.print_amt_max; i++)
{
BIT_Elt* const bi = sim_info.bit_sorted[i];
const int c = bi->correct_p1;
const int w = bi->wrong_p1;
const double cb = bi->correct_bm;
if( c + w == 0 ) break;
printf("%3d 0x%08x %.4f %7d %7d %.4f %.4f %.4f\n",
bi->rank, bi->pc, cb/(c+w),
c, w, c/((double)(c+w)),
((double)bi->correct_yg) / (c+w),
((double)bi->correct_pr) / (c+w)
);
}
}