/* * LSU EE 7700-1 Spring 2006 Sample Code * * Some Branch Predictor Simulators (Using Shade) * * gshare * Bimodal * YAGS: http://www.ece.lsu.edu/tca/s/eden-98.pdf * Perceptron http://www.ece.lsu.edu/tca/s/p280-tarjan.pdf * * Note: The branch predictors for a given run the branch predictors * use the same amount of global history but they DO NOT use the * same amount of storage. Bimodal is the smallest, then gshare, then * YAGS. The size of the perceptron predictor varies. * * * To Run: * * echo MY_BENCHMARK MY_BENCHMARK_ARGS | percep HISTORY_LEN [BHT_LG] * */ #define TR_REGS #include <stdio.h> #include <stdlib.h> #include <strings.h> #include <IHASH.h> #include <ITYPES.h> #include <trace.h> #include <stdtr.h> #include <trctl.h> extern int shade_sshell(char *command, void (*analyzer)()); typedef enum { BPU_Normal = 0, // Update GHR using outcome. For gshare. BPU_Include = 0, BPU_None = 1, // Don't touch GHR. For bimodal. BPU_Exclude = 1, BPU_Pad = 2 // Shift a zero into GHR. } BP_GHR_Update_Method; const char* const bpu_str = "IEP"; static void sim_init(int argc, char **argv, char *bm_command); static void sim_analyzer_phase_one(); static void sim_after_phase_one(); static void sim_results_phase_one_write(); int shade_main(int argc, char **argv) { /* Copy first line of stdin to bm_command. */ static char bm_command[200]; { char *bptr = bm_command; char* const bstop = bm_command + sizeof(bm_command); while( 1 ) { char c = getchar(); if( c == EOF || c == '\n' ) break; if( bptr == bstop ) { fprintf(stderr,"Benchmark command too long, exiting.\n"); return 1; } *bptr++ = c; } *bptr++ = 0; } /* Shade Documentation * * Overview: http://www.ece.lsu.edu/tca/intro.pdf * Ref Manual: http://www.ece.lsu.edu/tca/shade_man.pdf * * Trace Structure (trace->tr_pc, trace->ih, etc.) * /data4/koppel/shadekit/shade.v9.elf/src/include/trace.h * Trace Information Constants (TC_PC, TC_TAKEN, etc.) * /data4/koppel/shadekit/shade.v9.elf/src/include/trctl.h * Instruction Types (IT_ANY, IT_BRANCH, etc.) * /data4/koppel/shadekit/spixtools.v9.elf/src/include/ITYPES.h */ /* Tell Shade size of trace structure. (It can be customized.) */ shade_trctl_trsize( sizeof(Trace) ); /* * Tell Shade which instructions to trace and what information to collect. */ shade_trctl_it (IT_ANY, // Which instructions. (All) 1, // Turn tracing on or off. (On) 0, // Trace annulled instructions? (No) TC_TAKEN | TC_PC | TC_IH // What info to collect? (Outcome,pc,opcode) ); /* * Phase One: Collect data on bimodal and gshare predictors. */ sim_init(argc, argv, bm_command); shade_sshell(bm_command,sim_analyzer_phase_one); sim_after_phase_one(); return 0; } /* ************************************************************************* */ /* * Branch Predictors * */ #define SINC2(a) { if( (a) < 3 ) (a)++; } #define SDEC2(a) { if( (a) > 0 ) (a)--; } #define BP_ACCURACY(bp) bp_gshare_accuracy((BP_gshare*)bp) typedef struct { char counter; int tag; } BP_yags_pht_elt; typedef struct { int correct, wrong; double accuracy; unsigned int ghr; int history_length; BP_yags_pht_elt *pht_t, *pht_n; int bht_lg; int bht_size; int bht_mask; char *bht; int tag_size; // Size of tag stored in phts. int pht_size; int pht_mask; int tag_mask; int capacity_bits; } BP_yags; void bp_yags_reset(BP_yags *g); BP_yags* bp_yags_new(int history_length, int bht_lg) { BP_yags* const g = (BP_yags*) calloc(1,sizeof(BP_yags)); g->history_length = history_length; g->pht_size = 1 << history_length; g->pht_mask = g->pht_size - 1; g->pht_t = (BP_yags_pht_elt*) calloc(g->pht_size,sizeof(g->pht_t[0])); g->pht_n = (BP_yags_pht_elt*) calloc(g->pht_size,sizeof(g->pht_n[0])); g->bht_lg = bht_lg; g->bht_size = 1 << g->bht_lg; g->bht_mask = g->bht_size - 1; g->bht = (char*) calloc(g->bht_size,1); g->tag_size = 6; g->tag_mask = ( 1 << ( g->tag_size + 2 ) ) - 1; const int pht_capacity = g->pht_size * ( 2 + g->tag_size ); const int bht_capacity = g->bht_size * 2; g->capacity_bits = pht_capacity * 2 + bht_capacity; bp_yags_reset(g); return g; } void bp_yags_reset(BP_yags *g) { memset(g->pht_t, 0, g->pht_size * sizeof(g->pht_t[0]) ); memset(g->pht_n, 0, g->pht_size * sizeof(g->pht_t[0]) ); memset(g->bht, 0, g->bht_size * sizeof(g->bht[0]) ); g->ghr = 0; g->accuracy = -1; } char* bp_yags_config(BP_yags *g) { static char buffer[100]; sprintf(buffer,"Y%d/%d/%d", g->history_length,g->bht_lg,g->tag_size); return buffer; } int bp_yags_predict(BP_yags *g, unsigned int pc, int outcome) { char* const bht_entry = &g->bht[ ( pc >> 2 ) & g->pht_mask ]; const int pred_bm = bht_entry[0] > 1; const int pht_idx = ( g->ghr ^ ( pc >> 2 ) ) & g->pht_mask; BP_yags_pht_elt* const pht_elt = ( pred_bm ? g->pht_t : g->pht_n ) + pht_idx; const int tag = pc & g->tag_mask; const int hit = pht_elt->tag == tag; const int prediction = hit ? pht_elt->counter > 1 : pred_bm; const int bm_correct = pred_bm == outcome; const int yg_correct = prediction == outcome; if( bm_correct || !yg_correct ) { if( outcome ) { SINC2( *bht_entry ); } else { SDEC2( *bht_entry ); } } if( !bm_correct && !hit ) { pht_elt->tag = tag; pht_elt->counter = outcome ? 3 : 0; } else if ( hit ) { if( outcome ) { SINC2( pht_elt->counter ); } else { SDEC2( pht_elt->counter ); } } if( yg_correct ) g->correct++; else g->wrong++; g->ghr = ( ( g->ghr << 1 ) | outcome ) & g->pht_mask; return yg_correct; } typedef struct { int correct, wrong; double accuracy; /* Set after call to bp_gshare_accuracy. */ unsigned int ghr; char *pht; int history_length; int pht_size; int pht_mask; int capacity_bits; } BP_gshare; void bp_gshare_reset(BP_gshare *g); /* * Create and return a new branch predictor. */ BP_gshare* bp_gshare_new(int history_length) { BP_gshare* const g = (BP_gshare*) malloc(sizeof(BP_gshare)); g->history_length = history_length; g->pht_size = 1 << history_length; g->pht_mask = g->pht_size - 1; g->pht = (char*) malloc(g->pht_size); g->capacity_bits = g->pht_size * 2; bp_gshare_reset(g); return g; } /* * Reset branch predictor tables and statistics. */ void bp_gshare_reset(BP_gshare *g) { g->ghr = 0; g->correct = g->wrong = 0; g->accuracy = -1; bzero(g->pht,g->pht_size); } /* * Predict outcome of branch and update GHR using actual outcome. * * Method specifies how to update GHR: BPU_Include for gshare, BPU_Exclude * for bimodal, and BPU_Pad for ??. * Return 1 if prediction correct and 0 if prediction wrong. */ int bp_gshare_predict (BP_gshare *g, unsigned int pc, int outcome, BP_GHR_Update_Method method) { char* const pht_entry = &g->pht[ ( g->ghr ^ ( pc >> 2 ) ) & g->pht_mask ]; const int prediction = pht_entry[0] > 1; if( prediction == outcome ) g->correct++; else g->wrong++; if( outcome && pht_entry[0] < 3 ) pht_entry[0]++; if( !outcome && pht_entry[0] > 0 ) pht_entry[0]--; if( method == BPU_Include ) g->ghr = ( ( g->ghr << 1 ) | outcome ) & g->pht_mask; else if ( method == BPU_Pad ) g->ghr = ( g->ghr << 1 ) & g->pht_mask; return prediction == outcome; } /* * Return accuracy of predictor. Once this is called the accuracy is * available in g->accuracy. */ double bp_gshare_accuracy(BP_gshare *g) { if( g->accuracy < 0 ) { const double exec = g->correct + g->wrong; g->accuracy = exec ? g->correct / exec : 0; } return g->accuracy; } typedef struct { int correct, wrong; double accuracy; /* Set after call to bp_gshare_accuracy. */ unsigned int ghr; unsigned int *gpr; // Global path register: Holds addresses of recent br. int gpr_idx; char** weight_tables; char** weight_entries; // Entries used to make a particular prediction. int weight_bits; int weight_max, weight_min; char *bt; // Bias table. int bt_lg, bt_size, bt_mask; int spec_wt_per_pred; // Weights per prediction. int spec_b_per_wt; // Branches per weight. int spec_a; // If non-zero, use predicted branch pc for wt idx. int spec_p; int spec_g_mask; int history_length; int wt_lg, wt_size, wt_mask; int capacity_bits; // Storage needed to implement. } BP_perceptron; BP_perceptron* bp_perceptron_new(int history_lengthp, int bt_lg, int b_per_wt, int tight) { BP_perceptron* const bp = (BP_perceptron*) malloc(sizeof(BP_perceptron)); bp->weight_bits = 7; bp->spec_b_per_wt = b_per_wt; bp->spec_wt_per_pred = history_lengthp / bp->spec_b_per_wt; bp->spec_g_mask = ( 1 << bp->spec_b_per_wt ) - 1; bp->spec_p = 0; bp->spec_a = 1; bp->bt_lg = bt_lg; bp->bt_size = 1 << bt_lg; bp->bt_mask = bp->bt_size - 1; bp->wt_lg = tight || bt_lg < bp->spec_b_per_wt ? bp->spec_b_per_wt : bt_lg; bp->wt_size = 1 << bp->wt_lg; bp->wt_mask = bp->wt_size - 1; bp->history_length = bp->spec_wt_per_pred * bp->spec_b_per_wt; bp->weight_tables = (char**) malloc( ( 1 + bp->spec_wt_per_pred ) * sizeof(char*) ); bp->weight_entries = (char**) malloc( ( 1 + bp->spec_wt_per_pred ) * sizeof(char*) ); bp->capacity_bits = bp->wt_size * bp->spec_wt_per_pred * bp->weight_bits + bp->bt_size * bp->weight_bits; bp->weight_max = 1 << ( bp->weight_bits - 1 ); bp->weight_min = - bp->weight_max; int i; for(i=0; i<bp->spec_wt_per_pred; i++) bp->weight_tables[i] = (char*) calloc( bp->wt_size, 1 ); bp->weight_tables[i] = (char*) calloc( bp->bt_size, 1 ); bp->gpr = (unsigned int*) calloc( bp->history_length, sizeof(int) ); bp->gpr_idx = 0; bp->correct = bp->wrong = 0; bp->accuracy = -1; return bp; } char* bp_perceptron_config(BP_perceptron *bp) { static char buffer[100]; snprintf(buffer,sizeof(buffer),"%s%s%s-%2d/%2d-%2d-%2d", bp->spec_a ? "A" : "_", bp->spec_p ? "P" : "_", bp->spec_g_mask ? "G" : "_", bp->history_length, bp->spec_wt_per_pred, bp->wt_lg, bp->bt_lg ); return buffer; } #define INC(a) { if( (a) < g->weight_max ) (a)++; } #define DEC(a) { if( (a) > g->weight_min ) (a)--; } int bp_perceptron_predict(BP_perceptron *g, unsigned int pc, int outcome) { int ghr = g->ghr; int gpr_idx = g->gpr_idx; char **wt_ptrs = g->weight_entries; const int idx_init_val = g->spec_a ? pc >> 2 : 0; wt_ptrs[g->spec_wt_per_pred] = &g->weight_tables[g->spec_wt_per_pred][(pc>>2)&g->bt_mask]; int i; for( i = 0; i < g->spec_wt_per_pred; i++) { int idx = idx_init_val; if( g->spec_p ) idx = idx ^ g->gpr[gpr_idx]; if( g->spec_g_mask ) idx = idx ^ ( ghr & g->spec_g_mask ); char* const wt = g->weight_tables[i]; wt_ptrs[i] = &wt[ idx & g->wt_mask ]; ghr >>= g->spec_b_per_wt; gpr_idx -= g->spec_b_per_wt; if( gpr_idx < 0 ) gpr_idx += g->history_length; } int score = 0; for( i = 0; i <= g->spec_wt_per_pred; i++) score += *wt_ptrs[i]; const int pred = score >= 0; const int correct = pred == outcome; const int abs_score = score < 0 ? -score : score; if( correct ) g->correct++; else g->wrong++; if( !correct || abs_score < 10 ) { if( outcome ) for( i = 0; i <= g->spec_wt_per_pred; i++) { INC( *wt_ptrs[i] ); } else for( i = 0; i <= g->spec_wt_per_pred; i++) { DEC( *wt_ptrs[i] ); } } g->gpr_idx++; if( g->gpr_idx == g->history_length ) g->gpr_idx = 0; g->ghr = ( g->ghr << 1 ) | outcome; g->gpr[g->gpr_idx] = pc >> 2; return correct; } double bp_perceptron_accuracy(BP_perceptron *g) { if( g->accuracy < 0 ) { const double exec = g->correct + g->wrong; g->accuracy = exec ? g->correct / exec : 0; } return g->accuracy; } /* ************************************************************************* */ /* * Branch Information Table (BIT) * * Holds information about each branch instruction in program. */ typedef struct { unsigned int pc; /* Address of branch. */ int rank; /* Rank in number of executions: 0 most frequent, etc. */ int round; int correct_p1, wrong_p1; /* Prediction results for phase-one gshare. */ int correct_bm, wrong_bm; /* Prediction results for phase-one bimodal. */ int correct_yg, wrong_yg; int correct_pr, wrong_pr; int icount; /* Number of time this branch executes. */ double icount_inv; double icount_frac; /* icount / executions of all branches. */ double scaled_change; } BIT_Elt; typedef struct { int icount; /* Number of instructions. */ int icount_branch; /* Number of branches. */ char *bm_command; int print_amt_max; /* Maximum number of insn to print. */ int history_length; /* Length of ghr. */ /* Branch information table size variables. */ int bit_lg; /* Log_2 size. (Input or constant.) */ int bit_size; /* Computed. */ int bit_mask; /* Computed. */ int bit_occ; /* Number of branches in table. */ BP_yags *yg_phase_one; BP_perceptron *pr_phase_one; /* perceptron */ BP_perceptron *pr_array[20]; BP_gshare *gs_phase_one; /* gshare */ BP_gshare *bm_phase_one; /* bimodal */ BIT_Elt *bit; /* Branch information table. */ BIT_Elt **bit_sorted; /* Branch information table sorted by executions. */ } Sim_Info; Sim_Info sim_info; /* * Return branch information table entry for PC. * * If there is a collision (two branches map to same element) table * entry is reset. This should happen rarely, if it does increase * sim_info.bit_lg. */ BIT_Elt* bit_get(Sim_Info *si, unsigned int pc) { BIT_Elt* const bi = &si->bit[ si->bit_mask & ( ( pc ^ ( pc >> si->bit_lg ) ^ ( pc >> 8 ) ) >> 2 )]; if( bi->pc != pc ) { bi->pc = pc; bi->correct_p1 = bi->wrong_p1 = 0; bi->correct_bm = bi->wrong_bm = 0; } return bi; } /* * Return ordering of two BIT entries (for qsort). * * When assigned, sort order is number of times branch executes. */ int comp_mispred(const void *ap, const void *bp) { BIT_Elt* const a = *((BIT_Elt**)ap); BIT_Elt* const b = *((BIT_Elt**)bp); const int order = b->correct_p1 + b->wrong_p1 - a->wrong_p1 - a->correct_p1; /* If it's a tie put them in prog order. */ if( order == 0 ) return a->pc - b->pc; return order; } /* ************************************************************************* */ /* * Simulation Routines * */ void sim_init(int argv, char **argc, char *bm_command) { int i; if( argv < 2 ) { fprintf(stderr,"Usage: %s HISTORY_LEN [WT_LG] [BHT_LG]\n",argc[0]); exit(1); } /* Tune this for preference. (Number of insn to print info about.) */ sim_info.print_amt_max = 20; /* Make this large enough so branches don't share entries. */ sim_info.bit_lg = 19; // Size of branch information table. sim_info.history_length = atoi(argc[1]); int wt_lg = argv > 2 ? atoi(argc[2]) : sim_info.history_length; int bht_lg = argv > 3 ? atoi(argc[3]) : sim_info.history_length; sim_info.bit_size = 1 << sim_info.bit_lg; sim_info.bit_mask = sim_info.bit_size - 1; sim_info.bit = (BIT_Elt*) calloc(sim_info.bit_size,sizeof(sim_info.bit[0])); sim_info.bit_sorted = (BIT_Elt**) malloc(sim_info.bit_size*sizeof(void*)); for(i=0; i<sim_info.bit_size; i++) sim_info.bit_sorted[i] = &sim_info.bit[i]; sim_info.gs_phase_one = bp_gshare_new(sim_info.history_length); sim_info.bm_phase_one = bp_gshare_new(sim_info.history_length); sim_info.yg_phase_one = bp_yags_new(sim_info.history_length,bht_lg); sim_info.pr_phase_one = bp_perceptron_new(32,sim_info.history_length-3,4,0); BP_perceptron **bpptr = sim_info.pr_array; for(i=0; i < 6; i++) { const int wt_table_lg = sim_info.history_length - i; const int b_per_wt = sim_info.history_length >> i; if( b_per_wt ) *bpptr++ = bp_perceptron_new(sim_info.history_length,wt_table_lg,b_per_wt,0); const int b_per_wt_32 = 32 >> i; if( b_per_wt_32 && b_per_wt_32 < 32 ) *bpptr++ = bp_perceptron_new(32,wt_table_lg,b_per_wt_32,0); } sim_info.icount_branch = 0; sim_info.bm_command = bm_command; } /* * Phase One * * Compute gshare and bimodal prediction accuracies. * Record per-instruction prediction accuracies. */ void sim_analyzer_phase_one() { Trace *trace_entry; while( ( trace_entry = shade_step() ) ) { /* One iteration per (traced) instruction. */ /* ih: Instruction hash, sort of an ideal opcode. */ const int ih = trace_entry->tr_ih; sim_info.icount++; if( !ih_isbranch(ih) ) continue; // Continue if not an integer branch. sim_info.icount_branch++; int i; for(i=0; i<sim_info.history_length && sim_info.pr_array[i]; i++) bp_perceptron_predict (sim_info.pr_array[i], trace_entry->tr_pc, trace_entry->tr_taken); const int pr_correct = bp_perceptron_predict (sim_info.pr_phase_one, trace_entry->tr_pc, trace_entry->tr_taken); /* Bimodal Prediction */ const int bm_correct = bp_gshare_predict (sim_info.bm_phase_one, trace_entry->tr_pc, trace_entry->tr_taken, BPU_Exclude); /* gshare Prediction */ const int gs_correct = bp_gshare_predict (sim_info.gs_phase_one, trace_entry->tr_pc, trace_entry->tr_taken, BPU_Include); /* yags Prediction */ const int yg_correct = bp_yags_predict (sim_info.yg_phase_one, trace_entry->tr_pc, trace_entry->tr_taken); /* Record outcome for this branch. */ BIT_Elt* const bi = bit_get(&sim_info,trace_entry->tr_pc); if( gs_correct ) bi->correct_p1++; else bi->wrong_p1++; if( bm_correct ) bi->correct_bm++; else bi->wrong_bm++; if( pr_correct ) bi->correct_pr++; else bi->wrong_pr++; if( yg_correct ) bi->correct_yg++; else bi->wrong_yg++; } } void sim_after_phase_one() { BP_ACCURACY(sim_info.gs_phase_one); BP_ACCURACY(sim_info.bm_phase_one); BP_ACCURACY(sim_info.pr_phase_one); BP_ACCURACY(sim_info.yg_phase_one); /* Prepare sorted list of BIT entries. */ qsort(sim_info.bit_sorted,sim_info.bit_size, sizeof(sim_info.bit_sorted[0]),comp_mispred); const double icount_all_inv = 1.0 / sim_info.icount_branch; int i; for(i = 0; i < sim_info.bit_size; i++) { BIT_Elt* const bi = sim_info.bit_sorted[i]; bi->icount = bi->correct_p1 + bi->wrong_p1; if( bi->icount == 0 ) break; bi->rank = i; bi->icount_frac = bi->icount * icount_all_inv; bi->icount_inv = 1.0 / bi->icount; } sim_info.bit_occ = i; sim_results_phase_one_write(); } /* ************************************************************************* */ /* * Write Results * */ void sim_results_phase_one_write() { BP_gshare* const gs = sim_info.gs_phase_one; BP_gshare* const bm = sim_info.bm_phase_one; BP_perceptron* const pr = sim_info.pr_phase_one; BP_yags* const yg = sim_info.yg_phase_one; int i; printf("Found %d instructions of which %d were branches.\n", sim_info.icount, sim_info.icount_branch); for(i=0; i<sim_info.history_length && sim_info.pr_array[i]; i++) printf("Percep %s acc %.4f size %d\n", bp_perceptron_config(sim_info.pr_array[i]), BP_ACCURACY(sim_info.pr_array[i]), sim_info.pr_array[i]->capacity_bits ); printf("Perceptron config: %s, size %d bits\n", bp_perceptron_config(pr), pr->capacity_bits); printf("YAGS config %s, size %d bits\n", bp_yags_config(yg),yg->capacity_bits); printf("gshare size %d bits, bm size %d bits.\n", gs->capacity_bits, bm->capacity_bits); printf("For ghr len %d: ", sim_info.history_length); printf("Bimodal %.4f, gshare %.4f, yags %.4f, percep %.4f\n", bm->accuracy, gs->accuracy, yg->accuracy, pr->accuracy); printf("Rnk Insn Addr bimodal GS:Corr Wrong gshare yags percep\n"); for(i=0; i<sim_info.print_amt_max; i++) { BIT_Elt* const bi = sim_info.bit_sorted[i]; const int c = bi->correct_p1; const int w = bi->wrong_p1; const double cb = bi->correct_bm; if( c + w == 0 ) break; printf("%3d 0x%08x %.4f %7d %7d %.4f %.4f %.4f\n", bi->rank, bi->pc, cb/(c+w), c, w, c/((double)(c+w)), ((double)bi->correct_yg) / (c+w), ((double)bi->correct_pr) / (c+w) ); } }