diff --git a/def/pref.param.def b/def/pref.param.def index fd40581..c868a39 100644 --- a/def/pref.param.def +++ b/def/pref.param.def @@ -127,3 +127,218 @@ param< PREF_STRIDE_SINGLE_STRIDE_MODE, pref_stride_single_stride_mode, bool, fal param< PREF_STRIDE_ON, pref_stride_on, bool, false > param< PREF_STRIDE_ON_MEDIUM_CORE, pref_stride_on_medium_core, bool, false > param< PREF_STRIDE_ON_LARGE_CORE, pref_stride_on_large_core, bool, false > + +/* added on 2024-03-26 +/* ===================================================================================== + * Prefetcher + ==================================================================================== */ +param< PREF_2DC_ON, pref_2dc_on, bool, false > +param< PREF_2DC_ON_MEDIUM_CORE, pref_2dc_on_medium_core, bool, false > +param< PREF_2DC_ON_LARGE_CORE, pref_2dc_on_large_core, bool, false > + +param< DEBUG_PREF_2DC, debug_pref_2dc, bool, false > +DEF_PARAM(pref_2dc_on , PREF_2DC_ON , bool , bool , false , ) +DEF_PARAM(debug_pref_2dc , DEBUG_PREF_2DC , bool , bool , false , ) + +param< PREF_2DC_CACHE_SIZE, pref_2dc_cache_size, uns, (1 * 1024) > +param< PREF_2DC_CACHE_ASSOC, pref_2dc_cache_assoc, uns, 4 > +param< PREF_2DC_CACHE_LINE_SIZE, pref_2dc_cache_line_size, uns, 1 > + +param< PREF_2DC_MAX_DEGREE, pref_2dc_max_degree, uns, 32 > + +param< PREF_2DC_DEGREE, pref_2dc_degree, uns, 16 > + +param< PREF_2DC_ZONE_SHIFT, pref_2dc_zone_shift, uns, 10 > + +param< PREF_2DC_TAG_SIZE, pref_2dc_tag_size, uns, 16 > + +param< PREF_2DC_NUM_REGIONS, pref_2dc_num_regions, uns, 64 > + +param< PREF_2DC_REGION_HASH, pref_2dc_region_hash, uns, 61 > + +/* should be checked with Hyesoon */ +param< PREF_2DC_BANKS, pref_2dc_banks, uns, 32 > + +param< DEBUG_PREF_GHB, debug_pref_ghb, bool, false > +// the size of the global history buffer +param< PREF_GHB_BUFFER_N, pref_ghb_buffer_n, uns, 256 > + // size of ghb index table +param< PREF_GHB_INDEX_N, pref_ghb_index_n, uns, 128 > + // number of high order bits to use to determine the czone + // 12 works best +param< PREF_GHB_CZONE_BITS, pref_ghb_czone_bits, uns, 12 > + // Number of prefetches sent out +param< PREF_GHB_DEGREE, pref_ghb_degree, uns, 16 > + +param< PREF_GHB_MAX_DEGREE, pref_ghb_max_degree, uns, 32 > + +/* Heterogeneous Features */ +param< PREF_GHB_ON, pref_ghb_on, bool, false > + +param< PREF_GHB_ON_MEDIUM_CORE, pref_ghb_on_medium_core, bool, false > + +param< PREF_GHB_ON_LARGE_CORE, pref_ghb_on_large_core, bool, false > + + +param< PREF_PHASE_ON, pref_phase_on, bool, false > +param< PREF_PHASE_ON_MEDIUM_CORE, pref_phase_on_medium_core, bool, false > +param< PREF_PHASE_ON_LARGE_CORE, pref_phase_on_large_core, bool, false > + +param< PREF_PHASE_STUDY, pref_phase_study, bool, false > +param< DEBUG_PREF_PHASE, debug_pref_phase, bool, false > + +param< PREF_PHASE_PRIME_HASH, pref_phase_prime_hash, uns, 16381 > + +param< PREF_PHASE_INFOSIZE, pref_phase_infosize, uns, 16384 > + +param< PREF_PHASE_LOG2REGIONSIZE, pref_phase_log2regionsize, uns, 12 > +param< PREF_PHASE_REGIONENTRIES, pref_phase_regionentries, uns, 64 > + +param< PREF_PHASE_TRACKEDREGIONS, pref_phase_trackedregions, uns, 32 > + +param< PREF_PHASE_INTERVAL, pref_phase_interval, uns, 100000 > + +param< PREF_PHASE_TABLE_SIZE, pref_phase_table_size, uns, 64 > +param< PREF_PHASE_MAXDIFF_THRESH, pref_phase_maxdiff_thresh, uns, 64 > + +param< PREF_PHASE_MIN_MISSES, pref_phase_min_misses, uns, 64 > + +param< PREF_PHASE_MISSPER, pref_phase_missper, float, 0.4 > + +param< PREF_STRIDEPC_ON, pref_stridepc_on, bool, false > +param< PREF_STRIDEPC_ON_MEDIUM_CORE, pref_stridepc_on_medium_core, bool, false > +param< PREF_STRIDEPC_ON_LARGE_CORE, pref_stridepc_on_large_core, bool, false > + +param< DEBUG_PREF_STRIDEPC, debug_pref_stridepc, bool, false > +// the size of the stridepc table +param< PREF_STRIDEPC_TABLE_N, pref_stridepc_table_n, uns, 256 > + // Number of prefetches sent out on a miss/prefetch +param< PREF_STRIDEPC_DEGREE, pref_stridepc_degree, uns, 4 > +param< PREF_STRIDEPC_DISTANCE, pref_stridepc_distance, uns, 16 > + // Useloadaddr -> Do we stride based on load address +param< PREF_STRIDEPC_USELOADADDR, pref_stridepc_useloadaddr, bool, false > + +param< PREF_STRIDEPC_TRAINNUM, pref_stridepc_trainnum, uns, 2 > + +param< PREF_STRIDEPC_STARTDIS, pref_stridepc_startdis, uns, 4 > + + +// the number of stream buffer +param< STREAM_BUFFER_N, stream_buffer_n, uns, 256 > + // how many prefetch is requested every miss +param< STREAM_PREFETCH_N, stream_prefetch_n, uns, 2 > + // the fist stream prefetch request +param< STREAM_START_DIS, stream_start_dis, uns, 5 > + // how far the stream prefetcher request the prefetcher +param< STREAM_LENGTH, stream_length, uns, 16 > +param< STREAM_TRAIN_LENGTH, stream_train_length, uns, 16 > +param< STREAM_TRAIN_NUM, stream_train_num, uns, 1 > +param< STREAM_ACC_THROTTLE, stream_acc_throttle, bool, false > +param< PREF_ACC_USE_CACHE, pref_acc_use_cache, bool, false > +param< STREAM_STALL_ON_QUEUE_FULL, stream_stall_on_queue_full, bool, false > +param< STREAM_L1Q_DEMAND_RESERVE, stream_l1q_demand_reserve, uns, 0 > + +param + + // the number of prefetcher request every cycle +param< PREF_SCHEDULE_NUM, pref_schedule_num, uns, 4 > + // filter for training the same miss address +param< TRAIN_FILTER_SIZE, train_filter_size, uns, 32 > + // how many prefetching request can be buffered +param< PREF_REQ_Q_SIZE, pref_req_q_size, uns, 64 > +param< STREAM_CREATE_ON_DC_MISS,stream_create_on_dc_miss , uns, 64 > +param< STREAM_CREATE_ON_L1_MISS, stream_create_on_l1_miss, bool, true > +param< STREAM_TRAIN_ON_WRONGPATH, stream_train_on_wrongpath, bool, true > +param< STREAM_CREATE_ON_WRONGPATH, stream_create_on_wrongpath, bool, true > +param< STREAM_PREF_INTO_DCACHE, stream_pref_into_dcache, bool, false > + +param< STREAM_TRAIN_ON_DC_MISS, stream_train_on_dc_miss, bool, true > +param< REMOVE_REDUNDANT_STREAM, remove_redundant_stream, bool, false > + + + + // DEF_PARAM(l2hit_stream_pref_on , L2HIT_STREAM_PREF_ON , bool , bool ,false , ) // l0 prefetcher request +param< L2HIT_STREAM_SCHEDULE_NUM, l2hit_stream_schedule_num, uns, 4 > + +param< PREF_REQ_QUEUE_FILTER_ON, pref_req_queue_filter_on, bool, false > +param< HW_PREF_HIT_TRAIN_STREAM, hw_pref_hit_train_stream, bool, false > + +param< L2HIT_STREAM_BUFFER_N, l2hit_stream_buffer_n, uns, 32 > +param< L2HIT_STREAM_PREFETCH_N, l2hit_stream_prefetch_n, uns, 2 > + +param< L2HIT_STREAM_L2MISS_DROP, l2hit_stream_l2miss_drop, bool, true > + + // the fist stream prefetch request +param< L2HIT_STREAM_START_DIS, l2hit_stream_start_dis, uns, 2 > + // how far the stream prefetcher request the prefetcher +param< L2HIT_STREAM_LENGTH, l2hit_stream_length, uns, 8 > + +param< L2HIT_PREF_REQ_Q_SIZE, l2hit_pref_req_q_size, uns, 128 > +param< L2HIT_L2ACCESS_REQ_Q_SIZE, l2hit_l2access_req_q_size, uns, 64 > + + +param< PREF_STREAM_ACCPERSTREAM, pref_stream_accperstream, bool, false > + +param< PREF_ACC_ON, pref_acc_on, bool, false > +// Number of cache lines monitored in a given region +param< PREF_ACC_REGION_SIZE, pref_acc_region_size, uns, 256 > +// Number of distinct regions +param< PREF_ACC_NUM_REGIONS, pref_acc_num_regions, uns, 128 > + +param< PREF_ACC_INCDEC_LENGTH,pref_acc_incdec_length, bool, false > +param< PREF_TRAIN_THRESH_1, pref_train_thresh_1, float, 0.25 > +param< PREF_TRAIN_THRESH_2, pref_train_thresh_2, float, 0.50 > +param< PREF_TRAIN_THRESH_3, pref_train_thresh_3, float, 0.75 > + +param< PREF_STREAM_TRAIN_NUM_0, tream_train_num_0, uns, 6 > +param< PREF_STREAM_TRAIN_NUM_1, tream_train_num_1, uns, 4 > +param< PREF_STREAM_TRAIN_NUM_2, tream_train_num_2, uns, 2 > +param< PREF_STREAM_TRAIN_NUM_3, tream_train_num_3, uns, 1 > + +param< PREF_ACC_TRAIN_ACCOFFSET_0, pref_acc_train_accoffset_0, float, -0.4 > +param< PREF_ACC_TRAIN_ACCOFFSET_1, pref_acc_train_accoffset_1, float, -0.2 > +param< PREF_ACC_TRAIN_ACCOFFSET_2, pref_acc_train_accoffset_2, float, -0.1 > +param< PREF_ACC_TRAIN_ACCOFFSET_3, pref_acc_train_accoffset_3, float, +0.1 > + +param< PREF_ACC_DISTANCE_1, pref_acc_distance_1, uns, 3 > +param< PREF_ACC_DISTANCE_2, pref_acc_distance_2, uns, 8 > +param< PREF_ACC_DISTANCE_3, pref_acc_distance_3, uns, 10 > +param< PREF_ACC_DISTANCE_4, pref_acc_distance_4, uns, 12 > +param< PREF_ACC_DISTANCE_5, pref_acc_distance_5, uns, 16 > +param< PREF_ACC_DISTANCE_6, pref_acc_distance_6, uns, 24 > +param< PREF_ACC_DISTANCE_7, pref_acc_distance_7, uns, 42 > +param< PREF_ACC_DISTANCE_8, pref_acc_distance_8, uns, 64 > +param< PREF_ACC_DISTANCE_9, pref_acc_distance_9, uns, 96 > +param< PREF_ACC_DISTANCE_10, pref_acc_distance_10, uns, 128 > + +param< PREF_ACC_REGION_MOVE_ON, pref_acc_region_move_on, bool, true > +param< PREF_ACC_REGION_MOVE_FRACT, pref_acc_region_move_fract, float, 0.125 > + +param< PREF_ACC_CREATE_REG_ON_L2_ACCESS, pref_acc_create_reg_on_l2_access, bool, false > + +param< PREF_ACC_USE_REGION, pref_acc_use_region, bool, false > + +param< PREF_ACC_UPDATE_DELAY, pref_acc_update_delay, uns, 50 > +param< PREF_ACC_USE_OVERALLALSO, pref_acc_use_overallalso, bool, false > +param< PREF_ACC_USE_ONLYGLOBAL, pref_acc_use_onlyglobal, bool, false > + + // MAX DISTANCE FOR ACC BASED THROTTLING +param< PREF_STREAM_MAX_DISTANCE, pref_stream_max_distance, uns, 64 > +param< PREF_STREAM_DYN_TRAIN_ON, pref_stream_dyn_train_on, bool, true > +param< PREF_STREAM_DYN_DIST_ON, pref_stream_dyn_dist_on, bool, true > + + +/* Heterogeneous Feature */ +param + +param + +param + + +param< PREF_THROTTLE_ON, pref_throttle_on, bool, false > +param< PREF_THROTTLEFB_ON, pref_throttlefb_on, bool, false > +param< PREF_ACC_THRESH_4, pref_acc_thresh_4, float, 0.40 > +param< PREF_ACCRATIOTHROTTLE, pref_accratiothrottle, bool, false > +param< PREF_ACCRATIO_1, pref_accratio_1, float, 0.70 > +param \ No newline at end of file diff --git a/src/hw_prefetcher/pref_2dc.cc b/src/hw_prefetcher/pref_2dc.cc new file mode 100644 index 0000000..b6d42a3 --- /dev/null +++ b/src/hw_prefetcher/pref_2dc.cc @@ -0,0 +1,287 @@ +/*************************************************************************************** + * File : pref_2dc.c + * Author : Santhosh Srinath + * Date : 1/19/2006 + * CVS : $Id: pref_2dc.cc,v 1.2 2008/09/12 03:42:01 kacear Exp $: + * Description : + ***************************************************************************************/ + + +#include "pref_2dc.h" + +#include "../global_defs.h" +#include "../global_types.h" +#include "../debug_macros.h" + +#include "../utils.h" +#include "../assert_macros.h" +#include "../uop.h" + +#include "../cache.h" +#include "../statistics.h" +#include "../memory.h" +#include "../pref_common.h" + +#include "../all_knobs.h" + +#include "../core.h" + +/* + 2dc_prefetcher : 2 delta-correlation prefetcher + + O.k... So far 2-delta correlation prefetchers have just gone for + the basic approach - 2-d table. So we are implementing a cache like + table which can achieve most of the benefits from a much smaller + structure. + + Implementation -> Take the deltas and the PC, and the address and + come up with a hash function that works. Use this to access the + cache. +*/ + +/**************************************************************************************/ +/* Macros */ +#define DEBUG(args...) _DEBUG(DEBUG_PREF_2DC, ## args) + + +/**************************************************************************************/ +/* Global variables */ +// Pref_2DC * tdc_hwp; + + +pref_2dc_c::pref_2dc_c(hwp_common_c *hcc, Unit_Type type, macsim_c* simBase) +: pref_base_c(simBase) +{ + name = "2dc"; + hwp_type = Mem_To_UL1; + hwp_common = hcc; + switch (type) { + case UNIT_SMALL: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_2DC_ON; + break; + case UNIT_MEDIUM: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_2DC_ON_MEDIUM_CORE; + break; + case UNIT_LARGE: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_2DC_ON_LARGE_CORE; + break; + } + + + l2_miss = true; + l2_pref_hit = true; +} + + +pref_2dc_c::~pref_2dc_c() +{ +} + +void pref_2dc_c::init_func(int core_id) +{ + if (!knob_enable) + return; + + core_id = core_id; + + // CHECKME + // tdc_hwp = (Pref_2DC*)malloc(sizeof(Pref_2DC)); + this->hwp_info->enabled = true; + + this->regions = (Pref_2DC_Region*)calloc(*m_simBase->m_knobs->KNOB_PREF_2DC_NUM_REGIONS, sizeof(Pref_2DC_Region)); + + this->last_access = 0; + this->last_loadPC = 0; + + // CHECKME + // init_cache(&this->cache, "PREF_2DC_CACHE", *m_simBase->m_knobs->KNOB_PREF_2DC_CACHE_SIZE, *m_simBase->m_knobs->KNOB_PREF_2DC_CACHE_ASSOC, *m_simBase->m_knobs->KNOB_PREF_2DC_CACHE_LINE_SIZE, sizeof(Pref_2DC_Cache_Data), *m_simBase->m_knobs->KNOB_REPL_true_LRU); + cache = new cache_c("PREF_2DC_CACHE", *m_simBase->m_knobs->KNOB_PREF_2DC_CACHE_SIZE, *m_simBase->m_knobs->KNOB_PREF_2DC_CACHE_ASSOC, *m_simBase->m_knobs->KNOB_PREF_2DC_CACHE_LINE_SIZE, sizeof(Pref_2DC_Cache_Data), *m_simBase->m_knobs->KNOB_PREF_2DC_BANKS, false, -1, CACHE_DL2, false, 1, 0, m_simBase); + + this->cache_index_bits = log2_int(*m_simBase->m_knobs->KNOB_PREF_2DC_CACHE_SIZE/4); + this->hash_func = PREF_2DC_HASH_FUNC_DEFAULT; + this->pref_degree = *m_simBase->m_knobs->KNOB_PREF_2DC_DEGREE; +} + + +void pref_2dc_c::l2_pref_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_2dc_l2_train(tid, lineAddr, loadPC, true); +} + + +void pref_2dc_c::l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_2dc_l2_train(tid, lineAddr, loadPC, false); +} + + +void pref_2dc_c::pref_2dc_l2_train(int tid, Addr lineAddr, Addr loadPC, bool l2_hit) +{ + int delta; + Addr hash; + Addr lineIndex = lineAddr >> LOG2_DCACHE_LINE_SIZE; + Addr dummy_lineaddr; + Pref_2DC_Region * region = &this->regions[(lineIndex >> *m_simBase->m_knobs->KNOB_PREF_2DC_ZONE_SHIFT) % *m_simBase->m_knobs->KNOB_PREF_2DC_REGION_HASH]; + + if (this->last_access != 0) { + delta = lineIndex - this->last_access; + if (delta == 0) { // no point updating if we have the same address twice + return; + } + // update state of the cache for deltaA, deltaB + // no point inserting same deltas in. so if deltaA == deltaB and deltaB == delta then dont insert. + if (region->deltaA != 0 && region->deltaB != 0 && (!(region->deltaA == region->deltaB && region->deltaB == delta))) { + hash = pref_2dc_hash(this->last_access, this->last_loadPC, region->deltaA, region->deltaB); + // CHECKME + // FIXME (jaekyu, 4-26-2011) use appropriate application id + Pref_2DC_Cache_Data * data = (Pref_2DC_Cache_Data *)cache->access_cache(hash, &dummy_lineaddr, true, 0); + if (!data) { + Addr repl_addr; + if (!l2_hit) { //insert only on miss + data = (Pref_2DC_Cache_Data *)cache->insert_cache(hash, &dummy_lineaddr, &repl_addr, 0, false); + } else { + return; + } + } + data->delta = delta; + } + region->deltaC = region->deltaB; + region->deltaB = region->deltaA; + region->deltaA = delta; + } + this->last_access = lineIndex; + this->last_loadPC = loadPC; + + if (region->deltaA == 0 || region->deltaB == 0) { + return; + } // No useful deltas yet + + { + // Send out prefetches + Pref_2DC_Cache_Data * data; + uns num_pref_sent = 0; + int delta1 = region->deltaB; + int delta2 = region->deltaA; + + if (region->deltaA == region->deltaB && region->deltaB == region->deltaC) { + // Now just assume that this is a strided access and send out the next few. + for (; num_pref_sentpref_degree; num_pref_sent++) { + lineIndex += region->deltaA; + hwp_common->pref_addto_l2req_queue_set(lineIndex, this->hwp_info->id, (num_pref_sent==0), (num_pref_sent=this->pref_degree), loadPC); + } + } + while (num_pref_sent < this->pref_degree) { + hash = pref_2dc_hash(lineIndex, loadPC, delta1, delta2); + // FIXME (jaekyu, 4-26-2011) use appropriate application id + data = (Pref_2DC_Cache_Data *)cache->access_cache(hash, &dummy_lineaddr, true, 0); + if (!data) { // no hit for this hash + return; + } + lineIndex += data->delta; + + delta1 = delta2; + delta2 = data->delta; + + hwp_common->pref_addto_l2req_queue_set(lineIndex, this->hwp_info->id, (num_pref_sent==0), (num_pref_sent=this->pref_degree), loadPC); + num_pref_sent++; + } + } +} + + +void pref_2dc_c::pref_2dc_throttle(void) +{ + int dyn_shift = 0; + + float acc = hwp_common->pref_get_accuracy(this->hwp_info->id); + + if (acc != 1.0) { + if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_1) { + dyn_shift += 2; + } else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_2) { + dyn_shift += 1; + } else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_3) { + dyn_shift = 0; + } else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_4) { + dyn_shift = dyn_shift - 1; + } else { + dyn_shift = dyn_shift - 2; + } + } + + // COLLECT STATS + if (acc > 0.9) { + STAT_EVENT(PREF_ACC_1); + } else if (acc > 0.8) { + STAT_EVENT(PREF_ACC_2); + } else if (acc > 0.7) { + STAT_EVENT(PREF_ACC_3); + } else if (acc > 0.6) { + STAT_EVENT(PREF_ACC_4); + } else if (acc > 0.5) { + STAT_EVENT(PREF_ACC_5); + } else if (acc > 0.4) { + STAT_EVENT(PREF_ACC_6); + } else if (acc > 0.3) { + STAT_EVENT(PREF_ACC_7); + } else if (acc > 0.2) { + STAT_EVENT(PREF_ACC_8); + } else if (acc > 0.1) { + STAT_EVENT(PREF_ACC_9); + } else { + STAT_EVENT(PREF_ACC_10); + } + + if (acc==1.0){ + this->pref_degree = 64; + } else { + if (dyn_shift >= 2 ) { + this->pref_degree = 64; + STAT_EVENT(PREF_DISTANCE_5); + } else if (dyn_shift == 1 ) { + this->pref_degree = 32; + STAT_EVENT(PREF_DISTANCE_4); + } else if (dyn_shift == 0 ) { + this->pref_degree = 16; + STAT_EVENT(PREF_DISTANCE_3); + } else if (dyn_shift == -1 ) { + this->pref_degree = 8; + STAT_EVENT(PREF_DISTANCE_2); + } else if (dyn_shift <= -2 ) { + this->pref_degree = 2; + STAT_EVENT(PREF_DISTANCE_1); + } + } +} + + +Addr pref_2dc_c::pref_2dc_hash(Addr lineIndex, Addr loadPC, int deltaA, int deltaB) +{ + Addr res = 0; + uns cache_indexbitsA; + uns cache_indexbitsB; + uns tagbits; + + if (0) { // to prevent compilation error, jaekyu (11-3-2009) + printf("%lld", loadPC); + } + + switch(this->hash_func) { + case PREF_2DC_HASH_FUNC_DEFAULT: + // In this function, we just use the lower bits from each delta + // to form the hash. + cache_indexbitsA = this->cache_index_bits >> 1; + cache_indexbitsB = this->cache_index_bits - cache_indexbitsA; + + tagbits = (((deltaA >> cache_indexbitsA) ^ (deltaB >> cache_indexbitsB) ^ + ( lineIndex >> *m_simBase->m_knobs->KNOB_PREF_2DC_ZONE_SHIFT) ) & N_BIT_MASK(*m_simBase->m_knobs->KNOB_PREF_2DC_TAG_SIZE)); + + res = (((deltaA & N_BIT_MASK(cache_indexbitsA)) | ((deltaB & N_BIT_MASK(cache_indexbitsB))<cache_index_bits)); + break; + default: + ASSERT(0); + break; + } + return res; +} \ No newline at end of file diff --git a/src/hw_prefetcher/pref_2dc.h b/src/hw_prefetcher/pref_2dc.h new file mode 100644 index 0000000..c3f76cc --- /dev/null +++ b/src/hw_prefetcher/pref_2dc.h @@ -0,0 +1,72 @@ +/*************************************************************************************** + * File : pref_2dc.h + * Author : Santhosh Srinath + * Date : 11/16/2004 + * CVS : $Id: pref_2dc.h,v 1.1 2008/07/30 14:18:15 kacear Exp $: + * Description : + ***************************************************************************************/ +#ifndef __PREF_2DC_H__ + +//#include "dcu.h" +#include "../pref_common.h" +#include "../pref.h" + + + +typedef enum Pref_2DC_HashFunc_Enum { + PREF_2DC_HASH_FUNC_DEFAULT, +}Pref_2DC_HashFunc; + + +typedef struct Pref_2DC_Cache_Data_struct { + int delta; +}Pref_2DC_Cache_Data; + + +typedef struct Pref_2DC_Region_Struct { + int deltaA, deltaB, deltaC; +}Pref_2DC_Region; + + +class pref_2dc_c : public pref_base_c +{ + private: + // 2DC Cache + cache_c *cache; + uns cache_index_bits; + + uns pref_degree; + Addr last_access; + Addr last_loadPC; + Pref_2DC_HashFunc hash_func; + Pref_2DC_Region * regions; + + pref_2dc_c(); + + public: + + pref_2dc_c(hwp_common_c *, Unit_Type, macsim_c* simBase); + ~pref_2dc_c(); + + /*************************************************************/ + /* HWP Interface */ + void init_func(int); + void done_func() {} + void l1_miss_func(int, Addr, Addr, uop_c *) {} + void l1_hit_func(int, Addr, Addr, uop_c *) {} + void l1_pref_hit_func(int, Addr, Addr, uop_c *) {} + void l2_miss_func(int, Addr, Addr, uop_c *); + void l2_hit_func(int, Addr, Addr, uop_c *) {} + void l2_pref_hit_func(int, Addr, Addr, uop_c *); + + void pref_2dc_l2_train(int, Addr, Addr, bool); + + + /*************************************************************/ + /* Misc functions */ + void pref_2dc_throttle(void); + Addr pref_2dc_hash(Addr lineIndex, Addr loadPC, int deltaA, int deltaB); +}; + +#define __PREF_2DC_H__ +#endif /* __PREF_2DC_H__*/ \ No newline at end of file diff --git a/src/hw_prefetcher/pref_ghb.cc b/src/hw_prefetcher/pref_ghb.cc new file mode 100644 index 0000000..72ff464 --- /dev/null +++ b/src/hw_prefetcher/pref_ghb.cc @@ -0,0 +1,418 @@ +/*************************************************************************************** + * File : pref_ghb.c + * Author : Santhosh Srinath + * Date : 11/16/2004 + * CVS : $Id: pref_ghb.cc,v 1.2 2008/09/12 03:42:01 kacear Exp $: + * Description : + ***************************************************************************************/ + + +#include "pref_ghb.h" + +#include "../global_defs.h" +#include "../global_types.h" +#include "../debug_macros.h" + +#include "../utils.h" +#include "../assert_macros.h" +#include "../uop.h" + +#include "../cache.h" +#include "../statistics.h" +#include "../memory.h" +#include "../pref_common.h" +#include "../core.h" + +#include "../all_knobs.h" + +/* + ghb_prefetcher : Global History Buffer prefetcher + Based on the C/DC prefetcher described in the AC/DC paper + + Divides memory into "regions" - static partition of the address space + The index table is indexed by the region id and gives a pointer to the + last access in that region in the GHB +*/ + +/**************************************************************************************/ +/* Macros */ +#define DEBUG(args...) _DEBUG(*m_simBase->m_knobs->KNOB_DEBUG_PREF_GHB, ## args) + +#define CIRC_DEC(val, num) (((val) == 0) ? (num) - 1 : (val) - 1) + + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +//pref_ghb_c::pref_ghb_c() +//{ +//} + + +pref_ghb_c::~pref_ghb_c() +{ +} + + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +pref_ghb_c::pref_ghb_c(hwp_common_c *hcc, Unit_Type type, macsim_c* simBase) +: pref_base_c(simBase) +{ + name = "ghb"; + hwp_type = Mem_To_UL1; + hwp_common = hcc; + + switch (type) { + case UNIT_SMALL: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_GHB_ON; + break; + case UNIT_MEDIUM: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_GHB_ON_MEDIUM_CORE; + break; + case UNIT_LARGE: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_GHB_ON_LARGE_CORE; + break; + } + + shift_bit = LOG2_DCACHE_LINE_SIZE; + l1_miss = true; + l1_hit = true; + l2_miss = true; + l2_pref_hit = true; + +} + + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +void pref_ghb_c::init_func(int core_id) +{ + if (!knob_enable) + return; + + core_id = core_id; + + this->hwp_info->enabled = true; + this->index_table = (GHB_Index_Table_Entry *) malloc(sizeof(GHB_Index_Table_Entry)**m_simBase->m_knobs->KNOB_PREF_GHB_INDEX_N); + this->ghb_buffer = (GHB_Entry *) malloc(sizeof(GHB_Entry)**m_simBase->m_knobs->KNOB_PREF_GHB_BUFFER_N); + + this->ghb_head = -1; + this->ghb_tail = -1; + this->deltab_size = *m_simBase->m_knobs->KNOB_PREF_GHB_MAX_DEGREE+2; + + this->delta_buffer = (int *) calloc(this->deltab_size, sizeof(int)); + this->pref_degree = *m_simBase->m_knobs->KNOB_PREF_GHB_DEGREE; + + for (int ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_GHB_INDEX_N; ++ii) + { + this->index_table[ii].valid = false; + this->index_table[ii].last_access = 0; + } + + for (int ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_GHB_BUFFER_N; ++ii) + { + this->ghb_buffer[ii].ghb_ptr = -1; + this->ghb_buffer[ii].ghb_reverse_ptr = -1; + this->ghb_buffer[ii].idx_reverse_ptr = -1; + } + + + this->pref_degree_vals[0] = 2; + this->pref_degree_vals[1] = 4; + this->pref_degree_vals[2] = 8; + this->pref_degree_vals[3] = 12; + this->pref_degree_vals[4] = 16; +} + + +void pref_ghb_c::l1_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + l2_miss_func(tid, lineAddr, loadPC, uop); +} + + +void pref_ghb_c::l1_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + l2_miss_func(tid, lineAddr, loadPC, uop); +} + + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +void pref_ghb_c::l2_pref_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_ghb_l2_train(tid, lineAddr, loadPC, true, NULL); +} + + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +void pref_ghb_c::l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_ghb_l2_train(tid, lineAddr, loadPC, false, uop); +} + + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +void pref_ghb_c::pref_ghb_l2_train(int tid, Addr lineAddr, Addr loadPC, bool is_l2_hit, uop_c *uop) +{ +// if (g_simulation_cycle == 314) +// cout << "found\n"; + + // 1. adds address to ghb + // 2. sends upto "degree" prefetches to the prefQ + int ii; + int czone_idx = -1; + int old_ptr = -1; + + int ghb_idx = -1; + int delta1 = 0; + int delta2 = 0; + + int num_pref_sent = 0; + int deltab_head = -1; + int curr_deltab_size = 0; + + Addr lineIndex = lineAddr >> shift_bit; + Addr currLineIndex = lineIndex; + Addr index_tag = CZONE_TAG(lineAddr); + + DEBUG("GHB : Miss Addr %d czone: %d\n", (int) lineAddr, (int) index_tag); + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_GHB_INDEX_N; ++ii) + { + if ((!*m_simBase->m_knobs->KNOB_PREF_THREAD_INDEX || tid == this->index_table[ii].tid) && + index_tag == this->index_table[ii].czone_tag && + this->index_table[ii].valid) + { + // got a hit in the index table + czone_idx = ii; + old_ptr = this->index_table[ii].ghb_ptr; + break; + } + } + + if (czone_idx == -1) + { + if (is_l2_hit) // ONLY TRAIN on l2_hit + return; + + // Not present in index table. + // Make new czone + // First look if any entry is unused + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_GHB_INDEX_N; ii++) { + if (!this->index_table[ii].valid) { + czone_idx = ii; + break; + } + if (czone_idx == -1 || (this->index_table[czone_idx].last_access < this->index_table[ii].last_access)) { + czone_idx = ii; + } + } + DEBUG("Created new entry in index: %d idx:%d \n", (int) index_tag , czone_idx); + } + + if (old_ptr != -1 && this->ghb_buffer[old_ptr].miss_index == lineIndex) { + return; + } + + if (*m_simBase->m_knobs->KNOB_PREF_THROTTLE_ON) + pref_ghb_throttle(); + + if (*m_simBase->m_knobs->KNOB_PREF_THROTTLEFB_ON) + pref_ghb_throttle_fb(); + + pref_ghb_create_newentry(czone_idx, lineAddr, index_tag, old_ptr, tid); + + for (ii = 0; ii < this->deltab_size; ii++) + this->delta_buffer[ii] = 0; + + // Now ghb_tail points to the new entry. Work backwards to find a 2 delta match... + ghb_idx = this->ghb_buffer[this->ghb_tail].ghb_ptr; + DEBUG("l2hit:%d lineidx:%llx loadPC:%llx\n", is_l2_hit, lineIndex, loadPC); + while (ghb_idx != -1 && num_pref_sentpref_degree) { + int delta = currLineIndex - this->ghb_buffer[ghb_idx].miss_index; + if (delta > 100 || delta < -100) + break; + + // insert into delta buffer + deltab_head = (deltab_head + 1) % this->deltab_size; + this->delta_buffer[deltab_head] = delta; + curr_deltab_size++; + if (delta1==0) { + delta1 = delta; + } else if (delta2==0) { + delta2 = delta; + } else { + DEBUG("delta1:%d, delta2:%d\n", delta1, delta2); + // Catch strides quickly + if (delta1 == delta2) { + for (; num_pref_sentpref_degree; num_pref_sent++) { + lineIndex += delta1; + hwp_common->pref_addto_l2req_queue_set(lineIndex, this->hwp_info->id, (num_pref_sent==0), (num_pref_sent==this->pref_degree), loadPC); + } + } else { + if (delta1 == this->delta_buffer[(deltab_head-1)%this->deltab_size] && delta2 == this->delta_buffer[deltab_head]) { + // found a match + // lets go for a walk + int deltab_idx = (deltab_head - 2) % this->deltab_size; + int deltab_start_idx = deltab_idx; + for (; num_pref_sentpref_degree; num_pref_sent++) { + lineIndex += this->delta_buffer[deltab_idx]; + hwp_common->pref_addto_l2req_queue_set(lineIndex, this->hwp_info->id, (num_pref_sent==0), (num_pref_sent==this->pref_degree), loadPC); + DEBUG("Sent %llx\n", lineIndex); + deltab_idx = CIRC_DEC(deltab_idx, this->deltab_size); + if (deltab_idx > curr_deltab_size) { + deltab_idx = deltab_start_idx; + } + } + break; + } + } + } + currLineIndex = this->ghb_buffer[ghb_idx].miss_index; + ghb_idx = this->ghb_buffer[ghb_idx].ghb_ptr; + } + if (num_pref_sent) { + DEBUG("Num sent %d\n", num_pref_sent); + } +} + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +void pref_ghb_c::pref_ghb_create_newentry (int idx, Addr line_addr, Addr czone_tag, int old_ptr, int tid) +{ + int rev_ptr; + int rev_idx_ptr; + + this->index_table[idx].valid = true; + this->index_table[idx].czone_tag = czone_tag; + this->index_table[idx].last_access = m_simBase->m_simulation_cycle; + this->index_table[idx].tid = tid; + + // Now make entry in ghb + this->ghb_tail = (this->ghb_tail + 1) % *m_simBase->m_knobs->KNOB_PREF_GHB_BUFFER_N; + if (this->ghb_tail == old_ptr) { // takes care of some bad corner cases + old_ptr = -1; + } + if (this->ghb_head == -1) { + this->ghb_head = 0; + } else if (this->ghb_tail == this->ghb_head) { + // wrap-around + this->ghb_head = (this->ghb_head + 1) % *m_simBase->m_knobs->KNOB_PREF_GHB_BUFFER_N; + } + + rev_ptr = this->ghb_buffer[this->ghb_tail].ghb_reverse_ptr; + rev_idx_ptr = this->ghb_buffer[this->ghb_tail].idx_reverse_ptr; + if (rev_ptr!=-1) { + this->ghb_buffer[rev_ptr].ghb_ptr = -1; + } + + if (rev_idx_ptr != -1 && this->index_table[rev_idx_ptr].ghb_ptr == this->ghb_tail && rev_idx_ptr != idx) { + this->index_table[rev_idx_ptr].ghb_ptr = -1; + this->index_table[rev_idx_ptr].valid = false; + } + + this->ghb_buffer[this->ghb_tail].miss_index = line_addr >> shift_bit; + this->ghb_buffer[this->ghb_tail].ghb_ptr = old_ptr ; + this->ghb_buffer[this->ghb_tail].ghb_reverse_ptr = -1; + this->ghb_buffer[this->ghb_tail].idx_reverse_ptr = idx; + if (old_ptr != -1) + this->ghb_buffer[old_ptr].ghb_reverse_ptr = this->ghb_tail; + + this->index_table[idx].ghb_ptr = this->ghb_tail; +} + + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +void pref_ghb_c::pref_ghb_throttle(void) +{ + int dyn_shift = 0; + + float acc = hwp_common->pref_get_accuracy(this->hwp_info->id); + //float acc = pref_get_overallaccuracy(Mem_To_UL1); + float regacc = hwp_common->pref_get_regionbased_acc(); + float accratio = acc/regacc; + // float cov = pref_get_coverage(this->hwp_info->id); + + if (acc != 1.0) { + if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_1) { + dyn_shift += 2; + } else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_2) { + dyn_shift += 1; + } else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_3) { + dyn_shift = 0; + } else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_4) { + dyn_shift = dyn_shift - 1; + } else { + dyn_shift = dyn_shift - 2; + } + } + if (*m_simBase->m_knobs->KNOB_PREF_ACCRATIOTHROTTLE) { + if (accratio < *m_simBase->m_knobs->KNOB_PREF_ACCRATIO_1 ) { + dyn_shift = dyn_shift - 1; + } + } + /* + // Adjust for high coverage + if (cov > 0.70) { + if (dyn_shift < 0) { + dyn_shift = 0; + } + } + */ + // COLLECT STATS + if (acc > 0.9) { + STAT_EVENT(PREF_ACC_1); + } else if (acc > 0.8) { + STAT_EVENT(PREF_ACC_2); + } else if (acc > 0.7) { + STAT_EVENT(PREF_ACC_3); + } else if (acc > 0.6) { + STAT_EVENT(PREF_ACC_4); + } else if (acc > 0.5) { + STAT_EVENT(PREF_ACC_5); + } else if (acc > 0.4) { + STAT_EVENT(PREF_ACC_6); + } else if (acc > 0.3) { + STAT_EVENT(PREF_ACC_7); + } else if (acc > 0.2) { + STAT_EVENT(PREF_ACC_8); + } else if (acc > 0.1) { + STAT_EVENT(PREF_ACC_9); + } else { + STAT_EVENT(PREF_ACC_10); + } + + if (acc==1.0){ + this->pref_degree = 64; + } else { + if (dyn_shift >= 2 ) { + this->pref_degree = 64; + STAT_EVENT(PREF_DISTANCE_5); + } else if (dyn_shift == 1 ) { + this->pref_degree = 32; + STAT_EVENT(PREF_DISTANCE_4); + } else if (dyn_shift == 0 ) { + this->pref_degree = 16; + STAT_EVENT(PREF_DISTANCE_3); + } else if (dyn_shift == -1 ) { + this->pref_degree = 8; + STAT_EVENT(PREF_DISTANCE_2); + } else if (dyn_shift <= -2 ) { + this->pref_degree = 2; + STAT_EVENT(PREF_DISTANCE_1); + } + } +} + + +/////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////// +void pref_ghb_c::pref_ghb_throttle_fb(void) +{ + hwp_common->pref_get_degfb(this->hwp_info->id); + ASSERT(this->hwp_info->dyn_degree<=4 ); + this->pref_degree = this->pref_degree_vals[this->hwp_info->dyn_degree]; +} \ No newline at end of file diff --git a/src/hw_prefetcher/pref_ghb.h b/src/hw_prefetcher/pref_ghb.h new file mode 100644 index 0000000..f503f59 --- /dev/null +++ b/src/hw_prefetcher/pref_ghb.h @@ -0,0 +1,82 @@ +/*************************************************************************************** + * File : pref_ghb.h + * Author : Santhosh Srinath + * Date : 11/16/2004 + * CVS : $Id: pref_ghb.h,v 1.1 2008/07/30 14:18:16 kacear Exp $: + * Description : + ***************************************************************************************/ +#ifndef __PREF_GHB_H__ + +#include "../pref_common.h" +#include "../pref.h" + +#define CZONE_TAG( x ) ( x >> ( *m_simBase->m_knobs->KNOB_PREF_GHB_CZONE_BITS ) ) + +typedef struct GHB_Index_Table_Entry_Struct { + int tid; + Addr czone_tag; + bool valid; + // CHECKME + int ghb_ptr; // ptr to last entry in ghb with same czone + int last_access; // for lru +} GHB_Index_Table_Entry; + + +typedef struct GHB_Entry_Struct { + Addr miss_index; + int ghb_ptr; // -1 == invalid + int ghb_reverse_ptr; // -1 == invalid + int idx_reverse_ptr; +} GHB_Entry; + + +class pref_ghb_c : public pref_base_c{ + friend class pref_common_c; + private: + // Index table + GHB_Index_Table_Entry * index_table; + // GHB + GHB_Entry * ghb_buffer; + + int ghb_tail; + int ghb_head; + + int deltab_size; + int * delta_buffer; + + uns pref_degree; + + uns pref_degree_vals[5]; + + pref_ghb_c(); + + public: + pref_ghb_c(hwp_common_c *, Unit_Type, macsim_c*); + ~pref_ghb_c(); + + /*************************************************************/ + /* HWP Interface */ + void init_func(int core_id); + void done_func() {} + void l1_miss_func(int, Addr, Addr, uop_c *) ; + void l1_hit_func(int, Addr, Addr, uop_c *); + void l1_pref_hit_func(int, Addr, Addr, uop_c *) {} + void l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop); + void l2_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *) {} + void l2_pref_hit_func( int tid, Addr lineAddr, Addr loadPC, uop_c *uop); + + + void pref_ghb_l2_train(int tid, Addr lineAddr, Addr loadPC, bool l2_hit, uop_c *uop); + + /*************************************************************/ + /* Misc functions */ + void pref_ghb_create_newentry (int idx, Addr line_addr, Addr czone_tag, int old_ptr, int tid); + + void pref_ghb_throttle(void); + void pref_ghb_throttle_fb(void); + +}; + + +#define __PREF_GHB_H__ +#endif /* __PREF_GHB_H__*/ \ No newline at end of file diff --git a/src/hw_prefetcher/pref_phase.cc b/src/hw_prefetcher/pref_phase.cc new file mode 100644 index 0000000..7564d62 --- /dev/null +++ b/src/hw_prefetcher/pref_phase.cc @@ -0,0 +1,270 @@ +/*************************************************************************************** + * File : pref_phase.c + * Author : Santhosh Srinath + * Date : + * CVS : $Id: pref_phase.cc,v 1.2 2008/09/12 03:42:02 kacear Exp $: + * Description : + ***************************************************************************************/ + + +#include "pref_phase.h" + +#include "../global_defs.h" +#include "../global_types.h" +#include "../debug_macros.h" + +#include "../utils.h" +#include "../assert_macros.h" +#include "../uop.h" + +#include "../cache.h" +#include "../statistics.h" +#include "../memory.h" +#include "../core.h" + +#include "../all_knobs.h" + +/**************************************************************************************/ +/* Macros */ +#define DEBUG(args...) _DEBUG(DEBUG_PREF_PHASE, ## args) + +#define PAGENUM(x) (x>>*m_simBase->m_knobs->KNOB_PREF_PHASE_LOG2REGIONSIZE) + +/**************************************************************************************/ +/* Global variables */ +// Pref_PHASE * phase_hwp; +// FIXME + +FILE * PREF_PHASE_OUT; + + +//pref_phase_c::pref_phase_c() {} + +pref_phase_c::pref_phase_c(hwp_common_c *hcc, Unit_Type type, macsim_c* simBase) +: pref_base_c(simBase) +{ + name = "phase"; + hwp_type = Mem_To_UL1; + hwp_common = hcc; + switch (type) { + case UNIT_SMALL: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_PHASE_ON; + break; + case UNIT_MEDIUM: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_PHASE_ON_MEDIUM_CORE; + break; + case UNIT_LARGE: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_PHASE_ON_LARGE_CORE; + break; + } + + l2_miss = true; + l2_hit = true; + l2_pref_hit = true; + +} + +void pref_phase_c::init_func(int cid) +{ + int ii; + static char pref_phase_filename[] = "pref_phase"; + + core_id = cid; + + if (!knob_enable) + return; + + // phase_hwp = (Pref_PHASE*)malloc(sizeof(Pref_PHASE)); + this->hwp_info->enabled = true; + + this->phase_table = (PhaseInfoEntry *)calloc(*m_simBase->m_knobs->KNOB_PREF_PHASE_TABLE_SIZE,sizeof(PhaseInfoEntry)); + + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_PHASE_TABLE_SIZE; ii++) { + this->phase_table[ii].MemAccess = (bool *)calloc(*m_simBase->m_knobs->KNOB_PREF_PHASE_INFOSIZE, sizeof(bool)); + this->phase_table[ii].mapped_regions = (Phase_Region *)calloc(*m_simBase->m_knobs->KNOB_PREF_PHASE_TRACKEDREGIONS, sizeof(Phase_Region)); + } + this->MemAccess = (bool *) calloc(*m_simBase->m_knobs->KNOB_PREF_PHASE_INFOSIZE, sizeof(bool)); + this->mapped_regions = (Phase_Region *) calloc(*m_simBase->m_knobs->KNOB_PREF_PHASE_TRACKEDREGIONS, sizeof(Phase_Region)); + this->interval_start = 0; + this->curr_phaseid = 0; + this->num_misses = 0; + + if (*m_simBase->m_knobs->KNOB_PREF_PHASE_STUDY) { + PREF_PHASE_OUT = file_tag_fopen(pref_phase_filename, "w", m_simBase); + } +} + +// CHECKME +void pref_phase_c::l2_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + // Do nothing on a l2 hit + // pref_phase_l2_train(lineAddr, loadPC, true); + if (0) { + printf("%lld %lld", lineAddr, loadPC); + } +} + +void pref_phase_c::l2_pref_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_phase_l2_train(tid, lineAddr, loadPC, true); +} + +void pref_phase_c::l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_phase_l2_train(tid, lineAddr, loadPC, false); +} + +void pref_phase_c::pref_phase_l2_train(int tid, Addr lineAddr, Addr loadPC, bool pref_hit) +{ + if (0) { // to prevent compilation error, jaekyu (11-3-2009) + printf("%lld %lld %d", lineAddr, loadPC, pref_hit); + } + // CHECKME / Jaekyu / If we want to use phase prefetcher, then each core should contain one phare prefetcher +#if NEED_TO_DEBUG + int next_phaseid; + + bool qFull = false; + Addr lineIndex = lineAddr >> LOG2_DCACHE_LINE_SIZE; + Addr hashIndex = lineIndex % *m_simBase->m_knobs->KNOB_PREF_PHASE_PRIME_HASH; + + // Update access pattern + this->MemAccess[hashIndex] = true; + + pref_phase_updateregioninfo(this->mapped_regions, lineIndex); + + this->num_misses++; + if (m_inst_count - this->interval_start > *m_simBase->m_knobs->KNOB_PREF_PHASE_INTERVAL) { + this->interval_start = m_inst_count; + if (this->num_misses > *m_simBase->m_knobs->KNOB_PREF_PHASE_MIN_MISSES) { + if (*m_simBase->m_knobs->KNOB_PREF_PHASE_STUDY) { + int ii; + for (ii = 0; ii<*m_simBase->m_knobs->KNOB_PREF_PHASE_INFOSIZE; ii++) { + fprintf(PREF_PHASE_OUT, (this->MemAccess[ii] ? "1" : "0")); + } + fprintf(PREF_PHASE_OUT, "\n"); + } + + this->num_misses = 0; + next_phaseid = pref_phase_computenextphase(); + STAT_EVENT(PREF_PHASE_NEWPHASE_DET); + { + // Set the memAccess pattern for the next phase correctly + bool * tmp = this->MemAccess; + this->MemAccess = this->phase_table[next_phaseid].MemAccess; + this->phase_table[next_phaseid].MemAccess = tmp; + + memset(this->MemAccess, 0, sizeof(bool)**m_simBase->m_knobs->KNOB_PREF_PHASE_INFOSIZE); + } + { + // Set the mapped regions for the current region correctly + Phase_Region * tmp = this->mapped_regions; + this->mapped_regions = this->phase_table[this->curr_phaseid].mapped_regions; + this->phase_table[this->curr_phaseid].mapped_regions = tmp; + + memset(this->mapped_regions, 0, sizeof(Phase_Region)**m_simBase->m_knobs->KNOB_PREF_PHASE_TRACKEDREGIONS); + } + + if (!this->phase_table[next_phaseid].m_valid) { + STAT_EVENT(PREF_PHASE_NEWPHASE_NOTVALID); + + this->phase_table[next_phaseid].m_valid = true; + memset(this->phase_table[next_phaseid].mapped_regions, 0, sizeof(Phase_Region)**m_simBase->m_knobs->KNOB_PREF_PHASE_TRACKEDREGIONS); + } + + this->curr_phaseid = next_phaseid; + this->phase_table[next_phaseid].last_access = g_simulation_cycle; + this->currsent_regid = 0; + this->currsent_regid_offset = 0; + } + } + // Send out prefetches + while (this->currsent_regid < PREF_PHASE_REGIONENTRIES) { + Addr lineIndex, startIndex; + Phase_Region * region = &this->phase_table[this->curr_phaseid].mapped_regions[this->currsent_regid]; + if (region->m_valid) { + startIndex = region->PageNumber<<( *m_simBase->m_knobs->KNOB_PREF_PHASE_LOG2REGIONSIZE - LOG2_DCACHE_LINE_SIZE); + + for (; this->currsent_regid_offset < PREF_PHASE_REGIONENTRIES; this->currsent_regid_offset++) { + if ( region->RegionMemAccess[this->currsent_regid_offset]) { + lineIndex = startIndex + this->currsent_regid_offset; + if(!hw_prefetcher->pref_addto_l2req_queue(lineIndex, this->hwp_info->m_id)) { + qFull = true; + break; + } + STAT_EVENT(PREF_PHASE_SENTPREF); + } + } + if (qFull) + break; + } + this->currsent_regid++; + this->currsent_regid_offset = 0; + } + +#endif +} + +void pref_phase_c::pref_phase_updateregioninfo(Phase_Region *mapped_regions, Addr lineAddr) +{ + int ii, id; + Addr pagenum = PAGENUM(lineAddr); + int region_offset = (lineAddr >> LOG2_DCACHE_LINE_SIZE) & N_BIT_MASK(*m_simBase->m_knobs->KNOB_PREF_PHASE_LOG2REGIONSIZE - LOG2_DCACHE_LINE_SIZE); + id = -1; + + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_PHASE_TRACKEDREGIONS; ii++) { + if (mapped_regions[ii].valid && mapped_regions[ii].PageNumber == pagenum) { + id = ii; + break; + } + if (!mapped_regions[ii].valid || id==-1) { + id = ii; + } else if (mapped_regions[id].valid && mapped_regions[ii].last_access < mapped_regions[id].last_access) { + id = ii; + } + } + if (!mapped_regions[id].valid || mapped_regions[id].PageNumber!=pagenum ) { + memset(mapped_regions[id].RegionMemAccess, 0, sizeof(bool)*PREF_PHASE_REGIONENTRIES); + } + if (mapped_regions[id].PageNumber != pagenum) { + STAT_EVENT(PREF_PHASE_OVERWRITE_PAGE); + } + mapped_regions[id].PageNumber = pagenum; + mapped_regions[id].last_access = m_simBase->m_simulation_cycle; + mapped_regions[id].valid = true; + mapped_regions[id].RegionMemAccess[region_offset] = true; +} + +int pref_phase_c::pref_phase_computenextphase(void) +{ + int ii, jj; + int id = -1; + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_PHASE_TABLE_SIZE; ii++) { + if (this->phase_table[ii].valid) { + int diffnum = 0; + int missnum = 0; + float missper = 0.0; + for (jj = 0; jj < *m_simBase->m_knobs->KNOB_PREF_PHASE_INFOSIZE; jj++) { + if (this->MemAccess[jj] != this->phase_table[ii].MemAccess[jj]) { + diffnum++; + } + if (this->MemAccess[jj] == 1) { + missnum++; + } + } + missper = (1.0*diffnum) / (1.0*missnum); + if (diffnum < *m_simBase->m_knobs->KNOB_PREF_PHASE_MAXDIFF_THRESH && missper < *m_simBase->m_knobs->KNOB_PREF_PHASE_MISSPER) { + // Found a match + return ii; + } + } + if (id == -1 || !this->phase_table[ii].valid) { + id = ii; + } else if (this->phase_table[id].valid && this->phase_table[ii].last_access < this->phase_table[id].last_access) { + id = ii; + } + } + // Taken another entry... + // So set it to invalid + this->phase_table[id].valid = false; + return id; +} \ No newline at end of file diff --git a/src/hw_prefetcher/pref_phase.h b/src/hw_prefetcher/pref_phase.h new file mode 100644 index 0000000..ca36524 --- /dev/null +++ b/src/hw_prefetcher/pref_phase.h @@ -0,0 +1,93 @@ +/*************************************************************************************** + * File : pref_phase.h + * Author : Santhosh Srinath + * Date : 11/16/2004 + * CVS : $Id: pref_phase.h,v 1.1 2008/07/30 14:18:16 kacear Exp $: + * Description : + ***************************************************************************************/ +#ifndef __PREF_PHASE_H__ + +#include "../pref_common.h" +#include "../pref.h" + +/*************************************************************************************** + * Phase Based Prefetching: + * This prefetcher works by predicting the future memory access pattern based on + * the current access pattern. This is currently modeled more as collection of + * accesses rather than as a permutation of the accesses -> Order is not important. + + * Currently this prefetcher collects the L2 miss pattern for the current "phase" + * which is based on number of instructions retired. + * + * Largest Prime < 16384 = 16381 + ***************************************************************************************/ +#define PREF_PHASE_REGIONENTRIES 64 + + +// This struct keeps info on the regions being targeted. +typedef struct Phase_Region_Struct { + Addr PageNumber; + bool RegionMemAccess[PREF_PHASE_REGIONENTRIES]; // This is the access pattern for this region + + Counter last_access; + bool valid; +} Phase_Region; + + +typedef struct PhaseInfoEntry_Struct { + bool * MemAccess; // This is the access pattern for the whole of memory + // for the last interval + + Phase_Region * mapped_regions; // Given the last phase, what is the current access pattern + + Counter last_access; // used for lru + bool valid; +} PhaseInfoEntry; + + +class pref_phase_c : public pref_base_c +{ + private: + PhaseInfoEntry * phase_table; + + Counter interval_start; + + uns curr_phaseid; // Current phase entry we are prefetching for + + bool * MemAccess; // Current miss pattern - used to find the next phase + Phase_Region * mapped_regions; // Used to update the phase table + + uns currsent_regid; + uns currsent_regid_offset; + Counter num_misses; + + pref_phase_c(); + + public: + pref_phase_c(hwp_common_c *, Unit_Type, macsim_c*); + + /*************************************************************/ + /* HWP Interface */ + void init_func(int core_id); + void done_func() {} + void l1_miss_func(int, Addr, Addr, uop_c *) {} + void l1_hit_func(int, Addr, Addr, uop_c *) {} + void l1_pref_hit_func(int, Addr, Addr, uop_c *) {} + void l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c * ); + void l2_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c * ); + void l2_pref_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c * ); + + void pref_phase_l2_train(int tid, Addr lineAddr, Addr loadPC, bool pref_hit ); + + + /*************************************************************/ + /* Misc functions */ + + void pref_phase_updateregioninfo(Phase_Region *, Addr lineAddr); + + int pref_phase_computenextphase(void); +}; + + +#define __PREF_PHASE_H__ +#endif /* __PREF_PHASE_H__*/ \ No newline at end of file diff --git a/src/hw_prefetcher/pref_stream.cc b/src/hw_prefetcher/pref_stream.cc new file mode 100644 index 0000000..23c2732 --- /dev/null +++ b/src/hw_prefetcher/pref_stream.cc @@ -0,0 +1,748 @@ +/*************************************************************************************** + * File : pref_stream.c + * Author : Santhosh Srinath ( based on Hyesoon's code ) + * Date : 1/20/2005 + * CVS : $Id: pref_stream.cc,v 1.2 2008/09/12 03:42:02 kacear Exp $: + * Description : Stream Prefetcher + ***************************************************************************************/ + + +#include "pref_stream.h" + +#include "../global_defs.h" +#include "../global_types.h" +#include "../debug_macros.h" + +#include "../utils.h" +#include "../assert_macros.h" +#include "../uop.h" +#include "../memory.h" +#include "../core.h" +#include "../cache.h" +#include "../statistics.h" +#include "../memory.h" +#include "../pref_common.h" + +#include "../all_knobs.h" +#include "../knob.h" + +/**************************************************************************************/ +/* Macros */ +#define DEBUG(args...) _DEBUG((*m_simBase->m_knobs->KNOB_DEBUG_STREAM || *m_simBase->m_knobs->KNOB_DEBUG_MEM_TRACE), ## args) +#define DEBUG_PREFACC(args...) _DEBUG(DEBUG_PREFACC, ## args) + +/**************************************************************************************/ +/* Global Variables */ + +#if 0 +extern Memory *mem; +extern Dcache_Stage *dc; +#endif + +/***************************************************************************************/ +/* Local Prototypes */ + +/**************************************************************************************/ +/* stream prefetcher */ +/* prefetch is initiated by dcache miss but the request fills the l1 cache (second level) cache */ +/* each stream has the starting pointer and the ending pointer and those pointers tell whether the l1 miss is within the boundary (buffer) */ +/* stream buffer fetches one by one (end point requests the fetch address */ +/* stream buffer just holds the stream boundary not the data itself and data is stored in the second level cache */ +/* At the beginning we will wait until we see 2 miss addresses */ +/* using 2 miss addresses we decide the direction of the stream ( upward or downward) and in the begining fill the half of buffer ( so many request !!! ) */ +/* Reference : IBM POWER 4 White paper */ + + +// Default Constructor +//pref_stream_c::pref_stream_c() {} + +pref_stream_c::pref_stream_c(hwp_common_c *hcc, Unit_Type type, macsim_c* simBase) +: pref_base_c(simBase) +{ + name = "stream"; + hwp_type = Mem_To_UL1; + hwp_common = hcc; + + // configuration + switch (type) { + case UNIT_SMALL: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_STREAM_ON; + break; + case UNIT_MEDIUM: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_STREAM_ON_MEDIUM_CORE; + break; + case UNIT_LARGE: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_STREAM_ON_LARGE_CORE; + break; + } + + + done = true; + l1_hit = true; + l2_miss = true; + l2_hit = true; + +} + + +void pref_stream_c::init_func(int cid) +{ + if (!knob_enable) { + return; + } + + core_id = cid; + hwp_info->enabled = true; + pref_stream = new Pref_Stream; + pref_stream->hwp_info = hwp_info; + + pref_stream->stream = new Stream_Buffer[*m_simBase->m_knobs->KNOB_STREAM_BUFFER_N]; + pref_stream->train_filter = new Addr[*m_simBase->m_knobs->KNOB_TRAIN_FILTER_SIZE]; + + pref_stream->train_num = *m_simBase->m_knobs->KNOB_STREAM_TRAIN_NUM; + pref_stream->distance = *m_simBase->m_knobs->KNOB_STREAM_LENGTH; + pref_stream->pref_degree_vals[0] = 4; + pref_stream->pref_degree_vals[1] = 8; + pref_stream->pref_degree_vals[2] = 16; + pref_stream->pref_degree_vals[3] = 32; + pref_stream->pref_degree_vals[4] = 64; + + pref_stream->num_tosend = *m_simBase->m_knobs->KNOB_STREAM_PREFETCH_N; + pref_stream->num_tosend_vals[0] = 1; + pref_stream->num_tosend_vals[1] = 1; + pref_stream->num_tosend_vals[2] = 2; + pref_stream->num_tosend_vals[3] = 4; + pref_stream->num_tosend_vals[4] = 4; +} + + +void pref_stream_c::l1_hit_func(int tid, Addr line_addr, Addr load_PC, uop_c *uop) +{ + l2_miss_func(tid, line_addr, load_PC, uop); +} + + +/* line_addr: the first address of the cache block */ +void pref_stream_c::pref_stream_train(int tid, Addr line_addr, Addr load_PC, bool create) +{ + // search the stream buffer + int hit_index = -1; + int ii; + int dis, maxdistance; + Addr line_index = line_addr >> LOG2_DCACHE_LINE_SIZE; + /* training filter */ + + DEBUG("[DL0MISS%s]:0x%7llx mi:0x%lld core_id:%d\n", "L1", line_addr, line_index, core_id); + + if (!pref_stream_train_stream_filter(line_index)) { + + if (*m_simBase->m_knobs->KNOB_PREF_THROTTLE_ON) { + pref_stream_throttle(); + + if (*m_simBase->m_knobs->KNOB_PREF_STREAM_ACCPERSTREAM) + pref_stream_throttle_streams(line_index); + } + + if (*m_simBase->m_knobs->KNOB_PREF_THROTTLEFB_ON) { + pref_stream_throttle_fb(); + } + + /* search for stream buffer */ + // so we create on dcache misses also? - confusing... - onur + hit_index = pref_stream_train_create_stream_buffer(line_index, true, create, tid); + + if (hit_index == -1) /* we do not have a trained buffer, nor did we create it */ + return; + + pref_stream_addto_train_stream_filter(line_index); + + if (pref_stream->stream[hit_index].trained) { + + pref_stream->stream[hit_index].lru = m_simBase->m_simulation_cycle; // update lru + STAT_EVENT(HIT_TRAIN_STREAM); + + /* hit the stream_buffer, request the prefetch */ + + for (ii = 0 ; ii < pref_stream->num_tosend; ii++) { + + if ((pref_stream->stream[hit_index].sp == line_index) && (pref_stream->stream[hit_index].buffer_full)) { + // when is buffer_full set to false except for buffer creation? - onur + // stream prefetch is requesting enough far ahead + // stop prefetch and wait until miss address is within the buffer area + return; + } + + if (!hwp_common->pref_addto_l2req_queue(pref_stream->stream[hit_index].ep + pref_stream->stream[hit_index].dir, pref_stream->hwp_info->id, load_PC)){ + //if (!hwp_common->pref_addto_l2req_queue(pref_stream->stream[hit_index].ep + pref_stream->stream[hit_index].dir, pref_stream->hwp_info->id)){ + return; + } + + pref_stream->stream[hit_index].ep = pref_stream->stream[hit_index].ep + pref_stream->stream[hit_index].dir; + dis = pref_stream->stream[hit_index].ep - pref_stream->stream[hit_index].sp ; + maxdistance = (*m_simBase->m_knobs->KNOB_PREF_STREAM_ACCPERSTREAM ? pref_stream->stream[hit_index].length : pref_stream->distance); + if (((pref_stream->stream[hit_index].dir == 1) && (dis > maxdistance) ) || + ((pref_stream->stream[hit_index].dir == -1) && (dis < -maxdistance) )) { + pref_stream->stream[hit_index].buffer_full = true; + pref_stream->stream[hit_index].sp = pref_stream->stream[hit_index].sp + pref_stream->stream[hit_index].dir; + } + + if (*m_simBase->m_knobs->KNOB_REMOVE_REDUNDANT_STREAM) + pref_stream_remove_redundant_stream(hit_index); + + DEBUG("[InQ:0x%s]ma:0x%7llx mi:0x%7llx d:%2d ri:0x%7llx, sp:0x%7llx ep:0x%7llx core_id:%d\n", + "L1", + line_addr, line_index, pref_stream->stream[hit_index].dir, + pref_stream->stream[hit_index].ep + pref_stream->stream[hit_index].dir, + pref_stream->stream[hit_index].sp, pref_stream->stream[hit_index].ep, core_id); + } + } + else STAT_EVENT(MISS_TRAIN_STREAM); + } + } + + +void pref_stream_c::l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_stream_train(tid, lineAddr, loadPC, true); +} + + +void pref_stream_c::l2_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_stream_train(tid, lineAddr, loadPC, false); +} + + +int pref_stream_c::pref_stream_train_create_stream_buffer(Addr line_index, bool train, bool create, int tid) +{ + int ii; + int dir; + int lru_index = -1; + bool found_closeby = false; + // First check for a trained buffer + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ii++) { + if ((!*m_simBase->m_knobs->KNOB_PREF_THREAD_INDEX || pref_stream->stream[ii].tid == tid) && + pref_stream->stream[ii].valid && + pref_stream->stream[ii].trained ) { + if(((pref_stream->stream[ii].sp <= line_index ) && ((pref_stream->stream[ii].ep + *m_simBase->m_knobs->KNOB_PREF_TRAIN_WINDOW_SLACK) >= (line_index )) && (pref_stream->stream[ii].dir == 1)) || + (((pref_stream->stream[ii].sp -*m_simBase->m_knobs->KNOB_PREF_TRAIN_WINDOW_SLACK) >= (line_index )) && (pref_stream->stream[ii].ep <= line_index) && (pref_stream->stream[ii].dir == -1))) { + // found a trained buffer + return ii; + } + } + } + + if (train || create) { + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ii++) { + if ((!*m_simBase->m_knobs->KNOB_PREF_THREAD_INDEX || pref_stream->stream[ii].tid == tid) && + pref_stream->stream[ii].valid + && (!pref_stream->stream[ii].trained)) { + if ((pref_stream->stream[ii].sp <= (line_index + *m_simBase->m_knobs->KNOB_STREAM_TRAIN_LENGTH)) && + (pref_stream->stream[ii].sp >= (line_index - *m_simBase->m_knobs->KNOB_STREAM_TRAIN_LENGTH))) { + + if (train) { // do these only if we are training + // decide the train dir + if (pref_stream->stream[ii].sp > line_index) dir = -1; + else dir = 1; + pref_stream->stream[ii].train_hit++; + if (pref_stream->stream[ii].train_hit > pref_stream->train_num) { + pref_stream->stream[ii].trained = true; + pref_stream->stream[ii].start_vline = pref_stream->stream[ii].sp; + pref_stream->stream[ii].ep = (dir > 0 ) ? line_index + *m_simBase->m_knobs->KNOB_STREAM_START_DIS : line_index - *m_simBase->m_knobs->KNOB_STREAM_START_DIS ; // 04/17/03 BUG !!! ins_model.c + pref_stream->stream[ii].dir = dir; + DEBUG("stream trained stream_index:%3d sp %7llx ep %7llx dir %2d miss_index %llx core_id:%d\n", + ii, pref_stream->stream[ii].sp, pref_stream->stream[ii].ep, pref_stream->stream[ii].dir, line_index, core_id); + } + } + + // create a new stream buffer + // create_stream_buffer(dir, line_index ); + return ii; + } + } + } + + if (!create || found_closeby) + return -1; + } + + if (create) { + // search for invalid buffer + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ii++) { + if (!pref_stream->stream[ii].valid){ + lru_index = ii; + break; + } + } + + // search for oldest buffer + + if (lru_index == -1) { + uns len; + lru_index = 0; + for ( ii = 0 ; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ii++) { + if (pref_stream->stream[ii].lru < pref_stream->stream[lru_index].lru ) { + lru_index = ii; + } + } + STAT_EVENT(REPLACE_OLD_STREAM); + if (pref_stream->stream[lru_index].dir==0 || !pref_stream->stream[lru_index].trained) { + len = 0; + } + else if (pref_stream->stream[lru_index].dir == 1) { + len = pref_stream->stream[lru_index].ep - pref_stream->stream[lru_index].start_vline + 1; + } + else { + len = pref_stream->stream[lru_index].start_vline - pref_stream->stream[lru_index].ep + 1; + } + if (len!=0) { + STAT_EVENT( STREAM_LENGTH_0 + MIN2(len / 10, 10)); + } + } + + // create new train buffer + + pref_stream->stream[lru_index].lru = m_simBase->m_simulation_cycle; + pref_stream->stream[lru_index].valid = true; + pref_stream->stream[lru_index].sp = line_index; + pref_stream->stream[lru_index].ep = line_index; + pref_stream->stream[lru_index].train_hit = 1; + pref_stream->stream[lru_index].trained = false; + pref_stream->stream[lru_index].buffer_full = false; + + pref_stream->stream[lru_index].length = *m_simBase->m_knobs->KNOB_STREAM_LENGTH; + pref_stream->stream[lru_index].pref_issued = 0; + pref_stream->stream[lru_index].pref_useful = 0; + + STAT_EVENT(STREAM_TRAIN_CREATE); + DEBUG("create new stream : stream_no :%3d, line_index %7llx sp = %7llx core_id:%d\n", + lru_index, line_index, pref_stream->stream[lru_index].sp, core_id); + return lru_index; + } + + return -1; +} + + +void pref_stream_c::pref_stream_throttle(void) +{ + int dyn_shift = 0; + float acc = hwp_common->pref_get_accuracy(pref_stream->hwp_info->id); + float regacc = hwp_common->pref_get_regionbased_acc(); + float accratio = acc/regacc; + + if (acc != 1.0) { + if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_1) { + dyn_shift += 2; + } + else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_2) { + dyn_shift += 1; + } + else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_3) { + dyn_shift = 0; + } + else if (acc > *m_simBase->m_knobs->KNOB_PREF_ACC_THRESH_4) { + dyn_shift = dyn_shift - 1; + } + else { + dyn_shift = dyn_shift - 2; + } + } + if (*m_simBase->m_knobs->KNOB_PREF_ACCRATIOTHROTTLE) { + if (accratio < *m_simBase->m_knobs->KNOB_PREF_ACCRATIO_1 ) { + dyn_shift = dyn_shift - 1; + } + } + if (acc==1.0) { + pref_stream->distance = 64; + //pref_stream->train_num = PREF_STREAM_TRAIN_NUM_3; + STAT_EVENT(PREF_DISTANCE_4); + } + else { + if (dyn_shift >= 2 ) { + pref_stream->distance = 128; + // pref_stream->train_num = PREF_STREAM_TRAIN_NUM_3; + STAT_EVENT(PREF_DISTANCE_5); + } + else if (dyn_shift == 1 ) { + pref_stream->distance = 64; + //pref_stream->train_num = PREF_STREAM_TRAIN_NUM_3; + STAT_EVENT(PREF_DISTANCE_4); + } + else if (dyn_shift == 0 ) { + pref_stream->distance = 32; + //pref_stream->train_num = PREF_STREAM_TRAIN_NUM_2; + STAT_EVENT(PREF_DISTANCE_3); + } + else if (dyn_shift == -1 ) { + pref_stream->distance = 16; + //pref_stream->train_num = PREF_STREAM_TRAIN_NUM_1; + STAT_EVENT(PREF_DISTANCE_2); + } + else if (dyn_shift <= -2 ) { + pref_stream->distance = 5; + //pref_stream->train_num = PREF_STREAM_TRAIN_NUM_0; + STAT_EVENT(PREF_DISTANCE_1); + } + } +} + + +//////////////////////////////////////////////////////////////////////// +// Rest Used when throttling for each stream separately - NON FUNCTIONAL +bool pref_stream_c::pref_stream_train_stream_filter(Addr line_index) +{ + int ii; + for ( ii = 0 ; ii < *m_simBase->m_knobs->KNOB_TRAIN_FILTER_SIZE; ii++) { + if (pref_stream->train_filter[ii] == line_index) { + return true; + } + } + return false; +} + +inline void pref_stream_c::pref_stream_addto_train_stream_filter(Addr line_index) +{ + pref_stream->train_filter[(pref_stream->train_filter_no++)%*m_simBase->m_knobs->KNOB_TRAIN_FILTER_SIZE] = line_index; +} + + +void pref_stream_c::pref_stream_remove_redundant_stream(int hit_index) +{ + int ii; + + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ii++) { + if ((ii == hit_index) || (!pref_stream->stream[ii].valid)) continue; + if (((pref_stream->stream[ii].ep < pref_stream->stream[hit_index].ep ) && + (pref_stream->stream[ii].ep > pref_stream->stream[hit_index].sp ) ) || + ((pref_stream->stream[ii].sp < pref_stream->stream[hit_index].ep ) && + (pref_stream->stream[ii].sp > pref_stream->stream[hit_index].sp ) )) { + pref_stream->stream[ii].valid = false; + STAT_EVENT(REMOVE_REDUNDANT_STREAM_STAT); + DEBUG("stream[%d] sp:0x%llx ep:0x%llx is removed by stream[%d] sp:0x%llx ep:0x%llx core_id:%d\n", + ii, pref_stream->stream[ii].sp, pref_stream->stream[ii].ep, hit_index, + pref_stream->stream[hit_index].sp, pref_stream->stream[hit_index].ep, core_id); + } + } +} + + +float pref_stream_c::pref_stream_acc_getacc(int index, float pref_acc) +{ + float acc = pref_stream->stream[index].pref_issued>40?((float) pref_stream->stream[index].pref_useful) / ((float)pref_stream->stream[index].pref_issued) : pref_acc; + return acc; +} + + +void pref_stream_c::pref_stream_acc_l2_useful(Addr line_index) +{ + if (!knob_enable) { + return; + } + + for (int ii = 0; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ii++) + { + if (pref_stream->stream[ii].valid && pref_stream->stream[ii].trained ) + { + if(((pref_stream->stream[ii].start_vline <= line_index ) && (pref_stream->stream[ii].ep >= line_index) && (pref_stream->stream[ii].dir == 1)) || + ((pref_stream->stream[ii].start_vline >= line_index ) && (pref_stream->stream[ii].ep <= line_index) && (pref_stream->stream[ii].dir == -1))) { + // found a trained buffer + pref_stream->stream[ii].pref_useful += 1; + } + } + } + // pref_stream->pref_useful += 1; +} + + +void pref_stream_c::pref_stream_acc_l2_issued(Addr line_index) +{ + for (int ii = 0; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ++ii) + { + if (pref_stream->stream[ii].valid && pref_stream->stream[ii].trained ) + { + if(((pref_stream->stream[ii].start_vline <= line_index ) && + (pref_stream->stream[ii].ep >= line_index) && + (pref_stream->stream[ii].dir == 1)) || + ((pref_stream->stream[ii].start_vline >= line_index ) && + (pref_stream->stream[ii].ep <= line_index) && + (pref_stream->stream[ii].dir == -1))) + { + // found a trained buffer + pref_stream->stream[ii].pref_issued += 1; + } + } + } + // pref_stream->pref_issued += 1; +} + + +void pref_stream_c::pref_stream_throttle_streams(Addr line_index) +{ + for (int ii = 0; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ++ii) + { + if (pref_stream->stream[ii].valid && pref_stream->stream[ii].trained ) + { + if(((pref_stream->stream[ii].ep - *m_simBase->m_knobs->KNOB_PREF_ACC_DISTANCE_10 <= line_index ) && + (pref_stream->stream[ii].ep >= line_index) && (pref_stream->stream[ii].dir == 1)) || + ((pref_stream->stream[ii].ep + *m_simBase->m_knobs->KNOB_PREF_ACC_DISTANCE_10 >= line_index ) && ( + pref_stream->stream[ii].ep <= line_index) && (pref_stream->stream[ii].dir == -1))) + { + // found a trained buffer + pref_stream_throttle_stream(ii); + return; + } + } + } +} + +/* + throttle_stream_pf -> reset the stream length and train length for this stream buffer + */ +void pref_stream_c::pref_stream_throttle_stream(int index) +{ + if (0) { // to prevent compilation error, jaekyu (11-3-2009) + printf("%d", index); + } + /* + float stream_acc; + float pref_acc; + uns thresh_num = 0; + uns old_length = pref_stream->stream[index].length; + + pref_acc = (pref_stream->pref_issued > 100) ?((float)pref_stream->pref_useful)/((float)pref_stream->pref_issued):1; + + if (PREF_ACC_USE_CACHE) { + stream_acc = pref_stream_acc_getacc(index, pref_acc); + } else if (PREF_ACC_USE_REGION) { + stream_acc = Pref_getaccuracy(pref_stream->stream[index].ep, index); + } else { + stream_acc = Pref_getaccuracy_stream(pref_stream->stream[index].ep, index, pref_stream->stream[index].start_vline, pref_stream->stream[index].ep); + } + + // First set train distance based on pref_acc + if (PREF_ACC_DYN_TRAIN_ON) { + if (pref_acc >= PREF_TRAIN_THRESH_3) { + pref_stream->train_num = PREF_ACC_TRAIN_NUM_3; + } + else if (pref_acc >= PREF_TRAIN_THRESH_2) { + pref_stream->train_num = PREF_ACC_TRAIN_NUM_2; + } + else if (pref_acc >= PREF_TRAIN_THRESH_1) { + pref_stream->train_num = PREF_ACC_TRAIN_NUM_1; + } + else { + pref_stream->train_num = PREF_ACC_TRAIN_NUM_0; + } + } + if (!PREF_ACC_DYN_DIST_ON) { + return; + } + if (PREF_ACC_USE_OVERALLALSO && stream_acc < PREF_ACC_THRESH_7 ) { + if (pref_acc >= PREF_TRAIN_THRESH_3) { + stream_acc = stream_acc + PREF_ACC_TRAIN_ACCOFFSET_3; + } + else if (pref_acc >= PREF_TRAIN_THRESH_2) { + stream_acc = stream_acc + PREF_ACC_TRAIN_ACCOFFSET_2; + } + else if (pref_acc >= PREF_TRAIN_THRESH_1) { + stream_acc = stream_acc + PREF_ACC_TRAIN_ACCOFFSET_1; + } + else { + stream_acc = stream_acc + PREF_ACC_TRAIN_ACCOFFSET_0; + } + } + if (PREF_ACC_USE_ONLYGLOBAL) { + stream_acc = pref_acc; + } + // Now set the stream length for this buffer based on stream_acc + if (stream_acc >= PREF_ACC_THRESH_9) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length < PREF_ACC_DISTANCE_10) { + pref_stream->stream[index].length += 6; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_10; + } + thresh_num = 9; + } else if (stream_acc >= PREF_ACC_THRESH_8) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length < PREF_ACC_DISTANCE_9) { + pref_stream->stream[index].length += 5; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_9; + } + thresh_num = 8; + } else if (stream_acc >= PREF_ACC_THRESH_7) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length < PREF_ACC_DISTANCE_8) { + pref_stream->stream[index].length += 4; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_8; +} +thresh_num = 7; +} else if (stream_acc >= PREF_ACC_THRESH_6) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length < PREF_ACC_DISTANCE_7) { + pref_stream->stream[index].length += 3; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_7; + } + thresh_num = 6; +} else if (stream_acc >= PREF_ACC_THRESH_5) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length < PREF_ACC_DISTANCE_6) { + pref_stream->stream[index].length += 2; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_6; + } + thresh_num = 5; +} else if (stream_acc >= PREF_ACC_THRESH_4) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length < PREF_ACC_DISTANCE_5) { + pref_stream->stream[index].length += 1; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_5; + } + thresh_num = 4; +} else if (stream_acc >= PREF_ACC_THRESH_3) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length > PREF_ACC_DISTANCE_4) { + pref_stream->stream[index].length -= 1; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_4; + } + thresh_num = 3; +} else if (stream_acc >= PREF_ACC_THRESH_2) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length > PREF_ACC_DISTANCE_3) { + pref_stream->stream[index].length -= 2; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_3; + } + thresh_num = 2; +} else if (stream_acc >= PREF_ACC_THRESH_1) { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length > PREF_ACC_DISTANCE_2) { + pref_stream->stream[index].length -= 3; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_2; + } + thresh_num = 1; +} else { + if (PREF_ACC_INCDEC_LENGTH && pref_stream->stream[index].length > PREF_ACC_DISTANCE_1) { + pref_stream->stream[index].length -= 4; + } else { + pref_stream->stream[index].length = PREF_ACC_DISTANCE_1; + } + thresh_num = 0; +} +if (pref_stream->stream[index].dir == 1) { + Addr newsp = pref_stream->stream[index].ep - pref_stream->stream[index].length; + if (newsp > pref_stream->stream[index].start_vline) { + pref_stream->stream[index].sp = newsp; + } else { + pref_stream->stream[index].sp = pref_stream->stream[index].start_vline; + } +} else if (pref_stream->stream[index].dir == -1) { + Addr newsp = pref_stream->stream[index].ep + pref_stream->stream[index].length; + if (newsp < pref_stream->stream[index].start_vline) { + pref_stream->stream[index].sp = pref_stream->stream[index].ep + pref_stream->stream[index].length; + } else { + pref_stream->stream[index].sp = pref_stream->stream[index].start_vline; + } +} + +STAT_EVENT(PREF_ACC_NUM_1 + thresh_num); + +if (pref_stream->stream[index].length > old_length) { + STAT_EVENT(PREF_ACC_INC_LENGTH); +} else if (pref_stream->stream[index].length < old_length) { + STAT_EVENT(PREF_ACC_DEC_LENGTH); +} +*/ +} + + +void pref_stream_c::done_func(void) +{ + int ii; + uns len; + + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_STREAM_BUFFER_N; ++ii) + { + if (pref_stream->stream[ii].dir == 0 || !pref_stream->stream[ii].valid) + { + len = 0; + } + else if (pref_stream->stream[ii].dir == 1) + { + len = pref_stream->stream[ii].ep - pref_stream->stream[ii].start_vline + 1; + } + else + { + len = pref_stream->stream[ii].start_vline - pref_stream->stream[ii].ep + 1; + } + + if (len != 0) + STAT_EVENT( STREAM_LENGTH_0 + MIN2(len / 10, 10)); + } +} + +/* + void pref_stream_runahead(Counter op_num) + { +// Try to run ahead each stream having an accuracy > STREAM_RA_ACC by STREAM_RA_NUM +// on a l2 miss. +int ii, jj; +float stream_acc; +bool queue_full = false; +float pref_acc = (pref_stream->pref_issued > 100) ?((float)pref_stream->pref_useful)/((float)pref_stream->pref_issued):1; + +if (pref_stream->curr_op_num != op_num) { +pref_stream->ra_num_sent = 0; +pref_stream->ra_stream_index = 0; +pref_stream->curr_op_num = op_num; +STAT_EVENT(STREAM_ENTER_RA); +} + +for ( jj = pref_stream->ra_num_sent; jj < STREAM_RA_NUM; jj++) { +for ( ii = pref_stream->ra_stream_index ; ii < STREAM_BUFFER_N ; ii++) { +stream_acc = pref_acc_stream_getacc(ii, pref_acc ); +if (stream_acc > STREAM_RA_ACC && pref_stream->stream[ii].trained && pref_stream->stream[ii].valid) { +// Send out request +Pref_Mem_Req new_req; + +new_req.line_index = pref_stream->stream[ii].ep + pref_stream->stream[ii].dir; +new_req.line_addr = (new_req.line_index) << LOG2_DCACHE_LINE_SIZE; +// new_req.hit_stream_buffer = ii; +new_req.valid = true; + +if (pref_stream->pref_req_queue[stream_pref_req_no%PREF_REQ_Q_SIZE].valid) { +queue_full = true; +break; +} + +pref_stream->pref_req_queue[stream_pref_req_no++%PREF_REQ_Q_SIZE] = new_req; + +pref_stream->stream[ii].ep = pref_stream->stream[ii].ep + pref_stream->stream[ii].dir; +STAT_EVENT(STREAM_BUFFER_REQ); +} +} +if (queue_full) { +break; +} +} +pref_stream->ra_stream_index = ii; +pref_stream->ra_num_sent = jj; +} + +*/ + + +void pref_stream_c::pref_stream_throttle_fb(void) +{ + // on pref_dhal, we update the dyn_degree based on sent pref + if (*m_simBase->m_knobs->KNOB_PREF_DHAL) + { + pref_stream->distance = pref_stream->hwp_info->dyn_degree; + } + else + { + hwp_common->pref_get_degfb(pref_stream->hwp_info->id); + ASSERT(pref_stream->hwp_info->dyn_degree<=4 ); + // ASSERT(pref_stream->hwp_info->dyn_degree>=0 && pref_stream->hwp_info->dyn_degree<=4 ); + pref_stream->distance = pref_stream->pref_degree_vals[pref_stream->hwp_info->dyn_degree]; + pref_stream->num_tosend = pref_stream->num_tosend_vals[pref_stream->hwp_info->dyn_degree]; + } +} \ No newline at end of file diff --git a/src/hw_prefetcher/pref_stream.h b/src/hw_prefetcher/pref_stream.h new file mode 100644 index 0000000..620ba9f --- /dev/null +++ b/src/hw_prefetcher/pref_stream.h @@ -0,0 +1,103 @@ +/*************************************************************************************** + * File : pref_stream.h + * Author : Santhosh Srinath ( based on Hyesoon's code ) + * Date : 1/20/2005 + * CVS : $Id: pref_stream.h,v 1.1 2008/07/30 14:18:16 kacear Exp $: + * Description : Stream Prefetcher + ***************************************************************************************/ +#ifndef __STREAM_PREF_H__ + +#include "../memory.h" +#include "../pref_common.h" +#include "../pref.h" + +typedef struct Stream_Buffer_Struct { + int tid; + Addr load_pc[4]; + Addr line_index; + Addr sp; + Addr ep; + Addr start_vline; + int dir; + int lru; + bool valid; + bool buffer_full; + bool trained; + int train_hit; + uns length; // Now with the pref accuracy, we can dynamically tune the length + uns pref_issued; + uns pref_useful; +}Stream_Buffer; + +// stream HWP +typedef struct Pref_Stream_Struct { + pref_info_s* hwp_info; + + Stream_Buffer *stream; + Stream_Buffer *l2hit_stream; + + Addr *train_filter; + int train_filter_no; + + uns train_num; // With pref accuracy, dynamically tune the train length + uns distance; + uns pref_degree_vals[5]; + + uns num_tosend; + uns num_tosend_vals[5]; +}Pref_Stream; + + +class pref_stream_c : public pref_base_c +{ + private: + Pref_Stream *pref_stream; + pref_stream_c(); + + public: + pref_stream_c(hwp_common_c *, Unit_Type, macsim_c* simBase); + + void init_func(int ); + void done_func(void); + void l1_miss_func(int, Addr, Addr, uop_c *) {} + void l1_hit_func(int, Addr, Addr, uop_c *); + void l1_pref_hit_func(int, Addr, Addr, uop_c *) {} + void l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *); + void l2_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *); + void l2_pref_hit_func(int, Addr, Addr, uop_c*) {} + + + void pref_stream_train(int tid, Addr lineAddr, Addr loadPC, bool create); + + int pref_stream_train_create_stream_buffer(Addr line_index, bool train, bool create, int tid); + bool pref_stream_train_stream_filter(Addr line_index); + + inline void pref_stream_addto_train_stream_filter(Addr line_index); + + bool pref_stream_req_queue_filter(Addr line_addr); + + void pref_stream_remove_redundant_stream(int hit_index); + + //void pref_stream_runahead(Counter op_num); + + // Used when throttling using the overall accuracy numbers + void pref_stream_throttle(void); + + void pref_stream_throttle_fb(void); + + ///////////////////////////////////////////////// + // Used when throttling for each stream separately + // NON FUNCTIONAL CURRENTLY + void pref_stream_throttle_streams(Addr line_index); + void pref_stream_throttle_stream(int index); + // Again - Use ONLY when throttling per stream + float pref_stream_acc_getacc(int index, float pref_acc); + void pref_stream_acc_l2_useful(Addr line_index); + void pref_stream_acc_l2_issued(Addr line_index); + /////////////////////////////////////////////////// + +}; + + +#define __STREAM_PREF_H__ +#endif /* __STREAM_PREF_H__*/ \ No newline at end of file diff --git a/src/hw_prefetcher/pref_stridepc.cc b/src/hw_prefetcher/pref_stridepc.cc new file mode 100644 index 0000000..48d550a --- /dev/null +++ b/src/hw_prefetcher/pref_stridepc.cc @@ -0,0 +1,198 @@ +/*************************************************************************************** + * File : pref_stride.c + * Author : Santhosh Srinath + * Date : 1/23/2005 + * CVS : $Id: pref_stridepc.cc,v 1.2 2008/09/12 03:42:02 kacear Exp $: + * Description : Stride Prefetcher - Based on load's PC address + ***************************************************************************************/ + + +#include "pref_stridepc.h" + +#include "../global_defs.h" +#include "../global_types.h" +#include "../debug_macros.h" + +#include "../utils.h" +#include "../assert_macros.h" +#include "../uop.h" +#include "../statistics.h" +#include "../pref_common.h" +#include "../core.h" +#include "../memory.h" + +#include "../all_knobs.h" + +/* + stride prefetcher : Stride prefetcher based on the original stride work + - Essentially use the load's PC to index into a table of prefetch entries + +*/ + +/**************************************************************************************/ +/* Macros */ +#define DEBUG(args...) _DEBUG(*m_simBase->m_knobs->KNOB_DEBUG_PREF_STRIDEPC, ## args) + +/**************************************************************************************/ +/* Global variables */ + + +// Default Constructor +//pref_stridepc_c::pref_stridepc_c() {} + +pref_stridepc_c::pref_stridepc_c(hwp_common_c *hcc, Unit_Type type, macsim_c* simBase) +: pref_base_c(simBase) +{ + name = "stridepc"; + hwp_type = Mem_To_UL1; + hwp_common = hcc; + switch (type) { + case UNIT_SMALL: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_ON; + break; + case UNIT_MEDIUM: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_ON_MEDIUM_CORE; + break; + case UNIT_LARGE: + knob_enable = *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_ON_LARGE_CORE; + break; + } + + l1_miss = true; + l1_hit = true; + l2_miss = true; + +} + + +void pref_stridepc_c::init_func(int cid) +{ + if (!knob_enable) + return; + + core_id = cid; + hwp_info->enabled = true; + // stridepc_hwp = (Pref_StridePC*)malloc(sizeof(Pref_StridePC)); + this->stride_table = (StridePC_Table_Entry *) calloc(*m_simBase->m_knobs->KNOB_PREF_STRIDEPC_TABLE_N, sizeof(StridePC_Table_Entry)); +} + +void pref_stridepc_c::l1_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + l2_miss_func(tid, lineAddr, loadPC, uop); +} + +void pref_stridepc_c::l1_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + l2_miss_func(tid, lineAddr, loadPC, uop); +} + +void pref_stridepc_c::l2_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_stridepc_l2_train(tid, lineAddr, loadPC, uop, true); +} + + +void pref_stridepc_c::l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop) +{ + pref_stridepc_l2_train(tid, lineAddr, loadPC, uop, false); +} + +void pref_stridepc_c::pref_stridepc_l2_train(int tid, Addr lineAddr, Addr loadPC, uop_c *uop, bool l2_hit) +{ + int ii; + int idx = -1; + + Addr lineIndex = lineAddr >> LOG2_DCACHE_LINE_SIZE; + StridePC_Table_Entry * entry = NULL; + + int stride; + + if (loadPC==0) { + return; // no point hashing on a null address + } + + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_TABLE_N; ii++) { + if ((!*m_simBase->m_knobs->KNOB_PREF_THREAD_INDEX || this->stride_table[ii].tid == tid) && + this->stride_table[ii].load_addr == loadPC && + this->stride_table[ii].valid) { + idx = ii; + break; + } + } + + + if (idx == -1) { + if (l2_hit) { // ONLY TRAIN on hit + return; + } + for (ii = 0; ii < *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_TABLE_N; ii++) { + if (!this->stride_table[ii].valid) { + idx = ii; + break; + } + if (idx == -1 || (this->stride_table[idx].last_access < this->stride_table[ii].last_access)) { + idx = ii; + } + } + this->stride_table[idx].trained = false; + this->stride_table[idx].valid = true; + this->stride_table[idx].stride = 0; + this->stride_table[idx].train_num = 0; + this->stride_table[idx].pref_sent = 0; + this->stride_table[idx].last_addr = ( *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_USELOADADDR ? lineAddr : lineIndex); + //this->stride_table[idx].last_addr = ( *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_USELOADADDR ? lineAddr : lineIndex); + this->stride_table[idx].last_addr = uop->m_vaddr; + this->stride_table[idx].load_addr = loadPC; + this->stride_table[idx].last_access= m_simBase->m_simulation_cycle; + this->stride_table[idx].tid = tid; + return; + } + + entry = &this->stride_table[idx]; + entry->last_access = m_simBase->m_simulation_cycle; + stride = ( *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_USELOADADDR ? ( lineAddr - entry->last_addr ) : ( lineIndex - entry->last_addr ) ); + + // printf("loadaddr:%llx trained:%d entrystride:%d currstride:%d lineaddr:%llx pfaddr:%llx\n", entry->load_addr, entry->trained, entry->stride, stride, lineIndex, entry->pref_last_index); + + if (!entry->trained) { + // Now let's train + if (stride == 0) + return; + if (entry->stride!=stride) { + entry->stride = stride; + entry->train_num = 1; + } else { + entry->train_num++; + } + if ( entry->train_num == *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_TRAINNUM) { + entry->trained = true; + entry->start_index = (*m_simBase->m_knobs->KNOB_PREF_STRIDEPC_USELOADADDR ? lineAddr : lineIndex); + entry->pref_last_index = entry->start_index + (*m_simBase->m_knobs->KNOB_PREF_STRIDEPC_STARTDIS*entry->stride); + entry->pref_sent = 0; + } + } else { + Addr pref_index; + Addr curr_idx = (*m_simBase->m_knobs->KNOB_PREF_STRIDEPC_USELOADADDR ? lineAddr : lineIndex); + + if (entry->pref_sent) + entry->pref_sent--; + + if ((stride % entry->stride==0) && + (((stride > 0) && (curr_idx >= entry->start_index) && (curr_idx <= entry->pref_last_index)) || + ((stride < 0) && (curr_idx <= entry->start_index) && (curr_idx >= entry->pref_last_index)) )) { + // all good. continue sending out prefetches + for (ii = 0; (ii < *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_DEGREE && entry->pref_sent < *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_DISTANCE); ii++, entry->pref_sent++) { + pref_index = entry->pref_last_index + entry->stride; + if (!hwp_common->pref_addto_l2req_queue( (*m_simBase->m_knobs->KNOB_PREF_STRIDEPC_USELOADADDR ? (pref_index>>LOG2_DCACHE_LINE_SIZE) : pref_index), this->hwp_info->id )) + break; // q is full + entry->pref_last_index = pref_index; + } + } else { + // stride has changed... + // lets retrain + entry->trained = false; + entry->train_num = 1; + } + } + entry->last_addr = ( *m_simBase->m_knobs->KNOB_PREF_STRIDEPC_USELOADADDR ? lineAddr : lineIndex); +} diff --git a/src/hw_prefetcher/pref_stridepc.h b/src/hw_prefetcher/pref_stridepc.h new file mode 100644 index 0000000..8527150 --- /dev/null +++ b/src/hw_prefetcher/pref_stridepc.h @@ -0,0 +1,64 @@ +/*************************************************************************************** + * File : pref_stridepc.h + * Author : Santhosh Srinath + * Date : 1/23/2005 + * CVS : $Id: pref_stridepc.h,v 1.1 2008/07/30 14:18:16 kacear Exp $: + * Description : Stride Prefetcher - Based on load's PC address + ***************************************************************************************/ +#ifndef __PREF_STRIDEPC_H__ + +#include "../pref_common.h" +#include "../pref.h" + +typedef struct StridePC_Table_Entry_Struct { + bool trained; + bool valid; + + Addr last_addr; + Addr load_addr; + Addr start_index; + Addr pref_last_index; + int stride; + int tid; + + Counter train_num; + Counter pref_sent; + Counter last_access; // for lru +} StridePC_Table_Entry; + + +class pref_stridepc_c : public pref_base_c +{ + private: + StridePC_Table_Entry * stride_table; + pref_stridepc_c(); + + public: + + pref_stridepc_c(hwp_common_c *, Unit_Type, macsim_c* ); + /*************************************************************/ + /* HWP Interface */ + void init_func(int); + void done_func() {} + void l1_miss_func(int, Addr, Addr, uop_c *); + void l1_hit_func(int, Addr, Addr, uop_c *); + void l1_pref_hit_func(int, Addr, Addr, uop_c *) {} + void l2_miss_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop); + void l2_hit_func(int tid, Addr lineAddr, Addr loadPC, uop_c *uop ); + void l2_pref_hit_func(int, Addr, Addr, uop_c *) {} + + + void pref_stridepc_l2_train(int tid, Addr lineAddr, Addr loadPC, uop_c *uop, bool l2_hit); + void set_core_id(int cid) { + core_id = cid; + } + +}; + + + +/*************************************************************/ +/* Misc functions */ + +#define __PREF_STRIDEPC_H__ +#endif /* __PREF_STRIDEPC_H__*/ \ No newline at end of file