yechen3
diff --git a/‎configs/3.x-cfgs/SM7_TITANV/config_fermi_islip.icnt‎ renamed to ‎configs/3.x-cfgs/GTX480/config_fermi_islip.icnt‎
Lines changed: 1 addition & 1 deletion b/‎configs/3.x-cfgs/SM7_TITANV/config_fermi_islip.icnt‎ renamed to ‎configs/3.x-cfgs/GTX480/config_fermi_islip.icnt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/3.x-cfgs/GTX480/gpgpusim.config‎
Lines changed: 133 additions & 0 deletions b/‎configs/3.x-cfgs/GTX480/gpgpusim.config‎
Lines changed: 133 additions & 0 deletions
@@ -7,7 +7,7 @@ network_count = 2;
 
 // Topology
 topology = fly;
-k = 64;
+k = 27;
 n = 1;
 
 // Routing
 
@@ -0,0 +1,133 @@
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 20 
+
+
+# SASS execution (only supported with CUDA >= 4.0)
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 15
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 6
+-gpgpu_n_sub_partition_per_mchannel 2 
+
+# Fermi clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+# In Fermi, each pipeline has 16 execution units, so the Core clock needs to be divided
+# by 2. (GPGPU-Sim simulates a warp (32 threads) in a single cycle). 1400/2 = 700
+-gpgpu_clock_domains 700.0:700.0:700.0:924.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 32768
+
+# This implies a maximum of 48 warps/SM
+-gpgpu_shader_core_pipeline 1536:32 
+-gpgpu_shader_cta 8
+-gpgpu_simd_model 1 
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
+-gpgpu_pipeline_widths 2,1,1,2,1,1,2
+-gpgpu_num_sp_units 2
+-gpgpu_num_sfu_units 1
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+-ptx_opcode_latency_int 4,13,4,5,145
+-ptx_opcode_initiation_int 1,2,2,1,8
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 1,2,1,1,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 8,16,8,8,130
+
+
+# In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
+-gpgpu_cache:dl1  32:128:4,L:L:m:N:H,A:32:8,8
+-gpgpu_shmem_size 49152
+
+# The alternative configuration for fermi in case cudaFuncCachePreferL1 is selected
+#-gpgpu_cache:dl1  64:128:6,L:L:m:N:H,A:32:8,8
+#-gpgpu_shmem_size 16384
+
+# 64 sets, each 128 bytes 8-way for each memory sub partition. This gives 786KB L2 cache
+-gpgpu_cache:dl2 64:128:8,L:B:m:W:L,A:32:4,4:0,32
+-gpgpu_cache:dl2_texture_only 0 
+
+-gpgpu_cache:il1 4:128:4,L:R:f:N:L,A:2:32,4
+-gpgpu_tex_cache:l1 4:128:24,L:R:m:N:L,F:128:4,128:2
+-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4
+
+# enable operand collector 
+-gpgpu_operand_collector_num_units_sp 6
+-gpgpu_operand_collector_num_units_sfu 8
+-gpgpu_operand_collector_num_in_ports_sp 2
+-gpgpu_operand_collector_num_out_ports_sp 2
+-gpgpu_num_reg_banks 16
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+
+-gpgpu_max_insn_issue_per_warp 1
+
+# interconnection
+-network_mode 1 
+-inter_config_file config_fermi_islip.icnt
+
+# memory partition latency config 
+-rop_latency 120
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+# The DRAM return queue and the scheduler queue together should provide buffer
+# to sustain the memory level parallelism to tolerate DRAM latency 
+# To allow 100% DRAM utility, there should at least be enough buffer to sustain
+# the minimum DRAM latency (100 core cycles).  I.e. 
+#   Total buffer space required = 100 x 924MHz / 700MHz = 132
+-gpgpu_frfcfs_dram_sched_queue_size 16
+-gpgpu_dram_return_queue_size 116
+
+# for Fermi, bus width is 384bits, this is 8 bytes (4 bytes at each DRAM chip) per memory partition
+-gpgpu_n_mem_per_ctrlr 2
+-gpgpu_dram_buswidth 4
+-gpgpu_dram_burst_length 8
+-dram_data_command_freq_ratio 4  # GDDR5 is QDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS
+
+# GDDR5 timing from hynix H5GQ1H24AFR
+# to disable bank groups, set nbkgrp to 1 and tCCDL and tRTPL to 0
+-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2"
+
+# Fermi has two schedulers per core
+-gpgpu_num_sched_per_core 2
+# Two Level Scheduler with active and pending pools
+#-gpgpu_scheduler two_level_active:6:0:1
+# Loose round robbin scheduler
+#-gpgpu_scheduler lrr
+# Greedy then oldest scheduler
+-gpgpu_scheduler gto
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs
+-power_simulation_enabled 1
+-gpuwattch_xml_file gpuwattch_gtx480.xml
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0