Skip to content

Commit 5dc5bf8

Browse files
tgrogersGitHub Enterprise
authored andcommitted
Merge pull request accel-sim#21 from abdallm/dev-purdue-integration
A few little things and some sizeable changes: Added memory partition indexing, some configuration file updates, fixed the texture cache so apps that use tex memory will no longer crash.
2 parents faa0531 + eb3d06a commit 5dc5bf8

39 files changed

+1266
-854
lines changed

configs/3.x-cfgs/SM7_TITANV/config_fermi_islip.icnt renamed to configs/3.x-cfgs/GTX480/config_fermi_islip.icnt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ network_count = 2;
77

88
// Topology
99
topology = fly;
10-
k = 64;
10+
k = 27;
1111
n = 1;
1212

1313
// Routing
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# functional simulator specification
2+
-gpgpu_ptx_instruction_classification 0
3+
-gpgpu_ptx_sim_mode 0
4+
-gpgpu_ptx_force_max_capability 20
5+
6+
7+
# SASS execution (only supported with CUDA >= 4.0)
8+
-gpgpu_ptx_convert_to_ptxplus 0
9+
-gpgpu_ptx_save_converted_ptxplus 0
10+
11+
# high level architecture configuration
12+
-gpgpu_n_clusters 15
13+
-gpgpu_n_cores_per_cluster 1
14+
-gpgpu_n_mem 6
15+
-gpgpu_n_sub_partition_per_mchannel 2
16+
17+
# Fermi clock domains
18+
#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
19+
# In Fermi, each pipeline has 16 execution units, so the Core clock needs to be divided
20+
# by 2. (GPGPU-Sim simulates a warp (32 threads) in a single cycle). 1400/2 = 700
21+
-gpgpu_clock_domains 700.0:700.0:700.0:924.0
22+
23+
# shader core pipeline config
24+
-gpgpu_shader_registers 32768
25+
26+
# This implies a maximum of 48 warps/SM
27+
-gpgpu_shader_core_pipeline 1536:32
28+
-gpgpu_shader_cta 8
29+
-gpgpu_simd_model 1
30+
31+
# Pipeline widths and number of FUs
32+
# ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB
33+
-gpgpu_pipeline_widths 2,1,1,2,1,1,2
34+
-gpgpu_num_sp_units 2
35+
-gpgpu_num_sfu_units 1
36+
37+
# Instruction latencies and initiation intervals
38+
# "ADD,MAX,MUL,MAD,DIV"
39+
-ptx_opcode_latency_int 4,13,4,5,145
40+
-ptx_opcode_initiation_int 1,2,2,1,8
41+
-ptx_opcode_latency_fp 4,13,4,5,39
42+
-ptx_opcode_initiation_fp 1,2,1,1,4
43+
-ptx_opcode_latency_dp 8,19,8,8,330
44+
-ptx_opcode_initiation_dp 8,16,8,8,130
45+
46+
47+
# In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
48+
# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
49+
# ** Optional parameter - Required when mshr_type==Texture Fifo
50+
# Note: Hashing set index function (H) only applies to a set size of 32 or 64.
51+
-gpgpu_cache:dl1 32:128:4,L:L:m:N:H,A:32:8,8
52+
-gpgpu_shmem_size 49152
53+
54+
# The alternative configuration for fermi in case cudaFuncCachePreferL1 is selected
55+
#-gpgpu_cache:dl1 64:128:6,L:L:m:N:H,A:32:8,8
56+
#-gpgpu_shmem_size 16384
57+
58+
# 64 sets, each 128 bytes 8-way for each memory sub partition. This gives 786KB L2 cache
59+
-gpgpu_cache:dl2 64:128:8,L:B:m:W:L,A:32:4,4:0,32
60+
-gpgpu_cache:dl2_texture_only 0
61+
62+
-gpgpu_cache:il1 4:128:4,L:R:f:N:L,A:2:32,4
63+
-gpgpu_tex_cache:l1 4:128:24,L:R:m:N:L,F:128:4,128:2
64+
-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4
65+
66+
# enable operand collector
67+
-gpgpu_operand_collector_num_units_sp 6
68+
-gpgpu_operand_collector_num_units_sfu 8
69+
-gpgpu_operand_collector_num_in_ports_sp 2
70+
-gpgpu_operand_collector_num_out_ports_sp 2
71+
-gpgpu_num_reg_banks 16
72+
73+
# shared memory bankconflict detection
74+
-gpgpu_shmem_num_banks 32
75+
-gpgpu_shmem_limited_broadcast 0
76+
-gpgpu_shmem_warp_parts 1
77+
78+
-gpgpu_max_insn_issue_per_warp 1
79+
80+
# interconnection
81+
-network_mode 1
82+
-inter_config_file config_fermi_islip.icnt
83+
84+
# memory partition latency config
85+
-rop_latency 120
86+
-dram_latency 100
87+
88+
# dram model config
89+
-gpgpu_dram_scheduler 1
90+
# The DRAM return queue and the scheduler queue together should provide buffer
91+
# to sustain the memory level parallelism to tolerate DRAM latency
92+
# To allow 100% DRAM utility, there should at least be enough buffer to sustain
93+
# the minimum DRAM latency (100 core cycles). I.e.
94+
# Total buffer space required = 100 x 924MHz / 700MHz = 132
95+
-gpgpu_frfcfs_dram_sched_queue_size 16
96+
-gpgpu_dram_return_queue_size 116
97+
98+
# for Fermi, bus width is 384bits, this is 8 bytes (4 bytes at each DRAM chip) per memory partition
99+
-gpgpu_n_mem_per_ctrlr 2
100+
-gpgpu_dram_buswidth 4
101+
-gpgpu_dram_burst_length 8
102+
-dram_data_command_freq_ratio 4 # GDDR5 is QDR
103+
-gpgpu_mem_address_mask 1
104+
-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS
105+
106+
# GDDR5 timing from hynix H5GQ1H24AFR
107+
# to disable bank groups, set nbkgrp to 1 and tCCDL and tRTPL to 0
108+
-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40:
109+
CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2"
110+
111+
# Fermi has two schedulers per core
112+
-gpgpu_num_sched_per_core 2
113+
# Two Level Scheduler with active and pending pools
114+
#-gpgpu_scheduler two_level_active:6:0:1
115+
# Loose round robbin scheduler
116+
#-gpgpu_scheduler lrr
117+
# Greedy then oldest scheduler
118+
-gpgpu_scheduler gto
119+
120+
# stat collection
121+
-gpgpu_memlatency_stat 14
122+
-gpgpu_runtime_stat 500
123+
-enable_ptx_file_line_stats 1
124+
-visualizer_enabled 0
125+
126+
# power model configs
127+
-power_simulation_enabled 1
128+
-gpuwattch_xml_file gpuwattch_gtx480.xml
129+
130+
# tracing functionality
131+
#-trace_enabled 1
132+
#-trace_components WARP_SCHEDULER,SCOREBOARD
133+
#-trace_sampling_core 0

0 commit comments

Comments
 (0)