|
| 1 | +# functional simulator specification |
| 2 | +-gpgpu_ptx_instruction_classification 0 |
| 3 | +-gpgpu_ptx_sim_mode 0 |
| 4 | +-gpgpu_ptx_force_max_capability 20 |
| 5 | + |
| 6 | + |
| 7 | +# SASS execution (only supported with CUDA >= 4.0) |
| 8 | +-gpgpu_ptx_convert_to_ptxplus 0 |
| 9 | +-gpgpu_ptx_save_converted_ptxplus 0 |
| 10 | + |
| 11 | +# high level architecture configuration |
| 12 | +-gpgpu_n_clusters 15 |
| 13 | +-gpgpu_n_cores_per_cluster 1 |
| 14 | +-gpgpu_n_mem 6 |
| 15 | +-gpgpu_n_sub_partition_per_mchannel 2 |
| 16 | + |
| 17 | +# Fermi clock domains |
| 18 | +#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock> |
| 19 | +# In Fermi, each pipeline has 16 execution units, so the Core clock needs to be divided |
| 20 | +# by 2. (GPGPU-Sim simulates a warp (32 threads) in a single cycle). 1400/2 = 700 |
| 21 | +-gpgpu_clock_domains 700.0:700.0:700.0:924.0 |
| 22 | + |
| 23 | +# shader core pipeline config |
| 24 | +-gpgpu_shader_registers 32768 |
| 25 | + |
| 26 | +# This implies a maximum of 48 warps/SM |
| 27 | +-gpgpu_shader_core_pipeline 1536:32 |
| 28 | +-gpgpu_shader_cta 8 |
| 29 | +-gpgpu_simd_model 1 |
| 30 | + |
| 31 | +# Pipeline widths and number of FUs |
| 32 | +# ID_OC_SP,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_SFU,OC_EX_MEM,EX_WB |
| 33 | +-gpgpu_pipeline_widths 2,1,1,2,1,1,2 |
| 34 | +-gpgpu_num_sp_units 2 |
| 35 | +-gpgpu_num_sfu_units 1 |
| 36 | + |
| 37 | +# Instruction latencies and initiation intervals |
| 38 | +# "ADD,MAX,MUL,MAD,DIV" |
| 39 | +-ptx_opcode_latency_int 4,13,4,5,145 |
| 40 | +-ptx_opcode_initiation_int 1,2,2,1,8 |
| 41 | +-ptx_opcode_latency_fp 4,13,4,5,39 |
| 42 | +-ptx_opcode_initiation_fp 1,2,1,1,4 |
| 43 | +-ptx_opcode_latency_dp 8,19,8,8,330 |
| 44 | +-ptx_opcode_initiation_dp 8,16,8,8,130 |
| 45 | + |
| 46 | + |
| 47 | +# In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb |
| 48 | +# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry> |
| 49 | +# ** Optional parameter - Required when mshr_type==Texture Fifo |
| 50 | +# Note: Hashing set index function (H) only applies to a set size of 32 or 64. |
| 51 | +-gpgpu_cache:dl1 32:128:4,L:L:m:N:H,A:32:8,8 |
| 52 | +-gpgpu_shmem_size 49152 |
| 53 | + |
| 54 | +# The alternative configuration for fermi in case cudaFuncCachePreferL1 is selected |
| 55 | +#-gpgpu_cache:dl1 64:128:6,L:L:m:N:H,A:32:8,8 |
| 56 | +#-gpgpu_shmem_size 16384 |
| 57 | + |
| 58 | +# 64 sets, each 128 bytes 8-way for each memory sub partition. This gives 786KB L2 cache |
| 59 | +-gpgpu_cache:dl2 64:128:8,L:B:m:W:L,A:32:4,4:0,32 |
| 60 | +-gpgpu_cache:dl2_texture_only 0 |
| 61 | + |
| 62 | +-gpgpu_cache:il1 4:128:4,L:R:f:N:L,A:2:32,4 |
| 63 | +-gpgpu_tex_cache:l1 4:128:24,L:R:m:N:L,F:128:4,128:2 |
| 64 | +-gpgpu_const_cache:l1 64:64:2,L:R:f:N:L,A:2:32,4 |
| 65 | + |
| 66 | +# enable operand collector |
| 67 | +-gpgpu_operand_collector_num_units_sp 6 |
| 68 | +-gpgpu_operand_collector_num_units_sfu 8 |
| 69 | +-gpgpu_operand_collector_num_in_ports_sp 2 |
| 70 | +-gpgpu_operand_collector_num_out_ports_sp 2 |
| 71 | +-gpgpu_num_reg_banks 16 |
| 72 | + |
| 73 | +# shared memory bankconflict detection |
| 74 | +-gpgpu_shmem_num_banks 32 |
| 75 | +-gpgpu_shmem_limited_broadcast 0 |
| 76 | +-gpgpu_shmem_warp_parts 1 |
| 77 | + |
| 78 | +-gpgpu_max_insn_issue_per_warp 1 |
| 79 | + |
| 80 | +# interconnection |
| 81 | +-network_mode 1 |
| 82 | +-inter_config_file config_fermi_islip.icnt |
| 83 | + |
| 84 | +# memory partition latency config |
| 85 | +-rop_latency 120 |
| 86 | +-dram_latency 100 |
| 87 | + |
| 88 | +# dram model config |
| 89 | +-gpgpu_dram_scheduler 1 |
| 90 | +# The DRAM return queue and the scheduler queue together should provide buffer |
| 91 | +# to sustain the memory level parallelism to tolerate DRAM latency |
| 92 | +# To allow 100% DRAM utility, there should at least be enough buffer to sustain |
| 93 | +# the minimum DRAM latency (100 core cycles). I.e. |
| 94 | +# Total buffer space required = 100 x 924MHz / 700MHz = 132 |
| 95 | +-gpgpu_frfcfs_dram_sched_queue_size 16 |
| 96 | +-gpgpu_dram_return_queue_size 116 |
| 97 | + |
| 98 | +# for Fermi, bus width is 384bits, this is 8 bytes (4 bytes at each DRAM chip) per memory partition |
| 99 | +-gpgpu_n_mem_per_ctrlr 2 |
| 100 | +-gpgpu_dram_buswidth 4 |
| 101 | +-gpgpu_dram_burst_length 8 |
| 102 | +-dram_data_command_freq_ratio 4 # GDDR5 is QDR |
| 103 | +-gpgpu_mem_address_mask 1 |
| 104 | +-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.BBBCCCCB.CCSSSSSS |
| 105 | + |
| 106 | +# GDDR5 timing from hynix H5GQ1H24AFR |
| 107 | +# to disable bank groups, set nbkgrp to 1 and tCCDL and tRTPL to 0 |
| 108 | +-gpgpu_dram_timing_opt "nbk=16:CCD=2:RRD=6:RCD=12:RAS=28:RP=12:RC=40: |
| 109 | + CL=12:WL=4:CDLR=5:WR=12:nbkgrp=4:CCDL=3:RTPL=2" |
| 110 | + |
| 111 | +# Fermi has two schedulers per core |
| 112 | +-gpgpu_num_sched_per_core 2 |
| 113 | +# Two Level Scheduler with active and pending pools |
| 114 | +#-gpgpu_scheduler two_level_active:6:0:1 |
| 115 | +# Loose round robbin scheduler |
| 116 | +#-gpgpu_scheduler lrr |
| 117 | +# Greedy then oldest scheduler |
| 118 | +-gpgpu_scheduler gto |
| 119 | + |
| 120 | +# stat collection |
| 121 | +-gpgpu_memlatency_stat 14 |
| 122 | +-gpgpu_runtime_stat 500 |
| 123 | +-enable_ptx_file_line_stats 1 |
| 124 | +-visualizer_enabled 0 |
| 125 | + |
| 126 | +# power model configs |
| 127 | +-power_simulation_enabled 1 |
| 128 | +-gpuwattch_xml_file gpuwattch_gtx480.xml |
| 129 | + |
| 130 | +# tracing functionality |
| 131 | +#-trace_enabled 1 |
| 132 | +#-trace_components WARP_SCHEDULER,SCOREBOARD |
| 133 | +#-trace_sampling_core 0 |
0 commit comments