accel-sim
diff --git a/‎CHANGES‎
Lines changed: 12 additions & 0 deletions b/‎CHANGES‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 2 deletions b/‎README.md‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎configs/tested-cfgs/SM75_RTX2060/gpgpusim.config‎
Lines changed: 1 addition & 1 deletion b/‎configs/tested-cfgs/SM75_RTX2060/gpgpusim.config‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config‎
Lines changed: 1 addition & 1 deletion b/‎configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configs/tested-cfgs/SM7_QV100/gpgpusim.config‎
Lines changed: 13 additions & 9 deletions b/‎configs/tested-cfgs/SM7_QV100/gpgpusim.config‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎configs/tested-cfgs/SM7_TITANV/gpgpusim.config‎
Lines changed: 11 additions & 7 deletions b/‎configs/tested-cfgs/SM7_TITANV/gpgpusim.config‎
Lines changed: 11 additions & 7 deletions
diff --git a/‎configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt‎
Lines changed: 74 additions & 0 deletions b/‎configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt‎
Lines changed: 74 additions & 0 deletions
@@ -1,4 +1,16 @@
 LOG:
+Version 4.1.0 versus 4.0.0
+-Features:
+1- Supporting L1 write-allocate with sub-sector writing policy as in Volta+ hardware, and changing the Volta+ cards config to make L1 write-allocate with write-through
+2- Making the L1 adaptive cache policy to be configurable 
+3- Adding Ampere RTX 3060 config files
+-Bugs:
+1- Fixing L1 bank hash function bug
+2- Fixing L1 read hit counters in gpgpu-sim to match nvprof, to achieve more accurate L1 correlation with the HW
+3- Fixing bugs in lazy write handling, thanks to Gwendolyn Voskuilen from Sandia labs for this fix
+4- Fixing the backend pipeline for sub_core model 
+5- Fixing Memory stomp bug at the shader_config
+6- Some code refactoring:
 Version 4.0.0 (development branch) versus 3.2.3
 -Front-End:
 1- Support .nc cache modifier and __ldg function to access the read-only L1D cache
 
@@ -11,22 +11,26 @@ This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
 Please see the copyright notice in the file COPYRIGHT distributed with this
 release in the same directory as this file.
 
+GPGPU-Sim 4.0 is compatible with Accel-Sim simulation framework. With the support 
+of Accel-Sim, GPGPU-Sim 4.0 can run NVIDIA SASS traces (trace-based simulation) 
+generated by NVIDIA's dynamic binary instrumentation tool (NVBit). For more information 
+about Accel-Sim, see [https://accel-sim.github.io/](https://accel-sim.github.io/)
+
 If you use GPGPU-Sim 4.0 in your research, please cite:
 
 Mahmoud Khairy, Zhesheng Shen, Tor M. Aamodt, Timothy G Rogers.
 Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling.
 In proceedings of the 47th IEEE/ACM International Symposium on Computer Architecture (ISCA),
 May 29 - June 3, 2020.
 
-If you use CuDNN or PyTorch support, checkpointing or our new debugging tool for functional 
+If you use CuDNN or PyTorch support (execution-driven simulation), checkpointing or our new debugging tool for functional 
 simulation errors in GPGPU-Sim for your research, please cite:
 
 Jonathan Lew, Deval Shah, Suchita Pati, Shaylin Cattell, Mengchi Zhang, Amruth Sandhupatla, 
 Christopher Ng, Negar Goli, Matthew D. Sinclair, Timothy G. Rogers, Tor M. Aamodt
 Analyzing Machine Learning Workloads Using a Detailed GPU Simulator, arXiv:1811.08933,
 https://arxiv.org/abs/1811.08933
 
-
 If you use the Tensor Core model in GPGPU-Sim or GPGPU-Sim's CUTLASS Library 
 for your research please cite:
 
@@ -261,6 +265,7 @@ To clean the docs run
 The documentation resides at doc/doxygen/html.
 
 To run Pytorch applications with the simulator, install the modified Pytorch library as well by following instructions [here](https://github.com/gpgpu-sim/pytorch-gpgpu-sim).
+
 ## Step 3: Run
 
 Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "--cudart shared" in makefile (quotes should be excluded).
 
@@ -76,7 +76,7 @@
 
 # warp scheduling
 -gpgpu_num_sched_per_core 4
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 # a warp scheduler issue mode
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
 
@@ -103,7 +103,7 @@
 # Turing has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Turing, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
 
@@ -125,12 +125,12 @@
 -gpgpu_shmem_num_banks 32
 -gpgpu_shmem_limited_broadcast 0
 -gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 60
+-gpgpu_coalesce_arch 70
 
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
@@ -144,17 +144,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
@@ -229,4 +233,4 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
+#-trace_sampling_core 0
@@ -100,7 +100,7 @@
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
@@ -114,17 +114,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_l1_latency 20
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
 
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 78;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+