From 0e9ec61d2c3fa48595b691585b9fdd081940a28e Mon Sep 17 00:00:00 2001
From: lxrzlyr <1289539524@qq.com>
Date: Tue, 7 May 2024 17:10:42 +0800
Subject: [PATCH] Update md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

	修改：     README.md
	重命名：   document/Decumentation.md -> document/Documentation.md
	新文件：   document/Quick_Start.md
	删除：     include/dawn/algorithm/gpu/sgemv.cuh
	删除：     include/dawn/algorithm/gpu/spgemm.cuh
	删除：     include/dawn/algorithm/gpu/spmv.cuh
	新文件：   test/Performance.md
	新文件：   test/Test_Guide.md
	删除：     test/performance.md
	新文件：   tool/process.sh
	删除：     tool/process_all.sh

	修改：     .gitattributes
---
 .gitattributes                                |   5 -
 README.md                                     | 169 +------
 .../{Decumentation.md => Documentation.md}    |   0
 document/Quick_Start.md                       |  92 ++++
 include/dawn/algorithm/gpu/sgemv.cuh          | 170 -------
 include/dawn/algorithm/gpu/spgemm.cuh         | 414 ------------------
 include/dawn/algorithm/gpu/spmv.cuh           | 413 -----------------
 test/Performance.md                           |  68 +++
 test/Test_Guide.md                            |  21 +
 test/performance.md                           |  68 ---
 tool/process.sh                               |  52 +++
 tool/process_all.sh                           |   0
 12 files changed, 255 insertions(+), 1217 deletions(-)
 rename document/{Decumentation.md => Documentation.md} (100%)
 create mode 100644 document/Quick_Start.md
 delete mode 100644 include/dawn/algorithm/gpu/sgemv.cuh
 delete mode 100644 include/dawn/algorithm/gpu/spgemm.cuh
 delete mode 100644 include/dawn/algorithm/gpu/spmv.cuh
 create mode 100644 test/Performance.md
 create mode 100644 test/Test_Guide.md
 delete mode 100644 test/performance.md
 create mode 100755 tool/process.sh
 delete mode 100755 tool/process_all.sh

diff --git a/.gitattributes b/.gitattributes
index bdc2268..43c7816 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,9 +1,4 @@
-*.c filter=copyright
-*.cc filter=copyright
-*.h filter=copyright
 *.cpp filter=copyright
 *.hxx filter=copyright
 *.cu filter=copyright
 *.cuh filter=copyright
-*.md filter=copyrightW
-*.sh filter=copyright
\ No newline at end of file
diff --git a/README.md b/README.md
index bdcc229..18f962e 100755
--- a/README.md
+++ b/README.md
@@ -1,154 +1,19 @@
 # DAWN: An Noval SSSP/APSP Algorithm, CUDA/C++
 
-DAWN is a novel shortest paths algorithm, which is suitable for weighted and unweighted graphs. DAWN requires $O(m)$ space and $O(S_{wcc} \cdot E_{wcc})$ times on the unweighted graphs, which can also process SSSP tasks and requires $O(E_{wcc}(i))$ time. $S_{wcc}$ and $E_{wcc}$ denote the number of nodes and edges included in the largest WCC (Weakly Connected Component) in the graphs.
+DAWN is a novel shortest paths algorithm, which is suitable for weighted and unweighted graphs. In contrast to the prevalent optimization of state-of-the-art BFS implementation, which commonly rely on priority queues, our approach leverages matrix operations to endow DAWN with enhanced parallelism. DAWN is capable of solving the shortest path problems on graphs with negative weights, and can automatically exclude the influence of negative weight cycles.
 
-DAWN is capable of solving the APSP and SSSP problems on graphs with negative weights, and can automatically exclude the influence of negative weight cycles.  
+DAWN requires $O(m)$ space and $O(S_{wcc} \cdot E_{wcc})$ times on the unweighted graphs, which can also process SSSP tasks and requires $O(E_{wcc}(i))$ time. $S_{wcc}$ and $E_{wcc}$ denote the number of nodes and edges included in the largest WCC (Weakly Connected Component) in the graphs.
 
-The rapid closeness centrality algorithm based on DAWN has been implemented, while work on the betweenness centrality algorithm is still ongoing. We are very interested in developing a new BC algorithm based on DAWN, and we will not simply replicate the Brandes algorithm, which means that the BC algorithm will not appear in this library in the short term. We encourage colleagues to complete the implementation of the Brandes algorithm before the new algorithm is implemented.
+| [**Examples**](https://github.com/lxrzlyr/DAWN-An-Noval-SSSP-APSP-Algorithm/tree/dev/algorithm) | [**Documentation**](https://github.com/lxrzlyr/DAWN-An-Noval-SSSP-APSP-Algorithm/tree/dev/document) | [**Test**](https://github.com/lxrzlyr/DAWN-An-Noval-SSSP-APSP-Algorithm/tree/dev/test) |
+| ----------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------- |
+- Examples: Demonstrate the usage of algorithms in DAWN.
+- Document: Provides the detailed description of DAWN, include the **Quick_Start** and **Code_Guide**. **Quick_Start** provides the guide for quickly working with DAWN. **Code_Guide** provides the detailed description for how to implement own algorithm in DAWN.
+- Test: Provides the detailed information of Testing.
 
-To add NEW functions within DAWN, please refer to the  [document/Code_Guide](https://github.com/lxrzlyr/DAWN-An-Noval-SSSP-APSP-Algorithm/blob/dev/document/Code_Guide.md).
 
-## Quick Start Guide
+## Development Status
 
-### 0. Before getting started
-
-Depending on your GPU, you may also want to edit the CUDA_ARCHITECTURES variable in $PROJECT_ROOT/algorithm/gpu/CMakeLists.txt
-
-```c++
-export PROJECT_ROOT="to_your_project_path"
-```
-
-### 1. Modify $PROJECT_ROOT/algorithm/gpu/CMakeLists.txt
-
-According to your GPU, we use RTX 3080ti for computing, so CUDA_ARCHITECTURES is set to 86
-
-```c++
-set(CUDA_ARCHITECTURES "86")
-```
-
-If the machine does not have a GPU available, DAWN will automatically detect the CUDA environment and only build artifacts for the CPU. However, if you are certain that the machine has a usable GPU and you have been unable to build artifacts for the GPU correctly, we suspect there may be an issue with the CUDA environment. Please further check for any path-related problems.
-
-### 2.Download testing data
-
-Unzip the compressed package and put it in the directory you need
-
-The input data can be found on the Science Data Bank
-
-```c++
-URL=https://www.scidb.cn/s/6BjM3a
-GRAPH_DIR="to_your_graph_path"
-```
-
-### 3. Dependencies
-
-DAWN builds, runs, and has been tested on Ubuntu/Rocky Linux. Even though DAWN may work on other linux systems, we have not tested correctness or performance. DAWN is not available on Windows and cannot achieve the same performance on WSL systems. Please beware.
-
-At the minimum, DAWN depends on the following software:
-
-- A modern C++ compiler compliant with the C++ 14 standard
-- GCC (>= 9.4.0 or Clang >= 10.0.0)
-- CMake (>= 3.10)
-- libomp (>= 10.0)
-
-If you need run DAWN on the GPU, expand:
-
-- CUDA (>= 11.0)
-- thrust (>= 2.0)
-
-### 4. Build and RUN
-
-```c++
-cd $PROJECT_ROOT
-mkdir build && cd build
-cmake .. && make -j
-```
-
-If compilation succeeds without errors, you can run your code as before, for example
-
-```c++
-cd $PROJECT_ROOT/build
-./apsp_cpu SG $GRAPH_DIR/mouse_gene.mtx ../output.txt false 0 unweighted
-
-./apsp_cpu SG $GRAPH_DIR/cage10.mtx ../output.txt false 0 weighted
-
-```
-
-When the version is built, it will also prepare SSSP and MSSP applications, which can be used directly.
-
-If you need to use DAWN in your own solution, please check the source code under the **src**，**include** folder and call them.
-
-#### Using script
-
-```c++
-cd ..
-vim ./process.sh
-MAIN = ${main}
-GRAPH_DIR = ${test_graph}
-OUTPUT= ${outputfile}
-LOG_DIR= ${GRAPH_DIR}/log
-ESC && wq
-sudo chmod +x ../process.sh
-bash ../process.sh
-```
-
-Please note that the normal operation of the batch script needs to ensure that the test machine meets the minimum requirements. Insufficient memory or GPU memory needs to be manually adjusted according to amount of resources.
-
-#### For general graphs
-
-```c++
-CPU: Multi-threaded processor supporting OpenMP API
-RAM: 8GB or more
-GPU: 1GB or more
-Compiler: NVCC of CUDA 11.0 above
-OS:  Ubuntu 20.04 and above
-```
-
-#### For large-scale graphs
-
-```c++
-CPU: Multi-threaded processor supporting OpenMP API
-RAM: 24GB or more
-GPU: 4GB or more
-Compiler: NVCC of CUDA 11.0 above
-OS:  Ubuntu 20.04 and above
-```
-
-### 5.Release version
-
-| Algorithm | Implementation | Weigthed |
-| --------- | -------------- | -------- |
-| APSP      | GOVM           | True     |
-| APSP      | SOVM           | False    |
-| MSSP      | GOVM           | True     |
-| MSSP      | SOVM           | False    |
-| SSSP      | GOVM           | True     |
-| BFS       | SOVM           | False    |
-
-For the GPU version, please make sure that there is enough GPU memory for the graph. The size of the thread block and the scale of the graph is set reasonably according to the device parameters.
-
-```c++
-int device;
-cudaDeviceProp props;
-cudaGetDevice(&device);
-cudaGetDeviceProperties(&props, device);
-printf("Max shared memory per block: %ld\n", props.sharedMemPerBlock);
-```
-
-### 6.Performance
-
-We have presented a performance comparison of algorithms for DAWN, GAPBS, and Gunrock in a [table](https://github.com/lxrzlyr/DAWN-An-Noval-SSSP-APSP-Algorithm/blob/dev/test/performance.md). The benchmark tests were run on the Gunrock benchmark dataset and the Suite Sparse Collection dataset. The table provides specific information about the graphs and their corresponding runtime.
-
-We provide the file **check_unweighted.py** and **check_weighted.py**, based on networkx, which can be used to check the results printed by DAWN.
-
-We also provide the test code for Gunrock and GAPBS in the **test** directory. Due to differences in code build environments and other aspects among the repositories, it is not possible to pull and build them uniformly. If you need to verify the results of Gunrock and GAPBS, please visit the repositories for [Gunrock](https://github.com/gunrock/gunrock) and [GAPBS](https://github.com/sbeamer/gapbs) respectively, follow the repository build instructions, and replace the source files in the repository with the ones we provide. Alternatively, you can pull our modified fork branch and build directly([Gunrock](https://github.com/lxrzlyr/gunrock),[GAPBS](https://github.com/lxrzlyr/gapbs)).
-
-### 7.Documentation
-
-Please refer to [document/Documentation](https://github.com/lxrzlyr/DAWN-An-Noval-SSSP-APSP-Algorithm/blob/dev/document/Decumentation.md) for commands.
-
-## New version
-
-The version of DWAN on the weighted graph has been included in DAWN V2.1. Currently, DAWN includes the version that runs on unweighted graphs of int type index values, and the version that runs on negative weighted graphs of float type. (SOVM and GOVM have been the default implementation, if you want to use BOVM, please change the kernel function.)
+Currently, the rapid closeness centrality algorithm based on DAWN has been implemented, while work on the betweenness centrality algorithm is still ongoing. We are very interested in developing a new BC algorithm based on DAWN, which means that the BC algorithm will not appear in this library in the short term. We encourage colleagues to complete the implementation of the Brandes algorithm before the new algorithm is implemented.
 
 | Algorithm           | Release |
 | ------------------- | ------- |
@@ -158,9 +23,6 @@ The version of DWAN on the weighted graph has been included in DAWN V2.1. Curren
 | BFS                 | V2.1    |
 | BC                  | Doing   |
 | CC                  | V2.3    |
-| SPGEMM              | Doing   |
-| SPGEMV              | Doing   |
-| SPMV                | Doing   |
 | Cluster Analysis    | Future  |
 | Community Detection | Future  |
 
@@ -170,6 +32,19 @@ We welcome any interest and ideas related to DAWN and its applications. If you a
 
 The DAWN component based on Gunrock may be released to the main/develop branch in the near future, so please stay tuned to the [Gunrock](https://github.com/gunrock/gunrock). We will release new features of DAWN and the application algorithms based on DAWN on this repository. If the algorithms are also needed by Gunrock, we will contribute them to the Gunrock repository later.
 
+## How to Cite DAWN
+Thank you for citing our work. 
+
+```bibtex
+@InProceedings{Feng:2024:DAWN,
+  author =	 {Yelai Feng and Huaixi Wang and Yining Zhu and Xiandong Liu and Hongyi Lu and Qing Liu},
+  title =	 {DAWN: Matrix Operation-Optimized Algorithm for Shortest Paths Problem on Unweighted Graphs},
+  booktitle =	 {Proceedings of the 38th ACM International Conference on Supercomputing},
+  year =	 {2024},
+  doi =		 {10.1145/3650200.3656600}
+}
+```
+
 ## Copyright & License
 
 All source code are released under [Apache 2.0](https://github.com/lxrzlyr/DAWN-An-Noval-SSSP-APSP-Algorithm/blob/4266d98053678ce76e34be64477ac2364f0f4291/LICENSE).
diff --git a/document/Decumentation.md b/document/Documentation.md
similarity index 100%
rename from document/Decumentation.md
rename to document/Documentation.md
diff --git a/document/Quick_Start.md b/document/Quick_Start.md
new file mode 100644
index 0000000..b7fcf7b
--- /dev/null
+++ b/document/Quick_Start.md
@@ -0,0 +1,92 @@
+# Quick Start Guide
+
+## 0. Before getting started
+
+If the machine does not have a GPU available, DAWN will automatically detect the CUDA environment and only build artifacts for the CPU.
+
+```c++
+export PROJECT_ROOT="to_your_project_path"
+```
+## 1. Modify $PROJECT_ROOT/algorithm/gpu/CMakeLists.txt
+
+If you want to use the GPU version of DAWN, you need to modify the following code in $PROJECT_ROOT/algorithm/gpu/CMakeLists.txt. According to your GPU, we use RTX 3080TI for computing, so CUDA_ARCHITECTURES is set to 86.
+
+```c++
+set(CUDA_ARCHITECTURES "86")
+```
+
+Certainly, if you are unaware of the CUDA_ARCHITECTURES, we have implemented code to automatically select the CUDA_ARCHITECTURES. However, this may not necessarily be the most optimal choice.
+
+If you are certain that the machine has a usable GPU and you have been unable to build artifacts for the GPU correctly, we suspect there may be an issue with the CUDA environment. Please further check for any path-related problems.
+
+## 2. Download testing data
+
+Unzip the compressed package and put it in the directory you need. The input data can be found on the Science Data Bank.
+
+```c++
+URL=https://www.scidb.cn/s/6BjM3a
+GRAPH_DIR="to_your_graph_path"
+```
+
+## 3. Dependencies
+
+DAWN builds, runs, and has been tested on Ubuntu/Rocky Linux. Even though DAWN may work on other linux systems, we have not tested correctness or performance. DAWN is not available on Windows and cannot achieve the same performance on WSL systems. Please beware.
+
+At the minimum, DAWN depends on the following software:
+
+```c++
+- A modern C++ compiler compliant with the C++ 14 standard
+- GCC (>= 9.4.0 or Clang >= 10.0.0)
+- CMake (>= 3.10)
+- libomp (>= 10.0)
+```
+
+If you need run DAWN on the GPU, expand:
+
+```c++
+- CUDA (>= 11.0)
+- thrust (>= 2.0)
+```
+
+## 4. Build and RUN
+
+```c++
+cd $PROJECT_ROOT
+mkdir build && cd build
+cmake .. && make -j
+```
+
+If compilation succeeds without errors, you can run your code as before, for example
+
+```c++
+cd $PROJECT_ROOT/build
+./sssp_cpu $GRAPH_DIR/XX.mtx ../output.txt false 0
+./sssp_gpu $GRAPH_DIR/XXX.mtx ../output.txt 1024 false 0
+```
+
+If you need to use DAWN in your own solution, please check the source code under the **src**，**include** folder and call them.
+
+### 4.1 Using script
+
+```c++
+cd ..
+vim ./process.sh
+MAIN = ${main}
+GRAPH_DIR = ${test_graph}
+OUTPUT= ${outputfile}
+LOG_DIR= ${GRAPH_DIR}/log
+ESC && wq
+sudo chmod +x ../process.sh
+bash ../process.sh
+```
+
+Please note that the normal operation of the batch script needs to ensure that the test machine meets the minimum requirements. Insufficient memory or GPU memory needs to be manually adjusted according to amount of resources. For the GPU version, please make sure that there is enough GPU memory for the graph. 
+
+```c++
+// For general graphs
+CPU: Multi-threaded processor supporting OpenMP API
+RAM: 8GB or more
+GPU: 1GB or more
+Compiler: NVCC of CUDA 11.0 above
+OS:  Ubuntu 20.04 and above
+```
diff --git a/include/dawn/algorithm/gpu/sgemv.cuh b/include/dawn/algorithm/gpu/sgemv.cuh
deleted file mode 100644
index 426a9f8..0000000
--- a/include/dawn/algorithm/gpu/sgemv.cuh
+++ /dev/null
@@ -1,170 +0,0 @@
-/**
- * @author lxrzlyr (1289539524@qq.com)
- * @date 2024-04-21
- *
- * @copyright Copyright (c) 2024
- */
-// It is copy from https://github.com/Liu-xiandong/How_to_optimize_in_GPU.git,
-// will be revised to adapt the repository in time.
-#include <dawn/dawn.cuh>
-#include <stdio.h>
-#include <stdlib.h>
-
-// CUDA runtime
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <math.h>
-
-// cal offset from row col and ld , in row-major matrix, ld is the width of the
-// matrix
-#define OFFSET(row, col, ld) ((row) * (ld) + (col))
-
-// transfer float4
-#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
-
-#define checkCudaErrors(func)                                                \
-  {                                                                          \
-    cudaError_t e = (func);                                                  \
-    if (e != cudaSuccess)                                                    \
-      printf("%s %d CUDA: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
-  }
-
-template <unsigned int WarpSize>
-__device__ __forceinline__ float warpReduceSum(float sum) {
-  if (WarpSize >= 32)
-    sum += __shfl_down_sync(0xffffffff, sum, 16);  // 0-16, 1-17, 2-18, etc.
-  if (WarpSize >= 16)
-    sum += __shfl_down_sync(0xffffffff, sum, 8);  // 0-8, 1-9, 2-10, etc.
-  if (WarpSize >= 8)
-    sum += __shfl_down_sync(0xffffffff, sum, 4);  // 0-4, 1-5, 2-6, etc.
-  if (WarpSize >= 4)
-    sum += __shfl_down_sync(0xffffffff, sum, 2);  // 0-2, 1-3, 4-6, 5-7, etc.
-  if (WarpSize >= 2)
-    sum += __shfl_down_sync(0xffffffff, sum, 1);  // 0-1, 2-3, 4-5, etc.
-  return sum;
-}
-
-// if N <= 16
-template <const int ROW_PER_WARP>
-__global__ void Sgemv_v2(float* __restrict__ A,
-                         float* __restrict__ x,
-                         float* __restrict__ y,
-                         const int M,
-                         const int N) {
-  // Block index
-  int bx = blockIdx.x;
-
-  // Thread index
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-
-  const int warp_size = 32;
-  int laneId = tx % warp_size;
-  int current_warp_row = (blockDim.y * bx + ty) * ROW_PER_WARP;
-  const int kWarp_size = warp_size / ROW_PER_WARP;
-  int kLaneId = laneId % kWarp_size;
-  int current_thread_row = current_warp_row + laneId / kWarp_size;
-
-  if (current_thread_row < M) {
-    float res = 0;
-    int current_col = kLaneId;
-    res += A[current_thread_row * N + current_col] * x[current_col];
-    res = warpReduceSum<kWarp_size>(res);
-    if (kLaneId == 0)
-      y[current_thread_row] = res;
-  }
-}
-
-int main(int argc, char** argv) {
-  if (argc != 3) {
-    printf("usage: ./main [M] [N]\n");
-    exit(0);
-  }
-  size_t M = atoi(argv[1]);
-  size_t N = atoi(argv[2]);
-
-  size_t bytes_A = sizeof(float) * M * N;
-  size_t bytes_x = sizeof(float) * N;
-  size_t bytes_y = sizeof(float) * M;
-  float* h_A = (float*)malloc(bytes_A);
-  float* h_x = (float*)malloc(bytes_x);
-  float* h_y = (float*)malloc(bytes_y);
-  float* h_y1 = (float*)malloc(bytes_y);
-
-  float* d_A;
-  float* d_x;
-  float* d_y;
-
-  checkCudaErrors(cudaMalloc(&d_A, bytes_A));
-  checkCudaErrors(cudaMalloc(&d_x, bytes_x));
-  checkCudaErrors(cudaMalloc(&d_y, bytes_y));
-
-  const int WARP_SIZE = 32;
-  const int ROW_PER_WARP = 2;
-  const int THREAD_PER_BLOCK = 128;
-  const int WARP_PER_BLOCK = THREAD_PER_BLOCK / WARP_SIZE;
-  const int ROW_PER_BLOCK = WARP_PER_BLOCK * ROW_PER_WARP;
-
-  // 生成A的数据
-  for (int i = 0; i < M * N; i++) {
-    h_A[i] = (float)i / N;
-  }
-
-  // 生成x的数据
-  for (int i = 0; i < N; i++) {
-    h_x[i] = 1;
-  }
-  memset(h_y, 0, M * sizeof(float));
-  memset(h_y1, 0, M * sizeof(float));
-
-  int nIter = 1000;
-  checkCudaErrors(cudaMemcpy(d_A, h_A, bytes_A, cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaMemcpy(d_x, h_x, bytes_x, cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaMemcpy(d_y, h_y, bytes_y, cudaMemcpyHostToDevice));
-  for (int run = 0; run < nIter; run++) {
-    dim3 dimGrid(M / ROW_PER_BLOCK);
-    dim3 dimBlock(32, THREAD_PER_BLOCK / WARP_SIZE);
-    Sgemv_v2<ROW_PER_WARP><<<dimGrid, dimBlock>>>(d_A, d_x, d_y, M, N);
-  }
-  checkCudaErrors(cudaMemcpy(h_y, d_y, bytes_y, cudaMemcpyDeviceToHost));
-
-  // cublas
-  cublasHandle_t blas_handle;
-  cublasCreate(&blas_handle);
-  float alpha = 1.0;
-  float beta = 0;
-  checkCudaErrors(cudaMemcpy(d_y, h_y1, bytes_y, cudaMemcpyHostToDevice));
-  for (int run = 0; run < nIter; run++) {
-    cublasSgemv(blas_handle, CUBLAS_OP_T, N, M, &alpha, d_A, N, d_x, 1, &beta,
-                d_y, 1);
-  }
-  checkCudaErrors(cudaMemcpy(h_y1, d_y, bytes_y, cudaMemcpyDeviceToHost));
-  cublasDestroy(blas_handle);
-
-  double eps = 1.e-6;  // machine zero
-  bool correct = true;
-  for (int i = 0; i < M; i++) {
-    double abs_err = fabs(h_y[i] - h_y1[i]);
-    double dot_length = M;
-    double abs_val = fabs(h_y[i]);
-    double rel_err = abs_err / abs_val / dot_length;
-    if (rel_err > eps) {
-      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
-             h_y[i], h_y1[i], eps);
-      correct = false;
-      break;
-    }
-  }
-
-  printf("%s\n", correct ? "Result= PASS" : "Result= FAIL");
-
-  // Free Memory
-  cudaFree(d_A);
-  cudaFree(d_x);
-  cudaFree(d_y);
-
-  free(h_A);
-  free(h_x);
-  free(h_y);
-  free(h_y1);
-}
\ No newline at end of file
diff --git a/include/dawn/algorithm/gpu/spgemm.cuh b/include/dawn/algorithm/gpu/spgemm.cuh
deleted file mode 100644
index 7861b46..0000000
--- a/include/dawn/algorithm/gpu/spgemm.cuh
+++ /dev/null
@@ -1,414 +0,0 @@
-/**
- * @author lxrzlyr (1289539524@qq.com)
- * @date 2024-04-21
- *
- * @copyright Copyright (c) 2024
- */
-// optimize sgemm
-
-// It is copy from https://github.com/Liu-xiandong/How_to_optimize_in_GPU.git,
-// will be revised to adapt the repository in time.
-#include <dawn/dawn.cuh>
-// CUDA runtime
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-
-// cal offset from row col and ld , in row-major matrix, ld is the width of the
-// matrix
-#define OFFSET(row, col, ld) ((row) * (ld) + (col))
-
-// transfer float4
-#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
-
-#define checkCudaErrors(func)                                                \
-  {                                                                          \
-    cudaError_t e = (func);                                                  \
-    if (e != cudaSuccess)                                                    \
-      printf("%s %d CUDA: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
-  }
-
-// K: ldA
-// N: ldB
-template <
-    const int BLOCK_SIZE_M,   // height of block of C that each thread block
-                              // calculate
-    const int BLOCK_SIZE_K,   // width of block of A that each thread block load
-                              // into shared memory
-    const int BLOCK_SIZE_N,   // width of block of C that each thread block
-                              // calculate
-    const int THREAD_SIZE_Y,  // height of block of C that each thread calculate
-    const int THREAD_SIZE_X,  // width of block of C that each thread calculate
-    const bool ENABLE_DOUBLE_BUFFER  // whether enable double buffering or not
-    >
-__global__ void Sgemm(float* __restrict__ A,
-                      float* __restrict__ B,
-                      float* __restrict__ C,
-                      const int M,
-                      const int N,
-                      const int K) {
-  // Block index
-  int bx = blockIdx.x;
-  int by = blockIdx.y;
-
-  // Thread index
-  int tx = threadIdx.x;
-  int ty = threadIdx.y;
-
-  // the threads number in Block of X,Y
-  const int THREAD_X_PER_BLOCK = BLOCK_SIZE_N / THREAD_SIZE_X;
-  const int THREAD_Y_PER_BLOCK = BLOCK_SIZE_M / THREAD_SIZE_Y;
-  const int THREAD_NUM_PER_BLOCK = THREAD_X_PER_BLOCK * THREAD_Y_PER_BLOCK;
-
-  // thread id in cur Block
-  const int tid = ty * THREAD_X_PER_BLOCK + tx;
-
-  // shared memory
-  __shared__ float As[2][BLOCK_SIZE_K][BLOCK_SIZE_M];
-  __shared__ float Bs[2][BLOCK_SIZE_K][BLOCK_SIZE_N];
-  // registers for C
-  float accum[THREAD_SIZE_Y][THREAD_SIZE_X];
-#pragma unroll
-  for (int i = 0; i < THREAD_SIZE_Y; i++) {
-#pragma unroll
-    for (int j = 0; j < THREAD_SIZE_X; j++) {
-      accum[i][j] = 0.0;
-    }
-  }
-  // registers for A and B
-  float frag_a[2][THREAD_SIZE_Y];
-  float frag_b[2][THREAD_SIZE_X];
-  // registers load global memory
-  const int ldg_num_a =
-      BLOCK_SIZE_M * BLOCK_SIZE_K / (THREAD_NUM_PER_BLOCK * 4);
-  const int ldg_num_b =
-      BLOCK_SIZE_K * BLOCK_SIZE_N / (THREAD_NUM_PER_BLOCK * 4);
-  float ldg_a_reg[4 * ldg_num_a];
-  float ldg_b_reg[4 * ldg_num_b];
-
-  // threads number in one row
-  const int A_TILE_THREAD_PER_ROW = BLOCK_SIZE_K / 4;
-  const int B_TILE_THREAD_PER_ROW = BLOCK_SIZE_N / 4;
-
-  // row number and col number that needs to be loaded by this thread
-  const int A_TILE_ROW_START = tid / A_TILE_THREAD_PER_ROW;
-  const int B_TILE_ROW_START = tid / B_TILE_THREAD_PER_ROW;
-
-  const int A_TILE_COL = tid % A_TILE_THREAD_PER_ROW * 4;
-  const int B_TILE_COL = tid % B_TILE_THREAD_PER_ROW * 4;
-
-  // row stride that thread uses to load multiple rows of a tile
-  const int A_TILE_ROW_STRIDE = THREAD_NUM_PER_BLOCK / A_TILE_THREAD_PER_ROW;
-  const int B_TILE_ROW_STRIDE = THREAD_NUM_PER_BLOCK / B_TILE_THREAD_PER_ROW;
-
-  A = &A[(BLOCK_SIZE_M * by) * K];
-  B = &B[BLOCK_SIZE_N * bx];
-
-  // load index of the tile
-  const int warp_id = tid / 32;
-  const int lane_id = tid % 32;
-  const int a_tile_index =
-      warp_id / 2 * 16 +
-      lane_id / 8 * 4;  // warp_id * 8 + (lane_id / 16)*4; // (warp_id/4)*32 +
-                        // ((lane_id%16)/2)*4;
-  const int b_tile_index =
-      warp_id % 2 * 32 +
-      lane_id % 8 * 4;  //(lane_id % 16) * 4; // (warp_id%4)*16 + (lane_id/16)*8
-                        //+ (lane_id%2)*4;
-
-// transfer first tile from global mem to shared mem
-//  load A from global memory to shared memory
-#pragma unroll
-  for (int i = 0; i < BLOCK_SIZE_M; i += A_TILE_ROW_STRIDE) {
-    int ldg_index = i / A_TILE_ROW_STRIDE * 4;
-    FETCH_FLOAT4(ldg_a_reg[ldg_index]) =
-        FETCH_FLOAT4(A[OFFSET(A_TILE_ROW_START + i,  // row
-                              A_TILE_COL,            // col
-                              K)]);
-    As[0][A_TILE_COL][A_TILE_ROW_START + i] = ldg_a_reg[ldg_index];
-    As[0][A_TILE_COL + 1][A_TILE_ROW_START + i] = ldg_a_reg[ldg_index + 1];
-    As[0][A_TILE_COL + 2][A_TILE_ROW_START + i] = ldg_a_reg[ldg_index + 2];
-    As[0][A_TILE_COL + 3][A_TILE_ROW_START + i] = ldg_a_reg[ldg_index + 3];
-  }
-// load B from global memory to shared memory
-#pragma unroll
-  for (int i = 0; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
-    FETCH_FLOAT4(Bs[0][B_TILE_ROW_START + i][B_TILE_COL]) =
-        FETCH_FLOAT4(B[OFFSET(B_TILE_ROW_START + i,  // row
-                              B_TILE_COL,            // col
-                              N)]);
-  }
-  __syncthreads();
-
-  // load A from shared memory to register
-  FETCH_FLOAT4(frag_a[0][0]) = FETCH_FLOAT4(As[0][0][a_tile_index]);
-  FETCH_FLOAT4(frag_a[0][4]) = FETCH_FLOAT4(As[0][0][a_tile_index + 64]);
-
-  // load B from shared memory to register
-  FETCH_FLOAT4(frag_b[0][0]) = FETCH_FLOAT4(Bs[0][0][b_tile_index]);
-  FETCH_FLOAT4(frag_b[0][4]) = FETCH_FLOAT4(Bs[0][0][b_tile_index + 64]);
-
-  int write_stage_idx = 1;
-  int tile_idx = 0;
-  do {
-    // next tile index
-    tile_idx += BLOCK_SIZE_K;
-    // load next tile from global mem
-    if (tile_idx < K) {
-#pragma unroll
-      for (int i = 0; i < BLOCK_SIZE_M; i += A_TILE_ROW_STRIDE) {
-        int ldg_index = i / A_TILE_ROW_STRIDE * 4;
-        FETCH_FLOAT4(ldg_a_reg[ldg_index]) =
-            FETCH_FLOAT4(A[OFFSET(A_TILE_ROW_START + i,   // row
-                                  A_TILE_COL + tile_idx,  // col
-                                  K)]);
-      }
-#pragma unroll
-      for (int i = 0; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
-        int ldg_index = i / B_TILE_ROW_STRIDE * 4;
-        FETCH_FLOAT4(ldg_b_reg[ldg_index]) =
-            FETCH_FLOAT4(B[OFFSET(tile_idx + B_TILE_ROW_START + i,  // row
-                                  B_TILE_COL,                       // col
-                                  N)]);
-      }
-    }
-
-    int load_stage_idx = write_stage_idx ^ 1;
-
-#pragma unroll
-    for (int j = 0; j < BLOCK_SIZE_K - 1; ++j) {
-      // load next tile from shared mem to register
-      // load A from shared memory to register
-      FETCH_FLOAT4(frag_a[(j + 1) % 2][0]) =
-          FETCH_FLOAT4(As[load_stage_idx][(j + 1)][a_tile_index]);
-      FETCH_FLOAT4(frag_a[(j + 1) % 2][4]) =
-          FETCH_FLOAT4(As[load_stage_idx][(j + 1)][a_tile_index + 64]);
-      // load B from shared memory to register
-      FETCH_FLOAT4(frag_b[(j + 1) % 2][0]) =
-          FETCH_FLOAT4(Bs[load_stage_idx][(j + 1)][b_tile_index]);
-      FETCH_FLOAT4(frag_b[(j + 1) % 2][4]) =
-          FETCH_FLOAT4(Bs[load_stage_idx][(j + 1)][b_tile_index + 64]);
-// compute C THREAD_SIZE_X x THREAD_SIZE_Y
-#pragma unroll
-      for (int thread_y = 0; thread_y < THREAD_SIZE_Y; ++thread_y) {
-#pragma unroll
-        for (int thread_x = 0; thread_x < THREAD_SIZE_X; ++thread_x) {
-          accum[thread_y][thread_x] +=
-              frag_a[j % 2][thread_y] * frag_b[j % 2][thread_x];
-        }
-      }
-    }
-
-    if (tile_idx < K) {
-// load A from global memory to shared memory
-#pragma unroll
-      for (int i = 0; i < BLOCK_SIZE_M; i += A_TILE_ROW_STRIDE) {
-        int ldg_index = i / A_TILE_ROW_STRIDE * 4;
-        As[write_stage_idx][A_TILE_COL][A_TILE_ROW_START + i] =
-            ldg_a_reg[ldg_index];
-        As[write_stage_idx][A_TILE_COL + 1][A_TILE_ROW_START + i] =
-            ldg_a_reg[ldg_index + 1];
-        As[write_stage_idx][A_TILE_COL + 2][A_TILE_ROW_START + i] =
-            ldg_a_reg[ldg_index + 2];
-        As[write_stage_idx][A_TILE_COL + 3][A_TILE_ROW_START + i] =
-            ldg_a_reg[ldg_index + 3];
-      }
-// load B from global memory to shared memory
-#pragma unroll
-      for (int i = 0; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
-        int ldg_index = i / B_TILE_ROW_STRIDE * 4;
-        FETCH_FLOAT4(Bs[write_stage_idx][B_TILE_ROW_START + i][B_TILE_COL]) =
-            FETCH_FLOAT4(ldg_b_reg[ldg_index]);
-      }
-      // use double buffer, only need one sync
-      __syncthreads();
-      // switch
-      write_stage_idx ^= 1;
-    }
-
-    // load first tile from shared mem to register of next iter
-    // load A from shared memory to register
-    FETCH_FLOAT4(frag_a[0][0]) =
-        FETCH_FLOAT4(As[load_stage_idx ^ 1][0][a_tile_index]);
-    FETCH_FLOAT4(frag_a[0][4]) =
-        FETCH_FLOAT4(As[load_stage_idx ^ 1][0][a_tile_index + 64]);
-    // load B from shared memory to register
-    FETCH_FLOAT4(frag_b[0][0]) =
-        FETCH_FLOAT4(Bs[load_stage_idx ^ 1][0][b_tile_index]);
-    FETCH_FLOAT4(frag_b[0][4]) =
-        FETCH_FLOAT4(Bs[load_stage_idx ^ 1][0][b_tile_index + 64]);
-// compute C THREAD_SIZE_X x THREAD_SIZE_Y
-#pragma unroll
-    for (int thread_y = 0; thread_y < THREAD_SIZE_Y; ++thread_y) {
-#pragma unroll
-      for (int thread_x = 0; thread_x < THREAD_SIZE_X; ++thread_x) {
-        accum[thread_y][thread_x] += frag_a[1][thread_y] * frag_b[1][thread_x];
-      }
-    }
-  } while (tile_idx < K);
-
-  const int c_block_row = a_tile_index;
-  const int c_block_col = b_tile_index;
-
-  // store C00 block
-  for (int i = 0; i < 4; i++) {
-    FETCH_FLOAT4(C[OFFSET(BLOCK_SIZE_M * by + c_block_row + i,
-                          BLOCK_SIZE_N * bx + c_block_col, N)]) =
-        FETCH_FLOAT4(accum[i][0]);
-  }
-  // store C01 block
-  for (int i = 0; i < 4; i++) {
-    FETCH_FLOAT4(C[OFFSET(BLOCK_SIZE_M * by + c_block_row + i,
-                          BLOCK_SIZE_N * bx + c_block_col + 64, N)]) =
-        FETCH_FLOAT4(accum[i][4]);
-  }
-  // store C10 block
-  for (int i = 0; i < 4; i++) {
-    FETCH_FLOAT4(C[OFFSET(BLOCK_SIZE_M * by + c_block_row + 64 + i,
-                          BLOCK_SIZE_N * bx + c_block_col, N)]) =
-        FETCH_FLOAT4(accum[i + 4][0]);
-  }
-  // store C11 block
-  for (int i = 0; i < 4; i++) {
-    FETCH_FLOAT4(C[OFFSET(BLOCK_SIZE_M * by + c_block_row + 64 + i,
-                          BLOCK_SIZE_N * bx + c_block_col + 64, N)]) =
-        FETCH_FLOAT4(accum[i + 4][4]);
-  }
-}
-
-int main(int argc, char** argv) {
-  if (argc != 4) {
-    printf("usage: ./main [M] [K] [N]\n");
-    exit(0);
-  }
-  size_t M = atoi(argv[1]);
-  size_t K = atoi(argv[2]);
-  size_t N = atoi(argv[3]);
-
-  assert(M % 8 == 0);
-  assert(N % 8 == 0);
-  assert(K % 8 == 0);
-
-  size_t bytes_A = sizeof(float) * M * K;
-  size_t bytes_B = sizeof(float) * K * N;
-  size_t bytes_C = sizeof(float) * M * N;
-  float* h_A = (float*)malloc(bytes_A);
-  float* h_B = (float*)malloc(bytes_B);
-  float* h_C = (float*)malloc(bytes_C);
-  float* h_C1 = (float*)malloc(bytes_C);
-
-  float* d_A;
-  float* d_B;
-  float* d_C;
-
-  checkCudaErrors(cudaMalloc(&d_A, bytes_A));
-  checkCudaErrors(cudaMalloc(&d_B, bytes_B));
-  checkCudaErrors(cudaMalloc(&d_C, bytes_C));
-  double msecPerMatrixMul[2] = {0, 0};
-  double gigaFlops[2] = {0, 0};
-  double flopsPerMatrixMul = 2.0 * M * N * K;
-
-  // don't edit it
-  const int BLOCK_SIZE_M = 128;
-  const int BLOCK_SIZE_K = 8;
-  const int BLOCK_SIZE_N = 128;
-  const int THREAD_SIZE_X = 8;
-  const int THREAD_SIZE_Y = 8;
-  const bool ENABLE_DOUBLE_BUFFER = false;
-
-  // 生成A的数据
-  for (int i = 0; i < M * K; i++) {
-    h_A[i] = i / 13;
-  }
-
-  // 生成B的数据
-  for (int i = 0; i < K * N; i++) {
-    h_B[i] = i % 13;
-  }
-
-  checkCudaErrors(cudaMemcpy(d_A, h_A, bytes_A, cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaMemcpy(d_B, h_B, bytes_B, cudaMemcpyHostToDevice));
-
-  cudaEvent_t start, stop;
-  checkCudaErrors(cudaEventCreate(&start));
-  checkCudaErrors(cudaEventCreate(&stop));
-  float msecTotal = 0;
-  int nIter = 1000;
-
-  checkCudaErrors(cudaMemcpy(d_C, h_C, bytes_C, cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaEventRecord(start));
-  for (int run = 0; run < nIter; run++) {
-    dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_X, BLOCK_SIZE_M / THREAD_SIZE_Y);
-    dim3 dimGrid(N / BLOCK_SIZE_N, M / BLOCK_SIZE_M);
-    Sgemm<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_Y,
-          THREAD_SIZE_X, ENABLE_DOUBLE_BUFFER>
-        <<<dimGrid, dimBlock>>>(d_A, d_B, d_C, M, N, K);
-  }
-  checkCudaErrors(cudaEventRecord(stop));
-  checkCudaErrors(cudaEventSynchronize(stop));
-  checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
-
-  checkCudaErrors(cudaMemcpy(h_C, d_C, bytes_C, cudaMemcpyDeviceToHost));
-
-  msecPerMatrixMul[0] = msecTotal / nIter;
-  gigaFlops[0] =
-      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul[0] / 1000.0f);
-  printf(
-      "My gemm Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,\n",
-      gigaFlops[0], msecPerMatrixMul[0], flopsPerMatrixMul);
-
-  // cublas
-
-  cublasHandle_t blas_handle;
-  cublasCreate(&blas_handle);
-  float alpha = 1.0;
-  float beta = 0;
-  checkCudaErrors(cudaMemcpy(d_C, h_C, bytes_C, cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaEventRecord(start));
-  for (int run = 0; run < nIter; run++) {
-    cublasSgemm(blas_handle, CUBLAS_OP_T, CUBLAS_OP_T, M, N, K, &alpha, d_A, K,
-                d_B, N, &beta, d_C, N);
-  }
-  checkCudaErrors(cudaEventRecord(stop));
-  checkCudaErrors(cudaEventSynchronize(stop));
-  checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
-
-  checkCudaErrors(cudaMemcpy(h_C1, d_C, bytes_C, cudaMemcpyDeviceToHost));
-
-  msecPerMatrixMul[1] = msecTotal / nIter;
-  gigaFlops[1] =
-      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul[1] / 1000.0f);
-  printf("CuBlas Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,\n",
-         gigaFlops[1], msecPerMatrixMul[1], flopsPerMatrixMul);
-
-  cublasDestroy(blas_handle);
-
-  double eps = 1.e-6;  // machine zero
-  bool correct = true;
-  for (int i = 0; i < M * N; i++) {
-    int row = i / N;
-    int col = i % N;
-    double abs_err = fabs(h_C[i] - h_C1[col * M + row]);
-    double dot_length = M;
-    double abs_val = fabs(h_C[i]);
-    double rel_err = abs_err / abs_val / dot_length;
-    if (rel_err > eps) {
-      printf("Error! Matrix[%d][%d]=%.8f, ref=%.8f error term is > %E\n", row,
-             col, h_C[i], h_C1[col * M + row], eps);
-      correct = false;
-      break;
-    }
-  }
-
-  printf("%s\n", correct ? "Result= PASS" : "Result= FAIL");
-  printf("ratio= %f\n", gigaFlops[0] / gigaFlops[1]);
-
-  // Free Memory
-  cudaFree(d_A);
-  cudaFree(d_B);
-  cudaFree(d_C);
-
-  free(h_A);
-  free(h_B);
-  free(h_C);
-  free(h_C1);
-}
\ No newline at end of file
diff --git a/include/dawn/algorithm/gpu/spmv.cuh b/include/dawn/algorithm/gpu/spmv.cuh
deleted file mode 100644
index 3707419..0000000
--- a/include/dawn/algorithm/gpu/spmv.cuh
+++ /dev/null
@@ -1,413 +0,0 @@
-/**
- * @author lxrzlyr (1289539524@qq.com)
- * @date 2024-04-21
- *
- * @copyright Copyright (c) 2024
- */
-// It is copy from https://github.com/Liu-xiandong/How_to_optimize_in_GPU.git,
-// will be revised to adapt the repository in time.
-#include <dawn/dawn.cuh>
-#include <cuda.h>
-#include "device_launch_parameters.h"
-#include <time.h>
-#include <sys/time.h>
-#include <cuda_runtime_api.h>
-#include <cusparse.h>
-
-using namespace std;
-
-#define checkCudaErrors(func)                                                \
-  {                                                                          \
-    cudaError_t e = (func);                                                  \
-    if (e != cudaSuccess)                                                    \
-      printf("%s %d CUDA: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
-  }
-
-#define CHECK_CUDA(func)                                                   \
-  {                                                                        \
-    cudaError_t status = (func);                                           \
-    if (status != cudaSuccess) {                                           \
-      printf("CUDA API failed at line %d with error: %s (%d)\n", __LINE__, \
-             cudaGetErrorString(status), status);                          \
-      return EXIT_FAILURE;                                                 \
-    }                                                                      \
-  }
-
-#define CHECK_CUSPARSE(func)                                                   \
-  {                                                                            \
-    cusparseStatus_t status = (func);                                          \
-    if (status != CUSPARSE_STATUS_SUCCESS) {                                   \
-      printf("CUSPARSE API failed at line %d with error: %s (%d)\n", __LINE__, \
-             cusparseGetErrorString(status), status);                          \
-      return EXIT_FAILURE;                                                     \
-    }                                                                          \
-  }
-
-void add(int a, int b, float c, int* h, int* e, int* ne, float* w, int& idx) {
-  e[idx] = b, w[idx] = c, ne[idx] = h[a], h[a] = idx++;
-}
-
-void readVerEdges(int& is_weighted, int& n, int& t, int& m, std::string& file) {
-  std::ifstream input;
-  input.open("matrix/" + file + ".mtx");
-
-  while (input.peek() == '%')
-    input.ignore(2048, '\n');
-
-  input >> n >> t >> m;
-
-  std::string str;
-  input.ignore();
-  getline(input, str);
-  int cnt = 0;
-  for (auto c : str) {
-    if (c == ' ') {
-      cnt++;
-    }
-  }
-  if (cnt == 1) {
-    is_weighted = 0;
-  } else if (cnt == 2) {
-    is_weighted = 1;
-  } else {
-    std::cout << "error! you need to get right mtx input\n";
-    exit(0);
-  }
-  input.close();
-}
-
-void readMtxFile(int is_weighted,
-                 int n,
-                 int m,
-                 int* row_offset,
-                 int* col_index,
-                 float* val,
-                 std::string& file) {
-  ifstream input;
-  input.open("matrix/" + file + ".mtx");
-
-  while (input.peek() == '%')
-    input.ignore(2048, '\n');
-
-  int t;
-  input >> n >> t >> m;
-  int* h = (int*)malloc((n + 10) * sizeof(int));
-  memset(h, -1, sizeof(int) * (n + 10));
-  int* e = (int*)malloc((m + 10) * sizeof(int));
-  int* ne = (int*)malloc((m + 10) * sizeof(int));
-  float* w = (float*)malloc((m + 10) * sizeof(float));
-  int idx = 0;
-
-  int a, b;
-  double c;
-  srand((int)time(0));
-  if (is_weighted == 0) {
-    while (input >> a >> b) {
-      a--;
-      b--;
-      c = a % 13;
-      float tc = static_cast<float>(c);
-      add(a, b, tc, h, e, ne, w, idx);
-    }
-  } else if (is_weighted == 1) {
-    while (input >> a >> b >> c) {
-      a--;
-      b--;
-      float tc = static_cast<float>(c);
-      add(a, b, tc, h, e, ne, w, idx);
-    }
-  } else {
-    std::cout << "error! you need to get right mtx input\n";
-    exit(0);
-  }
-
-  row_offset[0] = 0;
-  int nnz_num = 0;
-
-  for (int i = 0; i < n; i++) {
-    int count = 0;
-    for (int j = h[i]; j != -1; j = ne[j]) {
-      count++;
-      int nextNode = e[j];
-      float nextWeight = w[j];
-      col_index[nnz_num] = nextNode;
-      val[nnz_num] = nextWeight;
-      nnz_num++;
-    }
-    row_offset[i + 1] = row_offset[i] + count;
-  }
-
-  input.close();
-  free(h);
-  free(e);
-  free(ne);
-  free(w);
-}
-
-template <unsigned int WarpSize>
-__device__ __forceinline__ float warpReduceSum(float sum) {
-  if (WarpSize >= 32)
-    sum += __shfl_down_sync(0xffffffff, sum, 16);  // 0-16, 1-17, 2-18, etc.
-  if (WarpSize >= 16)
-    sum += __shfl_down_sync(0xffffffff, sum, 8);  // 0-8, 1-9, 2-10, etc.
-  if (WarpSize >= 8)
-    sum += __shfl_down_sync(0xffffffff, sum, 4);  // 0-4, 1-5, 2-6, etc.
-  if (WarpSize >= 4)
-    sum += __shfl_down_sync(0xffffffff, sum, 2);  // 0-2, 1-3, 4-6, 5-7, etc.
-  if (WarpSize >= 2)
-    sum += __shfl_down_sync(0xffffffff, sum, 1);  // 0-1, 2-3, 4-5, etc.
-  return sum;
-}
-
-template <typename IndexType,
-          typename ValueType,
-          unsigned int VECTORS_PER_BLOCK,
-          unsigned int THREADS_PER_VECTOR>
-__global__ void My_spmv_csr_kernel(const IndexType row_num,
-                                   const IndexType* A_row_offset,
-                                   const IndexType* A_col_index,
-                                   const ValueType* A_value,
-                                   const ValueType* x,
-                                   ValueType* y) {
-  const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
-  const IndexType thread_id =
-      THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;  // global thread index
-  const IndexType thread_lane =
-      threadIdx.x & (THREADS_PER_VECTOR - 1);  // thread index within the vector
-  const IndexType row_id =
-      thread_id / THREADS_PER_VECTOR;  // global vector index
-
-  if (row_id < row_num) {
-    const IndexType row_start =
-        A_row_offset[row_id];  // same as: row_start = Ap[row];
-    const IndexType row_end = A_row_offset[row_id + 1];
-
-    // initialize local sum
-    ValueType sum = 0;
-
-    // accumulate local sums
-    for (IndexType jj = row_start + thread_lane; jj < row_end;
-         jj += THREADS_PER_VECTOR)
-      sum += A_value[jj] * x[A_col_index[jj]];
-
-    sum = warpReduceSum<THREADS_PER_VECTOR>(sum);
-    if (thread_lane == 0) {
-      y[row_id] = sum;
-    }
-  }
-}
-
-template <typename T>
-void vec_print(vector<T> array) {
-  for (auto x : array) {
-    cout << x << " ";
-  }
-  cout << std::endl;
-}
-
-template <typename IndexType, typename ValueType>
-void spmv_cpu_kernel(vector<IndexType>& row_offset,
-                     vector<IndexType>& col_index,
-                     vector<ValueType>& value,
-                     vector<ValueType>& x,
-                     vector<ValueType>& y,
-                     IndexType row_num) {
-  for (int i = 0; i < row_num; i++) {
-    ValueType res = 0;
-    IndexType num = row_offset[i + 1] - row_offset[i];
-    for (int j = 0; j < num; j++) {
-      IndexType index = row_offset[i] + j;
-      res += value[index] * x[col_index[index]];
-    }
-    y[i] = res;
-  }
-}
-
-int main(int argc, char** argv) {
-  if (argc != 3) {
-    printf("usage: ./spmv -f [matrix]\n");
-    exit(0);
-  }
-  string file;
-  for (int i = 1; i < argc; i++) {
-    if (strcmp(argv[i], "-f") == 0) {
-      file = argv[i + 1];
-    }
-  }
-
-  // read mtx file and convert to csr
-  int is_weighted = -1;
-  int row_num;
-  int col_num;
-  int nnz_num;
-  readVerEdges(is_weighted, row_num, col_num, nnz_num, file);
-  vector<int> row_offset(row_num + 1);
-  vector<int> col_index(nnz_num);
-  vector<float> value(nnz_num);
-  vector<float> x(col_num, 1.0);
-  vector<float> y(row_num);
-  vector<float> y_res(row_num);
-  vector<float> y_cusparse_res(row_num);
-  int iter = 2000;
-  readMtxFile(is_weighted, row_num, nnz_num, row_offset.data(),
-              col_index.data(), value.data(), file);
-
-  // check input
-  // std::cout<<" The row_offset is: "<<std::endl;
-  // vec_print<int>(row_offset);
-  // std::cout<<" The col_index is: "<<std::endl;
-  // vec_print<int>(col_index);
-  // std::cout<<" The value is: "<<std::endl;
-  // vec_print<float>(value);
-
-  // allocate memory in GPU device
-  int* d_A_row_offset;
-  int* d_A_col_index;
-  float* d_A_value;
-  float* d_x;
-  float* d_y;
-  float* d_y_cusparse;
-
-  checkCudaErrors(cudaMalloc(&d_A_row_offset, (row_num + 1) * sizeof(int)));
-  checkCudaErrors(cudaMalloc(&d_A_col_index, nnz_num * sizeof(int)));
-  checkCudaErrors(cudaMalloc(&d_A_value, nnz_num * sizeof(float)));
-  checkCudaErrors(cudaMalloc(&d_x, col_num * sizeof(float)));
-  checkCudaErrors(cudaMalloc(&d_y, row_num * sizeof(float)));
-  checkCudaErrors(cudaMalloc(&d_y_cusparse, row_num * sizeof(float)));
-  checkCudaErrors(cudaMemcpy(d_A_row_offset, row_offset.data(),
-                             (row_num + 1) * sizeof(int),
-                             cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaMemcpy(d_A_col_index, col_index.data(),
-                             nnz_num * sizeof(int), cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaMemcpy(d_A_value, value.data(), nnz_num * sizeof(float),
-                             cudaMemcpyHostToDevice));
-  checkCudaErrors(cudaMemcpy(d_x, x.data(), col_num * sizeof(float),
-                             cudaMemcpyHostToDevice));
-
-  // spmv
-  // 32 thread for a row
-  int mean_col_num = (nnz_num + (row_num - 1)) / row_num;
-  std::cout << "The average col num is: " << mean_col_num << std::endl;
-
-  // const int THREADS_PER_VECTOR = 32;
-  // const unsigned int VECTORS_PER_BLOCK  = 256 / THREADS_PER_VECTOR;
-  // const unsigned int THREADS_PER_BLOCK  = VECTORS_PER_BLOCK *
-  // THREADS_PER_VECTOR; const unsigned int NUM_BLOCKS = static_cast<unsigned
-  // int>((row_num + (VECTORS_PER_BLOCK - 1)) / VECTORS_PER_BLOCK);
-  // My_spmv_csr_kernel<int, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>
-  // <<<NUM_BLOCKS, THREADS_PER_BLOCK>>>
-  //     (row_num, d_A_row_offset, d_A_col_index, d_A_value, d_x, d_y);
-
-  for (int i = 0; i < iter; i++) {
-    if (mean_col_num <= 2) {
-      const int THREADS_PER_VECTOR = 2;
-      const unsigned int VECTORS_PER_BLOCK = 128;
-      const unsigned int NUM_BLOCKS = static_cast<unsigned int>(
-          (row_num + (VECTORS_PER_BLOCK - 1)) / VECTORS_PER_BLOCK);
-      My_spmv_csr_kernel<int, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>
-          <<<NUM_BLOCKS, 256>>>(row_num, d_A_row_offset, d_A_col_index,
-                                d_A_value, d_x, d_y);
-    } else if (mean_col_num > 2 && mean_col_num <= 4) {
-      const int THREADS_PER_VECTOR = 4;
-      const unsigned int VECTORS_PER_BLOCK = 64;
-      const unsigned int NUM_BLOCKS = static_cast<unsigned int>(
-          (row_num + (VECTORS_PER_BLOCK - 1)) / VECTORS_PER_BLOCK);
-      My_spmv_csr_kernel<int, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>
-          <<<NUM_BLOCKS, 256>>>(row_num, d_A_row_offset, d_A_col_index,
-                                d_A_value, d_x, d_y);
-    } else if (mean_col_num > 4 && mean_col_num <= 8) {
-      const int THREADS_PER_VECTOR = 8;
-      const unsigned int VECTORS_PER_BLOCK = 32;
-      const unsigned int NUM_BLOCKS = static_cast<unsigned int>(
-          (row_num + (VECTORS_PER_BLOCK - 1)) / VECTORS_PER_BLOCK);
-      My_spmv_csr_kernel<int, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>
-          <<<NUM_BLOCKS, 256>>>(row_num, d_A_row_offset, d_A_col_index,
-                                d_A_value, d_x, d_y);
-    } else if (mean_col_num > 8 && mean_col_num <= 16) {
-      const int THREADS_PER_VECTOR = 16;
-      const unsigned int VECTORS_PER_BLOCK = 16;
-      const unsigned int NUM_BLOCKS = static_cast<unsigned int>(
-          (row_num + (VECTORS_PER_BLOCK - 1)) / VECTORS_PER_BLOCK);
-      My_spmv_csr_kernel<int, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>
-          <<<NUM_BLOCKS, 256>>>(row_num, d_A_row_offset, d_A_col_index,
-                                d_A_value, d_x, d_y);
-    } else if (mean_col_num > 16) {
-      const int THREADS_PER_VECTOR = 32;
-      const unsigned int VECTORS_PER_BLOCK = 8;
-      const unsigned int NUM_BLOCKS = static_cast<unsigned int>(
-          (row_num + (VECTORS_PER_BLOCK - 1)) / VECTORS_PER_BLOCK);
-      My_spmv_csr_kernel<int, float, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>
-          <<<NUM_BLOCKS, 256>>>(row_num, d_A_row_offset, d_A_col_index,
-                                d_A_value, d_x, d_y);
-    }
-  }
-  checkCudaErrors(cudaMemcpy(y.data(), d_y, row_num * sizeof(float),
-                             cudaMemcpyDeviceToHost));
-
-  // cusparse spmv
-  //--------------------------------------------------------------------------
-  // CUSPARSE APIs
-  float alpha = 1.0f;
-  float beta = 0.0f;
-
-  cusparseHandle_t handle = NULL;
-  cusparseSpMatDescr_t matA;
-  cusparseDnVecDescr_t vecX, vecY;
-  void* dBuffer = NULL;
-  size_t bufferSize = 0;
-  CHECK_CUSPARSE(cusparseCreate(&handle))
-  // Create sparse matrix A in CSR format
-  CHECK_CUSPARSE(cusparseCreateCsr(&matA, row_num, col_num, nnz_num,
-                                   d_A_row_offset, d_A_col_index, d_A_value,
-                                   CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-                                   CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F))
-  // Create dense vector X
-  CHECK_CUSPARSE(cusparseCreateDnVec(&vecX, col_num, d_x, CUDA_R_32F))
-  // Create dense vector y
-  CHECK_CUSPARSE(cusparseCreateDnVec(&vecY, row_num, d_y_cusparse, CUDA_R_32F))
-  // allocate an external buffer if needed
-  CHECK_CUSPARSE(cusparseSpMV_bufferSize(
-      handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
-      CUDA_R_32F, CUSPARSE_MV_ALG_DEFAULT, &bufferSize))
-  CHECK_CUDA(cudaMalloc(&dBuffer, bufferSize))
-
-  // execute SpMV
-  for (int i = 0; i < iter; i++) {
-    CHECK_CUSPARSE(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                &alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
-                                CUSPARSE_MV_ALG_DEFAULT, dBuffer))
-  }
-
-  // destroy matrix/vector descriptors
-  CHECK_CUSPARSE(cusparseDestroySpMat(matA))
-  CHECK_CUSPARSE(cusparseDestroyDnVec(vecX))
-  CHECK_CUSPARSE(cusparseDestroyDnVec(vecY))
-  CHECK_CUSPARSE(cusparseDestroy(handle))
-  //--------------------------------------------------------------------------
-  // device result check
-  CHECK_CUDA(cudaMemcpy(y_cusparse_res.data(), d_y_cusparse,
-                        row_num * sizeof(float), cudaMemcpyDeviceToHost))
-
-  bool check_result = true;
-  for (int i = 0; i < row_num; i++) {
-    if (fabs(y[i] - y_cusparse_res[i]) > 1e-3) {
-      std::cout << "The result is error!" << std::endl;
-      printf("The row is: %d the y is:%f and the cusparse_y is:%f\n", i, y[i],
-             y_cusparse_res[i]);
-      check_result = false;
-      break;
-    }
-  }
-  if (check_result) {
-    std::cout << "The result is right!" << std::endl;
-  }
-
-  // Free Memory
-  cudaFree(d_A_row_offset);
-  cudaFree(d_A_col_index);
-  cudaFree(d_A_value);
-  cudaFree(d_x);
-  cudaFree(d_y);
-
-  return 0;
-}
\ No newline at end of file
diff --git a/test/Performance.md b/test/Performance.md
new file mode 100644
index 0000000..0df65e9
--- /dev/null
+++ b/test/Performance.md
@@ -0,0 +1,68 @@
+| Number | graph             |   DAWN(20) |    DAWN |    Gunrock |        GAP |       Nodes |       Edges |
+| -----: | :---------------- | ---------: | ------: | ---------: | ---------: | ----------: | ----------: |
+|      1 | crankseg_1        |     0.4329 |  1.9271 |     2.6063 |     1.6238 |      52,804 |   5,333,507 |
+|      2 | loc-Brightkite    |     0.0476 |  0.2038 |     0.3325 |     0.1170 |      58,228 |     214,078 |
+|      3 | p2p-Gnutella31    |     0.0537 |  0.0554 |     0.1565 |     0.1034 |      62,586 |     147,892 |
+|      4 | crankseg_2        |     0.5845 |  2.2151 |     3.0943 |     2.2400 |      63,838 |   7,106,348 |
+|      5 | nd24k             |     1.3253 |  0.9969 |     2.5757 |     3.0087 |      72,000 |  14,393,817 |
+|      6 | m_t1              |     0.4056 |  1.0409 |     1.7550 |     1.6848 |      97,578 |   4,925,574 |
+|      7 | x104              |     0.4186 |  1.9346 |     2.1477 |     1.4713 |     108,384 |   5,138,004 |
+|      8 | Ge99H100          |     0.4398 |  0.5809 |     1.6101 |     0.5165 |     112,985 |   4,282,190 |
+|      9 | Ga10As10H30       |     0.3185 |  0.5182 |     1.6472 |     0.8665 |     113,081 |   3,114,357 |
+|     10 | torso1            |     0.6779 |  3.1728 |     8.3669 |     1.6456 |     116,158 |   8,516,500 |
+|     11 | boneS01           |     0.3197 |   0.636 |     1.2072 |     1.5120 |     127,224 |   3,421,188 |
+|     12 | Ga19As19H42       |     0.4815 |  0.6732 |     1.9866 |     0.7224 |     133,123 |   4,508,981 |
+|     13 | bmw7st_1          |     0.3491 |   1.441 |     2.0788 |     0.8944 |     141,347 |   3,740,507 |
+|     14 | bmwcra_1          |     0.4915 |  1.7262 |     2.2999 |     1.8295 |     148,770 |   5,396,386 |
+|     15 | pkustk14          |     0.6529 |  1.8374 |     2.7029 |     2.0295 |     151,926 |   7,494,215 |
+|     16 | SiO2              |     0.6265 |  0.9721 |     3.4822 |     2.3220 |     155,331 |   5,719,417 |
+|     17 | wave              |     0.1836 |  0.2697 |     0.7051 |     1.3731 |     156,317 |   1,059,331 |
+|     18 | xenon2            |     0.3860 |  0.5752 |     1.3700 |     0.6304 |     157,464 |   3,866,688 |
+|     19 | PR02R             |     0.8503 |  2.8204 |     3.7410 |     1.6571 |     161,070 |   8,185,136 |
+|     20 | mono_500Hz        |     0.7519 |  0.7299 |     1.2950 |     2.2294 |     169,410 |   5,036,288 |
+|     21 | Si41Ge41H72       |     0.9252 |  0.8908 |     2.7422 |     1.3664 |     185,639 |   7,598,452 |
+|     22 | fullb             |     0.5588 |  1.1828 |     1.8003 |     1.5714 |     199,187 |   5,953,632 |
+|     23 | com-DBLP          |     0.6767 |  0.2183 |     0.5305 |     0.4102 |     317,080 |   2,099,732 |
+|     24 | web-BerkStan      |     1.0788 | 16.3167 |    12.3961 |     2.5108 |     685,230 |   7,600,595 |
+|     25 | rgg_n_2_20_s0     |     5.3429 |  2.7489 |    50.6211 |     6.3703 |   1,048,576 |  13,783,240 |
+|     26 | roadNet-PA        |     0.8937 |  1.5831 |    46.2673 |     3.3110 |   1,090,920 |   3,083,796 |
+|     27 | rgg_n_2_21_s0     |    15.9903 |  2.6134 |    70.1951 |    18.9606 |   2,097,152 |  28,975,990 |
+|     28 | rgg_n_2_22_s0     |    39.7581 |  1.5922 |   101.4020 |    48.8593 |   4,194,304 |  60,718,396 |
+|     29 | kmer_P1a          | 1,172.8234 |  1.2926 |   109.0780 | 1,176.1875 | 139,353,211 | 595,659,968 |
+|     30 | ak2010            |     0.0300 |  0.4593 |     0.7305 |     0.3957 |      45,292 |     108,549 |
+|     31 | asia_osm          |    10.9470 |  0.1233 | 2,790.2900 |   166.5654 |  11,950,757 |  12,711,603 |
+|     32 | belgium_osm       |     1.3884 |  0.9094 |   114.3420 |     6.2594 |   1,441,295 |   1,549,970 |
+|     33 | caidaRouterLevel  |     0.2296 |  0.3539 |     0.3719 |     1.8770 |     192,244 |     609,066 |
+|     34 | coAuthorsCiteseer |     0.2801 |  0.3435 |     0.4572 |     0.8020 |     227,320 |     814,134 |
+|     35 | coAuthorsDBLP     |     0.4938 |  0.1872 |     0.3666 |     0.3507 |     299,067 |     977,676 |
+|     36 | coPapersCiteseer  |     3.1737 |  0.9108 |     3.2352 |     2.4913 |     434,102 |  16,036,720 |
+|     37 | coPapersDBLP      |     4.1394 |  0.8439 |     3.6637 |     2.4649 |     540,486 |  15,245,729 |
+|     38 | delaunay_n10      |     0.0007 |  0.0616 |     0.2454 |     0.0469 |       1,024 |       3,056 |
+|     39 | delaunay_n11      |     0.0014 |  0.0825 |     0.3064 |     0.1864 |       2,048 |       6,127 |
+|     40 | delaunay_n12      |     0.0030 |  0.1182 |     0.4273 |     0.6764 |       4,096 |      12,264 |
+|     41 | delaunay_n13      |     0.0068 |  0.1678 |     0.5871 |     0.3086 |       8,192 |      24,547 |
+|     42 | delaunay_n14      |     0.0141 |  0.2271 |     0.8151 |     0.2320 |      16,384 |      49,122 |
+|     43 | delaunay_n15      |     0.0280 |  0.3073 |     1.0599 |     0.4040 |      32,768 |      98,274 |
+|     44 | delaunay_n16      |     0.0570 |   0.423 |     1.5167 |     0.3864 |      65,536 |     196,575 |
+|     45 | delaunay_n17      |     0.1121 |  0.5943 |     2.0615 |     0.6685 |     131,072 |     393,176 |
+|     46 | delaunay_n18      |     0.2376 |  1.0268 |     3.0184 |     1.4653 |     262,144 |     786,396 |
+|     47 | delaunay_n19      |     0.4945 |  1.9673 |    23.2473 |     2.2740 |     524,288 |   1,572,823 |
+|     48 | delaunay_n20      |     1.0880 |  2.2982 |    32.5912 |     2.4806 |   1,048,576 |   3,145,686 |
+|     49 | delaunay_n21      |     2.8250 |  2.1149 |    45.4094 |     6.2044 |   2,097,152 |   6,291,408 |
+|     50 | delaunay_n22      |     7.5536 |  2.1781 |    65.0226 |    16.0525 |   4,194,304 |  12,582,869 |
+|     51 | delaunay_n23      |    19.7583 |  2.4131 |    93.4047 |    26.6763 |   8,388,608 |  25,165,784 |
+|     52 | delaunay_n24      |    49.9102 |  2.4384 |   137.2600 |   107.3878 |  16,777,216 |  50,331,601 |
+|     53 | europe_osm        |   155.7345 |  0.3709 | 1,534.5400 |   270.3253 |  50,912,018 |  54,054,660 |
+|     54 | germany_osm       |    26.3836 |  0.0987 |   374.1470 |    52.3477 |  11,548,845 |  12,369,181 |
+|     55 | great-britain_osm |    13.2017 |  0.2877 |   537.9020 |    43.1726 |   7,733,822 |   8,156,517 |
+|     56 | indochina-2004    |    12.7614 |  3.5667 |    30.3925 |    53.9166 |   7,414,866 | 194,109,311 |
+|     57 | italy_osm         |     7.9301 |  0.3223 |   678.8460 |    43.5775 |   6,686,493 |   7,013,978 |
+|     58 | luxembourg_osm    |     0.0625 |  2.7094 |    12.7140 |     2.5926 |     114,599 |     119,666 |
+|     59 | netherlands_osm   |     1.7650 |  0.4556 |   140.5030 |    11.0192 |   2,216,688 |   2,441,238 |
+|     60 | road_central      |    58.5574 |  1.4313 |   330.8790 |    60.6113 |   1,971,281 |   2,766,607 |
+|     61 | roadNet-CA        |     1.9172 |  1.0457 |    52.3170 |     7.6407 |  14,081,816 |  16,933,413 |
+|     62 | road_usa          |    58.0327 |  0.7557 |   499.7490 |   227.5694 |  23,947,347 |  28,854,312 |
+|     63 | uk-2002           |    12.2523 |  3.2653 |    11.3005 |    81.8695 |  18,520,486 | 298,113,762 |
+|     64 | webbase-1M        |     0.0362 |  0.4072 |     1.9878 |     0.5330 |   1,000,005 |   3,105,536 |
+|     65 | uk-2005           |   118.3243 |  4.4685 |     0.0000 |   179.2459 |  39,459,925 | 921,345,078 |
+|     66 | arabic-2005       |    43.1304 |  5.8986 |     0.0000 |   150.0003 |  22,744,080 | 639,999,458 |
diff --git a/test/Test_Guide.md b/test/Test_Guide.md
new file mode 100644
index 0000000..c8db63d
--- /dev/null
+++ b/test/Test_Guide.md
@@ -0,0 +1,21 @@
+# Introduction
+We have presented a performance comparison of algorithms for DAWN, GAPBS, and Gunrock in [Performance](https://github.com/lxrzlyr/DAWN-An-Noval-SSSP-APSP-Algorithm/tree/dev/test/Performance.md). The benchmark tests were run on the Gunrock benchmark dataset and the Suite Sparse Collection dataset. The table provides specific information about the graphs and their corresponding runtime. The baseline implementations from Gunrock and GAPBS are provided in the **test** directory.
+
+# Test Environment
+
+The test environment is as follows:
+
+- OS: Ubuntu 20.04.5 LTS
+- CPU: Intel Core i5-13600KF
+- GPU: NVIDIA GeForce RTX 2080 Ti
+- Memory: 32GB
+- CUDA: 12.1
+
+# Code
+
+We also provide the test code for Gunrock in the **test/gunrock** and GAPBS in the **test/gapbs**. Due to differences in code build environments and other aspects among the repositories, it is not possible to pull and build them uniformly. Alternatively, you can pull our modified fork branch and build directly([Gunrock](https://github.com/lxrzlyr/gunrock),[GAPBS](https://github.com/lxrzlyr/gapbs)).
+
+If you need to verify the results of Gunrock and GAPBS, please visit the repositories for [Gunrock](https://github.com/gunrock/gunrock) and [GAPBS](https://github.com/sbeamer/gapbs) respectively, follow the repository build instructions, and replace the source files in the repository with the ones we provide.
+
+# Check the Results
+We provide the file **check_unweighted.py** and **check_weighted.py**, based on networkx, which can be used to check the results printed by DAWN.
\ No newline at end of file
diff --git a/test/performance.md b/test/performance.md
deleted file mode 100644
index 7771e91..0000000
--- a/test/performance.md
+++ /dev/null
@@ -1,68 +0,0 @@
-|   Number | graph             | DAWN(20)   |    DAWN | Gunrock    | GAP        | Nodes       | Edges       |
-|---------:|:------------------|-----------:|--------:|-----------:|-----------:|------------:|------------:|
-|        1 | crankseg_1        | 0.4329     |  1.9271 | 2.6063     | 1.6238     | 52,804      | 5,333,507   |
-|        2 | loc-Brightkite    | 0.0476     |  0.2038 | 0.3325     | 0.1170     | 58,228      | 214,078     |
-|        3 | p2p-Gnutella31    | 0.0537     |  0.0554 | 0.1565     | 0.1034     | 62,586      | 147,892     |
-|        4 | crankseg_2        | 0.5845     |  2.2151 | 3.0943     | 2.2400     | 63,838      | 7,106,348   |
-|        5 | nd24k             | 1.3253     |  0.9969 | 2.5757     | 3.0087     | 72,000      | 14,393,817  |
-|        6 | m_t1              | 0.4056     |  1.0409 | 1.7550     | 1.6848     | 97,578      | 4,925,574   |
-|        7 | x104              | 0.4186     |  1.9346 | 2.1477     | 1.4713     | 108,384     | 5,138,004   |
-|        8 | Ge99H100          | 0.4398     |  0.5809 | 1.6101     | 0.5165     | 112,985     | 4,282,190   |
-|        9 | Ga10As10H30       | 0.3185     |  0.5182 | 1.6472     | 0.8665     | 113,081     | 3,114,357   |
-|       10 | torso1            | 0.6779     |  3.1728 | 8.3669     | 1.6456     | 116,158     | 8,516,500   |
-|       11 | boneS01           | 0.3197     |  0.636  | 1.2072     | 1.5120     | 127,224     | 3,421,188   |
-|       12 | Ga19As19H42       | 0.4815     |  0.6732 | 1.9866     | 0.7224     | 133,123     | 4,508,981   |
-|       13 | bmw7st_1          | 0.3491     |  1.441  | 2.0788     | 0.8944     | 141,347     | 3,740,507   |
-|       14 | bmwcra_1          | 0.4915     |  1.7262 | 2.2999     | 1.8295     | 148,770     | 5,396,386   |
-|       15 | pkustk14          | 0.6529     |  1.8374 | 2.7029     | 2.0295     | 151,926     | 7,494,215   |
-|       16 | SiO2              | 0.6265     |  0.9721 | 3.4822     | 2.3220     | 155,331     | 5,719,417   |
-|       17 | wave              | 0.1836     |  0.2697 | 0.7051     | 1.3731     | 156,317     | 1,059,331   |
-|       18 | xenon2            | 0.3860     |  0.5752 | 1.3700     | 0.6304     | 157,464     | 3,866,688   |
-|       19 | PR02R             | 0.8503     |  2.8204 | 3.7410     | 1.6571     | 161,070     | 8,185,136   |
-|       20 | mono_500Hz        | 0.7519     |  0.7299 | 1.2950     | 2.2294     | 169,410     | 5,036,288   |
-|       21 | Si41Ge41H72       | 0.9252     |  0.8908 | 2.7422     | 1.3664     | 185,639     | 7,598,452   |
-|       22 | fullb             | 0.5588     |  1.1828 | 1.8003     | 1.5714     | 199,187     | 5,953,632   |
-|       23 | com-DBLP          | 0.6767     |  0.2183 | 0.5305     | 0.4102     | 317,080     | 2,099,732   |
-|       24 | web-BerkStan      | 1.0788     | 16.3167 | 12.3961    | 2.5108     | 685,230     | 7,600,595   |
-|       25 | rgg_n_2_20_s0     | 5.3429     |  2.7489 | 50.6211    | 6.3703     | 1,048,576   | 13,783,240  |
-|       26 | roadNet-PA        | 0.8937     |  1.5831 | 46.2673    | 3.3110     | 1,090,920   | 3,083,796   |
-|       27 | rgg_n_2_21_s0     | 15.9903    |  2.6134 | 70.1951    | 18.9606    | 2,097,152   | 28,975,990  |
-|       28 | rgg_n_2_22_s0     | 39.7581    |  1.5922 | 101.4020   | 48.8593    | 4,194,304   | 60,718,396  |
-|       29 | kmer_P1a          | 1,172.8234 |  1.2926 | 109.0780   | 1,176.1875 | 139,353,211 | 595,659,968 |
-|       30 | ak2010            | 0.0300     |  0.4593 | 0.7305     | 0.3957     | 45,292      | 108,549     |
-|       31 | asia_osm          | 10.9470    |  0.1233 | 2,790.2900 | 166.5654   | 11,950,757  | 12,711,603  |
-|       32 | belgium_osm       | 1.3884     |  0.9094 | 114.3420   | 6.2594     | 1,441,295   | 1,549,970   |
-|       33 | caidaRouterLevel  | 0.2296     |  0.3539 | 0.3719     | 1.8770     | 192,244     | 609,066     |
-|       34 | coAuthorsCiteseer | 0.2801     |  0.3435 | 0.4572     | 0.8020     | 227,320     | 814,134     |
-|       35 | coAuthorsDBLP     | 0.4938     |  0.1872 | 0.3666     | 0.3507     | 299,067     | 977,676     |
-|       36 | coPapersCiteseer  | 3.1737     |  0.9108 | 3.2352     | 2.4913     | 434,102     | 16,036,720  |
-|       37 | coPapersDBLP      | 4.1394     |  0.8439 | 3.6637     | 2.4649     | 540,486     | 15,245,729  |
-|       38 | delaunay_n10      | 0.0007     |  0.0616 | 0.2454     | 0.0469     | 1,024       | 3,056       |
-|       39 | delaunay_n11      | 0.0014     |  0.0825 | 0.3064     | 0.1864     | 2,048       | 6,127       |
-|       40 | delaunay_n12      | 0.0030     |  0.1182 | 0.4273     | 0.6764     | 4,096       | 12,264      |
-|       41 | delaunay_n13      | 0.0068     |  0.1678 | 0.5871     | 0.3086     | 8,192       | 24,547      |
-|       42 | delaunay_n14      | 0.0141     |  0.2271 | 0.8151     | 0.2320     | 16,384      | 49,122      |
-|       43 | delaunay_n15      | 0.0280     |  0.3073 | 1.0599     | 0.4040     | 32,768      | 98,274      |
-|       44 | delaunay_n16      | 0.0570     |  0.423  | 1.5167     | 0.3864     | 65,536      | 196,575     |
-|       45 | delaunay_n17      | 0.1121     |  0.5943 | 2.0615     | 0.6685     | 131,072     | 393,176     |
-|       46 | delaunay_n18      | 0.2376     |  1.0268 | 3.0184     | 1.4653     | 262,144     | 786,396     |
-|       47 | delaunay_n19      | 0.4945     |  1.9673 | 23.2473    | 2.2740     | 524,288     | 1,572,823   |
-|       48 | delaunay_n20      | 1.0880     |  2.2982 | 32.5912    | 2.4806     | 1,048,576   | 3,145,686   |
-|       49 | delaunay_n21      | 2.8250     |  2.1149 | 45.4094    | 6.2044     | 2,097,152   | 6,291,408   |
-|       50 | delaunay_n22      | 7.5536     |  2.1781 | 65.0226    | 16.0525    | 4,194,304   | 12,582,869  |
-|       51 | delaunay_n23      | 19.7583    |  2.4131 | 93.4047    | 26.6763    | 8,388,608   | 25,165,784  |
-|       52 | delaunay_n24      | 49.9102    |  2.4384 | 137.2600   | 107.3878   | 16,777,216  | 50,331,601  |
-|       53 | europe_osm        | 155.7345   |  0.3709 | 1,534.5400 | 270.3253   | 50,912,018  | 54,054,660  |
-|       54 | germany_osm       | 26.3836    |  0.0987 | 374.1470   | 52.3477    | 11,548,845  | 12,369,181  |
-|       55 | great-britain_osm | 13.2017    |  0.2877 | 537.9020   | 43.1726    | 7,733,822   | 8,156,517   |
-|       56 | indochina-2004    | 12.7614    |  3.5667 | 30.3925    | 53.9166    | 7,414,866   | 194,109,311 |
-|       57 | italy_osm         | 7.9301     |  0.3223 | 678.8460   | 43.5775    | 6,686,493   | 7,013,978   |
-|       58 | luxembourg_osm    | 0.0625     |  2.7094 | 12.7140    | 2.5926     | 114,599     | 119,666     |
-|       59 | netherlands_osm   | 1.7650     |  0.4556 | 140.5030   | 11.0192    | 2,216,688   | 2,441,238   |
-|       60 | road_central      | 58.5574    |  1.4313 | 330.8790   | 60.6113    | 1,971,281   | 2,766,607   |
-|       61 | roadNet-CA        | 1.9172     |  1.0457 | 52.3170    | 7.6407     | 14,081,816  | 16,933,413  |
-|       62 | road_usa          | 58.0327    |  0.7557 | 499.7490   | 227.5694   | 23,947,347  | 28,854,312  |
-|       63 | uk-2002           | 12.2523    |  3.2653 | 11.3005    | 81.8695    | 18,520,486  | 298,113,762 |
-|       64 | webbase-1M        | 0.0362     |  0.4072 | 1.9878     | 0.5330     | 1,000,005   | 3,105,536   |
-|       65 | uk-2005           | 118.3243   |  4.4685 | 0.0000     | 179.2459   | 39,459,925  | 921,345,078 |
-|       66 | arabic-2005       | 43.1304    |  5.8986 | 0.0000     | 150.0003   | 22,744,080  | 639,999,458 |
diff --git a/tool/process.sh b/tool/process.sh
new file mode 100755
index 0000000..1f610b5
--- /dev/null
+++ b/tool/process.sh
@@ -0,0 +1,52 @@
+/**
+ * @author lxrzlyr (1289539524@qq.com)
+ * @date 2024-05-07
+ *
+ * @copyright Copyright (c) 2024
+ */
+#!/bin/bash
+
+# Modify the absolute path of MAIN and GRAPH_DIR, or the relative path based on the directory where it is located.
+MAIN=""
+GRAPH_DIR=""
+SourceList=""
+Algorithm="SSSP"
+Interval="100"
+Prinft="false"
+Source="0"
+Stream="4"
+Block_size="1024"
+Weighted="unweighted"
+
+# Check if the GRAPH_DIR path exists and contains any mtx files
+if [[ ! -d "${GRAPH_DIR}" ]]; then
+    echo "Error: ${GRAPH_DIR} does not exist or is not a directory!"
+    exit 1
+fi
+
+# Set directory path for the graph log files
+LOG_DIR=""
+
+# Create LOG_DIR if it doesn't exist already
+[[ ! -d "${LOG_DIR}" ]] && mkdir "${LOG_DIR}"
+
+
+# Loop over all mtx files in GRAPH_DIR directory
+for file in ${GRAPH_DIR}/*.mtx; do
+    if [[ ! -f "${file}" ]]; then
+        continue
+    fi
+    
+ # Extract filename from filepath, without .mtx extension
+    filename=$(basename -- "${file}")
+    filename="${filename%.*}"
+    echo "Proccessing ${file}! Please check the log file for details."
+    # Run full_sssp on the mtx file and redirect output to logfile
+    # "${MAIN}" "${Algorithm}" "${file}" "${OUTPUT}" "${Interval}" "${Prinft}" "${SourceList}" "${Weighted}"| tee "${LOG_DIR}/${filename}_log.txt" #cpu
+    # "${MAIN}" "${Algorithm}" "${file}" "${OUTPUT}" "${Stream}" "${Block_size}" "${Interval}" "${Prinft}" "${Source}" | tee "${LOG_DIR}/${filename}_log.txt" #gpu
+    # "${MAIN}" "${Algorithm}" "${file}" "${OUTPUT}" "${Block_size}" "${Prinft}" "${SourceList}" "${Weighted}"| tee "${LOG_DIR}/${filename}_log.txt" #mssp
+    "${MAIN}" "${file}" "${OUTPUT}" "${Block_size}" "${Prinft}" "${SourceList}" "${Weighted}" | tee "${LOG_DIR}/${filename}_log.txt" #test
+done
+
+echo "All done!"
+
diff --git a/tool/process_all.sh b/tool/process_all.sh
deleted file mode 100755
index e69de29..0000000