diff --git a/src/GPUGraphsMatrix.jl b/src/GPUGraphsMatrix.jl
index 49dfb03..dc8ff9f 100644
--- a/src/GPUGraphsMatrix.jl
+++ b/src/GPUGraphsMatrix.jl
@@ -205,6 +205,7 @@ mutable struct SparseGPUMatrixSELL{
 } <: AbstractSparseGPUMatrix{Tv,Ti}
     m::Int
     n::Int
+    perm::Vector{Int}  # Row permutation to reduce padding (not implemented yet)
     slice_size::Int
     nslices::Int    # Number of slices
     nnz::Int        # Number of nonzeros
@@ -219,6 +220,7 @@ mutable struct SparseGPUMatrixSELL{
     function SparseGPUMatrixSELL(
         m::Int,
         n::Int,
+        perm::Vector{Int},
         slice_size::Int,
         nslices::Int,
         nnz::Int,
@@ -269,6 +271,7 @@ mutable struct SparseGPUMatrixSELL{
         new{Tv,Ti,typeof(nzval_gpu),typeof(slice_ptr_gpu)}(
             m,
             n,
+            perm,
             slice_size,
             nslices,
             nnz,
@@ -304,6 +307,11 @@ function SparseGPUMatrixSELL(
     max_nnz_per_slice = zeros(Int, n_slices)
     nnz_per_row = diff(rowptr)
 
+    # Compute optimal permutation of rows to minimize padding (not implemented yet)
+    #perm = reverse!(sortperm(nnz_per_row[:]))
+    perm = collect(1:size(m_t, 2))
+    nnz_per_row = nnz_per_row[perm]
+
     # Compute the maximum number of nonzeros per row for each slice
     n_stored = 0
     for i = 1:n_slices
@@ -333,11 +341,15 @@ function SparseGPUMatrixSELL(
             if row > size(m_t, 2)
                 break
             end
-            start = rowptr[row]
-            end_ = rowptr[row+1] - 1
+
+            start = rowptr[perm[row]]
+            end_ = rowptr[perm[row]+1] - 1
             temp_colval[row-slice_start+1, 1:(end_-start+1)] = colval[start:end_]
             temp_nzval[row-slice_start+1, 1:(end_-start+1)] = nzval[start:end_]
+
+
         end
+        
         # Reshape the sub-matrix to make it column-major vector and copy it to final storage
 
         colval_padded[slice_ptr[slice]:(slice_ptr[slice+1]-1)] =
@@ -346,11 +358,13 @@ function SparseGPUMatrixSELL(
             collect(Iterators.flatten(temp_nzval))
 
 
+
     end
 
     SparseGPUMatrixSELL(
         size(m_t, 2),
         size(m_t, 1),
+        perm,
         slice_size,
         n_slices,
         nnz(m_t),
@@ -372,7 +386,7 @@ function SparseGPUMatrixSELL(
 end
 
 
-# Base methods for the SparseGPUMatrixCSR type
+# Base methods for the SparseGPUMatrixSELL type
 Base.size(A::SparseGPUMatrixSELL) = (A.m, A.n)
 Base.size(A::SparseGPUMatrixSELL, i::Int) = (i == 1) ? A.m : A.n
 Base.length(A::SparseGPUMatrixSELL) = A.m * A.n
@@ -385,7 +399,7 @@ Base.display(A::SparseGPUMatrixSELL) = show(stdout, A)
 
 
 function Base.getindex(A::SparseGPUMatrixSELL, i::Int, j::Int)
-    #@warn "Scalar indexing on a SparseGPUMatrixCSR is slow. For better performance, vectorize the operation."
+    #@warn "Scalar indexing on a SparseGPUMatrixSELL is slow. For better performance, vectorize the operation."
     if i < 1 || i > A.m || j < 1 || j > A.n
         throw(BoundsError(A, (i, j)))
     end
diff --git a/src/algorithms/shortest_path.jl b/src/algorithms/shortest_path.jl
index bd05a22..0cd13ba 100644
--- a/src/algorithms/shortest_path.jl
+++ b/src/algorithms/shortest_path.jl
@@ -57,6 +57,7 @@ function shortest_path!(
     while true
 
         iter += one(Ti)
+        
         gpu_spmv!(
             next,
             A_T,
@@ -67,10 +68,13 @@ function shortest_path!(
             #mask = updated, Not used yet
         )
         # Diff : where we made progress
-        @. diff = next < dist
-        if reduce(|, diff) == zero(Td)
-            return nothing
+        if iter%8 == 0
+            @. diff = next < dist
+            if reduce(|, diff) == zero(Td)
+                return nothing
+            end
         end
+        
         # Update the dist array
         dist .= next
 
diff --git a/src/spmv.jl b/src/spmv.jl
index d08ae9e..0fd47d0 100644
--- a/src/spmv.jl
+++ b/src/spmv.jl
@@ -340,15 +340,18 @@ end
     add,
     accum,
 )
+    #slice, offset = @index(Global, NTuple)
+    #offset = offset - 1
+    #row = (slice-1) * slice_size + offset + 1
     row = @index(Global, Linear)
-    if mask[row] != mask_zero
-        slice = (row-1) ÷ slice_size + 1
-        offset = (row-1) % slice_size
+    slice = (row-1) ÷ slice_size + 1
+    offset = (row-1) % slice_size
+    if row <= n && mask[row] != mask_zero
 
         acc = monoid_neutral_element
         for i = (a_slice_ptr[slice]+offset):slice_size:(a_slice_ptr[slice+1]-1)
             col = a_col_val[i]
-            if col == -1
+            if col == -1 
                 break
             end
             acc = add(acc, mul(a_nz_val[i], b[col], row, col, col, 1), row, col, col, 1)
diff --git a/test/runtests.jl b/test/runtests.jl
index 6cdb786..5181a12 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -41,9 +41,9 @@ const PAD_VAL = -1
 
 
 
-    #include("structs.jl")
-    #include("spmv.jl")
-    #include("spmm.jl")
-    #include("bfs.jl")
+    include("structs.jl")
+    include("spmv.jl")
+    include("spmm.jl")
+    include("bfs.jl")
     include("shortest_path.jl")
 end
diff --git a/test/scratch.jl b/test/scratch.jl
index f0e410e..4dcff7c 100644
--- a/test/scratch.jl
+++ b/test/scratch.jl
@@ -109,41 +109,46 @@ using HarwellRutherfordBoeing
 
 MAIN_TYPE = Bool
 #MAIN_TYPE = Float32
+RES_TYPE = ifelse(MAIN_TYPE == Bool, Int32, MAIN_TYPE)
 BACKEND = CUDA.CUDABackend()
 
-ssmc = ssmc_db();
-orkut_path = fetch_ssmc(ssmc_matrices(ssmc, "SNAP", "Orkut"), format = "RB")[1]
-loaded_matrix = RutherfordBoeingData(joinpath(orkut_path, "com-Orkut.rb"));
-graph = SimpleDiGraph(loaded_matrix.data)
-
-#graph = dorogovtsev_mendes(150)
-A_cpu = transpose(
-    convert(
+use_dataset = true
+graph = dorogovtsev_mendes(150)
+if use_dataset
+    ssmc = ssmc_db();
+    orkut_path = fetch_ssmc(ssmc_matrices(ssmc, "SNAP", "Orkut"), format = "RB")[1]
+    nlpkkt_path = fetch_ssmc(ssmc_matrices(ssmc, "Schenk", "nlpkkt160"), format = "RB")[1]
+
+    #loaded_matrix = RutherfordBoeingData(joinpath(orkut_path, "com-Orkut.rb"));
+    loaded_matrix = RutherfordBoeingData(joinpath(nlpkkt_path, "nlpkkt160.rb"));
+    graph = SimpleDiGraph(loaded_matrix.data)
+else
+    graph = dorogovtsev_mendes(150)
+end
+A_cpu = transpose(convert(
         SparseMatrixCSC{MAIN_TYPE,Int32},
         adjacency_matrix(graph, MAIN_TYPE; dir = :both),
-    ),
-)
-#A2 = convert(
-#    SparseMatrixCSC{Bool,Int32},
-#    adjacency_matrix(graph, Bool; dir = :both),
-#)
-
-#A_T_gpu2 = SparseGPUMatrixCSR(transpose(A2), BACKEND)
+    ))
+
+
+
 SIZE = size(A_cpu, 2)
-SIZE_2 = 32
+A_T = SparseGPUMatrixSELL(A_cpu, BACKEND)
+A_T2 = SparseGPUMatrixCSR(A_cpu, BACKEND)
+
+SIZE_2 = 8
 
 B_cpu = rand(MAIN_TYPE, SIZE, SIZE_2);
 b_cpu = B_cpu[:, 1];
-C_cpu = zeros(MAIN_TYPE, SIZE, SIZE_2);
+C_cpu = zeros(RES_TYPE, SIZE, SIZE_2);
+
 
-A_T = SparseGPUMatrixSELL(A_cpu, BACKEND)
-A_T2 = SparseGPUMatrixCSR(A_cpu, BACKEND)
 B = KernelAbstractions.zeros(BACKEND, MAIN_TYPE, SIZE, SIZE_2);
 copyto!(B, B_cpu);
 b = KernelAbstractions.zeros(BACKEND, MAIN_TYPE, SIZE);
 copyto!(b, b_cpu);
-C = KernelAbstractions.zeros(BACKEND, MAIN_TYPE, SIZE, SIZE_2);
-c = KernelAbstractions.zeros(BACKEND, MAIN_TYPE, SIZE);
+C = KernelAbstractions.zeros(BACKEND, RES_TYPE, SIZE, SIZE_2);
+c = KernelAbstractions.zeros(BACKEND, RES_TYPE, SIZE);
 
 mask = rand(Bool, SIZE)
 mask_dense = KernelAbstractions.zeros(BACKEND, Bool, SIZE)
@@ -169,7 +174,14 @@ isapprox(C_cpu, C_res)
     end
     CUDA.synchronize()
 end
-c_res = zeros(MAIN_TYPE, SIZE);
+@benchmark begin
+    for _ = 1:SIZE_2
+        gpu_spmv!(c, A_T2, b, mask = mask_dense)
+    end
+    CUDA.synchronize()
+end
+
+c_res = zeros(RES_TYPE, SIZE);
 copyto!(c_res, c);
 c_cpu = A_cpu * b_cpu .* mask;
 isapprox(c_cpu, c_res)
@@ -201,7 +213,7 @@ isapprox(mat_res_cpu, vec_res)
 
 
 @benchmark begin
-    mat_res = GPUGraphs.shortest_path(A_T2, convert(Vector{Int32}, range(1, SIZE_2)));
+    mat_res = GPUGraphs.shortest_path(A_T, convert(Vector{Int32}, range(1, SIZE_2)));
     KernelAbstractions.synchronize(BACKEND)
 end