Skip to content

Commit a26041a

Browse files
authored
Merge pull request karpathy#495 from ChrisDryden/shared_memory
Removed unnecesary shared memory due to blockreduce using static defined shared memory
2 parents 9cf8c2f + 5450632 commit a26041a

File tree

2 files changed

+2
-2
lines changed

2 files changed

+2
-2
lines changed

dev/cuda/classifier_fused.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,7 @@ void fused_classifier5(float* dlogits, float* losses,
664664
int B, int T, int V, int P, int block_size) {
665665
const int N = B * T;
666666
const int grid_size = N;
667-
fused_classifier_kernel5<true,false><<<grid_size, block_size, 512>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
667+
fused_classifier_kernel5<true,false><<<grid_size, block_size>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
668668
cudaCheck(cudaGetLastError());
669669
}
670670

train_gpt2.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -1874,7 +1874,7 @@ void fused_classifier(Type* logits, Type* losses,
18741874
const int block_size = 1024;
18751875
const int N = B * T;
18761876
const int grid_size = N;
1877-
fused_classifier_kernel5<<<grid_size, block_size, 512>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
1877+
fused_classifier_kernel5<<<grid_size, block_size>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
18781878
cudaCheck(cudaGetLastError());
18791879
}
18801880

0 commit comments

Comments
 (0)