diff --git a/utils.h b/utils.h index c286d0c..4140873 100644 --- a/utils.h +++ b/utils.h @@ -99,9 +99,9 @@ float testF16F16GemmMaxError( srand(time(0)); for (int i = 0; i < M * K; i++) - h_a[i] = (T)(rand() / 65504 - 65504/2); + h_a[i] = (T)((rand() % 200 - 100) * 0.01); // -1 ~ 1 for (int i = 0; i < K * N; i++) - h_b[i] = (T)(rand() / 65504 - 65504/2); + h_b[i] = (T)((rand() % 200 - 100) * 0.01); cpuF16F16Gemm(h_a, h_b, h_c, M, N, K); @@ -148,9 +148,9 @@ float testF16F16GemmMaxError_V2( srand(time(0)); for (int i = 0; i < M * K; i++) - h_a[i] = (T)(rand() / 65504 - 65504/2); + h_a[i] = (T)((rand() % 200 - 100) * 0.01); // -1 ~ 1 for (int i = 0; i < K * N; i++) - h_b[i] = (T)(rand() / 65504 - 65504/2); + h_b[i] = (T)((rand() % 200 - 100) * 0.01); cublasHandle_t handle; cublasCreate(&handle); @@ -160,12 +160,12 @@ float testF16F16GemmMaxError_V2( cudaMemcpy(d_a, h_a, size_a, cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, size_b, cudaMemcpyHostToDevice); - cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, M, N, K, - &alpha, (half *)d_a, K, (half *)d_b, K, - &beta, (half *)d_c_ref, N); - // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, - // &alpha, (half *)d_b, K, (half *)d_a, K, + // cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, M, N, K, + // &alpha, (half *)d_a, K, (half *)d_b, K, // &beta, (half *)d_c_ref, N); + cublasHgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, N, M, K, + &alpha, (half *)d_b, K, (half *)d_a, K, + &beta, (half *)d_c_ref, N); // 上面两种调用 cublas 方式等同 gpuF16F16Gemm(d_a, d_b, d_c, M, N, K);