|
| 1 | +#include <libtorchaudio/cuda_utils.h> |
1 | 2 | #include <libtorchaudio/utils.h> |
2 | 3 | #include <torch/csrc/stable/library.h> |
| 4 | +#include <torch/csrc/stable/macros.h> |
3 | 5 | #include <torch/headeronly/core/Dispatch_v2.h> |
4 | 6 | #include <torch/headeronly/core/ScalarType.h> |
5 | 7 |
|
@@ -119,8 +121,9 @@ void forced_align_impl( |
119 | 121 | const Tensor& targets, |
120 | 122 | const int64_t blank, |
121 | 123 | Tensor& paths) { |
122 | | - auto defaultStream = at::cuda::getCurrentCUDAStream(); |
123 | | - auto cpuDataTranferStream = at::cuda::getStreamFromPool(); |
| 124 | + auto device_index = logProbs.get_device_index(); |
| 125 | + auto defaultStream = libtorchaudio::cuda::getCurrentCUDAStream(device_index); |
| 126 | + auto cpuDataTranferStream = libtorchaudio::cuda::getStreamFromPool(false, device_index); |
124 | 127 | const scalar_t kNegInfinity = -std::numeric_limits<scalar_t>::infinity(); |
125 | 128 | using target_t = typename std:: |
126 | 129 | conditional<target_scalar_type == ScalarType::Int, int, int64_t>::type; |
@@ -204,29 +207,29 @@ void forced_align_impl( |
204 | 207 | backPtrBufferLen, |
205 | 208 | torchaudio::packed_accessor32<scalar_t, 2>(alphas), |
206 | 209 | torchaudio::packed_accessor32<int8_t, 2>(backPtrBuffer)); |
207 | | - C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| 210 | + STD_CUDA_KERNEL_LAUNCH_CHECK(); |
208 | 211 | ++backPtrBufferLen; |
209 | 212 | if (backPtrBufferLen == kBackPtrBufferSize || t == T - 1) { |
210 | | - cpuDataTranferStream.synchronize(); |
| 213 | + libtorchaudio::cuda::synchronize(cpuDataTranferStream, device_index); |
211 | 214 | // GPU -> GPU copy |
212 | 215 | bufferCopy = torch::stable::clone(backPtrBuffer); |
213 | 216 | STD_TORCH_CHECK(bufferCopy.is_contiguous(), "unexpected fail, need to implement stable::Tensor::contiguous()") |
214 | | - defaultStream.synchronize(); |
215 | | - at::cuda::setCurrentCUDAStream(cpuDataTranferStream); |
| 217 | + libtorchaudio::cuda::synchronize(defaultStream, device_index); |
| 218 | + libtorchaudio::cuda::setCurrentCUDAStream(cpuDataTranferStream, device_index); |
216 | 219 | // Copy ASYNC from GPU to CPU |
217 | 220 | int64_t offset = |
218 | 221 | static_cast<int64_t>(t + 1 - backPtrBufferLen) * S * sizeof(int8_t); |
219 | | - C10_CUDA_CHECK(cudaMemcpyAsync( |
| 222 | + STD_CUDA_CHECK(cudaMemcpyAsync( |
220 | 223 | static_cast<int8_t*>(backPtrCpu.data_ptr()) + offset, |
221 | 224 | bufferCopy.data_ptr(), |
222 | 225 | backPtrBufferLen * S * sizeof(int8_t), |
223 | 226 | cudaMemcpyDeviceToHost, |
224 | 227 | cpuDataTranferStream)); |
225 | | - at::cuda::setCurrentCUDAStream(defaultStream); |
| 228 | + libtorchaudio::cuda::setCurrentCUDAStream(defaultStream, device_index); |
226 | 229 | backPtrBufferLen = 0; |
227 | 230 | } |
228 | 231 | } |
229 | | - cpuDataTranferStream.synchronize(); |
| 232 | + libtorchaudio::cuda::synchronize(cpuDataTranferStream, device_index); |
230 | 233 | auto alphasCpu = torchaudio::stable::cpu(alphas); |
231 | 234 | auto alphasCpu_a = torchaudio::accessor<scalar_t, 2>(alphasCpu); |
232 | 235 | int curIdxOffset = ((T - 1) % 2); |
|
0 commit comments