diff --git a/torchaudio/csrc/ffmpeg/buffer.cpp b/torchaudio/csrc/ffmpeg/buffer.cpp index d7fbd6acdd..5fd948b7c5 100644 --- a/torchaudio/csrc/ffmpeg/buffer.cpp +++ b/torchaudio/csrc/ffmpeg/buffer.cpp @@ -2,6 +2,10 @@ #include #include +#ifdef USE_CUDA +#include +#endif + namespace torchaudio { namespace ffmpeg { @@ -213,6 +217,94 @@ torch::Tensor convert_yuv420p(AVFrame* pFrame) { return t.permute({0, 3, 1, 2}); // NCHW } +torch::Tensor convert_nv12_cpu(AVFrame* pFrame) { + int width = pFrame->width; + int height = pFrame->height; + + auto options = torch::TensorOptions() + .dtype(torch::kUInt8) + .layout(torch::kStrided) + .device(torch::kCPU); + + torch::Tensor y = torch::empty({1, height, width, 1}, options); + { + uint8_t* tgt = y.data_ptr(); + uint8_t* src = pFrame->data[0]; + int linesize = pFrame->linesize[0]; + for (int h = 0; h < height; ++h) { + memcpy(tgt, src, width); + tgt += width; + src += linesize; + } + } + torch::Tensor uv = torch::empty({1, height / 2, width / 2, 2}, options); + { + uint8_t* tgt = uv.data_ptr(); + uint8_t* src = pFrame->data[1]; + int linesize = pFrame->linesize[1]; + for (int h = 0; h < height / 2; ++h) { + memcpy(tgt, src, width); + tgt += width; + src += linesize; + } + } + // Upsample width and height + uv = uv.repeat_interleave(2, -2).repeat_interleave(2, -3); + torch::Tensor t = torch::cat({y, uv}, -1); + return t.permute({0, 3, 1, 2}); // NCHW +} + +#ifdef USE_CUDA +torch::Tensor convert_nv12_cuda(AVFrame* pFrame) { + int width = pFrame->width; + int height = pFrame->height; + + auto options = torch::TensorOptions() + .dtype(torch::kUInt8) + .layout(torch::kStrided) + .device(torch::kCUDA); + + torch::Tensor y = torch::empty({1, height, width, 1}, options); + { + uint8_t* tgt = y.data_ptr(); + CUdeviceptr src = (CUdeviceptr)pFrame->data[0]; + int linesize = pFrame->linesize[0]; + if (cudaSuccess != + cudaMemcpy2D( + (void*)tgt, + width, + (const void*)src, + linesize, + width, + height, + cudaMemcpyDeviceToDevice)) { + throw std::runtime_error("Failed to copy Y plane to Cuda tensor."); + } + } + torch::Tensor uv = torch::empty({1, height / 2, width / 2, 2}, options); + { + uint8_t* tgt = uv.data_ptr(); + CUdeviceptr src = (CUdeviceptr)pFrame->data[1]; + int linesize = pFrame->linesize[1]; + if (cudaSuccess != + cudaMemcpy2D( + (void*)tgt, + width, + (const void*)src, + linesize, + width, + height / 2, + cudaMemcpyDeviceToDevice)) { + throw std::runtime_error("Failed to copy UV plane to Cuda tensor."); + } + } + // Upsample width and height + uv = uv.repeat_interleave(2, -2).repeat_interleave(2, -3); + torch::Tensor t = torch::cat({y, uv}, -1); + return t.permute({0, 3, 1, 2}); // NCHW +} +#endif + torch::Tensor convert_image_tensor(AVFrame* pFrame) { // ref: // https://ffmpeg.org/doxygen/4.1/filtering__video_8c_source.html#l00179 @@ -240,9 +332,35 @@ torch::Tensor convert_image_tensor(AVFrame* pFrame) { break; case AV_PIX_FMT_YUV420P: return convert_yuv420p(pFrame); + case AV_PIX_FMT_NV12: + return convert_nv12_cpu(pFrame); +#ifdef USE_CUDA + case AV_PIX_FMT_CUDA: { + AVHWFramesContext* hwctx = + (AVHWFramesContext*)pFrame->hw_frames_ctx->data; + AVPixelFormat sw_format = hwctx->sw_format; + // cuvid decoder (nvdec frontend of ffmpeg) only supports the following + // output formats + // https://github.com/FFmpeg/FFmpeg/blob/072101bd52f7f092ee976f4e6e41c19812ad32fd/libavcodec/cuviddec.c#L1121-L1124 + switch (sw_format) { + case AV_PIX_FMT_NV12: + return convert_nv12_cuda(pFrame); + case AV_PIX_FMT_P010: + case AV_PIX_FMT_P016: + throw std::runtime_error( + "Unsupported video format found in CUDA HW: " + + std::string(av_get_pix_fmt_name(sw_format))); + default: + throw std::runtime_error( + "Unexpected video format found in CUDA HW: " + + std::string(av_get_pix_fmt_name(sw_format))); + } + } +#endif default: throw std::runtime_error( - "Unexpected format: " + std::string(av_get_pix_fmt_name(format))); + "Unexpected video format: " + + std::string(av_get_pix_fmt_name(format))); } torch::Tensor t;