diff --git a/torchaudio/csrc/ffmpeg/decoder.cpp b/torchaudio/csrc/ffmpeg/decoder.cpp index 999b57da8b0..7614a8280ea 100644 --- a/torchaudio/csrc/ffmpeg/decoder.cpp +++ b/torchaudio/csrc/ffmpeg/decoder.cpp @@ -6,6 +6,125 @@ namespace ffmpeg { //////////////////////////////////////////////////////////////////////////////// // Decoder //////////////////////////////////////////////////////////////////////////////// +namespace { +AVCodecContextPtr get_decode_context( + enum AVCodecID codec_id, + const c10::optional& decoder_name) { + const AVCodec* pCodec = !decoder_name.has_value() + ? avcodec_find_decoder(codec_id) + : avcodec_find_decoder_by_name(decoder_name.value().c_str()); + + if (!pCodec) { + std::stringstream ss; + if (!decoder_name.has_value()) { + ss << "Unsupported codec: \"" << avcodec_get_name(codec_id) << "\", (" + << codec_id << ")."; + } else { + ss << "Unsupported codec: \"" << decoder_name.value() << "\"."; + } + throw std::runtime_error(ss.str()); + } + + AVCodecContext* pCodecContext = avcodec_alloc_context3(pCodec); + if (!pCodecContext) { + throw std::runtime_error("Failed to allocate CodecContext."); + } + return AVCodecContextPtr(pCodecContext); +} + +#ifdef USE_CUDA +enum AVPixelFormat get_hw_format( + AVCodecContext* ctx, + const enum AVPixelFormat* pix_fmts) { + const enum AVPixelFormat* p = nullptr; + AVPixelFormat pix_fmt = *static_cast(ctx->opaque); + for (p = pix_fmts; *p != -1; p++) { + if (*p == pix_fmt) { + return *p; + } + } + TORCH_WARN("Failed to get HW surface format."); + return AV_PIX_FMT_NONE; +} + +const AVCodecHWConfig* get_cuda_config(const AVCodec* pCodec) { + for (int i = 0;; ++i) { + const AVCodecHWConfig* config = avcodec_get_hw_config(pCodec, i); + if (!config) { + break; + } + if (config->device_type == AV_HWDEVICE_TYPE_CUDA && + config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX) { + return config; + } + } + std::stringstream ss; + ss << "CUDA device was requested, but the codec \"" << pCodec->name + << "\" is not supported."; + throw std::runtime_error(ss.str()); +} +#endif + +void init_codec_context( + AVCodecContext* pCodecContext, + AVCodecParameters* pParams, + const OptionDict& decoder_option, + const torch::Device& device, + AVBufferRefPtr& pHWBufferRef) { + int ret = avcodec_parameters_to_context(pCodecContext, pParams); + if (ret < 0) { + throw std::runtime_error( + "Failed to set CodecContext parameter: " + av_err2string(ret)); + } + +#ifdef USE_CUDA + // Enable HW Acceleration + if (device.type() == c10::DeviceType::CUDA) { + const AVCodecHWConfig* config = get_cuda_config(pCodecContext->codec); + // TODO: check how to log + // C10_LOG << "Decoder " << pCodec->name << " supports device " << + // av_hwdevice_get_type_name(config->device_type); + + // https://www.ffmpeg.org/doxygen/trunk/hw__decode_8c_source.html#l00221 + // 1. Set HW pixel format (config->pix_fmt) to opaue pointer. + static thread_local AVPixelFormat pix_fmt = config->pix_fmt; + pCodecContext->opaque = static_cast(&pix_fmt); + // 2. Set pCodecContext->get_format call back function which + // will retrieve the HW pixel format from opaque pointer. + pCodecContext->get_format = get_hw_format; + // 3. Create HW device context and set to pCodecContext. + AVBufferRef* hw_device_ctx = nullptr; + ret = av_hwdevice_ctx_create( + &hw_device_ctx, + AV_HWDEVICE_TYPE_CUDA, + std::to_string(device.index()).c_str(), + nullptr, + 0); + if (ret < 0) { + throw std::runtime_error( + "Failed to create CUDA device context: " + av_err2string(ret)); + } + assert(hw_device_ctx); + pCodecContext->hw_device_ctx = av_buffer_ref(hw_device_ctx); + pHWBufferRef.reset(hw_device_ctx); + } +#endif + + AVDictionary* opts = get_option_dict(decoder_option); + ret = avcodec_open2(pCodecContext, pCodecContext->codec, &opts); + clean_up_dict(opts); + + if (ret < 0) { + throw std::runtime_error( + "Failed to initialize CodecContext: " + av_err2string(ret)); + } + + if (pParams->codec_type == AVMEDIA_TYPE_AUDIO && !pParams->channel_layout) + pParams->channel_layout = + av_get_default_channel_layout(pCodecContext->channels); +} +} // namespace + Decoder::Decoder( AVCodecParameters* pParam, const c10::optional& decoder_name, @@ -13,12 +132,7 @@ Decoder::Decoder( const torch::Device& device) : pCodecContext(get_decode_context(pParam->codec_id, decoder_name)) { init_codec_context( - pCodecContext, - pParam, - decoder_name, - decoder_option, - device, - pHWBufferRef); + pCodecContext, pParam, decoder_option, device, pHWBufferRef); } int Decoder::process_packet(AVPacket* pPacket) { diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.cpp b/torchaudio/csrc/ffmpeg/ffmpeg.cpp index 32a9fd8d472..045fbb080f7 100644 --- a/torchaudio/csrc/ffmpeg/ffmpeg.cpp +++ b/torchaudio/csrc/ffmpeg/ffmpeg.cpp @@ -35,19 +35,6 @@ void clean_up_dict(AVDictionary* p) { } } -namespace { - -// https://github.com/FFmpeg/FFmpeg/blob/4e6debe1df7d53f3f59b37449b82265d5c08a172/doc/APIchanges#L252-L260 -// Starting from libavformat 59 (ffmpeg 5), -// AVInputFormat is const and related functions expect constant. -#if LIBAVFORMAT_VERSION_MAJOR >= 59 -#define AVINPUT_FORMAT_CONST const -#else -#define AVINPUT_FORMAT_CONST -#endif - -} // namespace - //////////////////////////////////////////////////////////////////////////////// // AVFormatContext //////////////////////////////////////////////////////////////////////////////// @@ -55,45 +42,6 @@ void AVFormatContextDeleter::operator()(AVFormatContext* p) { avformat_close_input(&p); }; -AVFormatContextPtr get_input_format_context( - const std::string& src, - const c10::optional& device, - const OptionDict& option, - AVIOContext* io_ctx) { - AVFormatContext* pFormat = avformat_alloc_context(); - if (!pFormat) { - throw std::runtime_error("Failed to allocate AVFormatContext."); - } - if (io_ctx) { - pFormat->pb = io_ctx; - } - - auto* pInput = [&]() -> AVINPUT_FORMAT_CONST AVInputFormat* { - if (device.has_value()) { - std::string device_str = device.value(); - AVINPUT_FORMAT_CONST AVInputFormat* p = - av_find_input_format(device_str.c_str()); - if (!p) { - std::ostringstream msg; - msg << "Unsupported device/format: \"" << device_str << "\""; - throw std::runtime_error(msg.str()); - } - return p; - } - return nullptr; - }(); - - AVDictionary* opt = get_option_dict(option); - int ret = avformat_open_input(&pFormat, src.c_str(), pInput, &opt); - clean_up_dict(opt); - - if (ret < 0) - throw std::runtime_error( - "Failed to open the input \"" + src + "\" (" + av_err2string(ret) + - ")."); - return AVFormatContextPtr(pFormat); -} - AVFormatContextPtr::AVFormatContextPtr(AVFormatContext* p) : Wrapper(p) {} @@ -162,136 +110,6 @@ void AVCodecContextDeleter::operator()(AVCodecContext* p) { avcodec_free_context(&p); }; -namespace { -const AVCodec* get_decode_codec( - enum AVCodecID codec_id, - const c10::optional& decoder_name) { - const AVCodec* pCodec = !decoder_name.has_value() - ? avcodec_find_decoder(codec_id) - : avcodec_find_decoder_by_name(decoder_name.value().c_str()); - - if (!pCodec) { - std::stringstream ss; - if (!decoder_name.has_value()) { - ss << "Unsupported codec: \"" << avcodec_get_name(codec_id) << "\", (" - << codec_id << ")."; - } else { - ss << "Unsupported codec: \"" << decoder_name.value() << "\"."; - } - throw std::runtime_error(ss.str()); - } - return pCodec; -} - -} // namespace - -AVCodecContextPtr get_decode_context( - enum AVCodecID codec_id, - const c10::optional& decoder_name) { - const AVCodec* pCodec = get_decode_codec(codec_id, decoder_name); - - AVCodecContext* pCodecContext = avcodec_alloc_context3(pCodec); - if (!pCodecContext) { - throw std::runtime_error("Failed to allocate CodecContext."); - } - return AVCodecContextPtr(pCodecContext); -} - -#ifdef USE_CUDA -enum AVPixelFormat get_hw_format( - AVCodecContext* ctx, - const enum AVPixelFormat* pix_fmts) { - const enum AVPixelFormat* p = nullptr; - AVPixelFormat pix_fmt = *static_cast(ctx->opaque); - for (p = pix_fmts; *p != -1; p++) { - if (*p == pix_fmt) { - return *p; - } - } - TORCH_WARN("Failed to get HW surface format."); - return AV_PIX_FMT_NONE; -} - -const AVCodecHWConfig* get_cuda_config(const AVCodec* pCodec) { - for (int i = 0;; ++i) { - const AVCodecHWConfig* config = avcodec_get_hw_config(pCodec, i); - if (!config) { - break; - } - if (config->device_type == AV_HWDEVICE_TYPE_CUDA && - config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX) { - return config; - } - } - std::stringstream ss; - ss << "CUDA device was requested, but the codec \"" << pCodec->name - << "\" is not supported."; - throw std::runtime_error(ss.str()); -} -#endif - -void init_codec_context( - AVCodecContext* pCodecContext, - AVCodecParameters* pParams, - const c10::optional& decoder_name, - const OptionDict& decoder_option, - const torch::Device& device, - AVBufferRefPtr& pHWBufferRef) { - const AVCodec* pCodec = get_decode_codec(pParams->codec_id, decoder_name); - - int ret = avcodec_parameters_to_context(pCodecContext, pParams); - if (ret < 0) { - throw std::runtime_error( - "Failed to set CodecContext parameter: " + av_err2string(ret)); - } - -#ifdef USE_CUDA - // Enable HW Acceleration - if (device.type() == c10::DeviceType::CUDA) { - const AVCodecHWConfig* config = get_cuda_config(pCodec); - // TODO: check how to log - // C10_LOG << "Decoder " << pCodec->name << " supports device " << - // av_hwdevice_get_type_name(config->device_type); - - // https://www.ffmpeg.org/doxygen/trunk/hw__decode_8c_source.html#l00221 - // 1. Set HW pixel format (config->pix_fmt) to opaue pointer. - static thread_local AVPixelFormat pix_fmt = config->pix_fmt; - pCodecContext->opaque = static_cast(&pix_fmt); - // 2. Set pCodecContext->get_format call back function which - // will retrieve the HW pixel format from opaque pointer. - pCodecContext->get_format = get_hw_format; - // 3. Create HW device context and set to pCodecContext. - AVBufferRef* hw_device_ctx = nullptr; - ret = av_hwdevice_ctx_create( - &hw_device_ctx, - AV_HWDEVICE_TYPE_CUDA, - std::to_string(device.index()).c_str(), - nullptr, - 0); - if (ret < 0) { - throw std::runtime_error( - "Failed to create CUDA device context: " + av_err2string(ret)); - } - assert(hw_device_ctx); - pCodecContext->hw_device_ctx = av_buffer_ref(hw_device_ctx); - pHWBufferRef.reset(hw_device_ctx); - } -#endif - - AVDictionary* opts = get_option_dict(decoder_option); - ret = avcodec_open2(pCodecContext, pCodec, &opts); - clean_up_dict(opts); - - if (ret < 0) { - throw std::runtime_error( - "Failed to initialize CodecContext: " + av_err2string(ret)); - } - - if (pParams->codec_type == AVMEDIA_TYPE_AUDIO && !pParams->channel_layout) - pParams->channel_layout = - av_get_default_channel_layout(pCodecContext->channels); -} - AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p) : Wrapper(p) {} diff --git a/torchaudio/csrc/ffmpeg/ffmpeg.h b/torchaudio/csrc/ffmpeg/ffmpeg.h index cede6134eb3..6ecab63673b 100644 --- a/torchaudio/csrc/ffmpeg/ffmpeg.h +++ b/torchaudio/csrc/ffmpeg/ffmpeg.h @@ -27,6 +27,15 @@ namespace ffmpeg { using OptionDict = std::map; +// https://github.com/FFmpeg/FFmpeg/blob/4e6debe1df7d53f3f59b37449b82265d5c08a172/doc/APIchanges#L252-L260 +// Starting from libavformat 59 (ffmpeg 5), +// AVInputFormat is const and related functions expect constant. +#if LIBAVFORMAT_VERSION_MAJOR >= 59 +#define AVFORMAT_CONST const +#else +#define AVFORMAT_CONST +#endif + // Replacement of av_err2str, which causes // `error: taking address of temporary array` // https://github.com/joncampbell123/composite-video-simulator/issues/5 @@ -84,13 +93,6 @@ struct AVFormatContextPtr explicit AVFormatContextPtr(AVFormatContext* p); }; -// create format context for reading media -AVFormatContextPtr get_input_format_context( - const std::string& src, - const c10::optional& device, - const OptionDict& option, - AVIOContext* io_ctx = nullptr); - //////////////////////////////////////////////////////////////////////////////// // AVIO //////////////////////////////////////////////////////////////////////////////// @@ -166,20 +168,6 @@ struct AVCodecContextPtr explicit AVCodecContextPtr(AVCodecContext* p); }; -// Allocate codec context from either decoder name or ID -AVCodecContextPtr get_decode_context( - enum AVCodecID codec_id, - const c10::optional& decoder); - -// Initialize codec context with the parameters -void init_codec_context( - AVCodecContext* pCodecContext, - AVCodecParameters* pParams, - const c10::optional& decoder_name, - const OptionDict& decoder_option, - const torch::Device& device, - AVBufferRefPtr& pHWBufferRef); - //////////////////////////////////////////////////////////////////////////////// // AVFilterGraph //////////////////////////////////////////////////////////////////////////////// diff --git a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp index ba168dc5c01..f3270a917d6 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp @@ -55,6 +55,45 @@ OutInfo convert(OutputStreamInfo osi) { } } // namespace +AVFormatContextPtr get_input_format_context( + const std::string& src, + const c10::optional& device, + const OptionDict& option, + AVIOContext* io_ctx) { + AVFormatContext* pFormat = avformat_alloc_context(); + if (!pFormat) { + throw std::runtime_error("Failed to allocate AVFormatContext."); + } + if (io_ctx) { + pFormat->pb = io_ctx; + } + + auto* pInput = [&]() -> AVFORMAT_CONST AVInputFormat* { + if (device.has_value()) { + std::string device_str = device.value(); + AVFORMAT_CONST AVInputFormat* p = + av_find_input_format(device_str.c_str()); + if (!p) { + std::ostringstream msg; + msg << "Unsupported device/format: \"" << device_str << "\""; + throw std::runtime_error(msg.str()); + } + return p; + } + return nullptr; + }(); + + AVDictionary* opt = get_option_dict(option); + int ret = avformat_open_input(&pFormat, src.c_str(), pInput, &opt); + clean_up_dict(opt); + + if (ret < 0) + throw std::runtime_error( + "Failed to open the input \"" + src + "\" (" + av_err2string(ret) + + ")."); + return AVFormatContextPtr(pFormat); +} + StreamReaderBinding::StreamReaderBinding(AVFormatContextPtr&& p) : StreamReader(std::move(p)) {} diff --git a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h index 8ef67ec2bd3..fada6417f7c 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h +++ b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h @@ -5,6 +5,13 @@ namespace torchaudio { namespace ffmpeg { +// create format context for reading media +AVFormatContextPtr get_input_format_context( + const std::string& src, + const c10::optional& device, + const OptionDict& option, + AVIOContext* io_ctx = nullptr); + // Because TorchScript requires c10::Dict type to pass dict, // while PyBind11 requires std::map type to pass dict, // we duplicate the return tuple.