Skip to content

Commit

Permalink
Support specifying decoder and its options
Browse files Browse the repository at this point in the history
This commit adds support to specify decoder to Streamer's add stream method.
This is roughly equivalent to `ffmpeg`'s `-c:v foo` and `-c:a foo` options.

This allows to override the decoder codec and/or specify the option of
the decoder.

This change allows to specify Nvidia NVDEC codec for supported formats,
which uses dedicated hardware for decoding the video.
  • Loading branch information
mthrok committed Apr 14, 2022
1 parent 7972be9 commit 63275e5
Show file tree
Hide file tree
Showing 10 changed files with 140 additions and 36 deletions.
6 changes: 5 additions & 1 deletion torchaudio/csrc/ffmpeg/decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@ namespace ffmpeg {
////////////////////////////////////////////////////////////////////////////////
// Decoder
////////////////////////////////////////////////////////////////////////////////
Decoder::Decoder(AVCodecParameters* pParam) : pCodecContext(pParam) {}
Decoder::Decoder(
AVCodecParameters* pParam,
std::string decoder_name,
std::map<std::string, std::string> decoder_option)
: pCodecContext(pParam, decoder_name, decoder_option) {}

int Decoder::process_packet(AVPacket* pPacket) {
return avcodec_send_packet(pCodecContext, pPacket);
Expand Down
5 changes: 4 additions & 1 deletion torchaudio/csrc/ffmpeg/decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ class Decoder {

public:
// Default constructable
Decoder(AVCodecParameters* pParam);
Decoder(
AVCodecParameters* pParam,
std::string decoder_name,
std::map<std::string, std::string> decoder_option);
// Custom destructor to clean up the resources
~Decoder() = default;
// Non-copyable
Expand Down
45 changes: 36 additions & 9 deletions torchaudio/csrc/ffmpeg/ffmpeg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,22 @@ void AVCodecContextDeleter::operator()(AVCodecContext* p) {
};

namespace {
AVCodecContext* get_codec_context(AVCodecParameters* pParams) {
const AVCodec* pCodec = avcodec_find_decoder(pParams->codec_id);
AVCodecContext* get_codec_context(
enum AVCodecID codec_id,
std::string decoder_name) {
const AVCodec* pCodec = decoder_name.empty()
? avcodec_find_decoder(codec_id)
: avcodec_find_decoder_by_name(decoder_name.c_str());

if (!pCodec) {
throw std::runtime_error("Unknown codec.");
std::stringstream ss;
if (decoder_name.empty()) {
ss << "Unsupported codec: \"" << avcodec_get_name(codec_id) << "\", ("
<< codec_id << ").";
} else {
ss << "Unsupported codec: \"" << decoder_name << "\".";
}
throw std::runtime_error(ss.str());
}

AVCodecContext* pCodecContext = avcodec_alloc_context3(pCodec);
Expand All @@ -167,27 +178,43 @@ AVCodecContext* get_codec_context(AVCodecParameters* pParams) {

void init_codec_context(
AVCodecContext* pCodecContext,
AVCodecParameters* pParams) {
const AVCodec* pCodec = avcodec_find_decoder(pParams->codec_id);
AVCodecParameters* pParams,
std::string decoder_name,
std::map<std::string, std::string> decoder_option) {
const AVCodec* pCodec = decoder_name.empty()
? avcodec_find_decoder(pParams->codec_id)
: avcodec_find_decoder_by_name(decoder_name.c_str());

// No need to check if pCodec is null as it's been already checked in
// get_codec_context

if (avcodec_parameters_to_context(pCodecContext, pParams) < 0) {
throw std::runtime_error("Failed to set CodecContext parameter.");
}

if (avcodec_open2(pCodecContext, pCodec, NULL) < 0) {
AVDictionary* opts = get_option_dict(decoder_option);
if (avcodec_open2(pCodecContext, pCodec, &opts) < 0) {
throw std::runtime_error("Failed to initialize CodecContext.");
}
auto unused_keys = clean_up_dict(opts);
if (unused_keys.size()) {
throw std::runtime_error(
"Unexpected decoder options: " + join(unused_keys));
}

if (pParams->codec_type == AVMEDIA_TYPE_AUDIO && !pParams->channel_layout)
pParams->channel_layout =
av_get_default_channel_layout(pCodecContext->channels);
}
} // namespace

AVCodecContextPtr::AVCodecContextPtr(AVCodecParameters* pParam)
AVCodecContextPtr::AVCodecContextPtr(
AVCodecParameters* pParam,
std::string decoder_name,
std::map<std::string, std::string> decoder_option)
: Wrapper<AVCodecContext, AVCodecContextDeleter>(
get_codec_context(pParam)) {
init_codec_context(ptr.get(), pParam);
get_codec_context(pParam->codec_id, decoder_name)) {
init_codec_context(ptr.get(), pParam, decoder_name, decoder_option);
}
////////////////////////////////////////////////////////////////////////////////
// AVFilterGraph
Expand Down
5 changes: 4 additions & 1 deletion torchaudio/csrc/ffmpeg/ffmpeg.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,10 @@ struct AVCodecContextDeleter {
};
struct AVCodecContextPtr
: public Wrapper<AVCodecContext, AVCodecContextDeleter> {
AVCodecContextPtr(AVCodecParameters* pParam);
AVCodecContextPtr(
AVCodecParameters* pParam,
std::string decoder,
std::map<std::string, std::string> decoder_option);
};

////////////////////////////////////////////////////////////////////////////////
Expand Down
36 changes: 26 additions & 10 deletions torchaudio/csrc/ffmpeg/prototype.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ namespace ffmpeg {

namespace {

using OptionDict = c10::Dict<std::string, std::string>;

std::map<std::string, std::string> convert_dict(
const c10::optional<c10::Dict<std::string, std::string>>& option) {
const c10::optional<OptionDict>& option) {
std::map<std::string, std::string> opts;
if (option) {
for (auto& it : option.value()) {
Expand All @@ -23,7 +25,7 @@ struct StreamerHolder : torch::CustomClassHolder {
StreamerHolder(
const std::string& src,
c10::optional<std::string> device,
c10::optional<c10::Dict<std::string, std::string>> option)
c10::optional<OptionDict> option)
: s(src, device.value_or(""), convert_dict(option)) {}
};

Expand All @@ -32,7 +34,7 @@ using S = c10::intrusive_ptr<StreamerHolder>;
S init(
const std::string& src,
c10::optional<std::string> device,
c10::optional<c10::Dict<std::string, std::string>> option) {
c10::optional<OptionDict> option) {
return c10::make_intrusive<StreamerHolder>(src, device, option);
}

Expand Down Expand Up @@ -231,7 +233,7 @@ void add_basic_audio_stream(
const c10::optional<int64_t>& sample_rate,
const c10::optional<c10::ScalarType>& dtype) {
std::string filter_desc = get_afilter_desc(sample_rate, dtype);
s->s.add_audio_stream(i, frames_per_chunk, num_chunks, filter_desc);
s->s.add_audio_stream(i, frames_per_chunk, num_chunks, filter_desc, "", {});
}

void add_basic_video_stream(
Expand All @@ -244,27 +246,41 @@ void add_basic_video_stream(
const c10::optional<int64_t>& height,
const c10::optional<std::string>& format) {
std::string filter_desc = get_vfilter_desc(frame_rate, width, height, format);
s->s.add_video_stream(i, frames_per_chunk, num_chunks, filter_desc);
s->s.add_video_stream(i, frames_per_chunk, num_chunks, filter_desc, "", {});
}

void add_audio_stream(
S s,
int64_t i,
int64_t frames_per_chunk,
int64_t num_chunks,
const c10::optional<std::string>& filter_desc) {
const c10::optional<std::string>& filter_desc,
const c10::optional<std::string>& decoder,
const c10::optional<OptionDict>& decoder_options) {
s->s.add_audio_stream(
i, frames_per_chunk, num_chunks, filter_desc.value_or(""));
i,
frames_per_chunk,
num_chunks,
filter_desc.value_or(""),
decoder.value_or(""),
convert_dict(decoder_options));
}

void add_video_stream(
S s,
int64_t i,
int64_t frames_per_chunk,
int64_t num_chunks,
const c10::optional<std::string>& filter_desc) {
const c10::optional<std::string>& filter_desc,
const c10::optional<std::string>& decoder,
const c10::optional<OptionDict>& decoder_options) {
s->s.add_video_stream(
i, frames_per_chunk, num_chunks, filter_desc.value_or(""));
i,
frames_per_chunk,
num_chunks,
filter_desc.value_or(""),
decoder.value_or(""),
convert_dict(decoder_options));
}

void remove_stream(S s, int64_t i) {
Expand Down Expand Up @@ -308,7 +324,7 @@ std::tuple<c10::optional<torch::Tensor>, int64_t> load(const std::string& src) {
int i = s.find_best_audio_stream();
auto sinfo = s.get_src_stream_info(i);
int64_t sample_rate = static_cast<int64_t>(sinfo.sample_rate);
s.add_audio_stream(i, -1, -1, "");
s.add_audio_stream(i, -1, -1, "", "", {});
process_all_packets(s);
auto tensors = s.pop_chunks();
return std::make_tuple<>(tensors[0], sample_rate);
Expand Down
7 changes: 5 additions & 2 deletions torchaudio/csrc/ffmpeg/stream_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ namespace ffmpeg {

using KeyType = StreamProcessor::KeyType;

StreamProcessor::StreamProcessor(AVCodecParameters* codecpar)
: decoder(codecpar) {}
StreamProcessor::StreamProcessor(
AVCodecParameters* codecpar,
std::string decoder_name,
std::map<std::string, std::string> decoder_option)
: decoder(codecpar, decoder_name, decoder_option) {}

////////////////////////////////////////////////////////////////////////////////
// Configurations
Expand Down
5 changes: 4 additions & 1 deletion torchaudio/csrc/ffmpeg/stream_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ class StreamProcessor {
std::map<KeyType, Sink> sinks;

public:
StreamProcessor(AVCodecParameters* codecpar);
StreamProcessor(
AVCodecParameters* codecpar,
std::string decoder_name,
std::map<std::string, std::string> decoder_option);
~StreamProcessor() = default;
// Non-copyable
StreamProcessor(const StreamProcessor&) = delete;
Expand Down
23 changes: 17 additions & 6 deletions torchaudio/csrc/ffmpeg/streamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,39 +156,50 @@ void Streamer::add_audio_stream(
int i,
int frames_per_chunk,
int num_chunks,
std::string filter_desc) {
std::string filter_desc,
std::string decoder,
std::map<std::string, std::string> decoder_option) {
add_stream(
i,
AVMEDIA_TYPE_AUDIO,
frames_per_chunk,
num_chunks,
std::move(filter_desc));
std::move(filter_desc),
decoder,
decoder_option);
}

void Streamer::add_video_stream(
int i,
int frames_per_chunk,
int num_chunks,
std::string filter_desc) {
std::string filter_desc,
std::string decoder,
std::map<std::string, std::string> decoder_option) {
add_stream(
i,
AVMEDIA_TYPE_VIDEO,
frames_per_chunk,
num_chunks,
std::move(filter_desc));
std::move(filter_desc),
decoder,
decoder_option);
}

void Streamer::add_stream(
int i,
AVMediaType media_type,
int frames_per_chunk,
int num_chunks,
std::string filter_desc) {
std::string filter_desc,
std::string decoder,
std::map<std::string, std::string> decoder_option) {
validate_src_stream_type(i, media_type);
AVStream* stream = pFormatContext->streams[i];
stream->discard = AVDISCARD_DEFAULT;
if (!processors[i])
processors[i] = std::make_unique<StreamProcessor>(stream->codecpar);
processors[i] = std::make_unique<StreamProcessor>(
stream->codecpar, decoder, decoder_option);
int key = processors[i]->add_stream(
stream->time_base,
stream->codecpar,
Expand Down
12 changes: 9 additions & 3 deletions torchaudio/csrc/ffmpeg/streamer.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,16 @@ class Streamer {
int i,
int frames_per_chunk,
int num_chunks,
std::string filter_desc);
std::string filter_desc,
std::string decoder,
std::map<std::string, std::string> decoder_option);
void add_video_stream(
int i,
int frames_per_chunk,
int num_chunks,
std::string filter_desc);
std::string filter_desc,
std::string decoder,
std::map<std::string, std::string> decoder_option);
void remove_stream(int i);

private:
Expand All @@ -80,7 +84,9 @@ class Streamer {
AVMediaType media_type,
int frames_per_chunk,
int num_chunks,
std::string filter_desc);
std::string filter_desc,
std::string decoder,
std::map<std::string, std::string> decoder_option);

public:
//////////////////////////////////////////////////////////////////////////////
Expand Down
32 changes: 30 additions & 2 deletions torchaudio/prototype/io/streamer.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,8 @@ def add_audio_stream(
buffer_chunk_size: int = 3,
stream_index: Optional[int] = None,
filter_desc: Optional[str] = None,
decoder: Optional[str] = None,
decoder_options: Optional[Dict[str, str]] = None,
):
"""Add output audio stream
Expand All @@ -375,10 +377,22 @@ def add_audio_stream(
The list of available filters can be found at
https://ffmpeg.org/ffmpeg-filters.html
Note that complex filters are not supported.
decoder (str or None, optional): The name of the decoder to be used.
When provided, use the specified decoder instead of the default one.
decoder_options (dict or None, optional): Options passed to decoder.
Mapping from str to str.
"""
i = self.default_audio_stream if stream_index is None else stream_index
torch.ops.torchaudio.ffmpeg_streamer_add_audio_stream(
self._s, i, frames_per_chunk, buffer_chunk_size, filter_desc
self._s,
i,
frames_per_chunk,
buffer_chunk_size,
filter_desc,
decoder,
decoder_options,
)

def add_video_stream(
Expand All @@ -387,6 +401,8 @@ def add_video_stream(
buffer_chunk_size: int = 3,
stream_index: Optional[int] = None,
filter_desc: Optional[str] = None,
decoder: Optional[str] = None,
decoder_options: Optional[Dict[str, str]] = None,
):
"""Add output video stream
Expand All @@ -407,10 +423,22 @@ def add_video_stream(
The list of available filters can be found at
https://ffmpeg.org/ffmpeg-filters.html
Note that complex filters are not supported.
decoder (str or None, optional): The name of the decoder to be used.
When provided, use the specified decoder instead of the default one.
decoder_options (dict or None, optional): Options passed to decoder.
Mapping from str to str.
"""
i = self.default_video_stream if stream_index is None else stream_index
torch.ops.torchaudio.ffmpeg_streamer_add_video_stream(
self._s, i, frames_per_chunk, buffer_chunk_size, filter_desc
self._s,
i,
frames_per_chunk,
buffer_chunk_size,
filter_desc,
decoder,
decoder_options,
)

def remove_stream(self, i: int):
Expand Down

0 comments on commit 63275e5

Please sign in to comment.