diff --git a/test/torchaudio_unittest/io/stream_reader_test.py b/test/torchaudio_unittest/io/stream_reader_test.py index 8acca7f7219..4e05ba056f6 100644 --- a/test/torchaudio_unittest/io/stream_reader_test.py +++ b/test/torchaudio_unittest/io/stream_reader_test.py @@ -96,6 +96,8 @@ def test_src_info(self): codec_long_name="H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10", format="yuv420p", bit_rate=71925, + num_frames=325, + bits_per_sample=8, width=320, height=180, frame_rate=25.0, @@ -106,6 +108,8 @@ def test_src_info(self): codec_long_name="AAC (Advanced Audio Coding)", format="fltp", bit_rate=72093, + num_frames=103, + bits_per_sample=0, sample_rate=8000.0, num_channels=2, ), @@ -115,6 +119,8 @@ def test_src_info(self): codec_long_name="MOV text", format=None, bit_rate=None, + num_frames=None, + bits_per_sample=None, ), StreamReaderSourceVideoStream( media_type="video", @@ -122,6 +128,8 @@ def test_src_info(self): codec_long_name="H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10", format="yuv420p", bit_rate=128783, + num_frames=390, + bits_per_sample=8, width=480, height=270, frame_rate=29.97002997002997, @@ -132,6 +140,8 @@ def test_src_info(self): codec_long_name="AAC (Advanced Audio Coding)", format="fltp", bit_rate=128837, + num_frames=205, + bits_per_sample=0, sample_rate=16000.0, num_channels=2, ), @@ -141,6 +151,8 @@ def test_src_info(self): codec_long_name="MOV text", format=None, bit_rate=None, + num_frames=None, + bits_per_sample=None, ), ] output = [s.get_src_stream_info(i) for i in range(6)] diff --git a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp index 35b6cf50a55..dd5080c0fe9 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp +++ b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.cpp @@ -11,6 +11,8 @@ SrcInfo convert(SrcStreamInfo ssi) { ssi.codec_long_name, ssi.fmt_name, ssi.bit_rate, + ssi.num_frames, + ssi.bits_per_sample, ssi.sample_rate, ssi.num_channels, ssi.width, diff --git a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h index 356bc41c22d..7134b701570 100644 --- a/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h +++ b/torchaudio/csrc/ffmpeg/stream_reader_wrapper.h @@ -11,6 +11,8 @@ using SrcInfo = std::tuple< std::string, // codec long name std::string, // format name int64_t, // bit_rate + int64_t, // num_frames + int64_t, // bits_per_sample // Audio double, // sample_rate int64_t, // num_channels diff --git a/torchaudio/csrc/ffmpeg/streamer.cpp b/torchaudio/csrc/ffmpeg/streamer.cpp index 18b6037a18b..691c09635bf 100644 --- a/torchaudio/csrc/ffmpeg/streamer.cpp +++ b/torchaudio/csrc/ffmpeg/streamer.cpp @@ -75,6 +75,8 @@ SrcStreamInfo Streamer::get_src_stream_info(int i) const { SrcStreamInfo ret; ret.media_type = codecpar->codec_type; ret.bit_rate = codecpar->bit_rate; + ret.num_frames = stream->nb_frames; + ret.bits_per_sample = codecpar->bits_per_raw_sample; const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id); if (desc) { ret.codec_name = desc->name; diff --git a/torchaudio/csrc/ffmpeg/typedefs.h b/torchaudio/csrc/ffmpeg/typedefs.h index 02f3a3a0b0b..a1706f4f8dc 100644 --- a/torchaudio/csrc/ffmpeg/typedefs.h +++ b/torchaudio/csrc/ffmpeg/typedefs.h @@ -12,6 +12,8 @@ struct SrcStreamInfo { const char* codec_long_name = "N/A"; const char* fmt_name = "N/A"; int bit_rate = 0; + int64_t num_frames = 0; + int bits_per_sample = 0; // Audio double sample_rate = 0; int num_channels = 0; diff --git a/torchaudio/io/_stream_reader.py b/torchaudio/io/_stream_reader.py index 3bbf2a40640..5dde9f6fb02 100644 --- a/torchaudio/io/_stream_reader.py +++ b/torchaudio/io/_stream_reader.py @@ -55,6 +55,12 @@ class StreamReaderSourceStream: This is an estimated values based on the initial few frames of the stream. For container formats and variable bit rate, it can be 0. """ + num_frames: Optional[int] + """The number of frames in the stream""" + bits_per_sample: Optional[int] + """This is the number of valid bits in each output sample. + For compressed format, it can be 0. + """ @dataclass @@ -100,41 +106,59 @@ class StreamReaderSourceVideoStream(StreamReaderSourceStream): _CODEC_LONG = 2 _FORMAT = 3 _BIT_RATE = 4 +_NUM_FRAMES = 5 +_BPS = 6 # - AUDIO -_SAMPLE_RATE = 5 -_NUM_CHANNELS = 6 +_SAMPLE_RATE = 7 +_NUM_CHANNELS = 8 # - VIDEO -_WIDTH = 7 -_HEIGHT = 8 -_FRAME_RATE = 9 +_WIDTH = 9 +_HEIGHT = 10 +_FRAME_RATE = 11 def _parse_si(i): media_type = i[_MEDIA_TYPE] codec_name = i[_CODEC] codec_long_name = i[_CODEC_LONG] + fmt = i[_FORMAT] + bit_rate = i[_BIT_RATE] + num_frames = i[_NUM_FRAMES] + bps = i[_BPS] if media_type == "audio": return StreamReaderSourceAudioStream( - media_type, - codec_name, - codec_long_name, - i[_FORMAT], - i[_BIT_RATE], - i[_SAMPLE_RATE], - i[_NUM_CHANNELS], + media_type=media_type, + codec=codec_name, + codec_long_name=codec_long_name, + format=fmt, + bit_rate=bit_rate, + num_frames=num_frames, + bits_per_sample=bps, + sample_rate=i[_SAMPLE_RATE], + num_channels=i[_NUM_CHANNELS], ) if media_type == "video": return StreamReaderSourceVideoStream( - media_type, - codec_name, - codec_long_name, - i[_FORMAT], - i[_BIT_RATE], - i[_WIDTH], - i[_HEIGHT], - i[_FRAME_RATE], + media_type=media_type, + codec=codec_name, + codec_long_name=codec_long_name, + format=fmt, + bit_rate=bit_rate, + num_frames=num_frames, + bits_per_sample=bps, + width=i[_WIDTH], + height=i[_HEIGHT], + frame_rate=i[_FRAME_RATE], ) - return StreamReaderSourceStream(media_type, codec_name, codec_long_name, None, None) + return StreamReaderSourceStream( + media_type=media_type, + codec=codec_name, + codec_long_name=codec_long_name, + format=None, + bit_rate=None, + num_frames=None, + bits_per_sample=None, + ) @dataclass