meta-pytorch
diff --git a/‎src/torchcodec/_core/AVIOBytesContext.cpp‎
Lines changed: 68 additions & 1 deletion b/‎src/torchcodec/_core/AVIOBytesContext.cpp‎
Lines changed: 68 additions & 1 deletion
diff --git a/‎src/torchcodec/_core/AVIOBytesContext.h‎
Lines changed: 24 additions & 2 deletions b/‎src/torchcodec/_core/AVIOBytesContext.h‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎src/torchcodec/_core/AVIOContextHolder.cpp‎
Lines changed: 8 additions & 3 deletions b/‎src/torchcodec/_core/AVIOContextHolder.cpp‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/torchcodec/_core/AVIOContextHolder.h‎
Lines changed: 4 additions & 7 deletions b/‎src/torchcodec/_core/AVIOContextHolder.h‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎src/torchcodec/_core/AVIOFileLikeContext.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/torchcodec/_core/AVIOFileLikeContext.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎src/torchcodec/_core/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 57 additions & 17 deletions b/‎src/torchcodec/_core/Encoder.cpp‎
Lines changed: 57 additions & 17 deletions
diff --git a/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 14 additions & 0 deletions b/‎src/torchcodec/_core/Encoder.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 23 additions & 0 deletions b/‎src/torchcodec/_core/FFMPEGCommon.cpp‎
Lines changed: 23 additions & 0 deletions
@@ -13,7 +13,7 @@ AVIOBytesContext::AVIOBytesContext(const void* data, int64_t dataSize)
     : dataContext_{static_cast<const uint8_t*>(data), dataSize, 0} {
   TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
   TORCH_CHECK(dataSize > 0, "Video data size must be positive");
-  createAVIOContext(&read, &seek, &dataContext_);
+  createAVIOContext(&read, nullptr, &seek, &dataContext_);
 }
 
 // The signature of this function is defined by FFMPEG.
@@ -67,4 +67,71 @@ int64_t AVIOBytesContext::seek(void* opaque, int64_t offset, int whence) {
   return ret;
 }
 
+AVIOToTensorContext::AVIOToTensorContext()
+    : dataContext_{
+          torch::empty(
+              {AVIOToTensorContext::INITIAL_TENSOR_SIZE},
+              {torch::kUInt8}),
+          0} {
+  createAVIOContext(nullptr, &write, &seek, &dataContext_);
+}
+
+// The signature of this function is defined by FFMPEG.
+int AVIOToTensorContext::write(void* opaque, const uint8_t* buf, int buf_size) {
+  auto dataContext = static_cast<DataContext*>(opaque);
+
+  int64_t bufSize = static_cast<int64_t>(buf_size);
+  if (dataContext->current + bufSize > dataContext->outputTensor.numel()) {
+    TORCH_CHECK(
+        dataContext->outputTensor.numel() * 2 <=
+            AVIOToTensorContext::MAX_TENSOR_SIZE,
+        "We tried to allocate an output encoded tensor larger than ",
+        AVIOToTensorContext::MAX_TENSOR_SIZE,
+        " bytes. If you think this should be supported, please report.");
+
+    // We double the size of the outpout tensor. Calling cat() may not be the
+    // most efficient, but it's simple.
+    dataContext->outputTensor =
+        torch::cat({dataContext->outputTensor, dataContext->outputTensor});
+  }
+
+  TORCH_CHECK(
+      dataContext->current + bufSize <= dataContext->outputTensor.numel(),
+      "Re-allocation of the output tensor didn't work. ",
+      "This should not happen, please report on TorchCodec bug tracker");
+
+  uint8_t* outputTensorData = dataContext->outputTensor.data_ptr<uint8_t>();
+  std::memcpy(outputTensorData + dataContext->current, buf, bufSize);
+  dataContext->current += bufSize;
+  return buf_size;
+}
+
+// The signature of this function is defined by FFMPEG.
+// Note: This `seek()` implementation is very similar to that of
+// AVIOBytesContext. We could consider merging both classes, or do some kind of
+// refac, but this doesn't seem worth it ATM.
+int64_t AVIOToTensorContext::seek(void* opaque, int64_t offset, int whence) {
+  auto dataContext = static_cast<DataContext*>(opaque);
+  int64_t ret = -1;
+
+  switch (whence) {
+    case AVSEEK_SIZE:
+      ret = dataContext->outputTensor.numel();
+      break;
+    case SEEK_SET:
+      dataContext->current = offset;
+      ret = offset;
+      break;
+    default:
+      break;
+  }
+
+  return ret;
+}
+
+torch::Tensor AVIOToTensorContext::getOutputTensor() {
+  return dataContext_.outputTensor.narrow(
+      /*dim=*/0, /*start=*/0, /*length=*/dataContext_.current);
+}
+
 } // namespace facebook::torchcodec
@@ -6,12 +6,13 @@
 
 #pragma once
 
+#include <torch/types.h>
 #include "src/torchcodec/_core/AVIOContextHolder.h"
 
 namespace facebook::torchcodec {
 
-// Enables users to pass in the entire video as bytes. Our read and seek
-// functions then traverse the bytes in memory.
+// For Decoding: enables users to pass in the entire video or audio as bytes.
+// Our read and seek functions then traverse the bytes in memory.
 class AVIOBytesContext : public AVIOContextHolder {
  public:
   explicit AVIOBytesContext(const void* data, int64_t dataSize);
@@ -29,4 +30,25 @@ class AVIOBytesContext : public AVIOContextHolder {
   DataContext dataContext_;
 };
 
+// For Encoding: used to encode into an output uint8 (bytes) tensor.
+class AVIOToTensorContext : public AVIOContextHolder {
+ public:
+  explicit AVIOToTensorContext();
+  torch::Tensor getOutputTensor();
+
+ private:
+  struct DataContext {
+    torch::Tensor outputTensor;
+    int64_t current;
+  };
+
+  static constexpr int64_t INITIAL_TENSOR_SIZE = 10'000'000; // 10MB
+  static constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
+  static int write(void* opaque, const uint8_t* buf, int buf_size);
+  // We need to expose seek() for some formats like mp3.
+  static int64_t seek(void* opaque, int64_t offset, int whence);
+
+  DataContext dataContext_;
+};
+
 } // namespace facebook::torchcodec
@@ -11,6 +11,7 @@ namespace facebook::torchcodec {
 
 void AVIOContextHolder::createAVIOContext(
     AVIOReadFunction read,
+    AVIOWriteFunction write,
     AVIOSeekFunction seek,
     void* heldData,
     int bufferSize) {
@@ -22,13 +23,17 @@ void AVIOContextHolder::createAVIOContext(
       buffer != nullptr,
       "Failed to allocate buffer of size " + std::to_string(bufferSize));
 
-  avioContext_.reset(avio_alloc_context(
+  TORCH_CHECK(
+      (seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
+      "seek method must be defined, and either write or read must be defined. "
+      "But not both!")
+  avioContext_.reset(avioAllocContext(
       buffer,
       bufferSize,
-      0,
+      /*write_flag=*/write != nullptr,
       heldData,
       read,
-      nullptr, // write function; not supported yet
+      write,
       seek));
 
   if (!avioContext_) {
 
@@ -19,9 +19,9 @@ namespace facebook::torchcodec {
 //      freed.
 //   2. It is a base class for AVIOContext specializations. When specializing a
 //      AVIOContext, we need to provide four things:
-//        1. A read callback function.
-//        2. A seek callback function.
-//        3. A write callback function. (Not supported yet; it's for encoding.)
+//        1. A read callback function, for decoding.
+//        2. A seek callback function, for decoding and encoding.
+//        3. A write callback function, for encoding.
 //        4. A pointer to some context object that has the same lifetime as the
 //           AVIOContext itself. This context object holds the custom state that
 //           tracks the custom behavior of reading, seeking and writing. It is
@@ -44,13 +44,10 @@ class AVIOContextHolder {
   // enforced by having a pure virtual methods, but we don't have any.)
   AVIOContextHolder() = default;
 
-  // These signatures are defined by FFmpeg.
-  using AVIOReadFunction = int (*)(void*, uint8_t*, int);
-  using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);
-
   // Deriving classes should call this function in their constructor.
   void createAVIOContext(
       AVIOReadFunction read,
+      AVIOWriteFunction write,
       AVIOSeekFunction seek,
       void* heldData,
       int bufferSize = defaultBufferSize);
 
@@ -23,7 +23,7 @@ AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
         py::hasattr(fileLike, "seek"),
         "File like object must implement a seek method.");
   }
-  createAVIOContext(&read, &seek, &fileLike_);
+  createAVIOContext(&read, nullptr, &seek, &fileLike_);
 }
 
 int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
 
@@ -65,8 +65,9 @@ function(make_torchcodec_libraries
     set(decoder_library_name "libtorchcodec_decoder${ffmpeg_major_version}")
     set(decoder_sources
         AVIOContextHolder.cpp
+        AVIOBytesContext.cpp
         FFMPEGCommon.cpp
-	DeviceInterface.cpp
+        DeviceInterface.cpp
         SingleStreamDecoder.cpp
         # TODO: lib name should probably not be "*_decoder*" now that it also
         # contains an encoder
 
@@ -1,12 +1,24 @@
 #include <sstream>
 
+#include "src/torchcodec/_core/AVIOBytesContext.h"
 #include "src/torchcodec/_core/Encoder.h"
 #include "torch/types.h"
 
 namespace facebook::torchcodec {
 
 namespace {
 
+torch::Tensor validateWf(torch::Tensor wf) {
+  TORCH_CHECK(
+      wf.dtype() == torch::kFloat32,
+      "waveform must have float32 dtype, got ",
+      wf.dtype());
+  // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
+  // planar (fltp).
+  TORCH_CHECK(wf.dim() == 2, "waveform must have 2 dimensions, got ", wf.dim());
+  return wf;
+}
+
 void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
   if (avCodec.supported_samplerates == nullptr) {
     return;
@@ -80,38 +92,55 @@ AudioEncoder::AudioEncoder(
     int sampleRate,
     std::string_view fileName,
     std::optional<int64_t> bitRate)
-    : wf_(wf) {
-  TORCH_CHECK(
-      wf_.dtype() == torch::kFloat32,
-      "waveform must have float32 dtype, got ",
-      wf_.dtype());
-  // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
-  // planar (fltp).
-  TORCH_CHECK(
-      wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());
-
+    : wf_(validateWf(wf)) {
   setFFmpegLogLevel();
   AVFormatContext* avFormatContext = nullptr;
-  auto status = avformat_alloc_output_context2(
+  int status = avformat_alloc_output_context2(
       &avFormatContext, nullptr, nullptr, fileName.data());
+
   TORCH_CHECK(
       avFormatContext != nullptr,
       "Couldn't allocate AVFormatContext. ",
       "Check the desired extension? ",
       getFFMPEGErrorStringFromErrorCode(status));
   avFormatContext_.reset(avFormatContext);
 
-  // TODO-ENCODING: Should also support encoding into bytes (use
-  // AVIOBytesContext)
-  TORCH_CHECK(
-      !(avFormatContext->oformat->flags & AVFMT_NOFILE),
-      "AVFMT_NOFILE is set. We only support writing to a file.");
   status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
   TORCH_CHECK(
       status >= 0,
       "avio_open failed: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
+  initializeEncoder(sampleRate, bitRate);
+}
+
+AudioEncoder::AudioEncoder(
+    const torch::Tensor wf,
+    int sampleRate,
+    std::string_view formatName,
+    std::unique_ptr<AVIOToTensorContext> avioContextHolder,
+    std::optional<int64_t> bitRate)
+    : wf_(validateWf(wf)), avioContextHolder_(std::move(avioContextHolder)) {
+  setFFmpegLogLevel();
+  AVFormatContext* avFormatContext = nullptr;
+  int status = avformat_alloc_output_context2(
+      &avFormatContext, nullptr, formatName.data(), nullptr);
+
+  TORCH_CHECK(
+      avFormatContext != nullptr,
+      "Couldn't allocate AVFormatContext. ",
+      "Check the desired extension? ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  avFormatContext_.reset(avFormatContext);
+
+  avFormatContext_->pb = avioContextHolder_->getAVIOContext();
+
+  initializeEncoder(sampleRate, bitRate);
+}
+
+void AudioEncoder::initializeEncoder(
+    int sampleRate,
+    std::optional<int64_t> bitRate) {
   // We use the AVFormatContext's default codec for that
   // specific format/container.
   const AVCodec* avCodec =
@@ -150,7 +179,7 @@ AudioEncoder::AudioEncoder(
 
   setDefaultChannelLayout(avCodecContext_, numChannels);
 
-  status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
+  int status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
   TORCH_CHECK(
       status == AVSUCCESS,
       "avcodec_open2 failed: ",
@@ -170,7 +199,18 @@ AudioEncoder::AudioEncoder(
   streamIndex_ = avStream->index;
 }
 
+torch::Tensor AudioEncoder::encodeToTensor() {
+  TORCH_CHECK(
+      avioContextHolder_ != nullptr,
+      "Cannot encode to tensor, avio context doesn't exist.");
+  encode();
+  return avioContextHolder_->getOutputTensor();
+}
+
 void AudioEncoder::encode() {
+  // TODO-ENCODING: Need to check, but consecutive calls to encode() are
+  // probably invalid. We can address this once we (re)design the public and
+  // private encoding APIs.
   UniqueAVFrame avFrame(av_frame_alloc());
   TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
   //  Default to 256 like in torchaudio
 
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/types.h>
+#include "src/torchcodec/_core/AVIOBytesContext.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 
 namespace facebook::torchcodec {
@@ -21,9 +22,19 @@ class AudioEncoder {
       int sampleRate,
       std::string_view fileName,
       std::optional<int64_t> bitRate = std::nullopt);
+  AudioEncoder(
+      const torch::Tensor wf,
+      int sampleRate,
+      std::string_view formatName,
+      std::unique_ptr<AVIOToTensorContext> avioContextHolder,
+      std::optional<int64_t> bitRate = std::nullopt);
   void encode();
+  torch::Tensor encodeToTensor();
 
  private:
+  void initializeEncoder(
+      int sampleRate,
+      std::optional<int64_t> bitRate = std::nullopt);
   void encodeInnerLoop(
       AutoAVPacket& autoAVPacket,
       const UniqueAVFrame& srcAVFrame);
@@ -35,5 +46,8 @@ class AudioEncoder {
   UniqueSwrContext swrContext_;
 
   const torch::Tensor wf_;
+
+  // Stores the AVIOContext for the output tensor buffer.
+  std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
 };
 } // namespace facebook::torchcodec
@@ -261,4 +261,27 @@ void setFFmpegLogLevel() {
   av_log_set_level(logLevel);
 }
 
+AVIOContext* avioAllocContext(
+    uint8_t* buffer,
+    int buffer_size,
+    int write_flag,
+    void* opaque,
+    AVIOReadFunction read_packet,
+    AVIOWriteFunction write_packet,
+    AVIOSeekFunction seek) {
+  return avio_alloc_context(
+      buffer,
+      buffer_size,
+      write_flag,
+      opaque,
+      read_packet,
+// The buf parameter of the write function is not const before FFmpeg 7.
+#if LIBAVFILTER_VERSION_MAJOR >= 10 // FFmpeg >= 7
+      write_packet,
+#else
+      reinterpret_cast<AVIOWriteFunctionOld>(write_packet),
+#endif
+      seek);
+}
+
 } // namespace facebook::torchcodec
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)`
`23`	`23`	`py::hasattr(fileLike, "seek"),`
`24`	`24`	`"File like object must implement a seek method.");`
`25`	`25`	`}`
`26`		`- createAVIOContext(&read, &seek, &fileLike_);`
	`26`	`+ createAVIOContext(&read, nullptr, &seek, &fileLike_);`
`27`	`27`	`}`
`28`	`28`
`29`	`29`	`int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {`