Skip to content

Commit

Permalink
avfilter/vf_overlay_videotoolbox: add fast code path for bgra overlay
Browse files Browse the repository at this point in the history
The previous implementation needed to convert both main and overlay
frames to BGRA texture and then convert back to YUV.
This operation is bandwidth heavy.

Add a faster shader when the overlay is in BGRA format which
calculates YUV values in the shader. This eliminates the need to
convert the main frame and does not require extra copy for the
overlay frame, leading to more than 100% performance improvements
overlaying 10-bit 1080p HEVC inputs on M1 Max (190fps -> 407fps).

The rgb to yuv formula is currently hard-coded to premultiplied
BT.709 matrix.
  • Loading branch information
gnattu committed Jul 4, 2024
1 parent 4edf5f8 commit 87e34ff
Showing 1 changed file with 224 additions and 50 deletions.
274 changes: 224 additions & 50 deletions debian/patches/0062-avfilter-add-vf_overlay_videotoolbox.patch
Original file line number Diff line number Diff line change
Expand Up @@ -120,26 +120,26 @@ Signed-off-by: Gnattu OC <gnattuoc@me.com>
create mode 100644 libavfilter/metal/vf_overlay_videotoolbox.metal
create mode 100644 libavfilter/vf_overlay_videotoolbox.m

diff --git a/configure b/configure
index 23066efa32..a7c349d126 100755
--- a/configure
+++ b/configure
@@ -3720,6 +3720,7 @@ overlay_qsv_filter_select="qsvvpp"
Index: FFmpeg/configure
===================================================================
--- FFmpeg.orig/configure
+++ FFmpeg/configure
@@ -3722,6 +3722,7 @@ overlay_qsv_filter_select="qsvvpp"
overlay_vaapi_filter_deps="vaapi VAProcPipelineCaps_blend_flags"
overlay_vulkan_filter_deps="vulkan spirv_compiler"
overlay_rkrga_filter_deps="rkrga"
+overlay_videotoolbox_filter_deps="metal corevideo coreimage videotoolbox"
owdenoise_filter_deps="gpl"
pad_opencl_filter_deps="opencl"
pan_filter_deps="swresample"
diff --git a/doc/filters.texi b/doc/filters.texi
index e0436a5755..bfb77562cb 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -19033,6 +19033,58 @@ See @ref{framesync}.

Index: FFmpeg/doc/filters.texi
===================================================================
--- FFmpeg.orig/doc/filters.texi
+++ FFmpeg/doc/filters.texi
@@ -18351,6 +18351,58 @@ See @ref{framesync}.
This filter also supports the @ref{framesync} options.

+@section overlay_videotoolbox
+
+Overlay one video on top of another.
Expand Down Expand Up @@ -193,13 +193,13 @@ index e0436a5755..bfb77562cb 100644
+@end itemize
+
@section owdenoise

Apply Overcomplete Wavelet denoiser.
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index f6c1d641d6..ea1389ab57 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -401,6 +401,9 @@ OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER) += vf_overlay_opencl.o opencl.o \
Index: FFmpeg/libavfilter/Makefile
===================================================================
--- FFmpeg.orig/libavfilter/Makefile
+++ FFmpeg/libavfilter/Makefile
@@ -403,6 +403,9 @@ OBJS-$(CONFIG_OVERLAY_OPENCL_FILTER)
OBJS-$(CONFIG_OVERLAY_QSV_FILTER) += vf_overlay_qsv.o framesync.o
OBJS-$(CONFIG_OVERLAY_VAAPI_FILTER) += vf_overlay_vaapi.o framesync.o vaapi_vpp.o
OBJS-$(CONFIG_OVERLAY_VULKAN_FILTER) += vf_overlay_vulkan.o vulkan.o vulkan_filter.o
Expand All @@ -209,43 +209,43 @@ index f6c1d641d6..ea1389ab57 100644
OBJS-$(CONFIG_OVERLAY_RKRGA_FILTER) += vf_overlay_rkrga.o framesync.o
OBJS-$(CONFIG_OWDENOISE_FILTER) += vf_owdenoise.o
OBJS-$(CONFIG_PAD_FILTER) += vf_pad.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 149bf50997..ec9d975ecb 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -378,6 +378,7 @@ extern const AVFilter ff_vf_overlay_vaapi;
Index: FFmpeg/libavfilter/allfilters.c
===================================================================
--- FFmpeg.orig/libavfilter/allfilters.c
+++ FFmpeg/libavfilter/allfilters.c
@@ -380,6 +380,7 @@ extern const AVFilter ff_vf_overlay_vaap
extern const AVFilter ff_vf_overlay_vulkan;
extern const AVFilter ff_vf_overlay_cuda;
extern const AVFilter ff_vf_overlay_rkrga;
+extern const AVFilter ff_vf_overlay_videotoolbox;
extern const AVFilter ff_vf_owdenoise;
extern const AVFilter ff_vf_pad;
extern const AVFilter ff_vf_pad_opencl;
diff --git a/libavfilter/metal/utils.h b/libavfilter/metal/utils.h
index 7350d42a35..d79c79751c 100644
--- a/libavfilter/metal/utils.h
+++ b/libavfilter/metal/utils.h
@@ -55,5 +55,4 @@ CVMetalTextureRef ff_metal_texture_from_pixbuf(void *avclass,
Index: FFmpeg/libavfilter/metal/utils.h
===================================================================
--- FFmpeg.orig/libavfilter/metal/utils.h
+++ FFmpeg/libavfilter/metal/utils.h
@@ -55,5 +55,4 @@ CVMetalTextureRef ff_metal_texture_from_
int plane,
MTLPixelFormat format)
API_AVAILABLE(macos(10.11), ios(8.0));
-
#endif /* AVFILTER_METAL_UTILS_H */
diff --git a/libavfilter/metal/utils.m b/libavfilter/metal/utils.m
index f365d3ceea..db5c5f6f10 100644
--- a/libavfilter/metal/utils.m
+++ b/libavfilter/metal/utils.m
@@ -55,6 +55,9 @@ CVMetalTextureRef ff_metal_texture_from_pixbuf(void *ctx,
Index: FFmpeg/libavfilter/metal/utils.m
===================================================================
--- FFmpeg.orig/libavfilter/metal/utils.m
+++ FFmpeg/libavfilter/metal/utils.m
@@ -55,6 +55,9 @@ CVMetalTextureRef ff_metal_texture_from_
{
CVMetalTextureRef tex = NULL;
CVReturn ret;
+ bool is_planer = CVPixelBufferIsPlanar(pixbuf);
+ size_t width = is_planer ? CVPixelBufferGetWidthOfPlane(pixbuf, plane) : CVPixelBufferGetWidth(pixbuf);
+ size_t height = is_planer ? CVPixelBufferGetHeightOfPlane(pixbuf, plane) : CVPixelBufferGetHeight(pixbuf);

ret = CVMetalTextureCacheCreateTextureFromImage(
NULL,
@@ -62,8 +65,8 @@ CVMetalTextureRef ff_metal_texture_from_pixbuf(void *ctx,
@@ -62,8 +65,8 @@ CVMetalTextureRef ff_metal_texture_from_
pixbuf,
NULL,
format,
Expand All @@ -256,12 +256,11 @@ index f365d3ceea..db5c5f6f10 100644
plane,
&tex
);
diff --git a/libavfilter/metal/vf_overlay_videotoolbox.metal b/libavfilter/metal/vf_overlay_videotoolbox.metal
new file mode 100644
index 0000000000..936e57e03e
Index: FFmpeg/libavfilter/metal/vf_overlay_videotoolbox.metal
===================================================================
--- /dev/null
+++ b/libavfilter/metal/vf_overlay_videotoolbox.metal
@@ -0,0 +1,58 @@
+++ FFmpeg/libavfilter/metal/vf_overlay_videotoolbox.metal
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2024 Gnattu OC <gnattuoc@me.com>
+ *
Expand Down Expand Up @@ -320,16 +319,52 @@ index 0000000000..936e57e03e
+ dest.write(result_color, gid);
+ }
+}
Index: libavfilter/vf_overlay_videotoolbox.m
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
+
+/*
+ * Blend shader for sperated yuv main and bgra mask
+ */
+kernel void blend_shader_bgra_overlay(
+ texture2d<float, access::read> source_y [[ texture(0) ]],
+ texture2d<float, access::read> source_uv [[ texture(1) ]],
+ texture2d<float, access::read> mask [[ texture(2) ]],
+ texture2d<float, access::write> dest_y [[ texture(3) ]],
+ texture2d<float, access::write> dest_uv [[ texture(4) ]],
+ constant mtlBlendParams& params [[ buffer(5) ]],
+ uint2 gid [[ thread_position_in_grid ]])
+{
+ const auto mask_size = uint2(mask.get_width(),
+ mask.get_height());
+ const auto loc_overlay = uint2(params.x_position, params.y_position);
+ const auto loc_uv = gid >> 1;
+ if (gid.x < loc_overlay.x ||
+ gid.y < loc_overlay.y ||
+ gid.x >= mask_size.x + loc_overlay.x ||
+ gid.y >= mask_size.y + loc_overlay.y)
+ {
+ float4 source_color_y = source_y.read(gid);
+ float4 source_color_uv = source_uv.read(loc_uv);
+ dest_y.write(source_color_y, gid);
+ dest_uv.write(source_color_uv, loc_uv);
+ } else {
+ float4 source_color_y = source_y.read(gid);
+ float4 source_color_uv = source_uv.read(loc_uv);
+ float4 mask_color = mask.read(gid - loc_overlay);
+ float y_overlay = 0.183 * mask_color.r + 0.614 * mask_color.g + 0.062 * mask_color.b + 0.0625f;
+ float u_overlay = -0.101 * mask_color.r - 0.339 * mask_color.g + 0.439 * mask_color.b + 0.5f;
+ float v_overlay = 0.439 * mask_color.r - 0.399 * mask_color.g - 0.040 * mask_color.b + 0.5f;
+ float alpha_color = mask_color.a;
+ float3 main_color = float3(source_color_y.x, source_color_uv.x, source_color_uv.y);
+ float3 overlay_color = float3(y_overlay, u_overlay, v_overlay);
+ float3 result_color = main_color * (1.0f - alpha_color) + (overlay_color * alpha_color);
+ dest_y.write(float4(result_color.x, 0.0f, 0.0f, 1.0f), gid);
+ dest_uv.write(float4(result_color.y, result_color.z, 0.0f, 1.0f), loc_uv);
+ }
+}
Index: FFmpeg/libavfilter/vf_overlay_videotoolbox.m
===================================================================
diff --git a/libavfilter/vf_overlay_videotoolbox.m b/libavfilter/vf_overlay_videotoolbox.m
new file mode 100644
--- /dev/null (revision 913e5ef1730481306c9607c554aea3043ea0ecd4)
+++ b/libavfilter/vf_overlay_videotoolbox.m (revision 913e5ef1730481306c9607c554aea3043ea0ecd4)
@@ -0,0 +1,609 @@
--- /dev/null
+++ FFmpeg/libavfilter/vf_overlay_videotoolbox.m
@@ -0,0 +1,749 @@
+/*
+ * Copyright (C) 2024 Gnattu OC <gnattuoc@me.com>
+ *
Expand Down Expand Up @@ -463,6 +498,43 @@ new file mode 100644
+ ff_objc_release(&buffer);
+}
+
+static void call_kernel_bgra_overlay(AVFilterContext *avctx,
+ id<MTLTexture> dst_y,
+ id<MTLTexture> dst_uv,
+ id<MTLTexture> main_y,
+ id<MTLTexture> main_uv,
+ id<MTLTexture> overlay,
+ uint x_position,
+ uint y_position) API_AVAILABLE(macos(10.11), ios(9.0))
+{
+ OverlayVideoToolboxContext *ctx = avctx->priv;
+ // Both the command buffer and encoder are auto-released by objc on default.
+ // Use CFBridgingRetain to get a more C-like behavior.
+ id<MTLCommandBuffer> buffer = CFBridgingRetain(ctx->mtl_queue.commandBuffer);
+ id<MTLComputeCommandEncoder> encoder = CFBridgingRetain((__bridge id<MTLCommandBuffer>)buffer.computeCommandEncoder);
+
+ MtlBlendParams *params = (MtlBlendParams *)ctx->mtl_params_buffer.contents;
+ *params = (MtlBlendParams) {
+ .x_position = x_position,
+ .y_position = y_position,
+ };
+
+ [(__bridge id<MTLComputeCommandEncoder>)encoder setTexture: main_y atIndex: 0];
+ [(__bridge id<MTLComputeCommandEncoder>)encoder setTexture: main_uv atIndex: 1];
+ [(__bridge id<MTLComputeCommandEncoder>)encoder setTexture: overlay atIndex: 2];
+ [(__bridge id<MTLComputeCommandEncoder>)encoder setTexture: dst_y atIndex: 3];
+ [(__bridge id<MTLComputeCommandEncoder>)encoder setTexture: dst_uv atIndex: 4];
+ [(__bridge id<MTLComputeCommandEncoder>)encoder setBuffer: ctx->mtl_params_buffer offset: 0 atIndex: 5];
+ ff_metal_compute_encoder_dispatch(ctx->mtl_device, ctx->mtl_pipeline, (__bridge id<MTLComputeCommandEncoder>)encoder, dst_y.width, dst_y.height);
+ [(__bridge id<MTLComputeCommandEncoder>)encoder endEncoding];
+
+ [(__bridge id<MTLCommandBuffer>)buffer commit];
+ [(__bridge id<MTLCommandBuffer>)buffer waitUntilCompleted];
+
+ ff_objc_release(&encoder);
+ ff_objc_release(&buffer);
+}
+
+// Copies and/or converts one pixel buffer to another.
+// This transparently handles pixel format and color spaces, and will do a conversion if needed.
+static int transfer_pixel_buffer(OverlayVideoToolboxContext *ctx, CVPixelBufferRef source, CVPixelBufferRef destination)
Expand Down Expand Up @@ -623,6 +695,87 @@ new file mode 100644
+ return ff_filter_frame(outlink, output);
+}
+
+static int overlay_vt_blend_bgra_overlay(FFFrameSync *fs) API_AVAILABLE(macos(10.11), ios(9.0))
+{
+ AVFilterContext *avctx = fs->parent;
+ OverlayVideoToolboxContext *ctx = avctx->priv;
+ AVFilterLink *outlink = avctx->outputs[0];
+ AVFilterLink *inlink_main = avctx->inputs[0];
+ AVFilterLink *inlink_overlay = avctx->inputs[1];
+ AVFrame *input_main, *input_overlay;
+ AVFrame *output;
+ AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink_main->hw_frames_ctx->data;
+ AVHWFramesContext *frames_ctx_overlay = (AVHWFramesContext*)inlink_overlay->hw_frames_ctx->data;
+ const AVPixFmtDescriptor *in_main_desc;
+
+ CVMetalTextureRef main_y, main_uv, dst_y, dst_uv, overlay;
+ id<MTLTexture> tex_main_y, tex_main_uv, tex_overlay, tex_dst_y, tex_dst_uv;
+
+ MTLPixelFormat mtl_format_y, mtl_format_uv;
+ OSType cv_format;
+ int ret;
+ int i, overlay_planes = 0;
+
+ in_main_desc = av_pix_fmt_desc_get(frames_ctx->sw_format);
+
+ // read main and overlay frames from inputs
+ ret = ff_framesync_get_frame(fs, 0, &input_main, 0);
+ if (ret < 0)
+ return ret;
+ ret = ff_framesync_get_frame(fs, 1, &input_overlay, 0);
+ if (ret < 0)
+ return ret;
+
+ if (!input_main)
+ return AVERROR_BUG;
+
+ output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+ if (!output)
+ return AVERROR(ENOMEM);
+
+ ret = av_frame_copy_props(output, input_main);
+ if (ret < 0)
+ return ret;
+
+ if (!input_overlay) {
+ ret = transfer_pixel_buffer(ctx, (CVPixelBufferRef)input_main->data[3], (CVPixelBufferRef)output->data[3]);
+ if (ret < 0)
+ return ret;
+ return ff_filter_frame(outlink, output);
+ }
+
+ mtl_format_y = (in_main_desc->comp[0].depth + in_main_desc->comp[0].shift) > 8 ? MTLPixelFormatR16Unorm : MTLPixelFormatR8Unorm;
+ mtl_format_uv = (in_main_desc->comp[0].depth + in_main_desc->comp[0].shift) > 8 ? MTLPixelFormatRG16Unorm : MTLPixelFormatRG8Unorm;
+ main_y = ff_metal_texture_from_pixbuf(avctx, ctx->texture_cache, (CVPixelBufferRef)input_main->data[3], 0, mtl_format_y);
+ main_uv = ff_metal_texture_from_pixbuf(avctx, ctx->texture_cache, (CVPixelBufferRef)input_main->data[3], 1, mtl_format_uv);
+ overlay = ff_metal_texture_from_pixbuf(avctx, ctx->texture_cache, (CVPixelBufferRef)input_overlay->data[3], 0, MTLPixelFormatBGRA8Unorm);
+ dst_y = ff_metal_texture_from_pixbuf(avctx, ctx->texture_cache, (CVPixelBufferRef)output->data[3], 0, mtl_format_y);
+ dst_uv = ff_metal_texture_from_pixbuf(avctx, ctx->texture_cache, (CVPixelBufferRef)output->data[3], 1, mtl_format_uv);
+
+ if (!overlay || !main_y || !main_uv || !dst_y || !dst_uv) {
+ return AVERROR(ENOSYS);
+ }
+
+ tex_main_y = CVMetalTextureGetTexture(main_y);
+ tex_main_uv = CVMetalTextureGetTexture(main_uv);
+ tex_overlay = CVMetalTextureGetTexture(overlay);
+ tex_dst_y = CVMetalTextureGetTexture(dst_y);
+ tex_dst_uv = CVMetalTextureGetTexture(dst_uv);
+
+ call_kernel_bgra_overlay(avctx,
+ tex_dst_y, tex_dst_uv,
+ tex_main_y, tex_main_uv,
+ tex_overlay,
+ ctx->x_position, ctx->y_position);
+ CFRelease(main_y);
+ CFRelease(main_uv);
+ CFRelease(overlay);
+ CFRelease(dst_y);
+ CFRelease(dst_uv);
+
+ return ff_filter_frame(outlink, output);
+}
+
+static av_cold void do_uninit(AVFilterContext *avctx) API_AVAILABLE(macos(10.11), ios(9.0))
+{
+ OverlayVideoToolboxContext *ctx = avctx->priv;
Expand Down Expand Up @@ -824,6 +977,28 @@ new file mode 100644
+ return AVERROR(ENOSYS);
+ }
+
+ // Use fast code path for BGRA overlay
+ if (overlay_frames->sw_format == AV_PIX_FMT_BGRA) {
+ NSError *err = nil;
+ ff_objc_release(&ctx->mtl_pipeline);
+ ff_objc_release(&ctx->mtl_function);
+ ctx->mtl_function = [ctx->mtl_library newFunctionWithName: @"blend_shader_bgra_overlay"];
+ if (!ctx->mtl_function) {
+ av_log(avctx, AV_LOG_ERROR, "Failed to create Metal function!\n");
+ overlay_videotoolbox_uninit(avctx);
+ return AVERROR_EXTERNAL;
+ }
+ ctx->mtl_pipeline = [ctx->mtl_device
+ newComputePipelineStateWithFunction: ctx->mtl_function
+ error: &err];
+ if (err) {
+ av_log(ctx, AV_LOG_ERROR, "Failed to create Metal compute pipeline: %s\n", err.description.UTF8String);
+ overlay_videotoolbox_uninit(avctx);
+ return AVERROR_EXTERNAL;
+ }
+ ctx->fs.on_event = &overlay_vt_blend_bgra_overlay;
+ }
+
+ ctx->device_ref = av_buffer_ref(main_frames->device_ref);
+ if (!ctx->device_ref) {
+ av_log(ctx, AV_LOG_ERROR, "A device reference create failed.\n");
Expand Down Expand Up @@ -939,4 +1114,3 @@ new file mode 100644
+ FILTER_OUTPUTS(overlay_videotoolbox_outputs),
+ .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};

0 comments on commit 87e34ff

Please sign in to comment.