diff --git a/src/torchcodec/_core/CudaDeviceInterface.cpp b/src/torchcodec/_core/CudaDeviceInterface.cpp index 9bfea4e52..ce4394926 100644 --- a/src/torchcodec/_core/CudaDeviceInterface.cpp +++ b/src/torchcodec/_core/CudaDeviceInterface.cpp @@ -20,6 +20,13 @@ static bool g_cuda = return new CudaDeviceInterface(device); }); +// BT.709 full range color conversion matrix for YUV to RGB conversion. +// See Note [YUV -> RGB Color Conversion, color space and color range] below. +constexpr Npp32f bt709FullRangeColorTwist[3][4] = { + {1.0f, 0.0f, 1.5748f, 0.0f}, + {1.0f, -0.187324273f, -0.468124273f, -128.0f}, + {1.0f, 1.8556f, 0.0f, -128.0f}}; + // We reuse cuda contexts across VideoDeoder instances. This is because // creating a cuda context is expensive. The cache mechanism is as follows: // 1. There is a cache of size MAX_CONTEXTS_PER_GPU_IN_CACHE cuda contexts for @@ -312,21 +319,54 @@ void CudaDeviceInterface::convertAVFrameToFrameOutput( static_cast(getFFMPEGCompatibleDeviceIndex(device_))); NppiSize oSizeROI = {width, height}; - Npp8u* input[2] = {avFrame->data[0], avFrame->data[1]}; + Npp8u* yuvData[2] = {avFrame->data[0], avFrame->data[1]}; NppStatus status; + // For background, see + // Note [YUV -> RGB Color Conversion, color space and color range] if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) { - status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx( - input, - avFrame->linesize[0], - static_cast(dst.data_ptr()), - dst.stride(0), - oSizeROI, - nppCtx); + if (avFrame->color_range == AVColorRange::AVCOL_RANGE_JPEG) { + // NPP provides a pre-defined color conversion function for BT.709 full + // range: nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx. But it's not closely + // matching the results we have on CPU. So we're using a custom color + // conversion matrix, which provides more accurate results. See the note + // mentioned above for details, and headaches. + + int srcStep[2] = {avFrame->linesize[0], avFrame->linesize[1]}; + + status = nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx( + yuvData, + srcStep, + static_cast(dst.data_ptr()), + dst.stride(0), + oSizeROI, + bt709FullRangeColorTwist, + nppCtx); + } else { + // If not full range, we assume studio limited range. + // The color conversion matrix for BT.709 limited range should be: + // static const Npp32f bt709LimitedRangeColorTwist[3][4] = { + // {1.16438356f, 0.0f, 1.79274107f, -16.0f}, + // {1.16438356f, -0.213248614f, -0.5329093290f, -128.0f}, + // {1.16438356f, 2.11240179f, 0.0f, -128.0f} + // }; + // We get very close results to CPU with that, but using the pre-defined + // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx seems to be even more accurate. + status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx( + yuvData, + avFrame->linesize[0], + static_cast(dst.data_ptr()), + dst.stride(0), + oSizeROI, + nppCtx); + } } else { + // TODO we're assuming BT.601 color space (and probably limited range) by + // calling nppiNV12ToRGB_8u_P2C3R_Ctx. We should handle BT.601 full range, + // and other color-spaces like 2020. status = nppiNV12ToRGB_8u_P2C3R_Ctx( - input, + yuvData, avFrame->linesize[0], static_cast(dst.data_ptr()), dst.stride(0), @@ -362,3 +402,123 @@ std::optional CudaDeviceInterface::findCodec( } } // namespace facebook::torchcodec + +/* clang-format off */ +// Note: [YUV -> RGB Color Conversion, color space and color range] +// +// The frames we get from the decoder (FFmpeg decoder, or NVCUVID) are in YUV +// format. We need to convert them to RGB. This note attempts to describe this +// process. There may be some inaccuracies and approximations that experts will +// notice, but our goal is only to provide a good enough understanding of the +// process for torchcodec developers to implement and maintain it. +// On CPU, filtergraph and swscale handle everything for us. With CUDA, we have +// to do a lot of the heavy lifting ourselves. +// +// Color space and color range +// --------------------------- +// Two main characteristics of a frame will affect the conversion process: +// 1. Color space: This basically defines what YUV values correspond to which +// physical wavelength. No need to go into details here,the point is that +// videos can come in different color spaces, the most common ones being +// BT.601 and BT.709, but there are others. +// In FFmpeg this is represented with AVColorSpace: +// https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#aff71a069509a1ad3ff54d53a1c894c85 +// 2. Color range: This defines the range of YUV values. There is: +// - full range, also called PC range: AVCOL_RANGE_JPEG +// - and the "limited" range, also called studio or TV range: AVCOL_RANGE_MPEG +// https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#a3da0bf691418bc22c4bcbe6583ad589a +// +// Color space and color range are independent concepts, so we can have a BT.709 +// with full range, and another one with limited range. Same for BT.601. +// +// In the first version of this note we'll focus on the full color range. It +// will later be updated to account for the limited range. +// +// Color conversion matrix +// ----------------------- +// YUV -> RGB conversion is defined as the reverse process of the RGB -> YUV, +// So this is where we'll start. +// At the core of a RGB -> YUV conversion are the "luma coefficients", which are +// specific to a given color space and defined by the color space standard. In +// FFmpeg they can be found here: +// https://github.com/FFmpeg/FFmpeg/blob/7d606ef0ccf2946a4a21ab1ec23486cadc21864b/libavutil/csp.c#L46-L56 +// +// For example, the BT.709 coefficients are: kr=0.2126, kg=0.7152, kb=0.0722 +// Coefficients must sum to 1. +// +// Conventionally Y is in [0, 1] range, and U and V are in [-0.5, 0.5] range +// (that's mathematically, in practice they are represented in integer range). +// The conversion is defined as: +// https://en.wikipedia.org/wiki/YCbCr#R'G'B'_to_Y%E2%80%B2PbPr +// Y = kr*R + kg*G + kb*B +// U = (B - Y) * 0.5 / (1 - kb) = (B - Y) / u_scale where u_scale = 2 * (1 - kb) +// V = (R - Y) * 0.5 / (1 - kr) = (R - Y) / v_scale where v_scale = 2 * (1 - kr) +// +// Putting all this into matrix form, we get: +// [Y] = [kr kg kb ] [R] +// [U] [-kr/u_scale -kg/u_scale (1-kb)/u_scale] [G] +// [V] [(1-kr)/v_scale -kg/v_scale -kb)/v_scale ] [B] +// +// +// Now, to convert YUV to RGB, we just need to invert this matrix: +// ```py +// import torch +// kr, kg, kb = 0.2126, 0.7152, 0.0722 # BT.709 luma coefficients +// u_scale = 2 * (1 - kb) +// v_scale = 2 * (1 - kr) +// +// rgb_to_yuv = torch.tensor([ +// [kr, kg, kb], +// [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale], +// [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale] +// ]) +// +// yuv_to_rgb_full = torch.linalg.inv(rgb_to_yuv) +// print("YUV->RGB matrix (Full Range):") +// print(yuv_to_rgb_full) +// ``` +// And we get: +// tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00], +// [ 1.0000e+00, -1.8732e-01, -4.6812e-01], +// [ 1.0000e+00, 1.8556e+00, 4.6231e-09]]) +// +// Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion +// +// Color conversion in NPP +// ----------------------- +// https://docs.nvidia.com/cuda/npp/image_color_conversion.html. +// +// NPP provides different ways to convert YUV to RGB: +// - pre-defined color conversion functions like +// nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx and nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx +// which are for BT.709 limited and full range, respectively. +// - generic color conversion functions that accept a custom color conversion +// matrix, called ColorTwist, like nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx +// +// We use the pre-defined functions or the color twist functions depending on +// which one we find to be closer to the CPU results. +// +// The color twist functionality is *partially* described in a section named +// "YUVToRGBColorTwist". Importantly: +// +// - The `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` function takes the YUV data +// and the color-conversion matrix as input. The function itself and the +// matrix assume different ranges for YUV values: +// - The **matrix coefficient** must assume that Y is in [0, 1] and U,V are in +// [-0.5, 0.5]. That's how we defined our matrix above. +// - The function `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` however expects all +// of the input Y, U, V to be in [0, 255]. That's how the data comes out of +// the decoder. +// - But *internally*, `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` needs U and V to +// be centered around 0, i.e. in [-128, 127]. So we need to apply a -128 +// offset to U and V. Y doesn't need to be offset. The offset can be applied +// by adding a 4th column to the matrix. +// +// +// So our conversion matrix becomes the following, with new offset column: +// tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00, 0] +// [ 1.0000e+00, -1.8732e-01, -4.6812e-01, -128] +// [ 1.0000e+00, 1.8556e+00, 4.6231e-09 , -128]]) +// +// And that's what we need to pass for BT701, full range. +/* clang-format on */ diff --git a/test/resources/bt709_full_range.mp4 b/test/resources/bt709_full_range.mp4 new file mode 100644 index 000000000..004028190 Binary files /dev/null and b/test/resources/bt709_full_range.mp4 differ diff --git a/test/test_decoders.py b/test/test_decoders.py index 6b54a1a52..d3f5ebc00 100644 --- a/test/test_decoders.py +++ b/test/test_decoders.py @@ -25,6 +25,8 @@ all_supported_devices, assert_frames_equal, AV1_VIDEO, + BT709_FULL_RANGE, + cuda_version_used_for_building_torch, get_ffmpeg_major_version, H264_10BITS, H265_10BITS, @@ -35,6 +37,7 @@ NASA_AUDIO_MP3_44100, NASA_VIDEO, needs_cuda, + psnr, SINE_MONO_S16, SINE_MONO_S32, SINE_MONO_S32_44100, @@ -1197,6 +1200,30 @@ def test_pts_to_dts_fallback(self, seek_mode): with pytest.raises(AssertionError, match="not equal"): torch.testing.assert_close(decoder[0], decoder[10]) + @needs_cuda + @pytest.mark.parametrize("asset", (BT709_FULL_RANGE, NASA_VIDEO)) + def test_full_and_studio_range_bt709_video(self, asset): + # Test ensuring result consistency between CPU and GPU decoder on BT709 + # videos, one with full color range, one with studio range. + # This is a non-regression test for times when we used to not support + # full range on GPU. + # + # NASA_VIDEO is a BT709 studio range video, as can be confirmed with + # ffprobe -v quiet -select_streams v:0 -show_entries + # stream=color_space,color_transfer,color_primaries,color_range -of + # default=noprint_wrappers=1 test/resources/nasa_13013.mp4 + decoder_gpu = VideoDecoder(asset.path, device="cuda") + decoder_cpu = VideoDecoder(asset.path, device="cpu") + + for frame_index in (0, 10, 20, 5): + gpu_frame = decoder_gpu.get_frame_at(frame_index).data.cpu() + cpu_frame = decoder_cpu.get_frame_at(frame_index).data + + if cuda_version_used_for_building_torch() >= (12, 9): + torch.testing.assert_close(gpu_frame, cpu_frame, rtol=0, atol=2) + elif cuda_version_used_for_building_torch() == (12, 8): + assert psnr(gpu_frame, cpu_frame) > 20 + @needs_cuda def test_10bit_videos_cuda(self): # Assert that we raise proper error on different kinds of 10bit videos. diff --git a/test/utils.py b/test/utils.py index 588fdd6df..ed611cfda 100644 --- a/test/utils.py +++ b/test/utils.py @@ -37,6 +37,31 @@ def get_ffmpeg_major_version(): return int(ffmpeg_version.split(".")[0]) +def cuda_version_used_for_building_torch() -> Optional[tuple[int, int]]: + # Return the CUDA version that was used to build PyTorch. That's not always + # the same as the CUDA version that is currently installed on the running + # machine, which is what we actually want. On the CI though, these are the + # same. + if torch.version.cuda is None: + return None + else: + return tuple(int(x) for x in torch.version.cuda.split(".")) + + +def psnr(a, b, max_val=255) -> float: + # Return Peak Signal-to-Noise Ratio (PSNR) between two tensors a and b. The + # higher, the better. + # According to https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio, + # typical values for the PSNR in lossy image and video compression are + # between 30 and 50 dB. + # Acceptable values for wireless transmission quality loss are considered to + # be about 20 dB to 25 dB + mse = torch.mean((a.float() - b.float()) ** 2) + if mse == 0: + return float("inf") + return 20 * torch.log10(max_val / torch.sqrt(mse)).item() + + # For use with decoded data frames. On CPU Linux, we expect exact, bit-for-bit # equality. On CUDA Linux, we expect a small tolerance. # On other platforms (e.g. MacOS), we also allow a small tolerance. FFmpeg does @@ -637,3 +662,24 @@ def sample_format(self) -> str: }, }, ) + + +# This is a BT.709 full range video, generated with: +# ffmpeg -f lavfi -i testsrc2=duration=1:size=1920x720:rate=30 \ +# -c:v libx264 -pix_fmt yuv420p -color_primaries bt709 -color_trc bt709 \ +# -colorspace bt709 -color_range pc bt709_full_range.mp4 +# +# We can confirm the color space and color range with: +# ffprobe -v quiet -select_streams v:0 -show_entries stream=color_space,color_transfer,color_primaries,color_range -of default=noprint_wrappers=1 test/resources/bt709_full_range.mp4 +# color_range=pc +# color_space=bt709 +# color_transfer=bt709 +# color_primaries=bt709 +BT709_FULL_RANGE = TestVideo( + filename="bt709_full_range.mp4", + default_stream_index=0, + stream_infos={ + 0: TestVideoStreamInfo(width=1280, height=720, num_color_channels=3), + }, + frames={0: {}}, # Not needed for now +)