-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[API Proposal]: VectorXXX<Half> support, and related various Hardware Intrinsics(F16C
, AVX-512 FP16
, and AdvSimd.Arm64
)
#62416
Comments
Tagging subscribers to this area: @dotnet/area-system-numerics Issue DetailsBackground and motivationIn Ivy Bridge and newer x86 processors, F16C is provided to convert between API ProposalIf you want to avoid using namespace System.Runtime.Intrinsics.X86
{
public abstract class F16C : Avx
{
public static bool IsSupported { get; }
public static Vector128<float> ConvertToVector128Single(Vector128<Half> value);
public static Vector256<float> ConvertToVector256Single(Vector128<Half> value);
public static Vector128<Half> ConvertToVector128Half(Vector128<float> value);
public static Vector128<Half> ConvertToVector128Half(Vector256<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToEven(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToEven(Vector256<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToZero(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToZero(Vector256<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToNegativeInfinity(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToNegativeInfinity(Vector256<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToPositiveInfinity(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToPositiveInfinity(Vector256<float> value);
}
} If using namespace System.Runtime.Intrinsics.X86
{
public abstract class F16C : Avx
{
public static bool IsSupported { get; }
public static Vector128<float> ConvertToVector128Single(Vector64<Half> value);
public static Vector256<float> ConvertToVector256Single(Vector128<Half> value);
public static Vector64<Half> ConvertToVector64Half(Vector128<float> value);
public static Vector128<Half> ConvertToVector128Half(Vector256<float> value);
public static Vector64<Half> ConvertToVector64HalfRoundToEven(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToEven(Vector256<float> value);
public static Vector64<Half> ConvertToVector64HalfRoundToZero(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToZero(Vector256<float> value);
public static Vector64<Half> ConvertToVector64HalfRoundToNegativeInfinity(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToNegativeInfinity(Vector256<float> value);
public static Vector64<Half> ConvertToVector64HalfRoundToPositiveInfinity(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToPositiveInfinity(Vector256<float> value);
}
} And also namespace System.Runtime.Intrinsics
{
public static class Vector64
{
public static Vector64<Half> AsHalf<T>(this Vector64<T> vector) where T : struct;
}
public static class Vector128
{
public static Vector128<Half> AsHalf<T>(this Vector128<T> vector) where T : struct;
}
public static class Vector256
{
public static Vector256<Half> AsHalf<T>(this Vector256<T> vector) where T : struct;
}
} and API UsageThis code is based on public static void ConverHalfToSingle(Span<float> dst, Span<Half> src)
{
ref var rdi = ref MemoryMarshal.GetReference(dst);
ref var rsi = ref MemoryMarshal.GetReference(src);
nint i, length = buffer.Length;
for (i = 0; i < length - 63; i += 64)
{
var ymm0 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 0)));
var ymm1 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 8)));
var ymm2 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 16)));
var ymm3 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 24)));
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 0)) = ymm0;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 8)) = ymm1;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 16)) = ymm2;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 24)) = ymm3;
ymm0 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 32)));
ymm1 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 40)));
ymm2 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 48)));
ymm3 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 56)));
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 32)) = ymm0;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 40)) = ymm1;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 48)) = ymm2;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 56)) = ymm3;
}
for (; i < length - 7; i += 8)
{
var xmm0 = F16C.ConvertToVector128Single(Unsafe.As<Half, Vector64<Half>>(ref Unsafe.Add(ref rsi, i + 0)));
var xmm1 = F16C.ConvertToVector128Single(Unsafe.As<Half, Vector64<Half>>(ref Unsafe.Add(ref rsi, i + 4)));
Unsafe.As<float, Vector128<float>>(ref Unsafe.Add(ref rdi, i + 0)) = xmm0;
Unsafe.As<float, Vector128<float>>(ref Unsafe.Add(ref rdi, i + 4)) = xmm1;
}
for (; i < length; i++)
{
Unsafe.Add(ref rdi, i) = (float)Unsafe.Add(ref rsi, i);
}
} Alternative DesignsNo response RisksNo response
|
Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics Issue DetailsBackground and motivationIn Ivy Bridge and newer x86 processors, F16C is provided to convert between API ProposalIf you want to avoid using namespace System.Runtime.Intrinsics.X86
{
public abstract class F16C : Avx
{
public static bool IsSupported { get; }
public static Vector128<float> ConvertToVector128Single(Vector128<Half> value);
public static Vector256<float> ConvertToVector256Single(Vector128<Half> value);
public static Vector128<Half> ConvertToVector128Half(Vector128<float> value);
public static Vector128<Half> ConvertToVector128Half(Vector256<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToEven(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToEven(Vector256<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToZero(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToZero(Vector256<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToNegativeInfinity(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToNegativeInfinity(Vector256<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToPositiveInfinity(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToPositiveInfinity(Vector256<float> value);
}
} If using namespace System.Runtime.Intrinsics.X86
{
public abstract class F16C : Avx
{
public static bool IsSupported { get; }
public static Vector128<float> ConvertToVector128Single(Vector64<Half> value);
public static Vector256<float> ConvertToVector256Single(Vector128<Half> value);
public static Vector64<Half> ConvertToVector64Half(Vector128<float> value);
public static Vector128<Half> ConvertToVector128Half(Vector256<float> value);
public static Vector64<Half> ConvertToVector64HalfRoundToEven(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToEven(Vector256<float> value);
public static Vector64<Half> ConvertToVector64HalfRoundToZero(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToZero(Vector256<float> value);
public static Vector64<Half> ConvertToVector64HalfRoundToNegativeInfinity(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToNegativeInfinity(Vector256<float> value);
public static Vector64<Half> ConvertToVector64HalfRoundToPositiveInfinity(Vector128<float> value);
public static Vector128<Half> ConvertToVector128HalfRoundToPositiveInfinity(Vector256<float> value);
}
} And also namespace System.Runtime.Intrinsics
{
public static class Vector64
{
public static Vector64<Half> AsHalf<T>(this Vector64<T> vector) where T : struct;
}
public static class Vector128
{
public static Vector128<Half> AsHalf<T>(this Vector128<T> vector) where T : struct;
}
public static class Vector256
{
public static Vector256<Half> AsHalf<T>(this Vector256<T> vector) where T : struct;
}
} and API UsageThis code is based on public static void ConverHalfToSingle(Span<float> dst, Span<Half> src)
{
ref var rdi = ref MemoryMarshal.GetReference(dst);
ref var rsi = ref MemoryMarshal.GetReference(src);
nint i, length = buffer.Length;
for (i = 0; i < length - 63; i += 64)
{
var ymm0 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 0)));
var ymm1 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 8)));
var ymm2 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 16)));
var ymm3 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 24)));
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 0)) = ymm0;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 8)) = ymm1;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 16)) = ymm2;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 24)) = ymm3;
ymm0 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 32)));
ymm1 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 40)));
ymm2 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 48)));
ymm3 = F16C.ConvertToVector256Single(Unsafe.As<Half, Vector128<Half>>(ref Unsafe.Add(ref rsi, i + 56)));
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 32)) = ymm0;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 40)) = ymm1;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 48)) = ymm2;
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref rdi, i + 56)) = ymm3;
}
for (; i < length - 7; i += 8)
{
var xmm0 = F16C.ConvertToVector128Single(Unsafe.As<Half, Vector64<Half>>(ref Unsafe.Add(ref rsi, i + 0)));
var xmm1 = F16C.ConvertToVector128Single(Unsafe.As<Half, Vector64<Half>>(ref Unsafe.Add(ref rsi, i + 4)));
Unsafe.As<float, Vector128<float>>(ref Unsafe.Add(ref rdi, i + 0)) = xmm0;
Unsafe.As<float, Vector128<float>>(ref Unsafe.Add(ref rdi, i + 4)) = xmm1;
}
for (; i < length; i++)
{
Unsafe.Add(ref rdi, i) = (float)Unsafe.Add(ref rsi, i);
}
} Alternative DesignsNo response RisksNo response
|
I have edited my comments, including adding Intrinsics for ARM. |
Worth noting that now with AVX512-FP16 all operations are now accelerated on XArch. |
I have an upcoming need for I'm adding color management to my image editing app (Paint.NET). Storing the canvas tiles at a higher precision (RGBA Float16 instead of BGRA The resizing and color transform steps are done on the CPU at Float32 precision with very high performance thanks to @saucecontrol's PhotoSauce.MagicScaler library. I then convert from RGBA Float32 back to BGRA32 on the CPU, which later on is then presented to the screen via Direct2D, converting to sRGB/scRGB with the Color Management effect, which operates at high precision (up to Float32) and renders into a Float16 swapchain. So the conversion goes from BGRA32 (CPU bitmap) --> RGBA Float32 (CPU intermediate buffers) -> BGRA32 (D2D bitmap) -> RGBA Float32 (effect intermediate texture(s)) -> RGBA Float16 (render target / swapchain). I'd like to be able to do BGRA32 (CPU bitmap) --> RGBA Float32 (CPU intermediate buffers) -> RGBA Float16 (D2D bitmap) -> RGBA Float32 (effect intermediate texture(s)) -> RGBA Float16 (render target / swapchain). I can use RGBA Float32 textures on the GPU today, but Float16 uses 1/2 the memory and PCI-E bandwidth, and won't lose any useful precision at this point in the rendering pipeline. For now I can P/Invoke a method in a native C/C++ DLL, but having native support in C# would be great. |
F16C
, AVX-512 FP16
, and AdvSimd.Arm64
)
Added proposal of |
Background and motivation
It's been a long time .NET first added
Half
type.But there's no support for hardware acceleration of conversion, that might be a common use case.
In Ivy Bridge and newer x86 processors, F16C is provided as a way to convert between
float
andHalf
.ARMv8-A also has a way to convert between them like F16C.
In Sapphire Rapids and newer x86 processors, AVX-512 FP16 is provided as a way to perform arithmetic operations of
Half
values.So I think it's great if .NET had support for hardware acceleration of conversion between them.
API Proposal
EDIT:
Vector64<Half>
design is removed as it's avoided in x86.EDIT: Added some arithmetic APIs including
MultiplyAddEstimate
discussed in #98053.Addition to
Vector*
Vector(64|128|256|512)?<T>
shouldn't throw any exceptions ifT
wasHalf
.F16C
AVX-512 FP16
Addition to AdvSimd.Arm64
Scalar variants are not included in favor of
Half
's explicit operator optimizations.API Sample Usage
F16C
AdvSimd.Arm64
Alternative Designs
No response
Risks
No response
The text was updated successfully, but these errors were encountered: