Skip to content

Commit

Permalink
Implement HLSL pack/unpack math intrinsics (#5934)
Browse files Browse the repository at this point in the history
  • Loading branch information
fairywreath authored Dec 28, 2024
1 parent c4429bc commit 7a6de4a
Show file tree
Hide file tree
Showing 3 changed files with 536 additions and 0 deletions.
354 changes: 354 additions & 0 deletions source/slang/hlsl.meta.slang
Original file line number Diff line number Diff line change
Expand Up @@ -20806,3 +20806,357 @@ T workgroupUniformLoad<T>(__ref T v)
return v;
}
}

//
// Pack/Unpack Math Intrinsics
//
// These were introduced in SM 6.6 but requirements are dropped to SM 5.0 here
// to expose these intrinsics on targets that do not have SM 6.6 features.
//

//@hidden:
[__readNone]
[ForceInline]
uint16_t __lsb_as_u16(uint32_t val)
{
return uint16_t(val & 0xFFU);
}

//@hidden:
[__readNone]
[ForceInline]
uint32_t __lsb_as_u32(uint32_t val)
{
return (val & 0xFFU);
}

//@hidden:
[__readNone]
[ForceInline]
int8_t __lsb_as_s8(uint32_t val)
{
return int8_t(val & 0xFFU);
}

//@hidden:
[__readNone]
[ForceInline]
int16_t __lsb_as_s16(uint32_t val)
{
return int16_t(__lsb_as_s8(val));
}

//@hidden:
[__readNone]
[ForceInline]
int32_t __lsb_as_s32(uint32_t val)
{
return int32_t(__lsb_as_s8(val));
}

//@hidden:
[__readNone]
[ForceInline]
uint32_t __lsb_clamp_u8_as_u32(int32_t val)
{
return clamp(val, 0, 255);
}

//@hidden:
[__readNone]
[ForceInline]
uint32_t __lsb_clamp_s8_as_u32(int32_t val)
{
return (uint32_t(clamp(val, -128, 127)) & 0xFFU);
}

//@public:
/// Unpack 4 signed 8-bit values into a vector of 16 bit integers.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
int16_t4 unpack_s8s16(int8_t4_packed packed)
{
__target_switch
{
case hlsl: __intrinsic_asm "unpack_s8s16";
case spirv:
return spirv_asm
{
%s8Vec = OpBitcast $$vector<int8_t, 4> $packed;
result:$$vector<int16_t, 4> = OpSConvert %s8Vec
};
default:
uint32_t packedValue = uint32_t(packed);
return int16_t4
(
__lsb_as_s16(packedValue),
__lsb_as_s16(packedValue >> 8U),
__lsb_as_s16(packedValue >> 16U),
__lsb_as_s16(packedValue >> 24U),
);
}
}

//@public:
/// Unpack 4 unsigned 8-bit values into a vector of 16 bit integers.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
uint16_t4 unpack_u8u16(uint8_t4_packed packed)
{
__target_switch
{
case hlsl: __intrinsic_asm "unpack_u8u16";
case spirv:
return spirv_asm
{
%u8Vec = OpBitcast $$vector<uint8_t, 4> $packed;
result:$$vector<uint16_t, 4> = OpUConvert %u8Vec
};
default:
uint32_t packedValue = uint32_t(packed);
return uint16_t4
(
__lsb_as_u16(packedValue),
__lsb_as_u16(packedValue >> 8U),
__lsb_as_u16(packedValue >> 16U),
__lsb_as_u16(packedValue >> 24U),
);
}
}

//@public:
/// Unpack 4 signed 8-bit values into a vector of 32 bit integers.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
int32_t4 unpack_s8s32(int8_t4_packed packed)
{
__target_switch
{
case hlsl: __intrinsic_asm "unpack_s8s32";
case wgsl: __intrinsic_asm "unpack4xI8";
case spirv:
return spirv_asm
{
%s8Vec = OpBitcast $$vector<int8_t, 4> $packed;
result:$$vector<int32_t, 4> = OpSConvert %s8Vec
};
default:
uint32_t packedValue = uint32_t(packed);
return int32_t4
(
__lsb_as_s32(packedValue),
__lsb_as_s32(packedValue >> 8U),
__lsb_as_s32(packedValue >> 16U),
__lsb_as_s32(packedValue >> 24U),
);
}
}

//@public:
/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
uint32_t4 unpack_u8u32(uint8_t4_packed packed)
{
__target_switch
{
case hlsl: __intrinsic_asm "unpack_u8u32";
case wgsl: __intrinsic_asm "unpack4xU8";
case spirv:
return spirv_asm
{
%u8Vec = OpBitcast $$vector<uint8_t, 4> $packed;
result:$$vector<uint32_t, 4> = OpUConvert %u8Vec
};
default:
uint32_t packedValue = uint32_t(packed);
return uint32_t4
(
__lsb_as_u32(packedValue),
__lsb_as_u32(packedValue >> 8U),
__lsb_as_u32(packedValue >> 16U),
__lsb_as_u32(packedValue >> 24U),
);
}
}

//@public:
/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
uint8_t4_packed pack_u8(uint32_t4 unpackedValue)
{
__target_switch
{
case hlsl: __intrinsic_asm "pack_u8";
case wgsl: __intrinsic_asm "pack4xU8";
default:
return uint8_t4_packed
(
__lsb_as_u32(unpackedValue.x)
| (__lsb_as_u32(unpackedValue.y) << 8U)
| (__lsb_as_u32(unpackedValue.z) << 16U)
| (__lsb_as_u32(unpackedValue.w) << 24U)
);
}
}

//@public:
/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
int8_t4_packed pack_s8(int32_t4 unpackedValue)
{
__target_switch
{
case hlsl: __intrinsic_asm "pack_s8";
case wgsl: __intrinsic_asm "pack4xI8";
default:
return int8_t4_packed
(
__lsb_as_u32(unpackedValue.x)
| (__lsb_as_u32(unpackedValue.y) << 8U)
| (__lsb_as_u32(unpackedValue.z) << 16U)
| (__lsb_as_u32(unpackedValue.w) << 24U)
);
}
}

//@public:
/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
uint8_t4_packed pack_u8(uint16_t4 unpackedValue)
{
__target_switch
{
case hlsl: __intrinsic_asm "pack_u8";
default:
return uint8_t4_packed
(
__lsb_as_u32(unpackedValue.x)
| (__lsb_as_u32(unpackedValue.y) << 8U)
| (__lsb_as_u32(unpackedValue.z) << 16U)
| (__lsb_as_u32(unpackedValue.w) << 24U)
);
}
}

//@public:
/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
int8_t4_packed pack_s8(int16_t4 unpackedValue)
{
__target_switch
{
case hlsl: __intrinsic_asm "pack_s8";
default:
return int8_t4_packed
(
__lsb_as_u32(unpackedValue.x)
| (__lsb_as_u32(unpackedValue.y) << 8U)
| (__lsb_as_u32(unpackedValue.z) << 16U)
| (__lsb_as_u32(unpackedValue.w) << 24U)
);
}
}

//@public:
/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers,
/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
uint8_t4_packed pack_clamp_u8(int32_t4 unpackedValue)
{
__target_switch
{
case hlsl: __intrinsic_asm "pack_clamp_u8";
case wgsl: __intrinsic_asm "pack4xU8Clamp(vec4<u32>($0))";
default:
return uint8_t4_packed
(
__lsb_clamp_u8_as_u32(unpackedValue.x)
| (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U)
| (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U)
| (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U)
);
}
}

//@public:
/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers,
/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
int8_t4_packed pack_clamp_s8(int32_t4 unpackedValue)
{
__target_switch
{
case hlsl: __intrinsic_asm "pack_clamp_s8";
case wgsl: __intrinsic_asm "pack4xI8Clamp";
default:
return int8_t4_packed
(
__lsb_clamp_s8_as_u32(unpackedValue.x)
| (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U)
| (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U)
| (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U)
);
}
}

//@public:
/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers,
/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
uint8_t4_packed pack_clamp_u8(int16_t4 unpackedValue)
{
__target_switch
{
case hlsl: __intrinsic_asm "pack_clamp_u8";
default:
return uint8_t4_packed
(
__lsb_clamp_u8_as_u32(unpackedValue.x)
| (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U)
| (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U)
| (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U)
);
}
}

//@public:
/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers,
/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
[__readNone]
[ForceInline]
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
int8_t4_packed pack_clamp_s8(int16_t4 unpackedValue)
{
__target_switch
{
case hlsl: __intrinsic_asm "pack_clamp_s8";
default:
return int8_t4_packed
(
__lsb_clamp_s8_as_u32(unpackedValue.x)
| (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U)
| (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U)
| (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U)
);
}
}

Loading

0 comments on commit 7a6de4a

Please sign in to comment.