Skip to content

Commit ef793ac

Browse files
matborzyszkowskiigcbot
authored andcommitted
Add missing functions to cth_pre_release for CRI
.
1 parent dadaf68 commit ef793ac

1 file changed

Lines changed: 275 additions & 0 deletions

File tree

IGC/BiFModule/Languages/OpenCL/PreRelease/opencl_cth_pre_release.h

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4191,3 +4191,278 @@ void intel_manageable_barrier_arrivewait(manageable_barrier_t* BData);
41914191
void intel_manageable_barrier_arrivedrop(manageable_barrier_t* BData);
41924192

41934193
void region_barrier(uint SubRegionSize);
4194+
4195+
// FP4 / INT4 conversion functions
4196+
// Int4
4197+
// Int4 -> Bfloat8:
4198+
uchar intel_convert_as_i4_bfloat8_as_uchar(char source);
4199+
uchar2 intel_convert_as_i42_bfloat82_as_uchar2(char2 source);
4200+
uchar3 intel_convert_as_i43_bfloat83_as_uchar3(char3 source);
4201+
uchar4 intel_convert_as_i44_bfloat84_as_uchar4(char4 source);
4202+
uchar8 intel_convert_as_i48_bfloat88_as_uchar8(char8 source);
4203+
uchar16 intel_convert_as_i416_bfloat816_as_uchar16(char16 source);
4204+
4205+
ushort intel_convert_as_i42_bfloat82_as_ushort_packed(char source);
4206+
ushort2 intel_convert_as_i44_bfloat84_as_ushort2_packed(char2 source);
4207+
ushort3 intel_convert_as_i46_bfloat86_as_ushort3_packed(char3 source);
4208+
ushort4 intel_convert_as_i48_bfloat88_as_ushort4_packed(char4 source);
4209+
ushort8 intel_convert_as_i416_bfloat816_as_ushort8_packed(char8 source);
4210+
ushort16 intel_convert_as_i432_bfloat832_as_ushort16_packed(char16 source);
4211+
4212+
// Int4 -> Hfloat8:
4213+
uchar intel_convert_as_i4_hfloat8_as_uchar(char source);
4214+
uchar2 intel_convert_as_i42_hfloat82_as_uchar2(char2 source);
4215+
uchar3 intel_convert_as_i43_hfloat83_as_uchar3(char3 source);
4216+
uchar4 intel_convert_as_i44_hfloat84_as_uchar4(char4 source);
4217+
uchar8 intel_convert_as_i48_hfloat88_as_uchar8(char8 source);
4218+
uchar16 intel_convert_as_i416_hfloat816_as_uchar16(char16 source);
4219+
4220+
ushort intel_convert_as_i42_hfloat82_as_ushort_packed(char source);
4221+
ushort2 intel_convert_as_i44_hfloat84_as_ushort2_packed(char2 source);
4222+
ushort3 intel_convert_as_i46_hfloat86_as_ushort3_packed(char3 source);
4223+
ushort4 intel_convert_as_i48_hfloat88_as_ushort4_packed(char4 source);
4224+
ushort8 intel_convert_as_i416_hfloat816_as_ushort8_packed(char8 source);
4225+
ushort16 intel_convert_as_i432_hfloat832_as_ushort16_packed(char16 source);
4226+
4227+
// Int4 -> Bfloat16:
4228+
ushort intel_convert_as_i4_bfloat16_as_ushort(char source);
4229+
ushort2 intel_convert_as_i42_bfloat162_as_ushort2(char2 source);
4230+
ushort3 intel_convert_as_i43_bfloat163_as_ushort3(char3 source);
4231+
ushort4 intel_convert_as_i44_bfloat164_as_ushort4(char4 source);
4232+
ushort8 intel_convert_as_i48_bfloat168_as_ushort8(char8 source);
4233+
ushort16 intel_convert_as_i416_bfloat1616_as_ushort16(char16 source);
4234+
4235+
uint intel_convert_as_i42_bfloat162_as_uint_packed(char source);
4236+
uint2 intel_convert_as_i44_bfloat164_as_uint2_packed(char2 source);
4237+
uint3 intel_convert_as_i46_bfloat166_as_uint3_packed(char3 source);
4238+
uint4 intel_convert_as_i48_bfloat168_as_uint4_packed(char4 source);
4239+
uint8 intel_convert_as_i416_bfloat1616_as_uint8_packed(char8 source);
4240+
uint16 intel_convert_as_i432_bfloat1632_as_uint16_packed(char16 source);
4241+
4242+
#ifdef cl_khr_fp16
4243+
// Int4 -> Half:
4244+
half intel_convert_as_i4_half(char source);
4245+
half2 intel_convert_as_i42_half2(char2 source);
4246+
half3 intel_convert_as_i43_half3(char3 source);
4247+
half4 intel_convert_as_i44_half4(char4 source);
4248+
half8 intel_convert_as_i48_half8(char8 source);
4249+
half16 intel_convert_as_i416_half16(char16 source);
4250+
4251+
uint intel_convert_as_i42_half2_as_uint_packed(char source);
4252+
uint2 intel_convert_as_i44_half4_as_uint2_packed(char2 source);
4253+
uint3 intel_convert_as_i46_half6_as_uint3_packed(char3 source);
4254+
uint4 intel_convert_as_i48_half8_as_uint4_packed(char4 source);
4255+
uint8 intel_convert_as_i416_half16_as_uint8_packed(char8 source);
4256+
uint16 intel_convert_as_i432_half32_as_uint16_packed(char16 source);
4257+
#endif // cl_khr_fp16
4258+
4259+
// e2m1
4260+
// e2m1 -> Bfloat8:
4261+
uchar intel_convert_as_e2m1_bfloat8_as_uchar(uchar source);
4262+
uchar2 intel_convert_as_e2m12_bfloat82_as_uchar2(uchar2 source);
4263+
uchar3 intel_convert_as_e2m13_bfloat83_as_uchar3(uchar3 source);
4264+
uchar4 intel_convert_as_e2m14_bfloat84_as_uchar4(uchar4 source);
4265+
uchar8 intel_convert_as_e2m18_bfloat88_as_uchar8(uchar8 source);
4266+
uchar16 intel_convert_as_e2m116_bfloat816_as_uchar16(uchar16 source);
4267+
4268+
ushort intel_convert_as_e2m12_bfloat82_as_ushort_packed(uchar source);
4269+
ushort2 intel_convert_as_e2m14_bfloat84_as_ushort2_packed(uchar2 source);
4270+
ushort3 intel_convert_as_e2m16_bfloat86_as_ushort3_packed(uchar3 source);
4271+
ushort4 intel_convert_as_e2m18_bfloat88_as_ushort4_packed(uchar4 source);
4272+
ushort8 intel_convert_as_e2m116_bfloat816_as_ushort8_packed(uchar8 source);
4273+
ushort16 intel_convert_as_e2m132_bfloat832_as_ushort16_packed(uchar16 source);
4274+
4275+
// e2m1 -> Hfloat8:
4276+
uchar intel_convert_as_e2m1_hfloat8_as_uchar(uchar source);
4277+
uchar2 intel_convert_as_e2m12_hfloat82_as_uchar2(uchar2 source);
4278+
uchar3 intel_convert_as_e2m13_hfloat83_as_uchar3(uchar3 source);
4279+
uchar4 intel_convert_as_e2m14_hfloat84_as_uchar4(uchar4 source);
4280+
uchar8 intel_convert_as_e2m18_hfloat88_as_uchar8(uchar8 source);
4281+
uchar16 intel_convert_as_e2m116_hfloat816_as_uchar16(uchar16 source);
4282+
4283+
ushort intel_convert_as_e2m12_hfloat82_as_ushort_packed(uchar source);
4284+
ushort2 intel_convert_as_e2m14_hfloat84_as_ushort2_packed(uchar2 source);
4285+
ushort3 intel_convert_as_e2m16_hfloat86_as_ushort3_packed(uchar3 source);
4286+
ushort4 intel_convert_as_e2m18_hfloat88_as_ushort4_packed(uchar4 source);
4287+
ushort8 intel_convert_as_e2m116_hfloat816_as_ushort8_packed(uchar8 source);
4288+
ushort16 intel_convert_as_e2m132_hfloat832_as_ushort16_packed(uchar16 source);
4289+
4290+
// e2m1 -> Bfloat16:
4291+
ushort intel_convert_as_e2m1_bfloat16_as_ushort(uchar source);
4292+
ushort2 intel_convert_as_e2m12_bfloat162_as_ushort2(uchar2 source);
4293+
ushort3 intel_convert_as_e2m13_bfloat163_as_ushort3(uchar3 source);
4294+
ushort4 intel_convert_as_e2m14_bfloat164_as_ushort4(uchar4 source);
4295+
ushort8 intel_convert_as_e2m18_bfloat168_as_ushort8(uchar8 source);
4296+
ushort16 intel_convert_as_e2m116_bfloat1616_as_ushort16(uchar16 source);
4297+
4298+
uint intel_convert_as_e2m12_bfloat162_as_uint_packed(uchar source);
4299+
uint2 intel_convert_as_e2m14_bfloat164_as_uint2_packed(uchar2 source);
4300+
uint3 intel_convert_as_e2m16_bfloat166_as_uint3_packed(uchar3 source);
4301+
uint4 intel_convert_as_e2m18_bfloat168_as_uint4_packed(uchar4 source);
4302+
uint8 intel_convert_as_e2m116_bfloat1616_as_uint8_packed(uchar8 source);
4303+
uint16 intel_convert_as_e2m132_bfloat1632_as_uint16_packed(uchar16 source);
4304+
4305+
#ifdef cl_khr_fp16
4306+
// e2m1 -> Half:
4307+
half intel_convert_as_e2m1_half(uchar source);
4308+
half2 intel_convert_as_e2m12_half2(uchar2 source);
4309+
half3 intel_convert_as_e2m13_half3(uchar3 source);
4310+
half4 intel_convert_as_e2m14_half4(uchar4 source);
4311+
half8 intel_convert_as_e2m18_half8(uchar8 source);
4312+
half16 intel_convert_as_e2m116_half16(uchar16 source);
4313+
4314+
uint intel_convert_as_e2m12_half2_as_uint_packed(uchar source);
4315+
uint2 intel_convert_as_e2m14_half4_as_uint2_packed(uchar2 source);
4316+
uint3 intel_convert_as_e2m16_half6_as_uint3_packed(uchar3 source);
4317+
uint4 intel_convert_as_e2m18_half8_as_uint4_packed(uchar4 source);
4318+
uint8 intel_convert_as_e2m116_half16_as_uint8_packed(uchar8 source);
4319+
uint16 intel_convert_as_e2m132_half32_as_uint16_packed(uchar16 source);
4320+
#endif // cl_khr_fp16
4321+
4322+
// lfsr
4323+
uint __attribute__((overloadable)) intel_lfsr(uint seed, uint polynomial);
4324+
ushort2 __attribute__((overloadable)) intel_lfsr(ushort2 seed, ushort2 polynomial);
4325+
uchar4 __attribute__((overloadable)) intel_lfsr(uchar4 seed, uchar4 polynomial);
4326+
4327+
// dnscl
4328+
// dnscl bf16 -> i4/fp4
4329+
uint intel_downscale_as_bf16_i4_mode_0(short2 s0, short2 s1);
4330+
uint intel_downscale_as_bf16_i4_mode_1(short2 s0, short2 s1);
4331+
uint intel_downscale_as_bf16_i4_mode_2(short2 s0, short2 s1);
4332+
uint intel_downscale_as_bf16_i4_mode_3(short2 s0, short2 s1);
4333+
uint intel_downscale_as_bf16_e2m1_mode_0(short2 s0, short2 s1);
4334+
uint intel_downscale_as_bf16_e2m1_mode_1(short2 s0, short2 s1);
4335+
uint intel_downscale_as_bf16_e2m1_mode_2(short2 s0, short2 s1);
4336+
uint intel_downscale_as_bf16_e2m1_mode_3(short2 s0, short2 s1);
4337+
#ifdef cl_khr_fp16
4338+
// dnscl f16 -> i4/fp4
4339+
uint intel_downscale_i4_mode_0(half2 s0, half2 s1);
4340+
uint intel_downscale_i4_mode_1(half2 s0, half2 s1);
4341+
uint intel_downscale_i4_mode_2(half2 s0, half2 s1);
4342+
uint intel_downscale_i4_mode_3(half2 s0, half2 s1);
4343+
uint intel_downscale_e2m1_mode_0(half2 s0, half2 s1);
4344+
uint intel_downscale_e2m1_mode_1(half2 s0, half2 s1);
4345+
uint intel_downscale_e2m1_mode_2(half2 s0, half2 s1);
4346+
uint intel_downscale_e2m1_mode_3(half2 s0, half2 s1);
4347+
#endif // cl_khr_fp16
4348+
// dnscl bf16 -> i4/fp4 stochastic rounding
4349+
uint intel_downscale_as_bf16_i4_mode_0_srnd(short2 s0, short2 s1, ushort2 bias);
4350+
uint intel_downscale_as_bf16_i4_mode_1_srnd(short2 s0, short2 s1, ushort2 bias);
4351+
uint intel_downscale_as_bf16_i4_mode_2_srnd(short2 s0, short2 s1, ushort2 bias);
4352+
uint intel_downscale_as_bf16_i4_mode_3_srnd(short2 s0, short2 s1, ushort2 bias);
4353+
uint intel_downscale_as_bf16_e2m1_mode_0_srnd(short2 s0, short2 s1, ushort2 bias);
4354+
uint intel_downscale_as_bf16_e2m1_mode_1_srnd(short2 s0, short2 s1, ushort2 bias);
4355+
uint intel_downscale_as_bf16_e2m1_mode_2_srnd(short2 s0, short2 s1, ushort2 bias);
4356+
uint intel_downscale_as_bf16_e2m1_mode_3_srnd(short2 s0, short2 s1, ushort2 bias);
4357+
#ifdef cl_khr_fp16
4358+
// dnscl f16 -> i4/fp4 stochastic rounding
4359+
uint intel_downscale_i4_mode_0_srnd(half2 s0, half2 s1, ushort2 bias);
4360+
uint intel_downscale_i4_mode_1_srnd(half2 s0, half2 s1, ushort2 bias);
4361+
uint intel_downscale_i4_mode_2_srnd(half2 s0, half2 s1, ushort2 bias);
4362+
uint intel_downscale_i4_mode_3_srnd(half2 s0, half2 s1, ushort2 bias);
4363+
uint intel_downscale_e2m1_mode_0_srnd(half2 s0, half2 s1, ushort2 bias);
4364+
uint intel_downscale_e2m1_mode_1_srnd(half2 s0, half2 s1, ushort2 bias);
4365+
uint intel_downscale_e2m1_mode_2_srnd(half2 s0, half2 s1, ushort2 bias);
4366+
uint intel_downscale_e2m1_mode_3_srnd(half2 s0, half2 s1, ushort2 bias);
4367+
#endif // cl_khr_fp16
4368+
4369+
// bf16 precision, f32/bf16 acc
4370+
float8 __attribute__((overloadable)) intel_sub_group_bf16_bf16_scaled_matrix_mad_k16(
4371+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4372+
short8 __attribute__((overloadable)) intel_sub_group_bf16_bf16_scaled_matrix_mad_k16(
4373+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4374+
float8 __attribute__((overloadable)) intel_sub_group_bf16_bf16_scaled_matrix_mad_k16_f32(
4375+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4376+
short8 __attribute__((overloadable)) intel_sub_group_bf16_bf16_scaled_matrix_mad_k16_bf16(
4377+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4378+
// f16 precision, f32/f16 acc
4379+
float8 __attribute__((overloadable)) intel_sub_group_f16_f16_scaled_matrix_mad_k16(
4380+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4381+
#if defined(cl_khr_fp16)
4382+
half8 __attribute__((overloadable)) intel_sub_group_f16_f16_scaled_matrix_mad_k16(
4383+
short8 a, int8 b, half8 acc, uchar scale_a, uchar scale_b);
4384+
float8 __attribute__((overloadable)) intel_sub_group_f16_f16_scaled_matrix_mad_k16_f32(
4385+
short8 a, int8 b, half8 acc, uchar scale_a, uchar scale_b);
4386+
half8 __attribute__((overloadable)) intel_sub_group_f16_f16_scaled_matrix_mad_k16_f16(
4387+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4388+
#endif // cl_khr_fp16
4389+
// bf8/hf8 precision, f32/bf16 acc
4390+
float8 __attribute__((overloadable)) intel_sub_group_hf8_hf8_scaled_matrix_mad_k32(
4391+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4392+
short8 __attribute__((overloadable)) intel_sub_group_hf8_hf8_scaled_matrix_mad_k32(
4393+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4394+
float8 __attribute__((overloadable)) intel_sub_group_hf8_hf8_scaled_matrix_mad_k32_f32(
4395+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4396+
short8 __attribute__((overloadable)) intel_sub_group_hf8_hf8_scaled_matrix_mad_k32_bf16(
4397+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4398+
float8 __attribute__((overloadable)) intel_sub_group_bf8_hf8_scaled_matrix_mad_k32(
4399+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4400+
short8 __attribute__((overloadable)) intel_sub_group_bf8_hf8_scaled_matrix_mad_k32(
4401+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4402+
float8 __attribute__((overloadable)) intel_sub_group_bf8_hf8_scaled_matrix_mad_k32_f32(
4403+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4404+
short8 __attribute__((overloadable)) intel_sub_group_bf8_hf8_scaled_matrix_mad_k32_bf16(
4405+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4406+
float8 __attribute__((overloadable)) intel_sub_group_hf8_bf8_scaled_matrix_mad_k32(
4407+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4408+
short8 __attribute__((overloadable)) intel_sub_group_hf8_bf8_scaled_matrix_mad_k32(
4409+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4410+
float8 __attribute__((overloadable)) intel_sub_group_hf8_bf8_scaled_matrix_mad_k32_f32(
4411+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4412+
short8 __attribute__((overloadable)) intel_sub_group_hf8_bf8_scaled_matrix_mad_k32_bf16(
4413+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4414+
float8 __attribute__((overloadable)) intel_sub_group_bf8_bf8_scaled_matrix_mad_k32(
4415+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4416+
short8 __attribute__((overloadable)) intel_sub_group_bf8_bf8_scaled_matrix_mad_k32(
4417+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4418+
float8 __attribute__((overloadable)) intel_sub_group_bf8_bf8_scaled_matrix_mad_k32_f32(
4419+
short8 a, int8 b, short8 acc, uchar scale_a, uchar scale_b);
4420+
short8 __attribute__((overloadable)) intel_sub_group_bf8_bf8_scaled_matrix_mad_k32_bf16(
4421+
short8 a, int8 b, float8 acc, uchar scale_a, uchar scale_b);
4422+
// fp4 precision, f32/bf16 acc
4423+
float8 __attribute__((overloadable)) intel_sub_group_e2m1_e2m1_scaled_matrix_mad_k64(
4424+
short8 a, int8 b, float8 acc, uchar2 scale_a, uchar2 scale_b);
4425+
short8 __attribute__((overloadable)) intel_sub_group_e2m1_e2m1_scaled_matrix_mad_k64(
4426+
short8 a, int8 b, short8 acc, uchar2 scale_a, uchar2 scale_b);
4427+
float8 __attribute__((overloadable)) intel_sub_group_e2m1_e2m1_scaled_matrix_mad_k64_f32(
4428+
short8 a, int8 b, short8 acc, uchar2 scale_a, uchar2 scale_b);
4429+
short8 __attribute__((overloadable)) intel_sub_group_e2m1_e2m1_scaled_matrix_mad_k64_bf16(
4430+
short8 a, int8 b, float8 acc, uchar2 scale_a, uchar2 scale_b);
4431+
4432+
// intel_sub_group_e2m1_e2m1_matrix_mad_k64
4433+
// 8
4434+
float8 __attribute__((overloadable))
4435+
intel_sub_group_e2m1_e2m1_matrix_mad_k64(short8 a, int8 b, float8 acc);
4436+
short8 __attribute__((overloadable))
4437+
intel_sub_group_e2m1_e2m1_matrix_mad_k64(short8 a, int8 b, short8 acc);
4438+
float8 __attribute__((overloadable))
4439+
intel_sub_group_e2m1_e2m1_matrix_mad_k64_f32(short8 a, int8 b, short8 acc);
4440+
short8 __attribute__((overloadable))
4441+
intel_sub_group_e2m1_e2m1_matrix_mad_k64_bf16(short8 a, int8 b, float8 acc);
4442+
// 4
4443+
float4 __attribute__((overloadable))
4444+
intel_sub_group_e2m1_e2m1_matrix_mad_k64(short4 a, int8 b, float4 acc);
4445+
short4 __attribute__((overloadable))
4446+
intel_sub_group_e2m1_e2m1_matrix_mad_k64(short4 a, int8 b, short4 acc);
4447+
float4 __attribute__((overloadable))
4448+
intel_sub_group_e2m1_e2m1_matrix_mad_k64_f32(short4 a, int8 b, short4 acc);
4449+
short4 __attribute__((overloadable))
4450+
intel_sub_group_e2m1_e2m1_matrix_mad_k64_bf16(short4 a, int8 b, float4 acc);
4451+
// 2
4452+
float2 __attribute__((overloadable))
4453+
intel_sub_group_e2m1_e2m1_matrix_mad_k64(short2 a, int8 b, float2 acc);
4454+
short2 __attribute__((overloadable))
4455+
intel_sub_group_e2m1_e2m1_matrix_mad_k64(short2 a, int8 b, short2 acc);
4456+
float2 __attribute__((overloadable))
4457+
intel_sub_group_e2m1_e2m1_matrix_mad_k64_f32(short2 a, int8 b, short2 acc);
4458+
short2 __attribute__((overloadable))
4459+
intel_sub_group_e2m1_e2m1_matrix_mad_k64_bf16(short2 a, int8 b, float2 acc);
4460+
// scalar
4461+
float __attribute__((overloadable))
4462+
intel_sub_group_e2m1_e2m1_matrix_mad_k64(short a, int8 b, float acc);
4463+
short __attribute__((overloadable))
4464+
intel_sub_group_e2m1_e2m1_matrix_mad_k64(short a, int8 b, short acc);
4465+
float __attribute__((overloadable))
4466+
intel_sub_group_e2m1_e2m1_matrix_mad_k64_f32(short a, int8 b, short acc);
4467+
short __attribute__((overloadable))
4468+
intel_sub_group_e2m1_e2m1_matrix_mad_k64_bf16(short a, int8 b, float acc);

0 commit comments

Comments
 (0)