@@ -4191,3 +4191,278 @@ void intel_manageable_barrier_arrivewait(manageable_barrier_t* BData);
41914191void intel_manageable_barrier_arrivedrop (manageable_barrier_t * BData );
41924192
41934193void region_barrier (uint SubRegionSize );
4194+
4195+ // FP4 / INT4 conversion functions
4196+ // Int4
4197+ // Int4 -> Bfloat8:
4198+ uchar intel_convert_as_i4_bfloat8_as_uchar (char source );
4199+ uchar2 intel_convert_as_i42_bfloat82_as_uchar2 (char2 source );
4200+ uchar3 intel_convert_as_i43_bfloat83_as_uchar3 (char3 source );
4201+ uchar4 intel_convert_as_i44_bfloat84_as_uchar4 (char4 source );
4202+ uchar8 intel_convert_as_i48_bfloat88_as_uchar8 (char8 source );
4203+ uchar16 intel_convert_as_i416_bfloat816_as_uchar16 (char16 source );
4204+
4205+ ushort intel_convert_as_i42_bfloat82_as_ushort_packed (char source );
4206+ ushort2 intel_convert_as_i44_bfloat84_as_ushort2_packed (char2 source );
4207+ ushort3 intel_convert_as_i46_bfloat86_as_ushort3_packed (char3 source );
4208+ ushort4 intel_convert_as_i48_bfloat88_as_ushort4_packed (char4 source );
4209+ ushort8 intel_convert_as_i416_bfloat816_as_ushort8_packed (char8 source );
4210+ ushort16 intel_convert_as_i432_bfloat832_as_ushort16_packed (char16 source );
4211+
4212+ // Int4 -> Hfloat8:
4213+ uchar intel_convert_as_i4_hfloat8_as_uchar (char source );
4214+ uchar2 intel_convert_as_i42_hfloat82_as_uchar2 (char2 source );
4215+ uchar3 intel_convert_as_i43_hfloat83_as_uchar3 (char3 source );
4216+ uchar4 intel_convert_as_i44_hfloat84_as_uchar4 (char4 source );
4217+ uchar8 intel_convert_as_i48_hfloat88_as_uchar8 (char8 source );
4218+ uchar16 intel_convert_as_i416_hfloat816_as_uchar16 (char16 source );
4219+
4220+ ushort intel_convert_as_i42_hfloat82_as_ushort_packed (char source );
4221+ ushort2 intel_convert_as_i44_hfloat84_as_ushort2_packed (char2 source );
4222+ ushort3 intel_convert_as_i46_hfloat86_as_ushort3_packed (char3 source );
4223+ ushort4 intel_convert_as_i48_hfloat88_as_ushort4_packed (char4 source );
4224+ ushort8 intel_convert_as_i416_hfloat816_as_ushort8_packed (char8 source );
4225+ ushort16 intel_convert_as_i432_hfloat832_as_ushort16_packed (char16 source );
4226+
4227+ // Int4 -> Bfloat16:
4228+ ushort intel_convert_as_i4_bfloat16_as_ushort (char source );
4229+ ushort2 intel_convert_as_i42_bfloat162_as_ushort2 (char2 source );
4230+ ushort3 intel_convert_as_i43_bfloat163_as_ushort3 (char3 source );
4231+ ushort4 intel_convert_as_i44_bfloat164_as_ushort4 (char4 source );
4232+ ushort8 intel_convert_as_i48_bfloat168_as_ushort8 (char8 source );
4233+ ushort16 intel_convert_as_i416_bfloat1616_as_ushort16 (char16 source );
4234+
4235+ uint intel_convert_as_i42_bfloat162_as_uint_packed (char source );
4236+ uint2 intel_convert_as_i44_bfloat164_as_uint2_packed (char2 source );
4237+ uint3 intel_convert_as_i46_bfloat166_as_uint3_packed (char3 source );
4238+ uint4 intel_convert_as_i48_bfloat168_as_uint4_packed (char4 source );
4239+ uint8 intel_convert_as_i416_bfloat1616_as_uint8_packed (char8 source );
4240+ uint16 intel_convert_as_i432_bfloat1632_as_uint16_packed (char16 source );
4241+
4242+ #ifdef cl_khr_fp16
4243+ // Int4 -> Half:
4244+ half intel_convert_as_i4_half (char source );
4245+ half2 intel_convert_as_i42_half2 (char2 source );
4246+ half3 intel_convert_as_i43_half3 (char3 source );
4247+ half4 intel_convert_as_i44_half4 (char4 source );
4248+ half8 intel_convert_as_i48_half8 (char8 source );
4249+ half16 intel_convert_as_i416_half16 (char16 source );
4250+
4251+ uint intel_convert_as_i42_half2_as_uint_packed (char source );
4252+ uint2 intel_convert_as_i44_half4_as_uint2_packed (char2 source );
4253+ uint3 intel_convert_as_i46_half6_as_uint3_packed (char3 source );
4254+ uint4 intel_convert_as_i48_half8_as_uint4_packed (char4 source );
4255+ uint8 intel_convert_as_i416_half16_as_uint8_packed (char8 source );
4256+ uint16 intel_convert_as_i432_half32_as_uint16_packed (char16 source );
4257+ #endif // cl_khr_fp16
4258+
4259+ // e2m1
4260+ // e2m1 -> Bfloat8:
4261+ uchar intel_convert_as_e2m1_bfloat8_as_uchar (uchar source );
4262+ uchar2 intel_convert_as_e2m12_bfloat82_as_uchar2 (uchar2 source );
4263+ uchar3 intel_convert_as_e2m13_bfloat83_as_uchar3 (uchar3 source );
4264+ uchar4 intel_convert_as_e2m14_bfloat84_as_uchar4 (uchar4 source );
4265+ uchar8 intel_convert_as_e2m18_bfloat88_as_uchar8 (uchar8 source );
4266+ uchar16 intel_convert_as_e2m116_bfloat816_as_uchar16 (uchar16 source );
4267+
4268+ ushort intel_convert_as_e2m12_bfloat82_as_ushort_packed (uchar source );
4269+ ushort2 intel_convert_as_e2m14_bfloat84_as_ushort2_packed (uchar2 source );
4270+ ushort3 intel_convert_as_e2m16_bfloat86_as_ushort3_packed (uchar3 source );
4271+ ushort4 intel_convert_as_e2m18_bfloat88_as_ushort4_packed (uchar4 source );
4272+ ushort8 intel_convert_as_e2m116_bfloat816_as_ushort8_packed (uchar8 source );
4273+ ushort16 intel_convert_as_e2m132_bfloat832_as_ushort16_packed (uchar16 source );
4274+
4275+ // e2m1 -> Hfloat8:
4276+ uchar intel_convert_as_e2m1_hfloat8_as_uchar (uchar source );
4277+ uchar2 intel_convert_as_e2m12_hfloat82_as_uchar2 (uchar2 source );
4278+ uchar3 intel_convert_as_e2m13_hfloat83_as_uchar3 (uchar3 source );
4279+ uchar4 intel_convert_as_e2m14_hfloat84_as_uchar4 (uchar4 source );
4280+ uchar8 intel_convert_as_e2m18_hfloat88_as_uchar8 (uchar8 source );
4281+ uchar16 intel_convert_as_e2m116_hfloat816_as_uchar16 (uchar16 source );
4282+
4283+ ushort intel_convert_as_e2m12_hfloat82_as_ushort_packed (uchar source );
4284+ ushort2 intel_convert_as_e2m14_hfloat84_as_ushort2_packed (uchar2 source );
4285+ ushort3 intel_convert_as_e2m16_hfloat86_as_ushort3_packed (uchar3 source );
4286+ ushort4 intel_convert_as_e2m18_hfloat88_as_ushort4_packed (uchar4 source );
4287+ ushort8 intel_convert_as_e2m116_hfloat816_as_ushort8_packed (uchar8 source );
4288+ ushort16 intel_convert_as_e2m132_hfloat832_as_ushort16_packed (uchar16 source );
4289+
4290+ // e2m1 -> Bfloat16:
4291+ ushort intel_convert_as_e2m1_bfloat16_as_ushort (uchar source );
4292+ ushort2 intel_convert_as_e2m12_bfloat162_as_ushort2 (uchar2 source );
4293+ ushort3 intel_convert_as_e2m13_bfloat163_as_ushort3 (uchar3 source );
4294+ ushort4 intel_convert_as_e2m14_bfloat164_as_ushort4 (uchar4 source );
4295+ ushort8 intel_convert_as_e2m18_bfloat168_as_ushort8 (uchar8 source );
4296+ ushort16 intel_convert_as_e2m116_bfloat1616_as_ushort16 (uchar16 source );
4297+
4298+ uint intel_convert_as_e2m12_bfloat162_as_uint_packed (uchar source );
4299+ uint2 intel_convert_as_e2m14_bfloat164_as_uint2_packed (uchar2 source );
4300+ uint3 intel_convert_as_e2m16_bfloat166_as_uint3_packed (uchar3 source );
4301+ uint4 intel_convert_as_e2m18_bfloat168_as_uint4_packed (uchar4 source );
4302+ uint8 intel_convert_as_e2m116_bfloat1616_as_uint8_packed (uchar8 source );
4303+ uint16 intel_convert_as_e2m132_bfloat1632_as_uint16_packed (uchar16 source );
4304+
4305+ #ifdef cl_khr_fp16
4306+ // e2m1 -> Half:
4307+ half intel_convert_as_e2m1_half (uchar source );
4308+ half2 intel_convert_as_e2m12_half2 (uchar2 source );
4309+ half3 intel_convert_as_e2m13_half3 (uchar3 source );
4310+ half4 intel_convert_as_e2m14_half4 (uchar4 source );
4311+ half8 intel_convert_as_e2m18_half8 (uchar8 source );
4312+ half16 intel_convert_as_e2m116_half16 (uchar16 source );
4313+
4314+ uint intel_convert_as_e2m12_half2_as_uint_packed (uchar source );
4315+ uint2 intel_convert_as_e2m14_half4_as_uint2_packed (uchar2 source );
4316+ uint3 intel_convert_as_e2m16_half6_as_uint3_packed (uchar3 source );
4317+ uint4 intel_convert_as_e2m18_half8_as_uint4_packed (uchar4 source );
4318+ uint8 intel_convert_as_e2m116_half16_as_uint8_packed (uchar8 source );
4319+ uint16 intel_convert_as_e2m132_half32_as_uint16_packed (uchar16 source );
4320+ #endif // cl_khr_fp16
4321+
4322+ // lfsr
4323+ uint __attribute__((overloadable )) intel_lfsr (uint seed , uint polynomial );
4324+ ushort2 __attribute__((overloadable )) intel_lfsr (ushort2 seed , ushort2 polynomial );
4325+ uchar4 __attribute__((overloadable )) intel_lfsr (uchar4 seed , uchar4 polynomial );
4326+
4327+ // dnscl
4328+ // dnscl bf16 -> i4/fp4
4329+ uint intel_downscale_as_bf16_i4_mode_0 (short2 s0 , short2 s1 );
4330+ uint intel_downscale_as_bf16_i4_mode_1 (short2 s0 , short2 s1 );
4331+ uint intel_downscale_as_bf16_i4_mode_2 (short2 s0 , short2 s1 );
4332+ uint intel_downscale_as_bf16_i4_mode_3 (short2 s0 , short2 s1 );
4333+ uint intel_downscale_as_bf16_e2m1_mode_0 (short2 s0 , short2 s1 );
4334+ uint intel_downscale_as_bf16_e2m1_mode_1 (short2 s0 , short2 s1 );
4335+ uint intel_downscale_as_bf16_e2m1_mode_2 (short2 s0 , short2 s1 );
4336+ uint intel_downscale_as_bf16_e2m1_mode_3 (short2 s0 , short2 s1 );
4337+ #ifdef cl_khr_fp16
4338+ // dnscl f16 -> i4/fp4
4339+ uint intel_downscale_i4_mode_0 (half2 s0 , half2 s1 );
4340+ uint intel_downscale_i4_mode_1 (half2 s0 , half2 s1 );
4341+ uint intel_downscale_i4_mode_2 (half2 s0 , half2 s1 );
4342+ uint intel_downscale_i4_mode_3 (half2 s0 , half2 s1 );
4343+ uint intel_downscale_e2m1_mode_0 (half2 s0 , half2 s1 );
4344+ uint intel_downscale_e2m1_mode_1 (half2 s0 , half2 s1 );
4345+ uint intel_downscale_e2m1_mode_2 (half2 s0 , half2 s1 );
4346+ uint intel_downscale_e2m1_mode_3 (half2 s0 , half2 s1 );
4347+ #endif // cl_khr_fp16
4348+ // dnscl bf16 -> i4/fp4 stochastic rounding
4349+ uint intel_downscale_as_bf16_i4_mode_0_srnd (short2 s0 , short2 s1 , ushort2 bias );
4350+ uint intel_downscale_as_bf16_i4_mode_1_srnd (short2 s0 , short2 s1 , ushort2 bias );
4351+ uint intel_downscale_as_bf16_i4_mode_2_srnd (short2 s0 , short2 s1 , ushort2 bias );
4352+ uint intel_downscale_as_bf16_i4_mode_3_srnd (short2 s0 , short2 s1 , ushort2 bias );
4353+ uint intel_downscale_as_bf16_e2m1_mode_0_srnd (short2 s0 , short2 s1 , ushort2 bias );
4354+ uint intel_downscale_as_bf16_e2m1_mode_1_srnd (short2 s0 , short2 s1 , ushort2 bias );
4355+ uint intel_downscale_as_bf16_e2m1_mode_2_srnd (short2 s0 , short2 s1 , ushort2 bias );
4356+ uint intel_downscale_as_bf16_e2m1_mode_3_srnd (short2 s0 , short2 s1 , ushort2 bias );
4357+ #ifdef cl_khr_fp16
4358+ // dnscl f16 -> i4/fp4 stochastic rounding
4359+ uint intel_downscale_i4_mode_0_srnd (half2 s0 , half2 s1 , ushort2 bias );
4360+ uint intel_downscale_i4_mode_1_srnd (half2 s0 , half2 s1 , ushort2 bias );
4361+ uint intel_downscale_i4_mode_2_srnd (half2 s0 , half2 s1 , ushort2 bias );
4362+ uint intel_downscale_i4_mode_3_srnd (half2 s0 , half2 s1 , ushort2 bias );
4363+ uint intel_downscale_e2m1_mode_0_srnd (half2 s0 , half2 s1 , ushort2 bias );
4364+ uint intel_downscale_e2m1_mode_1_srnd (half2 s0 , half2 s1 , ushort2 bias );
4365+ uint intel_downscale_e2m1_mode_2_srnd (half2 s0 , half2 s1 , ushort2 bias );
4366+ uint intel_downscale_e2m1_mode_3_srnd (half2 s0 , half2 s1 , ushort2 bias );
4367+ #endif // cl_khr_fp16
4368+
4369+ // bf16 precision, f32/bf16 acc
4370+ float8 __attribute__((overloadable )) intel_sub_group_bf16_bf16_scaled_matrix_mad_k16 (
4371+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4372+ short8 __attribute__((overloadable )) intel_sub_group_bf16_bf16_scaled_matrix_mad_k16 (
4373+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4374+ float8 __attribute__((overloadable )) intel_sub_group_bf16_bf16_scaled_matrix_mad_k16_f32 (
4375+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4376+ short8 __attribute__((overloadable )) intel_sub_group_bf16_bf16_scaled_matrix_mad_k16_bf16 (
4377+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4378+ // f16 precision, f32/f16 acc
4379+ float8 __attribute__((overloadable )) intel_sub_group_f16_f16_scaled_matrix_mad_k16 (
4380+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4381+ #if defined(cl_khr_fp16 )
4382+ half8 __attribute__((overloadable )) intel_sub_group_f16_f16_scaled_matrix_mad_k16 (
4383+ short8 a , int8 b , half8 acc , uchar scale_a , uchar scale_b );
4384+ float8 __attribute__((overloadable )) intel_sub_group_f16_f16_scaled_matrix_mad_k16_f32 (
4385+ short8 a , int8 b , half8 acc , uchar scale_a , uchar scale_b );
4386+ half8 __attribute__((overloadable )) intel_sub_group_f16_f16_scaled_matrix_mad_k16_f16 (
4387+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4388+ #endif // cl_khr_fp16
4389+ // bf8/hf8 precision, f32/bf16 acc
4390+ float8 __attribute__((overloadable )) intel_sub_group_hf8_hf8_scaled_matrix_mad_k32 (
4391+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4392+ short8 __attribute__((overloadable )) intel_sub_group_hf8_hf8_scaled_matrix_mad_k32 (
4393+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4394+ float8 __attribute__((overloadable )) intel_sub_group_hf8_hf8_scaled_matrix_mad_k32_f32 (
4395+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4396+ short8 __attribute__((overloadable )) intel_sub_group_hf8_hf8_scaled_matrix_mad_k32_bf16 (
4397+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4398+ float8 __attribute__((overloadable )) intel_sub_group_bf8_hf8_scaled_matrix_mad_k32 (
4399+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4400+ short8 __attribute__((overloadable )) intel_sub_group_bf8_hf8_scaled_matrix_mad_k32 (
4401+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4402+ float8 __attribute__((overloadable )) intel_sub_group_bf8_hf8_scaled_matrix_mad_k32_f32 (
4403+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4404+ short8 __attribute__((overloadable )) intel_sub_group_bf8_hf8_scaled_matrix_mad_k32_bf16 (
4405+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4406+ float8 __attribute__((overloadable )) intel_sub_group_hf8_bf8_scaled_matrix_mad_k32 (
4407+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4408+ short8 __attribute__((overloadable )) intel_sub_group_hf8_bf8_scaled_matrix_mad_k32 (
4409+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4410+ float8 __attribute__((overloadable )) intel_sub_group_hf8_bf8_scaled_matrix_mad_k32_f32 (
4411+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4412+ short8 __attribute__((overloadable )) intel_sub_group_hf8_bf8_scaled_matrix_mad_k32_bf16 (
4413+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4414+ float8 __attribute__((overloadable )) intel_sub_group_bf8_bf8_scaled_matrix_mad_k32 (
4415+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4416+ short8 __attribute__((overloadable )) intel_sub_group_bf8_bf8_scaled_matrix_mad_k32 (
4417+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4418+ float8 __attribute__((overloadable )) intel_sub_group_bf8_bf8_scaled_matrix_mad_k32_f32 (
4419+ short8 a , int8 b , short8 acc , uchar scale_a , uchar scale_b );
4420+ short8 __attribute__((overloadable )) intel_sub_group_bf8_bf8_scaled_matrix_mad_k32_bf16 (
4421+ short8 a , int8 b , float8 acc , uchar scale_a , uchar scale_b );
4422+ // fp4 precision, f32/bf16 acc
4423+ float8 __attribute__((overloadable )) intel_sub_group_e2m1_e2m1_scaled_matrix_mad_k64 (
4424+ short8 a , int8 b , float8 acc , uchar2 scale_a , uchar2 scale_b );
4425+ short8 __attribute__((overloadable )) intel_sub_group_e2m1_e2m1_scaled_matrix_mad_k64 (
4426+ short8 a , int8 b , short8 acc , uchar2 scale_a , uchar2 scale_b );
4427+ float8 __attribute__((overloadable )) intel_sub_group_e2m1_e2m1_scaled_matrix_mad_k64_f32 (
4428+ short8 a , int8 b , short8 acc , uchar2 scale_a , uchar2 scale_b );
4429+ short8 __attribute__((overloadable )) intel_sub_group_e2m1_e2m1_scaled_matrix_mad_k64_bf16 (
4430+ short8 a , int8 b , float8 acc , uchar2 scale_a , uchar2 scale_b );
4431+
4432+ // intel_sub_group_e2m1_e2m1_matrix_mad_k64
4433+ // 8
4434+ float8 __attribute__((overloadable ))
4435+ intel_sub_group_e2m1_e2m1_matrix_mad_k64 (short8 a , int8 b , float8 acc );
4436+ short8 __attribute__((overloadable ))
4437+ intel_sub_group_e2m1_e2m1_matrix_mad_k64 (short8 a , int8 b , short8 acc );
4438+ float8 __attribute__((overloadable ))
4439+ intel_sub_group_e2m1_e2m1_matrix_mad_k64_f32 (short8 a , int8 b , short8 acc );
4440+ short8 __attribute__((overloadable ))
4441+ intel_sub_group_e2m1_e2m1_matrix_mad_k64_bf16 (short8 a , int8 b , float8 acc );
4442+ // 4
4443+ float4 __attribute__((overloadable ))
4444+ intel_sub_group_e2m1_e2m1_matrix_mad_k64 (short4 a , int8 b , float4 acc );
4445+ short4 __attribute__((overloadable ))
4446+ intel_sub_group_e2m1_e2m1_matrix_mad_k64 (short4 a , int8 b , short4 acc );
4447+ float4 __attribute__((overloadable ))
4448+ intel_sub_group_e2m1_e2m1_matrix_mad_k64_f32 (short4 a , int8 b , short4 acc );
4449+ short4 __attribute__((overloadable ))
4450+ intel_sub_group_e2m1_e2m1_matrix_mad_k64_bf16 (short4 a , int8 b , float4 acc );
4451+ // 2
4452+ float2 __attribute__((overloadable ))
4453+ intel_sub_group_e2m1_e2m1_matrix_mad_k64 (short2 a , int8 b , float2 acc );
4454+ short2 __attribute__((overloadable ))
4455+ intel_sub_group_e2m1_e2m1_matrix_mad_k64 (short2 a , int8 b , short2 acc );
4456+ float2 __attribute__((overloadable ))
4457+ intel_sub_group_e2m1_e2m1_matrix_mad_k64_f32 (short2 a , int8 b , short2 acc );
4458+ short2 __attribute__((overloadable ))
4459+ intel_sub_group_e2m1_e2m1_matrix_mad_k64_bf16 (short2 a , int8 b , float2 acc );
4460+ // scalar
4461+ float __attribute__((overloadable ))
4462+ intel_sub_group_e2m1_e2m1_matrix_mad_k64 (short a , int8 b , float acc );
4463+ short __attribute__((overloadable ))
4464+ intel_sub_group_e2m1_e2m1_matrix_mad_k64 (short a , int8 b , short acc );
4465+ float __attribute__((overloadable ))
4466+ intel_sub_group_e2m1_e2m1_matrix_mad_k64_f32 (short a , int8 b , short acc );
4467+ short __attribute__((overloadable ))
4468+ intel_sub_group_e2m1_e2m1_matrix_mad_k64_bf16 (short a , int8 b , float acc );
0 commit comments