Compare commits
1 Commits
amd-stagin
...
V_MFMA
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c4de17627b |
@@ -133,35 +133,45 @@ void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
|
||||
}
|
||||
|
||||
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
|
||||
// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
|
||||
// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
|
||||
// CHECK-GFX908-NEXT: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <32 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
|
||||
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
|
||||
// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
|
||||
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <16 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
|
||||
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
|
||||
// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
|
||||
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <4 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
|
||||
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
|
||||
// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
|
||||
// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <16 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
|
||||
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
|
||||
// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
|
||||
// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <4 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
|
||||
@@ -172,35 +182,45 @@ void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
|
||||
#ifdef MFMA_GFX90A_TESTS
|
||||
|
||||
// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
|
||||
// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
|
||||
// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
|
||||
// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <32 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
|
||||
// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
|
||||
// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
|
||||
// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <16 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
|
||||
// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
|
||||
// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
|
||||
// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <4 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
|
||||
// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
|
||||
// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
|
||||
// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <16 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
|
||||
}
|
||||
|
||||
// CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
|
||||
// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
|
||||
// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
|
||||
// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
|
||||
// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <4 x float> %c, i32 0, i32 0, i32 0)
|
||||
void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
|
||||
{
|
||||
*out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
|
||||
|
||||
@@ -3027,11 +3027,11 @@ def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i
|
||||
def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>;
|
||||
def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
|
||||
def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>;
|
||||
def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>;
|
||||
def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
|
||||
def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
|
||||
def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
|
||||
def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
|
||||
def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2bf16_ty>;
|
||||
def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2bf16_ty>;
|
||||
def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2bf16_ty>;
|
||||
def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2bf16_ty>;
|
||||
def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2bf16_ty>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// gfx90a intrinsics
|
||||
@@ -3043,11 +3043,11 @@ def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
|
||||
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
|
||||
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
|
||||
|
||||
def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
|
||||
def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
|
||||
def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
|
||||
def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
|
||||
def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
|
||||
def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4bf16_ty>;
|
||||
def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4bf16_ty>;
|
||||
def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4bf16_ty>;
|
||||
def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4bf16_ty>;
|
||||
def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4bf16_ty>;
|
||||
|
||||
// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
|
||||
// Three bits corresponding to the neg modifier applied to the respective
|
||||
|
||||
@@ -2549,8 +2549,11 @@ def VOP_V4F32_V4F16_V4F16_V4F32 : VOPProfile <[v4f32, v4f16, v4f16, v4f32]>;
|
||||
def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>;
|
||||
def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>;
|
||||
def VOP_V4F32_V2I16_V2I16_V4F32 : VOPProfile <[v4f32, v2i16, v2i16, v4f32]>;
|
||||
def VOP_V4F32_V2BF16_V2BF16_V4F32 : VOPProfile <[v4f32, v2bf16, v2bf16, v4f32]>;
|
||||
def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>;
|
||||
def VOP_V16F32_V2BF16_V2BF16_V16F32 : VOPProfile <[v16f32, v2bf16, v2bf16, v16f32]>;
|
||||
def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>;
|
||||
def VOP_V32F32_V2BF16_V2BF16_V32F32 : VOPProfile <[v32f32, v2bf16, v2bf16, v32f32]>;
|
||||
def VOP_V4I32_I32_I32_V4I32 : VOPProfile <[v4i32, i32, i32, v4i32]>;
|
||||
def VOP_V16I32_I32_I32_V16I32 : VOPProfile <[v16i32, i32, i32, v16i32]>;
|
||||
def VOP_V32I32_I32_I32_V32I32 : VOPProfile <[v32i32, i32, i32, v32i32]>;
|
||||
@@ -2562,8 +2565,11 @@ def VOP_V2F32_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, v2f32]>;
|
||||
def VOP_V2F32_V2F32_V2F32 : VOPProfile <[v2f32, v2f32, v2f32, untyped]>;
|
||||
def VOP_V2I32_V2I32_V2I32 : VOPProfile <[v2i32, v2i32, v2i32, untyped]>;
|
||||
def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>;
|
||||
def VOP_V4F32_V4BF16_V4BF16_V4F32 : VOPProfile <[v4f32, v4bf16, v4bf16, v4f32]>;
|
||||
def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
|
||||
def VOP_V16F32_V4BF16_V4BF16_V16F32 : VOPProfile <[v16f32, v4bf16, v4bf16, v16f32]>;
|
||||
def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
|
||||
def VOP_V32F32_V4BF16_V4BF16_V32F32 : VOPProfile <[v32f32, v4bf16, v4bf16, v32f32]>;
|
||||
|
||||
def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>;
|
||||
def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>;
|
||||
|
||||
@@ -573,15 +573,15 @@ def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, A
|
||||
def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, AISrc_128_b32, ADst_128>;
|
||||
def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, AISrc_512_b32, ADst_512>;
|
||||
def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, AISrc_1024_b32, ADst_1024>;
|
||||
def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, AISrc_128_b32, ADst_128>;
|
||||
def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32, ADst_512>;
|
||||
def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>;
|
||||
def VOPProfileMAI_F32_V2BF16_X4 : VOPProfileMAI<VOP_V4F32_V2BF16_V2BF16_V4F32,AISrc_128_b32, ADst_128>;
|
||||
def VOPProfileMAI_F32_V2BF16_X16 : VOPProfileMAI<VOP_V16F32_V2BF16_V2BF16_V16F32, AISrc_512_b32, ADst_512>;
|
||||
def VOPProfileMAI_F32_V2BF16_X32 : VOPProfileMAI<VOP_V32F32_V2BF16_V2BF16_V32F32, AISrc_1024_b32, ADst_1024>;
|
||||
def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4I16_X4 : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4BF16_X4 : VOPProfileMAI<VOP_V4F32_V4BF16_V4BF16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4BF16_X16 : VOPProfileMAI<VOP_V16F32_V4BF16_V4BF16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4BF16_X32 : VOPProfileMAI<VOP_V32F32_V4BF16_V4BF16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
|
||||
def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>;
|
||||
def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>;
|
||||
def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, AISrc_128_b32, ADst_128, AVSrc_64>;
|
||||
@@ -597,15 +597,15 @@ def VOPProfileMAI_F32_F32_X32_VCD : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32,
|
||||
def VOPProfileMAI_I32_I32_X4_VCD : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, VISrc_128_b32, VDst_128>;
|
||||
def VOPProfileMAI_I32_I32_X16_VCD : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, VISrc_512_b32, VDst_512>;
|
||||
def VOPProfileMAI_I32_I32_X32_VCD : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, VISrc_1024_b32, VDst_1024>;
|
||||
def VOPProfileMAI_F32_V2I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, VISrc_128_b32, VDst_128>;
|
||||
def VOPProfileMAI_F32_V2I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, VISrc_512_b32, VDst_512>;
|
||||
def VOPProfileMAI_F32_V2I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, VISrc_1024_b32, VDst_1024>;
|
||||
def VOPProfileMAI_F32_V2BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V2BF16_V2BF16_V4F32, VISrc_128_b32, VDst_128>;
|
||||
def VOPProfileMAI_F32_V2BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V2BF16_V2BF16_V16F32, VISrc_512_b32, VDst_512>;
|
||||
def VOPProfileMAI_F32_V2BF16_X32_VCD : VOPProfileMAI<VOP_V32F32_V2BF16_V2BF16_V32F32, VISrc_1024_b32, VDst_1024>;
|
||||
def VOPProfileMAI_F32_V4F16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4F16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4BF16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4BF16_V4BF16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4BF16_V4BF16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
|
||||
def VOPProfileMAI_F32_V4BF16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4BF16_V4BF16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
|
||||
def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>;
|
||||
def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>;
|
||||
def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, VISrc_128_b32, VDst_128, AVSrc_64>;
|
||||
@@ -727,22 +727,22 @@ defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32",
|
||||
let Predicates = [isGFX908orGFX90A] in {
|
||||
defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>;
|
||||
defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>;
|
||||
defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>;
|
||||
defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>;
|
||||
defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>;
|
||||
defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
|
||||
defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
|
||||
defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2BF16_X4", int_amdgcn_mfma_f32_4x4x2bf16>;
|
||||
defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2BF16_X16", int_amdgcn_mfma_f32_16x16x2bf16>;
|
||||
defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2BF16_X4", int_amdgcn_mfma_f32_16x16x8bf16>;
|
||||
defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2BF16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
|
||||
defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2BF16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
|
||||
}
|
||||
|
||||
} // End SubtargetPredicate = HasMAIInsts
|
||||
|
||||
let Predicates = [isGFX90APlus] in {
|
||||
let is_gfx940_xdl = 1 in {
|
||||
defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>;
|
||||
defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>;
|
||||
defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>;
|
||||
defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>;
|
||||
defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>;
|
||||
defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4BF16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>;
|
||||
defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4BF16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>;
|
||||
defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4BF16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>;
|
||||
defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4BF16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>;
|
||||
defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4BF16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>;
|
||||
}
|
||||
|
||||
let is_dgemm = 1 in {
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s
|
||||
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
|
||||
declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
||||
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%in.1 = load <32 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <32 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -117,9 +117,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -147,9 +147,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -194,9 +194,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -225,9 +225,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
|
||||
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat>, <2 x bfloat>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat>, <2 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat>, <2 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat>, <2 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat>, <2 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
||||
; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
|
||||
@@ -55,9 +55,9 @@ declare i32 @llvm.amdgcn.workitem.id.x()
|
||||
define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <32 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i32 1 to <2 x i16>
|
||||
%b = bitcast i32 2 to <2 x i16>
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i32 1 to <2 x bfloat>
|
||||
%b = bitcast i32 2 to <2 x bfloat>
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <32 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -76,9 +76,9 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i32 1 to <2 x i16>
|
||||
%b = bitcast i32 2 to <2 x i16>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i32 1 to <2 x bfloat>
|
||||
%b = bitcast i32 2 to <2 x bfloat>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -97,9 +97,9 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i32 1 to <2 x i16>
|
||||
%b = bitcast i32 2 to <2 x i16>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i32 1 to <2 x bfloat>
|
||||
%b = bitcast i32 2 to <2 x bfloat>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -118,9 +118,9 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i32 1 to <2 x i16>
|
||||
%b = bitcast i32 2 to <2 x i16>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i32 1 to <2 x bfloat>
|
||||
%b = bitcast i32 2 to <2 x bfloat>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat> %a, <2 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -139,9 +139,9 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i32 1 to <2 x i16>
|
||||
%b = bitcast i32 2 to <2 x i16>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i32 1 to <2 x bfloat>
|
||||
%b = bitcast i32 2 to <2 x bfloat>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat> %a, <2 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s
|
||||
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
|
||||
declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
|
||||
declare i32 @llvm.amdgcn.workitem.id.x()
|
||||
@@ -23,9 +23,9 @@ declare i32 @llvm.amdgcn.workitem.id.x()
|
||||
define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <32 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <32 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -42,9 +42,9 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -61,9 +61,9 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -80,9 +80,9 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -99,9 +99,9 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i64 1 to <4 x i16>
|
||||
%b = bitcast i64 2 to <4 x i16>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
%a = bitcast i64 1 to <4 x bfloat>
|
||||
%b = bitcast i64 2 to <4 x bfloat>
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
|
||||
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
|
||||
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat>, <2 x bfloat>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat>, <2 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat>, <2 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat>, <2 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat>, <2 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <32 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
|
||||
declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
|
||||
declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
|
||||
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
|
||||
@@ -21,9 +21,9 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32
|
||||
define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <32 x float>, ptr addrspace(1) %arg
|
||||
%a = bitcast i32 1 to <2 x i16>
|
||||
%b = bitcast i32 2 to <2 x i16>
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%a = bitcast i32 1 to <2 x bfloat>
|
||||
%b = bitcast i32 2 to <2 x bfloat>
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <32 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <32 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -33,7 +33,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -43,7 +43,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -53,7 +53,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat> undef, <2 x bfloat> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -63,7 +63,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat> undef, <2 x bfloat> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -73,7 +73,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <32 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <32 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <32 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <32 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -83,7 +83,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -93,7 +93,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -103,7 +103,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <16 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <16 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
@@ -113,7 +113,7 @@ bb:
|
||||
define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%in.1 = load <4 x float>, ptr addrspace(1) %arg
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
|
||||
store <4 x float> %mai.1, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user