[AMDGPU] Fix operand types for the BF16 variants of V_MFMA instruction

2024-02-17 10:24:02 -05:00
8 changed files with 150 additions and 124 deletions
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl
@@ -133,35 +133,45 @@ void test_mfma_i32_16x16x16i8(global v4i* out, int a, int b, v4i c)
 }

 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x2bf16
-// CHECK-GFX908: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
+// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
+// CHECK-GFX908-NEXT: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <32 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x2bf16(global v32f* out, v2s a, v2s b, v32f c)
 {
  *out = __builtin_amdgcn_mfma_f32_32x32x2bf16(a, b, c, 0, 0, 0);
 }

 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x2bf16
-// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
+// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
+// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x2bf16(global v16f* out, v2s a, v2s b, v16f c)
 {
  *out = __builtin_amdgcn_mfma_f32_16x16x2bf16(a, b, c, 0, 0, 0);
 }

 // CHECK-GFX908-LABEL: @test_mfma_f32_4x4x2bf16
-// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
+// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
+// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_4x4x2bf16(global v4f* out, v2s a, v2s b, v4f c)
 {
  *out = __builtin_amdgcn_mfma_f32_4x4x2bf16(a, b, c, 0, 0, 0);
 }

 // CHECK-GFX908-LABEL: @test_mfma_f32_32x32x4bf16
-// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
+// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
+// CHECK-GFX908: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x4bf16(global v16f* out, v2s a, v2s b, v16f c)
 {
  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, b, c, 0, 0, 0);
 }

 // CHECK-GFX908-LABEL: @test_mfma_f32_16x16x8bf16
-// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX908: [[s1:%[0-9]+]] = bitcast <2 x i16> %a to <2 x bfloat>
+// CHECK-GFX908-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %b to <2 x bfloat>
+// CHECK-GFX908: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
 {
  *out = __builtin_amdgcn_mfma_f32_16x16x8bf16(a, b, c, 0, 0, 0);
@@ -172,35 +182,45 @@ void test_mfma_f32_16x16x8bf16(global v4f* out, v2s a, v2s b, v4f c)
 #ifdef MFMA_GFX90A_TESTS

 // CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x4bf16_1k
-// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
+// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
+// CHECK-GFX90A: call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <32 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x4bf16_1k(global v32f* out, v4s a, v4s b, v32f c)
 {
  *out = __builtin_amdgcn_mfma_f32_32x32x4bf16_1k(a, b, c, 0, 0, 0);
 }

 // CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x4bf16_1k
-// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
+// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
+// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x4bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
 {
  *out = __builtin_amdgcn_mfma_f32_16x16x4bf16_1k(a, b, c, 0, 0, 0);
 }

 // CHECK-GFX90A-LABEL: @test_mfma_f32_4x4x4bf16_1k
-// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
+// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
+// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_4x4x4bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
 {
  *out = __builtin_amdgcn_mfma_f32_4x4x4bf16_1k(a, b, c, 0, 0, 0);
 }

 // CHECK-GFX90A-LABEL: @test_mfma_f32_32x32x8bf16_1k
-// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
+// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
+// CHECK-GFX90A: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <16 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_32x32x8bf16_1k(global v16f* out, v4s a, v4s b, v16f c)
 {
  *out = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, b, c, 0, 0, 0);
 }

 // CHECK-GFX90A-LABEL: @test_mfma_f32_16x16x16bf16_1k
-// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %c, i32 0, i32 0, i32 0)
+// CHECK-GFX90A: [[s1:%[0-9]+]] = bitcast <4 x i16> %a to <4 x bfloat>
+// CHECK-GFX90A-NEXT: [[s2:%[0-9]+]] = bitcast <4 x i16> %b to <4 x bfloat>
+// CHECK-GFX90A: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat> [[s1]], <4 x bfloat> [[s2]], <4 x float> %c, i32 0, i32 0, i32 0)
 void test_mfma_f32_16x16x16bf16_1k(global v4f* out, v4s a, v4s b, v4f c)
 {
  *out = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(a, b, c, 0, 0, 0);
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3027,11 +3027,11 @@ def int_amdgcn_mfma_i32_16x16x4i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i
 def int_amdgcn_mfma_i32_4x4x4i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
 def int_amdgcn_mfma_i32_32x32x8i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
 def int_amdgcn_mfma_i32_16x16x16i8  : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
-def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>;
-def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
-def int_amdgcn_mfma_f32_4x4x2bf16   : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
-def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
-def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
+def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2bf16_ty>;
+def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2bf16_ty>;
+def int_amdgcn_mfma_f32_4x4x2bf16   : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2bf16_ty>;
+def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2bf16_ty>;
+def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2bf16_ty>;

 //===----------------------------------------------------------------------===//
 // gfx90a intrinsics
@@ -3043,11 +3043,11 @@ def int_amdgcn_flat_atomic_fadd   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;

-def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
-def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
-def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
-def int_amdgcn_mfma_f32_32x32x8bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
-def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
+def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4bf16_ty>;
+def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4bf16_ty>;
+def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4bf16_ty>;
+def int_amdgcn_mfma_f32_32x32x8bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4bf16_ty>;
+def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4bf16_ty>;

 // Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
 //       Three bits corresponding to the neg modifier applied to the respective
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2549,8 +2549,11 @@ def VOP_V4F32_V4F16_V4F16_V4F32   : VOPProfile <[v4f32,  v4f16, v4f16, v4f32]>;
 def VOP_V16F32_V4F16_V4F16_V16F32 : VOPProfile <[v16f32, v4f16, v4f16, v16f32]>;
 def VOP_V32F32_V4F16_V4F16_V32F32 : VOPProfile <[v32f32, v4f16, v4f16, v32f32]>;
 def VOP_V4F32_V2I16_V2I16_V4F32   : VOPProfile <[v4f32,  v2i16, v2i16, v4f32]>;
+def VOP_V4F32_V2BF16_V2BF16_V4F32 : VOPProfile <[v4f32,  v2bf16, v2bf16, v4f32]>;
 def VOP_V16F32_V2I16_V2I16_V16F32 : VOPProfile <[v16f32, v2i16, v2i16, v16f32]>;
+def VOP_V16F32_V2BF16_V2BF16_V16F32 : VOPProfile <[v16f32, v2bf16, v2bf16, v16f32]>;
 def VOP_V32F32_V2I16_V2I16_V32F32 : VOPProfile <[v32f32, v2i16, v2i16, v32f32]>;
+def VOP_V32F32_V2BF16_V2BF16_V32F32 : VOPProfile <[v32f32, v2bf16, v2bf16, v32f32]>;
 def VOP_V4I32_I32_I32_V4I32       : VOPProfile <[v4i32,  i32,   i32,   v4i32]>;
 def VOP_V16I32_I32_I32_V16I32     : VOPProfile <[v16i32, i32,   i32,   v16i32]>;
 def VOP_V32I32_I32_I32_V32I32     : VOPProfile <[v32i32, i32,   i32,   v32i32]>;
@@ -2562,8 +2565,11 @@ def VOP_V2F32_V2F32_V2F32_V2F32   : VOPProfile <[v2f32,  v2f32, v2f32, v2f32]>;
 def VOP_V2F32_V2F32_V2F32         : VOPProfile <[v2f32,  v2f32, v2f32, untyped]>;
 def VOP_V2I32_V2I32_V2I32         : VOPProfile <[v2i32,  v2i32, v2i32, untyped]>;
 def VOP_V4F32_V4I16_V4I16_V4F32   : VOPProfile <[v4f32,  v4i16, v4i16, v4f32]>;
+def VOP_V4F32_V4BF16_V4BF16_V4F32   : VOPProfile <[v4f32,  v4bf16, v4bf16, v4f32]>;
 def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
+def VOP_V16F32_V4BF16_V4BF16_V16F32 : VOPProfile <[v16f32, v4bf16, v4bf16, v16f32]>;
 def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
+def VOP_V32F32_V4BF16_V4BF16_V32F32 : VOPProfile <[v32f32, v4bf16, v4bf16, v32f32]>;

 def VOP_V4I32_I64_I64_V4I32       : VOPProfile <[v4i32,  i64,   i64,   v4i32]>;
 def VOP_V16I32_I64_I64_V16I32     : VOPProfile <[v16i32, i64,   i64,   v16i32]>;
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -573,15 +573,15 @@ def VOPProfileMAI_F32_F32_X32   : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32,     A
 def VOPProfileMAI_I32_I32_X4    : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32,       AISrc_128_b32,  ADst_128>;
 def VOPProfileMAI_I32_I32_X16   : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32,     AISrc_512_b32,  ADst_512>;
 def VOPProfileMAI_I32_I32_X32   : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32,     AISrc_1024_b32, ADst_1024>;
-def VOPProfileMAI_F32_V2I16_X4  : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32,   AISrc_128_b32,  ADst_128>;
-def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32,  ADst_512>;
-def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>;
+def VOPProfileMAI_F32_V2BF16_X4  : VOPProfileMAI<VOP_V4F32_V2BF16_V2BF16_V4F32,AISrc_128_b32,  ADst_128>;
+def VOPProfileMAI_F32_V2BF16_X16 : VOPProfileMAI<VOP_V16F32_V2BF16_V2BF16_V16F32, AISrc_512_b32,  ADst_512>;
+def VOPProfileMAI_F32_V2BF16_X32 : VOPProfileMAI<VOP_V32F32_V2BF16_V2BF16_V32F32, AISrc_1024_b32, ADst_1024>;
 def VOPProfileMAI_F32_V4F16_X4  : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32,   AISrc_128_b32,  ADst_128,  AVSrc_64>;
 def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32,  ADst_512,  AVSrc_64>;
 def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
-def VOPProfileMAI_F32_V4I16_X4  : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32,   AISrc_128_b32,  ADst_128,  AVSrc_64>;
-def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, AISrc_512_b32,  ADst_512,  AVSrc_64>;
-def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
+def VOPProfileMAI_F32_V4BF16_X4  : VOPProfileMAI<VOP_V4F32_V4BF16_V4BF16_V4F32,   AISrc_128_b32,  ADst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_V4BF16_X16 : VOPProfileMAI<VOP_V16F32_V4BF16_V4BF16_V16F32, AISrc_512_b32,  ADst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_V4BF16_X32 : VOPProfileMAI<VOP_V32F32_V4BF16_V4BF16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
 def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64,      AISrc_256_f64,  ADst_256,  AVSrc_64>;
 def VOPProfileMAI_F64_4X4X4F64   : VOPProfileMAI<VOP_F64_F64_F64_F64,          AISrc_64_f64,   ADst_64,   AVSrc_64>;
 def VOPProfileMAI_I32_I64_X16   : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32,       AISrc_128_b32,  ADst_128,  AVSrc_64>;
@@ -597,15 +597,15 @@ def VOPProfileMAI_F32_F32_X32_VCD    : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32,
 def VOPProfileMAI_I32_I32_X4_VCD     : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32,       VISrc_128_b32,  VDst_128>;
 def VOPProfileMAI_I32_I32_X16_VCD    : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32,     VISrc_512_b32,  VDst_512>;
 def VOPProfileMAI_I32_I32_X32_VCD    : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32,     VISrc_1024_b32, VDst_1024>;
-def VOPProfileMAI_F32_V2I16_X4_VCD   : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32,   VISrc_128_b32,  VDst_128>;
-def VOPProfileMAI_F32_V2I16_X16_VCD  : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, VISrc_512_b32,  VDst_512>;
-def VOPProfileMAI_F32_V2I16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, VISrc_1024_b32, VDst_1024>;
+def VOPProfileMAI_F32_V2BF16_X4_VCD  : VOPProfileMAI<VOP_V4F32_V2BF16_V2BF16_V4F32, VISrc_128_b32,  VDst_128>;
+def VOPProfileMAI_F32_V2BF16_X16_VCD : VOPProfileMAI<VOP_V16F32_V2BF16_V2BF16_V16F32, VISrc_512_b32,  VDst_512>;
+def VOPProfileMAI_F32_V2BF16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V2BF16_V2BF16_V32F32, VISrc_1024_b32, VDst_1024>;
 def VOPProfileMAI_F32_V4F16_X4_VCD   : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32,   VISrc_128_b32,  VDst_128,  AVSrc_64>;
 def VOPProfileMAI_F32_V4F16_X16_VCD  : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, VISrc_512_b32,  VDst_512,  AVSrc_64>;
 def VOPProfileMAI_F32_V4F16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
-def VOPProfileMAI_F32_V4I16_X4_VCD   : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32,   VISrc_128_b32,  VDst_128,  AVSrc_64>;
-def VOPProfileMAI_F32_V4I16_X16_VCD  : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, VISrc_512_b32,  VDst_512,  AVSrc_64>;
-def VOPProfileMAI_F32_V4I16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
+def VOPProfileMAI_F32_V4BF16_X4_VCD   : VOPProfileMAI<VOP_V4F32_V4BF16_V4BF16_V4F32,   VISrc_128_b32,  VDst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_V4BF16_X16_VCD  : VOPProfileMAI<VOP_V16F32_V4BF16_V4BF16_V16F32, VISrc_512_b32,  VDst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_V4BF16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V4BF16_V4BF16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
 def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64,       VISrc_256_f64,  VDst_256,  AVSrc_64>;
 def VOPProfileMAI_F64_4X4X4F64_VCD   : VOPProfileMAI<VOP_F64_F64_F64_F64,           VISrc_64_f64,   VDst_64,   AVSrc_64>;
 def VOPProfileMAI_I32_I64_X16_VCD    : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32,       VISrc_128_b32,  VDst_128,  AVSrc_64>;
@@ -727,22 +727,22 @@ defm V_MFMA_I32_32X32X4I8   : MAIInst<"v_mfma_i32_32x32x4i8",   "I32_I32_X32",
 let Predicates = [isGFX908orGFX90A] in {
 defm V_MFMA_I32_16X16X16I8  : MAIInst<"v_mfma_i32_16x16x16i8",  "I32_I32_X4",    int_amdgcn_mfma_i32_16x16x16i8>;
 defm V_MFMA_I32_32X32X8I8   : MAIInst<"v_mfma_i32_32x32x8i8",   "I32_I32_X16",   int_amdgcn_mfma_i32_32x32x8i8>;
-defm V_MFMA_F32_4X4X2BF16   : MAIInst<"v_mfma_f32_4x4x2bf16",   "F32_V2I16_X4",  int_amdgcn_mfma_f32_4x4x2bf16>;
-defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>;
-defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4",  int_amdgcn_mfma_f32_16x16x8bf16>;
-defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
-defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
+defm V_MFMA_F32_4X4X2BF16   : MAIInst<"v_mfma_f32_4x4x2bf16",   "F32_V2BF16_X4",  int_amdgcn_mfma_f32_4x4x2bf16>;
+defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2BF16_X16", int_amdgcn_mfma_f32_16x16x2bf16>;
+defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2BF16_X4",  int_amdgcn_mfma_f32_16x16x8bf16>;
+defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2BF16_X32", int_amdgcn_mfma_f32_32x32x2bf16>;
+defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2BF16_X16", int_amdgcn_mfma_f32_32x32x4bf16>;
 }

 } // End SubtargetPredicate = HasMAIInsts

 let Predicates = [isGFX90APlus] in {
  let is_gfx940_xdl = 1 in {
-  defm V_MFMA_F32_32X32X4BF16_1K  : MAIInst<"v_mfma_f32_32x32x4bf16_1k",  "F32_V4I16_X32",  int_amdgcn_mfma_f32_32x32x4bf16_1k>;
-  defm V_MFMA_F32_16X16X4BF16_1K  : MAIInst<"v_mfma_f32_16x16x4bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_16x16x4bf16_1k>;
-  defm V_MFMA_F32_4X4X4BF16_1K    : MAIInst<"v_mfma_f32_4x4x4bf16_1k",    "F32_V4I16_X4",   int_amdgcn_mfma_f32_4x4x4bf16_1k>;
-  defm V_MFMA_F32_32X32X8BF16_1K  : MAIInst<"v_mfma_f32_32x32x8bf16_1k",  "F32_V4I16_X16",  int_amdgcn_mfma_f32_32x32x8bf16_1k>;
-  defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4",   int_amdgcn_mfma_f32_16x16x16bf16_1k>;
+  defm V_MFMA_F32_32X32X4BF16_1K  : MAIInst<"v_mfma_f32_32x32x4bf16_1k",  "F32_V4BF16_X32",  int_amdgcn_mfma_f32_32x32x4bf16_1k>;
+  defm V_MFMA_F32_16X16X4BF16_1K  : MAIInst<"v_mfma_f32_16x16x4bf16_1k",  "F32_V4BF16_X16",  int_amdgcn_mfma_f32_16x16x4bf16_1k>;
+  defm V_MFMA_F32_4X4X4BF16_1K    : MAIInst<"v_mfma_f32_4x4x4bf16_1k",    "F32_V4BF16_X4",   int_amdgcn_mfma_f32_4x4x4bf16_1k>;
+  defm V_MFMA_F32_32X32X8BF16_1K  : MAIInst<"v_mfma_f32_32x32x8bf16_1k",  "F32_V4BF16_X16",  int_amdgcn_mfma_f32_32x32x8bf16_1k>;
+  defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4BF16_X4",   int_amdgcn_mfma_f32_16x16x16bf16_1k>;
  }

  let is_dgemm = 1 in {
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN %s

-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
 declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
 declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_endpgm
 bb:
  %in.1 = load <32 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
  store <32 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -117,9 +117,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_endpgm
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -147,9 +147,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GCN-NEXT:    s_endpgm
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -194,9 +194,9 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_endpgm
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -225,9 +225,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GCN-NEXT:    s_endpgm
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -2,11 +2,11 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s

-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat>, <2 x bfloat>, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat>, <2 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat>, <2 x bfloat>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat>, <2 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat>, <2 x bfloat>, <4 x float>, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()

 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x2bf16:
@@ -55,9 +55,9 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <32 x float>, ptr addrspace(1) %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i32 1 to <2 x bfloat>
+  %b = bitcast i32 2 to <2 x bfloat>
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
  store <32 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -76,9 +76,9 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i32 1 to <2 x bfloat>
+  %b = bitcast i32 2 to <2 x bfloat>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -97,9 +97,9 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i32 1 to <2 x bfloat>
+  %b = bitcast i32 2 to <2 x bfloat>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -118,9 +118,9 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> %a, <2 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i32 1 to <2 x bfloat>
+  %b = bitcast i32 2 to <2 x bfloat>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat> %a, <2 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -139,9 +139,9 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> %a, <2 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i32 1 to <2 x bfloat>
+  %b = bitcast i32 2 to <2 x bfloat>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat> %a, <2 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,11 +1,11 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s

-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
 declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
 declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()
@@ -23,9 +23,9 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <32 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
  store <32 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -42,9 +42,9 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -61,9 +61,9 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -80,9 +80,9 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -99,9 +99,9 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) #0 {
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %a = bitcast i64 1 to <4 x i16>
-  %b = bitcast i64 2 to <4 x i16>
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
+  %a = bitcast i64 1 to <4 x bfloat>
+  %b = bitcast i64 2 to <4 x bfloat>
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat> %a, <4 x bfloat> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
--- a/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-bf16-vgpr-cd-select.ll
@@ -1,16 +1,16 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=GCN %s

-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16>, <2 x i16>, <4 x float>, i32, i32, i32)
-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
-declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
-declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat>, <2 x bfloat>, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat>, <2 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat>, <2 x bfloat>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat>, <2 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat>, <2 x bfloat>, <4 x float>, i32, i32, i32)
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <32 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat>, <4 x bfloat>, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat>, <4 x bfloat>, <4 x float>, i32, i32, i32)
 declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
 declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
 declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
@@ -21,9 +21,9 @@ declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32
 define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <32 x float>, ptr addrspace(1) %arg
-  %a = bitcast i32 1 to <2 x i16>
-  %b = bitcast i32 2 to <2 x i16>
-  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16> %a, <2 x i16> %b, <32 x float> %in.1, i32 0, i32 0, i32 0)
+  %a = bitcast i32 1 to <2 x bfloat>
+  %b = bitcast i32 2 to <2 x bfloat>
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <32 x float> %in.1, i32 0, i32 0, i32 0)
  store <32 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -33,7 +33,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -43,7 +43,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -53,7 +53,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x i16> undef, <2 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16(<2 x bfloat> undef, <2 x bfloat> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -63,7 +63,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x i16> undef, <2 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8bf16(<2 x bfloat> undef, <2 x bfloat> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -73,7 +73,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <32 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <32 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <32 x float> %in.1, i32 0, i32 0, i32 0)
  store <32 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -83,7 +83,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -93,7 +93,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -103,7 +103,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <16 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> undef, <4 x i16> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <16 x float> %in.1, i32 0, i32 0, i32 0)
  store <16 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }
@@ -113,7 +113,7 @@ bb:
 define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg) {
 bb:
  %in.1 = load <4 x float>, ptr addrspace(1) %arg
-  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> undef, <4 x i16> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x bfloat> undef, <4 x bfloat> undef, <4 x float> %in.1, i32 0, i32 0, i32 0)
  store <4 x float> %mai.1, ptr addrspace(1) %arg
  ret void
 }