- Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Simplify dpp builtin handling #115090
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/rampitec/11-04-_amdgpu_allow_lane-op_lowering_for_illegal_types
Are you sure you want to change the base?
[AMDGPU] Simplify dpp builtin handling #115090
Conversation
| Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
| @llvm/pr-subscribers-libcxx @llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) ChangesDPP intrinsics can handle any type now, so no need to cast to The caveat is that intrinsics only handle backend legal types, Full diff: https://github.com/llvm/llvm-project/pull/115090.diff 3 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 82770a75af23e4..7e3e6463799fb6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19193,37 +19193,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); llvm::Type *DataTy = ConvertType(E->getArg(0)->getType()); - unsigned Size = DataTy->getPrimitiveSizeInBits(); - llvm::Type *IntTy = - llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u)); Function *F = CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8 ? Intrinsic::amdgcn_mov_dpp8 : Intrinsic::amdgcn_update_dpp, - IntTy); + DataTy); assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 || E->getNumArgs() == 2); bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp; if (InsertOld) - Args.push_back(llvm::PoisonValue::get(IntTy)); - for (unsigned I = 0; I != E->getNumArgs(); ++I) { + Args.push_back(llvm::PoisonValue::get(DataTy)); + Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, 0, E)); + for (unsigned I = 1; I != E->getNumArgs(); ++I) { llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E); - if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) && - Size < 32) { - if (!DataTy->isIntegerTy()) - V = Builder.CreateBitCast( - V, llvm::IntegerType::get(Builder.getContext(), Size)); - V = Builder.CreateZExtOrBitCast(V, IntTy); - } llvm::Type *ExpTy = F->getFunctionType()->getFunctionParamType(I + InsertOld); Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy)); } - Value *V = Builder.CreateCall(F, Args); - if (Size < 32 && !DataTy->isIntegerTy()) - V = Builder.CreateTrunc( - V, llvm::IntegerType::get(Builder.getContext(), Size)); - return Builder.CreateTruncOrBitCast(V, DataTy); + return Builder.CreateCall(F, Args); } case AMDGPU::BI__builtin_amdgcn_permlane16: case AMDGPU::BI__builtin_amdgcn_permlanex16: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl index a4054cba236dd2..7e4ee6f4a942db 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -36,45 +36,37 @@ void test_mov_dpp8_long(global long* out, long a) { } // CHECK-LABEL: @test_mov_dpp8_float( -// CHECK: %0 = bitcast float %a to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) -// CHECK-NEXT: store i32 %1, +// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.mov.dpp8.f32(float %a, i32 1) +// CHECK-NEXT: store float %0, void test_mov_dpp8_float(global float* out, float a) { *out = __builtin_amdgcn_mov_dpp8(a, 1); } // CHECK-LABEL: @test_mov_dpp8_double -// CHECK: %0 = bitcast double %x to i64 -// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1) -// CHECK-NEXT: store i64 %1, +// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.mov.dpp8.f64(double %x, i32 1) +// CHECK-NEXT: store double %0, void test_mov_dpp8_double(double x, global double *p) { *p = __builtin_amdgcn_mov_dpp8(x, 1); } // CHECK-LABEL: @test_mov_dpp8_short -// CHECK: %0 = zext i16 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) -// CHECK-NEXT: %2 = trunc i32 %1 to i16 -// CHECK-NEXT: store i16 %2, +// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.mov.dpp8.i16(i16 %x, i32 1) +// CHECK-NEXT: store i16 %0, void test_mov_dpp8_short(short x, global short *p) { *p = __builtin_amdgcn_mov_dpp8(x, 1); } // CHECK-LABEL: @test_mov_dpp8_char -// CHECK: %0 = zext i8 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) -// CHECK-NEXT: %2 = trunc i32 %1 to i8 -// CHECK-NEXT: store i8 %2, +// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.mov.dpp8.i8(i8 %x, i32 1) +// CHECK-NEXT: store i8 %0, void test_mov_dpp8_char(char x, global char *p) { *p = __builtin_amdgcn_mov_dpp8(x, 1); } // CHECK-LABEL: @test_mov_dpp8_half -// CHECK: %0 = load i16, -// CHECK: %1 = zext i16 %0 to i32 -// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1) -// CHECK-NEXT: %3 = trunc i32 %2 to i16 -// CHECK-NEXT: store i16 %3, +// CHECK: %0 = load half, +// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.mov.dpp8.f16(half %0, i32 1) +// CHECK-NEXT: store half %1, void test_mov_dpp8_half(half *x, global half *p) { *p = __builtin_amdgcn_mov_dpp8(*x, 1); } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index 269f20e2f53fe1..0c5995be5e098a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -117,45 +117,37 @@ void test_mov_dpp_long(long x, global long *p) { } // CHECK-LABEL: @test_mov_dpp_float -// CHECK: %0 = bitcast float %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: store i32 %1, +// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float poison, float %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store float %0, void test_mov_dpp_float(float x, global float *p) { *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_mov_dpp_double -// CHECK: %0 = bitcast double %x to i64 -// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: store i64 %1, +// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double poison, double %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store double %0, void test_mov_dpp_double(double x, global double *p) { *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_mov_dpp_short -// CHECK: %0 = zext i16 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %2 = trunc i32 %1 to i16 -// CHECK-NEXT: store i16 %2, +// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 poison, i16 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i16 %0, void test_mov_dpp_short(short x, global short *p) { *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_mov_dpp_char -// CHECK: %0 = zext i8 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %2 = trunc i32 %1 to i8 -// CHECK-NEXT: store i8 %2, +// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 poison, i8 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i8 %0, void test_mov_dpp_char(char x, global char *p) { *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_mov_dpp_half -// CHECK: %0 = load i16, -// CHECK: %1 = zext i16 %0 to i32 -// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %3 = trunc i32 %2 to i16 -// CHECK-NEXT: store i16 %3, +// CHECK: %0 = load half, +// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half poison, half %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store half %1, void test_mov_dpp_half(half *x, global half *p) { *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0); } @@ -175,45 +167,37 @@ void test_update_dpp_long(long x, global long *p) { } // CHECK-LABEL: @test_update_dpp_float -// CHECK: %0 = bitcast float %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: store i32 %1, +// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float %x, float %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store float %0, void test_update_dpp_float(float x, global float *p) { *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_update_dpp_double -// CHECK: %0 = bitcast double %x to i64 -// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: store i64 %1, +// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double %x, double %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store double %0, void test_update_dpp_double(double x, global double *p) { *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_update_dpp_short -// CHECK: %0 = zext i16 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %2 = trunc i32 %1 to i16 -// CHECK-NEXT: store i16 %2, +// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 %x, i16 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i16 %0, void test_update_dpp_short(short x, global short *p) { *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_update_dpp_char -// CHECK: %0 = zext i8 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %2 = trunc i32 %1 to i8 -// CHECK-NEXT: store i8 %2, +// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 %x, i8 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i8 %0, void test_update_dpp_char(char x, global char *p) { *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_update_dpp_half -// CHECK: %0 = load i16, -// CHECK: %1 = zext i16 %0 to i32 -// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %3 = trunc i32 %2 to i16 -// CHECK-NEXT: store i16 %3, +// CHECK: %0 = load half, +// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half %0, half %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store half %1, void test_update_dpp_half(half *x, global half *p) { *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0); } |
| @llvm/pr-subscribers-clang-codegen Author: Stanislav Mekhanoshin (rampitec) ChangesDPP intrinsics can handle any type now, so no need to cast to The caveat is that intrinsics only handle backend legal types, Full diff: https://github.com/llvm/llvm-project/pull/115090.diff 3 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 82770a75af23e4..7e3e6463799fb6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -19193,37 +19193,24 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments); assert(Error == ASTContext::GE_None && "Should not codegen an error"); llvm::Type *DataTy = ConvertType(E->getArg(0)->getType()); - unsigned Size = DataTy->getPrimitiveSizeInBits(); - llvm::Type *IntTy = - llvm::IntegerType::get(Builder.getContext(), std::max(Size, 32u)); Function *F = CGM.getIntrinsic(BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp8 ? Intrinsic::amdgcn_mov_dpp8 : Intrinsic::amdgcn_update_dpp, - IntTy); + DataTy); assert(E->getNumArgs() == 5 || E->getNumArgs() == 6 || E->getNumArgs() == 2); bool InsertOld = BuiltinID == AMDGPU::BI__builtin_amdgcn_mov_dpp; if (InsertOld) - Args.push_back(llvm::PoisonValue::get(IntTy)); - for (unsigned I = 0; I != E->getNumArgs(); ++I) { + Args.push_back(llvm::PoisonValue::get(DataTy)); + Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, 0, E)); + for (unsigned I = 1; I != E->getNumArgs(); ++I) { llvm::Value *V = EmitScalarOrConstFoldImmArg(ICEArguments, I, E); - if (I < (BuiltinID == AMDGPU::BI__builtin_amdgcn_update_dpp ? 2u : 1u) && - Size < 32) { - if (!DataTy->isIntegerTy()) - V = Builder.CreateBitCast( - V, llvm::IntegerType::get(Builder.getContext(), Size)); - V = Builder.CreateZExtOrBitCast(V, IntTy); - } llvm::Type *ExpTy = F->getFunctionType()->getFunctionParamType(I + InsertOld); Args.push_back(Builder.CreateTruncOrBitCast(V, ExpTy)); } - Value *V = Builder.CreateCall(F, Args); - if (Size < 32 && !DataTy->isIntegerTy()) - V = Builder.CreateTrunc( - V, llvm::IntegerType::get(Builder.getContext(), Size)); - return Builder.CreateTruncOrBitCast(V, DataTy); + return Builder.CreateCall(F, Args); } case AMDGPU::BI__builtin_amdgcn_permlane16: case AMDGPU::BI__builtin_amdgcn_permlanex16: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl index a4054cba236dd2..7e4ee6f4a942db 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl @@ -36,45 +36,37 @@ void test_mov_dpp8_long(global long* out, long a) { } // CHECK-LABEL: @test_mov_dpp8_float( -// CHECK: %0 = bitcast float %a to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) -// CHECK-NEXT: store i32 %1, +// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.mov.dpp8.f32(float %a, i32 1) +// CHECK-NEXT: store float %0, void test_mov_dpp8_float(global float* out, float a) { *out = __builtin_amdgcn_mov_dpp8(a, 1); } // CHECK-LABEL: @test_mov_dpp8_double -// CHECK: %0 = bitcast double %x to i64 -// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.mov.dpp8.i64(i64 %0, i32 1) -// CHECK-NEXT: store i64 %1, +// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.mov.dpp8.f64(double %x, i32 1) +// CHECK-NEXT: store double %0, void test_mov_dpp8_double(double x, global double *p) { *p = __builtin_amdgcn_mov_dpp8(x, 1); } // CHECK-LABEL: @test_mov_dpp8_short -// CHECK: %0 = zext i16 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) -// CHECK-NEXT: %2 = trunc i32 %1 to i16 -// CHECK-NEXT: store i16 %2, +// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.mov.dpp8.i16(i16 %x, i32 1) +// CHECK-NEXT: store i16 %0, void test_mov_dpp8_short(short x, global short *p) { *p = __builtin_amdgcn_mov_dpp8(x, 1); } // CHECK-LABEL: @test_mov_dpp8_char -// CHECK: %0 = zext i8 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %0, i32 1) -// CHECK-NEXT: %2 = trunc i32 %1 to i8 -// CHECK-NEXT: store i8 %2, +// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.mov.dpp8.i8(i8 %x, i32 1) +// CHECK-NEXT: store i8 %0, void test_mov_dpp8_char(char x, global char *p) { *p = __builtin_amdgcn_mov_dpp8(x, 1); } // CHECK-LABEL: @test_mov_dpp8_half -// CHECK: %0 = load i16, -// CHECK: %1 = zext i16 %0 to i32 -// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.mov.dpp8.i32(i32 %1, i32 1) -// CHECK-NEXT: %3 = trunc i32 %2 to i16 -// CHECK-NEXT: store i16 %3, +// CHECK: %0 = load half, +// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.mov.dpp8.f16(half %0, i32 1) +// CHECK-NEXT: store half %1, void test_mov_dpp8_half(half *x, global half *p) { *p = __builtin_amdgcn_mov_dpp8(*x, 1); } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index 269f20e2f53fe1..0c5995be5e098a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -117,45 +117,37 @@ void test_mov_dpp_long(long x, global long *p) { } // CHECK-LABEL: @test_mov_dpp_float -// CHECK: %0 = bitcast float %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: store i32 %1, +// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float poison, float %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store float %0, void test_mov_dpp_float(float x, global float *p) { *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_mov_dpp_double -// CHECK: %0 = bitcast double %x to i64 -// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 poison, i64 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: store i64 %1, +// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double poison, double %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store double %0, void test_mov_dpp_double(double x, global double *p) { *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_mov_dpp_short -// CHECK: %0 = zext i16 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %2 = trunc i32 %1 to i16 -// CHECK-NEXT: store i16 %2, +// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 poison, i16 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i16 %0, void test_mov_dpp_short(short x, global short *p) { *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_mov_dpp_char -// CHECK: %0 = zext i8 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %2 = trunc i32 %1 to i8 -// CHECK-NEXT: store i8 %2, +// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 poison, i8 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i8 %0, void test_mov_dpp_char(char x, global char *p) { *p = __builtin_amdgcn_mov_dpp(x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_mov_dpp_half -// CHECK: %0 = load i16, -// CHECK: %1 = zext i16 %0 to i32 -// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 poison, i32 %1, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %3 = trunc i32 %2 to i16 -// CHECK-NEXT: store i16 %3, +// CHECK: %0 = load half, +// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half poison, half %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store half %1, void test_mov_dpp_half(half *x, global half *p) { *p = __builtin_amdgcn_mov_dpp(*x, 0x101, 0xf, 0xf, 0); } @@ -175,45 +167,37 @@ void test_update_dpp_long(long x, global long *p) { } // CHECK-LABEL: @test_update_dpp_float -// CHECK: %0 = bitcast float %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: store i32 %1, +// CHECK: %0 = tail call{{.*}} float @llvm.amdgcn.update.dpp.f32(float %x, float %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store float %0, void test_update_dpp_float(float x, global float *p) { *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_update_dpp_double -// CHECK: %0 = bitcast double %x to i64 -// CHECK-NEXT: %1 = tail call{{.*}} i64 @llvm.amdgcn.update.dpp.i64(i64 %0, i64 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: store i64 %1, +// CHECK: %0 = tail call{{.*}} double @llvm.amdgcn.update.dpp.f64(double %x, double %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store double %0, void test_update_dpp_double(double x, global double *p) { *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_update_dpp_short -// CHECK: %0 = zext i16 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %2 = trunc i32 %1 to i16 -// CHECK-NEXT: store i16 %2, +// CHECK: %0 = tail call{{.*}} i16 @llvm.amdgcn.update.dpp.i16(i16 %x, i16 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i16 %0, void test_update_dpp_short(short x, global short *p) { *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_update_dpp_char -// CHECK: %0 = zext i8 %x to i32 -// CHECK-NEXT: %1 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %0, i32 %0, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %2 = trunc i32 %1 to i8 -// CHECK-NEXT: store i8 %2, +// CHECK: %0 = tail call{{.*}} i8 @llvm.amdgcn.update.dpp.i8(i8 %x, i8 %x, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store i8 %0, void test_update_dpp_char(char x, global char *p) { *p = __builtin_amdgcn_update_dpp(x, x, 0x101, 0xf, 0xf, 0); } // CHECK-LABEL: @test_update_dpp_half -// CHECK: %0 = load i16, -// CHECK: %1 = zext i16 %0 to i32 -// CHECK-NEXT: %2 = tail call{{.*}} i32 @llvm.amdgcn.update.dpp.i32(i32 %1, i32 %1, i32 257, i32 15, i32 15, i1 false) -// CHECK-NEXT: %3 = trunc i32 %2 to i16 -// CHECK-NEXT: store i16 %3, +// CHECK: %0 = load half, +// CHECK-NEXT: %1 = tail call{{.*}} half @llvm.amdgcn.update.dpp.f16(half %0, half %0, i32 257, i32 15, i32 15, i1 false) +// CHECK-NEXT: store half %1, void test_update_dpp_half(half *x, global half *p) { *p = __builtin_amdgcn_update_dpp(*x, *x, 0x101, 0xf, 0xf, 0); } |
7a0daff to dc5e6fe Compare d9f9656 to ce7f572 Compare dc5e6fe to 27d5137 Compare ce7f572 to 084e347 Compare
arsenm left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also teach instcombine to fold bitcast + app
27d5137 to b178b26 Compare 084e347 to 7ccac58 Compare
It still needs downstack change to handle i8: #114887 |
b178b26 to 634483a Compare 7ccac58 to f3d99e4 Compare 634483a to 7c4fa7d Compare f3d99e4 to f7e10b1 Compare 7c4fa7d to 70dd649 Compare DPP intrinsics can handle any type now, so no need to cast to integer. The caveat is that intrinsics only handle backend legal types, but it does not work with i8 for example.
70dd649 to 6cc8a46 Compare f7e10b1 to 1fe2797 Compare 
DPP intrinsics can handle any type now, so no need to cast to
integer.