- Notifications
You must be signed in to change notification settings - Fork 15.3k
[HIP] Perform implicit pointer cast when compiling HIP, not when -fcuda-is-device #165387
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[HIP] Perform implicit pointer cast when compiling HIP, not when -fcuda-is-device #165387
Conversation
dac020c to ed2f606 Compare | @llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-amdgpu Author: Juan Manuel Martinez Caamaño (jmmartinez) Changes[HIP] Perform implicit pointer cast when compiling device code, not when -fcuda-is-device When compiling HIP device code, we add implicit casts for the pointer When compiling for the host, apply the same casts for device or kernel functions, This patch changes the condition depending on -fcuda-is-device to depend Full diff: https://github.com/llvm/llvm-project/pull/165387.diff 2 Files Affected:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index a50c27610dc96..1d1b0f5c75905 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6734,8 +6734,10 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, // If Arg is declared in the default address space and Param is declared // in a non-default address space, perform an implicit address space cast to // the parameter type. - if (getLangOpts().HIP && getLangOpts().CUDAIsDevice && FD && - FD->getBuiltinID()) { + FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda =*/true); + bool CallerIsDevice = Caller && (Caller->hasAttr<CUDAGlobalAttr>() || + Caller->hasAttr<CUDADeviceAttr>()); + if (getLangOpts().HIP && CallerIsDevice && FD && FD->getBuiltinID()) { for (unsigned Idx = 0; Idx < ArgExprs.size() && Idx < FD->param_size(); ++Idx) { ParmVarDecl *Param = FD->getParamDecl(Idx); diff --git a/clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip b/clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip index 366278f648939..b49c1866caa1c 100644 --- a/clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip +++ b/clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip @@ -1,7 +1,7 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify=device %s -fcuda-is-device -// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify=host %s -// device-no-diagnostics +// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify %s -fcuda-is-device +// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s +// expected-no-diagnostics #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -20,11 +20,11 @@ __device__ void i_am_device(void* src, __amdgpu_buffer_rsrc_t rsrc, __shared__ v __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, dst, 12, vindex, voffset, soffset, 0, 0); __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, dst, 16, vindex, voffset, soffset, 0, 0); - __builtin_amdgcn_load_to_lds(src, dst, 1, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 2, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 4, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 12, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 16, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} + __builtin_amdgcn_load_to_lds(src, dst, 1, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 2, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 4, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 12, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 16, 0, 0); __builtin_amdgcn_global_load_lds(src, dst, 1, 0 , 0); __builtin_amdgcn_global_load_lds(src, dst, 2, 0 , 0); @@ -46,11 +46,11 @@ __global__ void i_am_kernel(void* src, __amdgpu_buffer_rsrc_t rsrc, __shared__ v __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, dst, 12, vindex, voffset, soffset, 0, 0); __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, dst, 16, vindex, voffset, soffset, 0, 0); - __builtin_amdgcn_load_to_lds(src, dst, 1, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 2, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 4, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 12, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 16, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} + __builtin_amdgcn_load_to_lds(src, dst, 1, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 2, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 4, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 12, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 16, 0, 0); __builtin_amdgcn_global_load_lds(src, dst, 1, 0 , 0); __builtin_amdgcn_global_load_lds(src, dst, 2, 0 , 0); |
…hen -fcuda-is-device When compiling HIP device code, we add implicit casts for the pointer arguments being passed to builtin calls. When compiling for the host, apply the same casts for __device__ or __kernel__ functions, since the device side of the source should still pass type checks. This patch changes the condition depending on -fcuda-is-device to depend on if the builtin's caller is marked as __device__ or __kernel__. stack-info: PR: llvm#165387, branch: users/jmmartinez/fix/load_lds_typesignature/1
clang/lib/Sema/SemaExpr.cpp Outdated
| if (getLangOpts().HIP && getLangOpts().CUDAIsDevice && FD && | ||
| FD->getBuiltinID()) { | ||
| FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda =*/true); | ||
| bool CallerIsDevice = Caller && (Caller->hasAttr<CUDAGlobalAttr>() || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When would Caller be null? Having the behavior changed based on a particular caller seems bad?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The caller can be null when the expression appears in a global variable assignment for example. It happens for some builtins used in constexpr assignments.
Having the behavior changed based on a particular caller seems bad?
Having this whole implicit cast for pointers when we compile for the device is already a big problem. I suspect these casts are masking some issue elsewhere.
I think the original intention of the author was to allow for the relaxed pointer address space casts on device code (so, the caller of the builtin is a function marked with device or global). But they used CUDAIsDevice instead. More on why we need the casts later.
Since when we compile we do semantic analysis of device and host code despite not generating code for both cases, the semantic analysis for __device__ functions must still validate even for host compilation. But today it is not the case, but it's not a problem since we mark several of our builtins as having a meaningless signature).
Why do we need these AS in the first place? I'm not 100% sure yet, but I found some cases that I'd expect to pass that fail without any cast. For example:
// fails with: error: cannot initialize a parameter of type '__shared__ void *' with an lvalue of type 'void *' __device__ void test_load_to_lds_u32(void* src, __shared__ void *dst) { __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); }At the same time, I found some cases where I believe compilation should fail but having the implicit AS cast allows for it.
This case for example, shared is not marked as __shared__, while the builtin expects a __shared__ float*. But since we allow for the implicit casts it compiles.
__global__ void test_ds_fmin(float src, float *shared) { volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false); }| Ping ! |
69ab914 to 011550b Compare …hen -fcuda-is-device When compiling HIP device code, we add implicit casts for the pointer arguments being passed to builtin calls. When compiling for the host, apply the same casts for __device__ or __kernel__ functions, since the device side of the source should still pass type checks. This patch changes the condition depending on -fcuda-is-device to depend on if the builtin's caller is marked as __device__ or __kernel__. stack-info: PR: #165387, branch: users/jmmartinez/fix/load_lds_typesignature/1
🐧 Linux x64 Test Results
|
| Ping ! |
shiltian left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one nit.
…hen -fcuda-is-device When compiling HIP device code, we add implicit casts for the pointer arguments being passed to builtin calls. When compiling for the host, apply the same casts for __device__ or __kernel__ functions, since the device side of the source should still pass type checks. This patch changes the condition depending on -fcuda-is-device to depend on if the builtin's caller is marked as __device__ or __kernel__. stack-info: PR: #165387, branch: users/jmmartinez/fix/load_lds_typesignature/1
011550b to cf96a53 Compare …da-is-device (llvm#165387) When compiling HIP device code, we add implicit casts for the pointer arguments passed to built-in calls. When compiling for the host, apply the same casts, since the device side of the source (device functions and kernels) should still pass type checks.
…da-is-device (llvm#165387) When compiling HIP device code, we add implicit casts for the pointer arguments passed to built-in calls. When compiling for the host, apply the same casts, since the device side of the source (device functions and kernels) should still pass type checks.
This tests show how type-checking is performed for `__builtin_amdgcn_load_to_lds`, but not for `__builtin_amdgcn_raw_ptr_buffer_load_lds`, `__builtin_amdgcn_struct_ptr_buffer_load_lds` and `__builtin_amdgcn_global_load_lds` since they are declared with the 't' attribute. Stacked on top of: #165387
…(#165388) This tests show how type-checking is performed for `__builtin_amdgcn_load_to_lds`, but not for `__builtin_amdgcn_raw_ptr_buffer_load_lds`, `__builtin_amdgcn_struct_ptr_buffer_load_lds` and `__builtin_amdgcn_global_load_lds` since they are declared with the 't' attribute. Stacked on top of: llvm/llvm-project#165387
…#165389) Allows for type checking depending on the builtin signature. Stacked on top of: llvm#165387 and llvm#165388
…ltins (#165389) Allows for type checking depending on the builtin signature. Stacked on top of: llvm/llvm-project#165387 and llvm/llvm-project#165388
When compiling HIP device code, we add implicit casts for the pointer
arguments being passed to builtin calls.
When compiling for the host, apply the same casts for
__device__or__kernel__functions,since the device side of the source should still pass type checks.