[HIP] Perform implicit pointer cast when compiling HIP, not when -fcuda-is-device #165387

jmmartinez · 2025-10-28T13:23:41Z

When compiling HIP device code, we add implicit casts for the pointer
arguments being passed to builtin calls.

When compiling for the host, apply the same casts for __device__ or __kernel__ functions,
since the device side of the source should still pass type checks.

llvmbot · 2025-10-28T13:24:15Z

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-backend-amdgpu

Author: Juan Manuel Martinez Caamaño (jmmartinez)

Changes

[HIP] Perform implicit pointer cast when compiling device code, not when -fcuda-is-device

When compiling HIP device code, we add implicit casts for the pointer
arguments being passed to builtin calls.

When compiling for the host, apply the same casts for device or kernel functions,
since the device side of the source should still pass type checks.

This patch changes the condition depending on -fcuda-is-device to depend
on if the builtin's caller is marked as device or kernel.

Full diff: https://github.com/llvm/llvm-project/pull/165387.diff

2 Files Affected:

(modified) clang/lib/Sema/SemaExpr.cpp (+4-2)
(modified) clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip (+13-13)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index a50c27610dc96..1d1b0f5c75905 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6734,8 +6734,10 @@ ExprResult Sema::BuildCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, // If Arg is declared in the default address space and Param is declared // in a non-default address space, perform an implicit address space cast to // the parameter type. - if (getLangOpts().HIP && getLangOpts().CUDAIsDevice && FD && - FD->getBuiltinID()) { + FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda =*/true); + bool CallerIsDevice = Caller && (Caller->hasAttr<CUDAGlobalAttr>() || + Caller->hasAttr<CUDADeviceAttr>()); + if (getLangOpts().HIP && CallerIsDevice && FD && FD->getBuiltinID()) { for (unsigned Idx = 0; Idx < ArgExprs.size() && Idx < FD->param_size(); ++Idx) { ParmVarDecl *Param = FD->getParamDecl(Idx); diff --git a/clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip b/clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip index 366278f648939..b49c1866caa1c 100644 --- a/clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip +++ b/clang/test/SemaHIP/amdgpu-gfx950-load-to-lds.hip @@ -1,7 +1,7 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify=device %s -fcuda-is-device -// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify=host %s -// device-no-diagnostics +// RUN: %clang_cc1 -fsyntax-only -triple amdgcn -target-cpu gfx950 -verify %s -fcuda-is-device +// RUN: %clang_cc1 -fsyntax-only -triple x86_64 -aux-triple amdgcn -verify %s +// expected-no-diagnostics #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -20,11 +20,11 @@ __device__ void i_am_device(void* src, __amdgpu_buffer_rsrc_t rsrc, __shared__ v __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, dst, 12, vindex, voffset, soffset, 0, 0); __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, dst, 16, vindex, voffset, soffset, 0, 0); - __builtin_amdgcn_load_to_lds(src, dst, 1, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 2, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 4, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 12, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 16, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} + __builtin_amdgcn_load_to_lds(src, dst, 1, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 2, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 4, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 12, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 16, 0, 0); __builtin_amdgcn_global_load_lds(src, dst, 1, 0 , 0); __builtin_amdgcn_global_load_lds(src, dst, 2, 0 , 0); @@ -46,11 +46,11 @@ __global__ void i_am_kernel(void* src, __amdgpu_buffer_rsrc_t rsrc, __shared__ v __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, dst, 12, vindex, voffset, soffset, 0, 0); __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, dst, 16, vindex, voffset, soffset, 0, 0); - __builtin_amdgcn_load_to_lds(src, dst, 1, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 2, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 4, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 12, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} - __builtin_amdgcn_load_to_lds(src, dst, 16, 0, 0); // host-error{{cannot initialize a parameter of type '__attribute__((address_space(3))) void *' with an lvalue of type 'void *'}} + __builtin_amdgcn_load_to_lds(src, dst, 1, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 2, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 4, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 12, 0, 0); + __builtin_amdgcn_load_to_lds(src, dst, 16, 0, 0); __builtin_amdgcn_global_load_lds(src, dst, 1, 0 , 0); __builtin_amdgcn_global_load_lds(src, dst, 2, 0 , 0);

…hen -fcuda-is-device When compiling HIP device code, we add implicit casts for the pointer arguments being passed to builtin calls. When compiling for the host, apply the same casts for __device__ or __kernel__ functions, since the device side of the source should still pass type checks. This patch changes the condition depending on -fcuda-is-device to depend on if the builtin's caller is marked as __device__ or __kernel__. stack-info: PR: llvm#165387, branch: users/jmmartinez/fix/load_lds_typesignature/1

arsenm · 2025-11-05T16:06:41Z

clang/lib/Sema/SemaExpr.cpp

- if (getLangOpts().HIP && getLangOpts().CUDAIsDevice && FD &&
- FD->getBuiltinID()) {
+ FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda =*/true);
+ bool CallerIsDevice = Caller && (Caller->hasAttr<CUDAGlobalAttr>() ||


When would Caller be null? Having the behavior changed based on a particular caller seems bad?

The caller can be null when the expression appears in a global variable assignment for example. It happens for some builtins used in constexpr assignments.

Having the behavior changed based on a particular caller seems bad?

Having this whole implicit cast for pointers when we compile for the device is already a big problem. I suspect these casts are masking some issue elsewhere.

I think the original intention of the author was to allow for the relaxed pointer address space casts on device code (so, the caller of the builtin is a function marked with device or global). But they used CUDAIsDevice instead. More on why we need the casts later.

Since when we compile we do semantic analysis of device and host code despite not generating code for both cases, the semantic analysis for __device__ functions must still validate even for host compilation. But today it is not the case, but it's not a problem since we mark several of our builtins as having a meaningless signature).

Why do we need these AS in the first place? I'm not 100% sure yet, but I found some cases that I'd expect to pass that fail without any cast. For example:

// fails with: error: cannot initialize a parameter of type '__shared__ void *' with an lvalue of type 'void *' __device__ void test_load_to_lds_u32(void* src, __shared__ void *dst) { __builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0); }

At the same time, I found some cases where I believe compilation should fail but having the implicit AS cast allows for it.

This case for example, shared is not marked as __shared__, while the builtin expects a __shared__ float*. But since we allow for the implicit casts it compiles.

__global__ void test_ds_fmin(float src, float *shared) { volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false); }

jmmartinez · 2025-11-19T09:16:57Z

Ping !

clang/lib/Sema/SemaExpr.cpp

…hen -fcuda-is-device When compiling HIP device code, we add implicit casts for the pointer arguments being passed to builtin calls. When compiling for the host, apply the same casts for __device__ or __kernel__ functions, since the device side of the source should still pass type checks. This patch changes the condition depending on -fcuda-is-device to depend on if the builtin's caller is marked as __device__ or __kernel__. stack-info: PR: #165387, branch: users/jmmartinez/fix/load_lds_typesignature/1

github-actions · 2025-11-21T09:57:19Z

🐧 Linux x64 Test Results

111411 tests passed
4448 tests skipped

jmmartinez · 2025-11-25T16:24:14Z

Ping !

shiltian

LGTM with one nit.

clang/lib/Sema/SemaExpr.cpp

…hen -fcuda-is-device When compiling HIP device code, we add implicit casts for the pointer arguments being passed to builtin calls. When compiling for the host, apply the same casts for __device__ or __kernel__ functions, since the device side of the source should still pass type checks. This patch changes the condition depending on -fcuda-is-device to depend on if the builtin's caller is marked as __device__ or __kernel__. stack-info: PR: #165387, branch: users/jmmartinez/fix/load_lds_typesignature/1

…da-is-device (llvm#165387) When compiling HIP device code, we add implicit casts for the pointer arguments passed to built-in calls. When compiling for the host, apply the same casts, since the device side of the source (device functions and kernels) should still pass type checks.

This tests show how type-checking is performed for `__builtin_amdgcn_load_to_lds`, but not for `__builtin_amdgcn_raw_ptr_buffer_load_lds`, `__builtin_amdgcn_struct_ptr_buffer_load_lds` and `__builtin_amdgcn_global_load_lds` since they are declared with the 't' attribute. Stacked on top of: #165387

…(#165388) This tests show how type-checking is performed for `__builtin_amdgcn_load_to_lds`, but not for `__builtin_amdgcn_raw_ptr_buffer_load_lds`, `__builtin_amdgcn_struct_ptr_buffer_load_lds` and `__builtin_amdgcn_global_load_lds` since they are declared with the 't' attribute. Stacked on top of: llvm/llvm-project#165387

) Allows for type checking depending on the builtin signature. Stacked on top of: #165387 and #165388

…#165389) Allows for type checking depending on the builtin signature. Stacked on top of: llvm#165387 and llvm#165388

…ltins (#165389) Allows for type checking depending on the builtin signature. Stacked on top of: llvm/llvm-project#165387 and llvm/llvm-project#165388

jmmartinez force-pushed the users/jmmartinez/fix/load_lds_typesignature/1 branch from dac020c to ed2f606 Compare October 28, 2025 13:23

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" labels Oct 28, 2025

jmmartinez self-assigned this Oct 28, 2025

jmmartinez requested review from ranapratap55, shiltian and yxsamliu October 28, 2025 13:27

This was referenced Oct 28, 2025

[NFC][HIP] Add __builtin_*_load_lds type check test cases #165388

Merged

[HIP][AMDGPU] Remove 't' from all __builtin_*_load_lds builtins #165389

Merged

jmmartinez requested review from arsenm, gandhi56 and gandhi56-zz October 30, 2025 12:54

arsenm reviewed Nov 5, 2025

View reviewed changes

jmmartinez requested a review from arsenm November 6, 2025 12:48

yxsamliu reviewed Nov 19, 2025

View reviewed changes

clang/lib/Sema/SemaExpr.cpp Outdated Show resolved Hide resolved

jmmartinez force-pushed the users/jmmartinez/fix/load_lds_typesignature/1 branch from 69ab914 to 011550b Compare November 21, 2025 09:14

jmmartinez requested a review from yxsamliu November 21, 2025 09:45

jmmartinez changed the title ~~[HIP] Perform implicit pointer cast when compiling device code, not when -fcuda-is-device~~ [HIP] Perform implicit pointer cast when compiling HIP, not when -fcuda-is-device Nov 21, 2025

shiltian approved these changes Nov 25, 2025

View reviewed changes

clang/lib/Sema/SemaExpr.cpp Outdated Show resolved Hide resolved

jmmartinez added 3 commits November 25, 2025 17:48

[Review] Drop the caller is device condition

c598f1b

[Review] Format comment

cf96a53

jmmartinez force-pushed the users/jmmartinez/fix/load_lds_typesignature/1 branch from 011550b to cf96a53 Compare November 25, 2025 16:48

jmmartinez merged commit 0a35f44 into main Nov 26, 2025
10 checks passed

jmmartinez deleted the users/jmmartinez/fix/load_lds_typesignature/1 branch November 26, 2025 13:03

jmmartinez added a commit that referenced this pull request Nov 28, 2025

[HIP][AMDGPU] Remove 't' from all __builtin_*_load_lds builtins (#165389

318236d

) Allows for type checking depending on the builtin signature. Stacked on top of: #165387 and #165388

aahrun pushed a commit to aahrun/llvm-project that referenced this pull request Dec 1, 2025

[HIP][AMDGPU] Remove 't' from all __builtin_*_load_lds builtins (llvm…

1365da7

…#165389) Allows for type checking depending on the builtin signature. Stacked on top of: llvm#165387 and llvm#165388

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[HIP] Perform implicit pointer cast when compiling HIP, not when -fcuda-is-device #165387

[HIP] Perform implicit pointer cast when compiling HIP, not when -fcuda-is-device #165387

Uh oh!

jmmartinez commented Oct 28, 2025 •

edited

Loading

llvmbot commented Oct 28, 2025 •

edited

Loading

arsenm Nov 5, 2025

jmmartinez Nov 6, 2025

jmmartinez commented Nov 19, 2025

Uh oh!

github-actions bot commented Nov 21, 2025 •

edited

Loading

jmmartinez commented Nov 25, 2025

shiltian left a comment

Uh oh!

Uh oh!

Labels

6 participants

[HIP] Perform implicit pointer cast when compiling HIP, not when -fcuda-is-device #165387

[HIP] Perform implicit pointer cast when compiling HIP, not when -fcuda-is-device #165387

Uh oh!

Conversation

jmmartinez commented Oct 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

llvmbot commented Oct 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

arsenm Nov 5, 2025

Choose a reason for hiding this comment

jmmartinez Nov 6, 2025

Choose a reason for hiding this comment

jmmartinez commented Nov 19, 2025

Uh oh!

github-actions bot commented Nov 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

🐧 Linux x64 Test Results

jmmartinez commented Nov 25, 2025

shiltian left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Labels

6 participants

jmmartinez commented Oct 28, 2025 •

edited

Loading

llvmbot commented Oct 28, 2025 •

edited

Loading

github-actions bot commented Nov 21, 2025 •

edited

Loading