- Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU][Clang] Allow amdgpu-waves-per-eu attribute to lower target occupancy range #138284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals | ||
| // REQUIRES: amdgpu-registered-target | ||
| // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device -emit-llvm -o - %s | FileCheck %s | ||
| | ||
| // COM: Most tests are in the OpenCL semastics, this is just a verification for HIP | ||
| | ||
| #define __global__ __attribute__((global)) | ||
| | ||
| //. | ||
| // CHECK: @__hip_cuid_ = addrspace(1) global i8 0 | ||
| // CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata" | ||
| //. | ||
| // CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone | ||
| // CHECK-LABEL: define {{[^@]+}}@_Z21kernel_waves_per_eu_0v | ||
| // CHECK-SAME: () #[[ATTR0:[0-9]+]] { | ||
| // CHECK-NEXT: entry: | ||
| // CHECK-NEXT: ret void | ||
| // | ||
| __global__ __attribute__((amdgpu_waves_per_eu(0))) void kernel_waves_per_eu_0() {} | ||
| | ||
| // Equivalent to kernel_waves_per_eu_0. | ||
| // CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone | ||
| // CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_0v | ||
| // CHECK-SAME: () #[[ATTR0]] { | ||
| // CHECK-NEXT: entry: | ||
| // CHECK-NEXT: ret void | ||
| // | ||
| __global__ __attribute__((amdgpu_waves_per_eu(0, 0))) void kernel_waves_per_eu_0_0() {} | ||
| | ||
| // CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone | ||
| // CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_4v | ||
| // CHECK-SAME: () #[[ATTR1:[0-9]+]] { | ||
| // CHECK-NEXT: entry: | ||
| // CHECK-NEXT: ret void | ||
| // | ||
| __global__ __attribute__((amdgpu_waves_per_eu(0, 4))) void kernel_waves_per_eu_0_4() {} | ||
| | ||
| // CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone | ||
| // CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_1_4v | ||
| // CHECK-SAME: () #[[ATTR2:[0-9]+]] { | ||
| // CHECK-NEXT: entry: | ||
| // CHECK-NEXT: ret void | ||
| // | ||
| __global__ __attribute__((amdgpu_waves_per_eu(1, 4))) void kernel_waves_per_eu_1_4() {} | ||
| //. | ||
| // CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } | ||
| // CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="0,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } | ||
| // CHECK: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="1,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } | ||
| //. | ||
| // CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} | ||
| // CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} | ||
| // CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} | ||
| //. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {} | |
| __attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}} | ||
| | ||
| __attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}} | ||
| __attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}} | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing clang codegen test changes that show the new accepted values. This is still not emitting minimums of 0 though, so this is just losing a test? Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [0,4] is now a valid range (i.e., no minimum requested, at most 4) so I moved it below instead of deleting it. I also added some HIP codegen tests. | ||
| | ||
| __attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}} | ||
| __attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}} | ||
| | @@ -61,6 +60,7 @@ __attribute__((amdgpu_num_vgpr(0))) kernel void kernel_num_vgpr_0() {} | |
| | ||
| kernel __attribute__((amdgpu_flat_work_group_size(32, 64))) void kernel_flat_work_group_size_32_64() {} | ||
| kernel __attribute__((amdgpu_waves_per_eu(2))) void kernel_waves_per_eu_2() {} | ||
| kernel __attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} | ||
| kernel __attribute__((amdgpu_waves_per_eu(2, 4))) void kernel_waves_per_eu_2_4() {} | ||
| kernel __attribute__((amdgpu_num_sgpr(32))) void kernel_num_sgpr_32() {} | ||
| kernel __attribute__((amdgpu_num_vgpr(64))) void kernel_num_vgpr_64() {} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -57,5 +57,5 @@ entry: | |
| ret void | ||
| } | ||
| | ||
| attributes #0 = { "amdgpu-waves-per-eu"="1,1" } | ||
| attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" } | ||
| attributes #0 = { "amdgpu-waves-per-eu"="1" } | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why remove the maximum from the attributes? IIUC this test is supposed to show that the flat-wg-size can override an incompatible waves-per-eu attribute. If we remove the max, that property is no more verified. Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was done to keep the IR checks the same but yes it makes the test no longer really meaningful indeed. The problem is that the flat workgroup size limits (both currently and after this PR) the maximum achievable, but does not limit (only after this PR) the minimum. I can't showcase that by having an absent/maximum flat workgroup size range, since that doesn't restrict the maximum achievable occupancy. When I split this PR I will address that by having a narrower/smaller | ||
| attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are right, thanks for the catch.