llvm · lucas-rami · May 22, 2025 · May 22, 2025 · Jun 27, 2025 · dhruvachak
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -743,20 +743,21 @@ void CodeGenModule::handleAMDGPUWavesPerEUAttr(
  llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
  unsigned Min =
  Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
- unsigned Max =
- Attr->getMax()
- ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
- : 0;
 
- if (Min != 0) {
- assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
-
- std::string AttrVal = llvm::utostr(Min);
- if (Max != 0)
- AttrVal = AttrVal + "," + llvm::utostr(Max);
- F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
- } else
- assert(Max == 0 && "Max must be zero");
+ if (Attr->getMax()) {
+ unsigned Max =
+ Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
+ assert(Min == 0 || (Min != 0 && Max != 0) &&
+ "Min must be non-zero when Max is non-zero");
+ assert(Min <= Max && "Min must be less than or equal to Max");
+ // Do not add the attribute if min,max=0,0.
+ if (Max != 0) {
+ std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
+ F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
+ }
+ } else if (Min != 0) {
+ F->addFnAttr("amdgpu-waves-per-eu", llvm::utostr(Min));
+ }
 }
 
 std::unique_ptr<TargetCodeGenInfo>

diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -245,11 +245,6 @@ static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
  if (MaxExpr && !S.checkUInt32Argument(Attr, MaxExpr, Max, 1))
  return true;
 
- if (Min == 0 && Max != 0) {
- S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
- << &Attr << 0;
- return true;
- }
  if (Max != 0 && Min > Max) {
  S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
  << &Attr << 1;

diff --git a/clang/test/CodeGenHIP/amdgpu-waves-per-eu.hip b/clang/test/CodeGenHIP/amdgpu-waves-per-eu.hip
@@ -0,0 +1,53 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
+
+// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP
+
+#define __global__ __attribute__((global))
+
+//.
+// CHECK: @__hip_cuid_ = addrspace(1) global i8 0
+// CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
+//.
+// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@_Z21kernel_waves_per_eu_0v
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+__global__ __attribute__((amdgpu_waves_per_eu(0))) void kernel_waves_per_eu_0() {}
+
+// Equivalent to kernel_waves_per_eu_0.
+// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_0v
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+__global__ __attribute__((amdgpu_waves_per_eu(0, 0))) void kernel_waves_per_eu_0_0() {}
+
+// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_0_4v
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+__global__ __attribute__((amdgpu_waves_per_eu(0, 4))) void kernel_waves_per_eu_0_4() {}
+
+// CHECK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@_Z23kernel_waves_per_eu_1_4v
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: ret void
+//
+__global__ __attribute__((amdgpu_waves_per_eu(1, 4))) void kernel_waves_per_eu_1_4() {}
+//.
+// CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CHECK: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="0,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// CHECK: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "amdgpu-waves-per-eu"="1,4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
+// CHECK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+//.
diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl b/clang/test/SemaOpenCL/amdgpu-attrs.cl
@@ -46,7 +46,6 @@ __attribute__((amdgpu_num_sgpr(4294967296))) kernel void kernel_num_sgpr_L() {}
 __attribute__((amdgpu_num_vgpr(4294967296))) kernel void kernel_num_vgpr_L() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
 
 __attribute__((amdgpu_flat_work_group_size(0, 64))) kernel void kernel_flat_work_group_size_0_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: max must be 0 since min is 0}}
-__attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: max must be 0 since min is 0}}
 
 __attribute__((amdgpu_flat_work_group_size(64, 32))) kernel void kernel_flat_work_group_size_64_32() {} // expected-error {{'amdgpu_flat_work_group_size' attribute argument is invalid: min must not be greater than max}}
 __attribute__((amdgpu_waves_per_eu(4, 2))) kernel void kernel_waves_per_eu_4_2() {} // expected-error {{'amdgpu_waves_per_eu' attribute argument is invalid: min must not be greater than max}}
@@ -61,6 +60,7 @@ __attribute__((amdgpu_num_vgpr(0))) kernel void kernel_num_vgpr_0() {}
 
 kernel __attribute__((amdgpu_flat_work_group_size(32, 64))) void kernel_flat_work_group_size_32_64() {}
 kernel __attribute__((amdgpu_waves_per_eu(2))) void kernel_waves_per_eu_2() {}
+kernel __attribute__((amdgpu_waves_per_eu(0, 4))) kernel void kernel_waves_per_eu_0_4() {}
 kernel __attribute__((amdgpu_waves_per_eu(2, 4))) void kernel_waves_per_eu_2_4() {}
 kernel __attribute__((amdgpu_num_sgpr(32))) void kernel_num_sgpr_32() {}
 kernel __attribute__((amdgpu_num_vgpr(64))) void kernel_num_vgpr_64() {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -156,15 +156,15 @@ AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
  }
 }
 
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
- const Function &F) const {
+std::pair<unsigned, unsigned>
+AMDGPUSubtarget::getFlatWorkGroupSizes(const Function &F) const {
  // Default minimum/maximum flat work group sizes.
  std::pair<unsigned, unsigned> Default =
- getDefaultFlatWorkGroupSize(F.getCallingConv());
+  getDefaultFlatWorkGroupSize(F.getCallingConv());
 
  // Requested minimum/maximum flat work group sizes.
  std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
- F, "amdgpu-flat-work-group-size", Default);
+  F, "amdgpu-flat-work-group-size", Default);
 
  // Make sure requested minimum is less than requested maximum.
  if (Requested.first > Requested.second)
@@ -186,23 +186,29 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
  // sizes limits the achievable maximum, and we aim to support enough waves per
  // EU so that we can concurrently execute all waves of a single workgroup of
  // maximum size on a CU.
- std::pair<unsigned, unsigned> Default = {
+ std::pair<unsigned, unsigned> WavesPerEU = {
  getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
  getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
- Default.first = std::min(Default.first, Default.second);
-
- // Make sure requested minimum is within the default range and lower than the
- // requested maximum. The latter must not violate target specification.
- if (RequestedWavesPerEU.first < Default.first ||
- RequestedWavesPerEU.first > Default.second ||
- RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
- RequestedWavesPerEU.second > getMaxWavesPerEU())
- return Default;
-
- // We cannot exceed maximum occupancy implied by flat workgroup size and LDS.
- RequestedWavesPerEU.second =
- std::min(RequestedWavesPerEU.second, Default.second);
- return RequestedWavesPerEU;
+ WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+
+ // Requested minimum must not violate subtarget's specifications and be no
+ // greater than maximum.
+ if (RequestedWavesPerEU.first &&
+ (RequestedWavesPerEU.first < getMinWavesPerEU() ||
+ RequestedWavesPerEU.first > RequestedWavesPerEU.second))
+ return WavesPerEU;
+ // Requested maximum must not violate subtarget's specifications.
+ if (RequestedWavesPerEU.second > getMaxWavesPerEU())
+ return WavesPerEU;
+
+ // A requested maximum may limit both the final minimum and maximum, but
+ // not increase them. A requested minimum can either decrease or increase the
+ // default minimum as long as it doesn't exceed the maximum.
+ WavesPerEU.second = std::min(WavesPerEU.second, RequestedWavesPerEU.second);
+ if (RequestedWavesPerEU.first)
+ WavesPerEU.first = RequestedWavesPerEU.first;
+ WavesPerEU.first = std::min(WavesPerEU.first, WavesPerEU.second);
+ return WavesPerEU;
 }
 
 std::pair<unsigned, unsigned>
@@ -229,7 +235,7 @@ std::pair<unsigned, unsigned>
 AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
  unsigned LDSBytes, const Function &F) const {
  // Default minimum/maximum number of waves per execution unit.
- std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
+ std::pair<unsigned, unsigned> Default(0, getMaxWavesPerEU());
 
  // Requested minimum/maximum number of waves per execution unit.
  std::pair<unsigned, unsigned> Requested =

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -128,7 +128,9 @@ class AMDGPUSubtarget {
  /// Returns the target minimum/maximum number of waves per EU. This is based
  /// on the minimum/maximum number of \p RequestedWavesPerEU and further
  /// limited by the maximum achievable occupancy derived from the range of \p
- /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
+ /// FlatWorkGroupSizes and number of \p LDSBytes per workgroup. A
+ /// minimum requested waves/EU value of 0 indicates an intent to not restrict
+ /// the minimum target occupancy.
  std::pair<unsigned, unsigned>
  getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
  std::pair<unsigned, unsigned> FlatWorkGroupSizes,

diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -225,3 +225,15 @@ entry:
  ret void
 }
 attributes #12 = {"amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,10" "amdgpu-lds-size"="16384"}
+
+; At most 2 waves per execution unit.
+; CHECK-LABEL: {{^}}empty_at_most_2:
+; CHECK: SGPRBlocks: 12
+; CHECK: VGPRBlocks: 21
+; CHECK: NumSGPRsForWavesPerEU: 102
+; CHECK: NumVGPRsForWavesPerEU: 85
+define amdgpu_kernel void @empty_at_most_2() #13 {
+entry:
+ ret void
+}
+attributes #13 = {"amdgpu-waves-per-eu"="0,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/default-flat-work-group-size-overrides-waves-per-eu.ll
@@ -57,5 +57,5 @@ entry:
  ret void
 }
 
-attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
-attributes #1 = { "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1024" }
+attributes #0 = { "amdgpu-waves-per-eu"="1" }
+attributes #1 = { "amdgpu-waves-per-eu"="1" "amdgpu-flat-work-group-size"="1,1024" }