intel · againull · Nov 6, 2023 · Sep 15, 2023 · Oct 11, 2023 · Oct 13, 2023
@@ -1583,6 +1583,22 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
  let SupportsNonconformingLambdaSyntax = 1;
 }
 
+def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr {
+ let Spellings = [CXX11<"intel", "min_work_groups_per_cu">];
+ let Args = [ExprArgument<"Value">];
+ let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
+ let Subjects = SubjectList<[Function], ErrorDiag>;
+ let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs];
+}
+
+def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr {
+ let Spellings = [CXX11<"intel", "max_work_groups_per_mp">];
+ let Args = [ExprArgument<"Value">];
+ let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
+ let Subjects = SubjectList<[Function], ErrorDiag>;
+ let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs];
+}
+
 def SYCLIntelMaxGlobalWorkDim : InheritableAttr {
  let Spellings = [CXX11<"intel", "max_global_work_dim">];
  let Args = [ExprArgument<"Value">];

@@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel.
  }];
 }
 
+def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation {
+ let Category = DocCatFunction;
+ let Heading = "intel::min_work_groups_per_cu";
+ let Content = [{
+Applies to a device function/lambda function. Indicates the desired minimum
+number of resident work_groups per multiprocessor. It complies to the
+.minnctapersm PTX directive.
+
+.. code-block:: c++
+
+ [[intel::min_work_groups_per_cu(2)]] void foo() {}
+
+ class Foo {
+ public:
+ [[intel::min_work_groups_per_cu(2)]] void operator()() const {}
+ };
+
+ template <int N>
+ class Functor {
+ public:
+ [[intel::min_work_groups_per_cu(N)]] void operator()() const {}
+ };
+
+ template <int N>
+ [[intel::min_work_groups_per_cu(N)]] void func() {}
+
+ }];
+}
+
+def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation {
+ let Category = DocCatFunction;
+ let Heading = "intel::max_work_groups_per_mp";
+ let Content = [{
+Applies to a device function/lambda function. Indicates the desired maximum
+number work_groups per cluster with which the application will ever launch. It
+complies to the .maxclusterrank PTX directive. Note, that the feature requires
+SM_90 or higher.
+
+.. code-block:: c++
+
+ [[intel::max_work_groups_per_mp(2)]] void foo() {}
+
+ class Foo {
+ public:
+ [[intel::max_work_groups_per_mp(2)]] void operator()() const {}
+ };
+
+ template <int N>
+ class Functor {
+ public:
+ [[intel::max_work_groups_per_mp(N)]] void operator()() const {}
+ };
+
+ template <int N>
+ [[intel::max_work_groups_per_mp(N)]] void func() {}
+
+ }];
+}
+
 def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation {
  let Category = DocCatFunction;
  let Heading = "intel::max_global_work_dim";

@@ -12002,9 +12002,12 @@ def warn_sycl_kernel_return_type : Warning<
 def err_sycl_special_type_num_init_method : Error<
  "types with 'sycl_special_class' attribute must have one and only one '__init' "
  "method defined">;
+def warn_launch_bounds_is_cuda_specific : Warning<
+ "%0 attribute ignored, only applicable when targeting Nvidia devices">,
+ InGroup<IgnoredAttributes>;
 
 def warn_cuda_maxclusterrank_sm_90 : Warning<
- "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
+ "'maxclusterrank' requires sm_90 or higher, CUDA arch provided: %0, ignoring "
  "%1 attribute">, InGroup<IgnoredAttributes>;
 
 def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "

@@ -11432,6 +11432,16 @@ class Sema final {
  SYCLIntelMaxGlobalWorkDimAttr *
  MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
  const SYCLIntelMaxGlobalWorkDimAttr &A);
+ void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+ Decl *D, const AttributeCommonInfo &CI, Expr *E);
+ SYCLIntelMinWorkGroupsPerComputeUnitAttr *
+ MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+ Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A);
+ void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+ Decl *D, const AttributeCommonInfo &CI, Expr *E);
+ SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
+ MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+ Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A);
  void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI,
  Expr *E);
  SYCLIntelBankWidthAttr *

@@ -758,6 +758,24 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
  llvm::MDNode::get(Context, AttrMDArgs));
  }
 
+ auto attrAsMDArg = [&](Expr *E) {
+ const auto *CE = cast<ConstantExpr>(E);
+ std::optional<llvm::APSInt> ArgVal = CE->getResultAsAPSInt();
+ return llvm::ConstantAsMetadata::get(
+ Builder.getInt32(ArgVal->getSExtValue()));
+ };
+
+ if (const auto *A = FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+ Fn->setMetadata("min_work_groups_per_cu",
+ llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
+ }
+
+ if (const auto *A =
+ FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+ Fn->setMetadata("max_work_groups_per_mp",
+ llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
+ }
+
  if (const SYCLIntelMaxWorkGroupSizeAttr *A =
  FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
 

@@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
  // And kernel functions are not subject to inlining
  F->addFnAttr(llvm::Attribute::NoInline);
  }
+ if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
+ auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
+ (*MWGS->getYDimVal()).getExtValue() *
+ (*MWGS->getXDimVal()).getExtValue();
+ if (MaxThreads > 0)
+ addNVVMMetadata(F, "maxntidx", MaxThreads);
+
+ auto attrValue = [&](Expr *E) {
+ const auto *CE = cast<ConstantExpr>(E);
+ std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
+ return Val->getZExtValue();
+ };
+
+ if (const auto *MWGPCU =
+ FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+ // The value is guaranteed to be > 0, pass it to the metadata.
+ addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue()));
+
+ if (const auto *MWGPMP =
+ FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+ // The value is guaranteed to be > 0, pass it to the metadata.
+ addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
+ }
+ }
+ }
  }
 
  // Perform special handling in CUDA mode.

@@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
  NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A);
  else if (const auto *A = dyn_cast<SYCLWorkGroupSizeHintAttr>(Attr))
  NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A);
+ else if (const auto *A =
+ dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(Attr))
+ NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A);
+ else if (const auto *A =
+ dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(Attr))
+ NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A);
  else if (const auto *A = dyn_cast<SYCLIntelMaxGlobalWorkDimAttr>(Attr))
  NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A);
  else if (const auto *BTFA = dyn_cast<BTFDeclTagAttr>(Attr))