Skip to content
16 changes: 16 additions & 0 deletions clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -1583,6 +1583,22 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
let SupportsNonconformingLambdaSyntax = 1;
}

def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr {
let Spellings = [CXX11<"intel", "min_work_groups_per_cu">];
let Args = [ExprArgument<"Value">];
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs];
}

def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr {
let Spellings = [CXX11<"intel", "max_work_groups_per_mp">];
let Args = [ExprArgument<"Value">];
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
let Subjects = SubjectList<[Function], ErrorDiag>;
let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs];
}

def SYCLIntelMaxGlobalWorkDim : InheritableAttr {
let Spellings = [CXX11<"intel", "max_global_work_dim">];
let Args = [ExprArgument<"Value">];
Expand Down
59 changes: 59 additions & 0 deletions clang/include/clang/Basic/AttrDocs.td
Original file line number Diff line number Diff line change
Expand Up @@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel.
}];
}

def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation {
let Category = DocCatFunction;
let Heading = "intel::min_work_groups_per_cu";
let Content = [{
Applies to a device function/lambda function. Indicates the desired minimum
number of resident work_groups per multiprocessor. It complies to the
.minnctapersm PTX directive.

.. code-block:: c++

[[intel::min_work_groups_per_cu(2)]] void foo() {}

class Foo {
public:
[[intel::min_work_groups_per_cu(2)]] void operator()() const {}
};

template <int N>
class Functor {
public:
[[intel::min_work_groups_per_cu(N)]] void operator()() const {}
};

template <int N>
[[intel::min_work_groups_per_cu(N)]] void func() {}

}];
}

def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation {
let Category = DocCatFunction;
let Heading = "intel::max_work_groups_per_mp";
let Content = [{
Applies to a device function/lambda function. Indicates the desired maximum
number work_groups per cluster with which the application will ever launch. It
complies to the .maxclusterrank PTX directive. Note, that the feature requires
SM_90 or higher.

.. code-block:: c++

[[intel::max_work_groups_per_mp(2)]] void foo() {}

class Foo {
public:
[[intel::max_work_groups_per_mp(2)]] void operator()() const {}
};

template <int N>
class Functor {
public:
[[intel::max_work_groups_per_mp(N)]] void operator()() const {}
};

template <int N>
[[intel::max_work_groups_per_mp(N)]] void func() {}

}];
}

def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation {
let Category = DocCatFunction;
let Heading = "intel::max_global_work_dim";
Expand Down
5 changes: 4 additions & 1 deletion clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -12002,9 +12002,12 @@ def warn_sycl_kernel_return_type : Warning<
def err_sycl_special_type_num_init_method : Error<
"types with 'sycl_special_class' attribute must have one and only one '__init' "
"method defined">;
def warn_launch_bounds_is_cuda_specific : Warning<
"%0 attribute ignored, only applicable when targeting Nvidia devices">,
InGroup<IgnoredAttributes>;

def warn_cuda_maxclusterrank_sm_90 : Warning<
"maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"'maxclusterrank' requires sm_90 or higher, CUDA arch provided: %0, ignoring "
"%1 attribute">, InGroup<IgnoredAttributes>;

def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "
Expand Down
10 changes: 10 additions & 0 deletions clang/include/clang/Sema/Sema.h
Original file line number Diff line number Diff line change
Expand Up @@ -11432,6 +11432,16 @@ class Sema final {
SYCLIntelMaxGlobalWorkDimAttr *
MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
const SYCLIntelMaxGlobalWorkDimAttr &A);
void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
Decl *D, const AttributeCommonInfo &CI, Expr *E);
SYCLIntelMinWorkGroupsPerComputeUnitAttr *
MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A);
void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
Decl *D, const AttributeCommonInfo &CI, Expr *E);
SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A);
void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *E);
SYCLIntelBankWidthAttr *
Expand Down
18 changes: 18 additions & 0 deletions clang/lib/CodeGen/CodeGenFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,24 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
llvm::MDNode::get(Context, AttrMDArgs));
}

auto attrAsMDArg = [&](Expr *E) {
const auto *CE = cast<ConstantExpr>(E);
std::optional<llvm::APSInt> ArgVal = CE->getResultAsAPSInt();
return llvm::ConstantAsMetadata::get(
Builder.getInt32(ArgVal->getSExtValue()));
};

if (const auto *A = FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
Fn->setMetadata("min_work_groups_per_cu",
llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
}

if (const auto *A =
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
Fn->setMetadata("max_work_groups_per_mp",
llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
}

if (const SYCLIntelMaxWorkGroupSizeAttr *A =
FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {

Expand Down
25 changes: 25 additions & 0 deletions clang/lib/CodeGen/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
// And kernel functions are not subject to inlining
F->addFnAttr(llvm::Attribute::NoInline);
}
if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
(*MWGS->getYDimVal()).getExtValue() *
(*MWGS->getXDimVal()).getExtValue();
if (MaxThreads > 0)
addNVVMMetadata(F, "maxntidx", MaxThreads);

auto attrValue = [&](Expr *E) {
const auto *CE = cast<ConstantExpr>(E);
std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
return Val->getZExtValue();
};

if (const auto *MWGPCU =
FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
// The value is guaranteed to be > 0, pass it to the metadata.
addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue()));

if (const auto *MWGPMP =
FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
// The value is guaranteed to be > 0, pass it to the metadata.
addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
}
}
}
}

// Perform special handling in CUDA mode.
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Sema/SemaDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A);
else if (const auto *A = dyn_cast<SYCLWorkGroupSizeHintAttr>(Attr))
NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A);
else if (const auto *A =
dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(Attr))
NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A);
else if (const auto *A =
dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(Attr))
NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A);
else if (const auto *A = dyn_cast<SYCLIntelMaxGlobalWorkDimAttr>(Attr))
NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A);
else if (const auto *BTFA = dyn_cast<BTFDeclTagAttr>(Attr))
Expand Down
Loading