Skip to content

Commit b334921

Browse files
authored
[SYCL][Graph] Add support for handler-less graph submission (#20690)
- Adds `submit_graph_direct_with_event_impl` and `submit_graph_direct_without_event_impl` to the ABI which invoke a handler-less path for graph submission for `queue::ext_oneapi_graph` and the free function `execute_graph`. - Adjusts handler-less `submit_direct` utility to be more general: support submissions which may contain host task, move scheduler bypass logic to callback functor, and parameterize submission CGType. - Extends graph test coverage to cover identified gaps: recording handler-less graph submission and host task cases with dependencies.
1 parent 4b4d451 commit b334921

File tree

13 files changed

+353
-75
lines changed

13 files changed

+353
-75
lines changed

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,8 @@ inline void execute_graph(handler &CGH,
450450
inline void execute_graph(queue Q, command_graph<graph_state::executable> &G,
451451
const sycl::detail::code_location &CodeLoc =
452452
sycl::detail::code_location::current()) {
453-
submit(std::move(Q), [&](handler &CGH) { execute_graph(CGH, G); }, CodeLoc);
453+
submit_graph_direct_without_event_impl(std::move(Q), G, /*DepEvents*/ {},
454+
CodeLoc);
454455
}
455456

456457
} // namespace ext::oneapi::experimental

sycl/include/sycl/queue.hpp

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,20 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
8181
const detail::KernelPropertyHolderStructTy &Props,
8282
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
8383

84+
event __SYCL_EXPORT submit_graph_direct_with_event_impl(
85+
const queue &Queue,
86+
ext::oneapi::experimental::command_graph<
87+
ext::oneapi::experimental::graph_state::executable> &G,
88+
sycl::span<const event> DepEvents,
89+
const detail::code_location &CodeLoc = detail::code_location::current());
90+
91+
void __SYCL_EXPORT submit_graph_direct_without_event_impl(
92+
const queue &Queue,
93+
ext::oneapi::experimental::command_graph<
94+
ext::oneapi::experimental::graph_state::executable> &G,
95+
sycl::span<const event> DepEvents,
96+
const detail::code_location &CodeLoc = detail::code_location::current());
97+
8498
namespace detail {
8599
class queue_impl;
86100

@@ -3651,7 +3665,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
36513665
ext::oneapi::experimental::graph_state::executable>
36523666
Graph,
36533667
const detail::code_location &CodeLoc = detail::code_location::current()) {
3654-
return submit([&](handler &CGH) { CGH.ext_oneapi_graph(Graph); }, CodeLoc);
3668+
return submit_graph_direct_with_event_impl(*this, Graph, /*DepEvents*/ {},
3669+
CodeLoc);
36553670
}
36563671

36573672
/// Shortcut for executing a graph of commands with a single dependency.
@@ -3666,12 +3681,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
36663681
Graph,
36673682
event DepEvent,
36683683
const detail::code_location &CodeLoc = detail::code_location::current()) {
3669-
return submit(
3670-
[&](handler &CGH) {
3671-
CGH.depends_on(DepEvent);
3672-
CGH.ext_oneapi_graph(Graph);
3673-
},
3674-
CodeLoc);
3684+
return submit_graph_direct_with_event_impl(
3685+
*this, Graph, sycl::span<const event>(&DepEvent, 1), CodeLoc);
36753686
}
36763687

36773688
/// Shortcut for executing a graph of commands with multiple dependencies.
@@ -3686,12 +3697,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
36863697
Graph,
36873698
const std::vector<event> &DepEvents,
36883699
const detail::code_location &CodeLoc = detail::code_location::current()) {
3689-
return submit(
3690-
[&](handler &CGH) {
3691-
CGH.depends_on(DepEvents);
3692-
CGH.ext_oneapi_graph(Graph);
3693-
},
3694-
CodeLoc);
3700+
return submit_graph_direct_with_event_impl(*this, Graph, DepEvents,
3701+
CodeLoc);
36953702
}
36963703

36973704
/// Provides a hint to the runtime that previously issued commands to this

sycl/source/detail/graph/graph_impl.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,7 +1204,7 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue,
12041204
return SignalEvent;
12051205
}
12061206

1207-
EventImplPtr
1207+
std::pair<EventImplPtr, bool>
12081208
exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
12091209
sycl::detail::CG::StorageInitHelper CGData,
12101210
bool EventNeeded) {
@@ -1213,19 +1213,17 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
12131213
cleanupExecutionEvents(MSchedulerDependencies);
12141214
CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(),
12151215
MSchedulerDependencies.end());
1216-
12171216
bool IsCGDataSafeForSchedulerBypass =
12181217
detail::Scheduler::areEventsSafeForSchedulerBypass(
12191218
CGData.MEvents, Queue.getContextImpl()) &&
12201219
CGData.MRequirements.empty();
1220+
bool SkipScheduler = IsCGDataSafeForSchedulerBypass && !MContainsHostTask;
12211221

12221222
// This variable represents the returned event. It will always be nullptr if
12231223
// EventNeeded is false.
12241224
EventImplPtr SignalEvent;
1225-
12261225
if (!MContainsHostTask) {
1227-
bool SkipScheduler =
1228-
IsCGDataSafeForSchedulerBypass && MPartitions[0]->MRequirements.empty();
1226+
SkipScheduler = SkipScheduler && MPartitions[0]->MRequirements.empty();
12291227
if (SkipScheduler) {
12301228
SignalEvent = enqueuePartitionDirectly(MPartitions[0], Queue,
12311229
CGData.MEvents, EventNeeded);
@@ -1258,7 +1256,7 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue,
12581256
SignalEvent->setProfilingEnabled(MEnableProfiling);
12591257
}
12601258

1261-
return SignalEvent;
1259+
return {SignalEvent, SkipScheduler};
12621260
}
12631261

12641262
void exec_graph_impl::duplicateNodes() {

sycl/source/detail/graph/graph_impl.hpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -640,11 +640,14 @@ class exec_graph_impl {
640640
/// @param CGData Command-group data provided by the sycl::handler
641641
/// @param EventNeeded Whether an event signalling the completion of this
642642
/// operation needs to be returned.
643-
/// @return Returns an event if EventNeeded is true. Returns nullptr
644-
/// otherwise.
645-
EventImplPtr enqueue(sycl::detail::queue_impl &Queue,
646-
sycl::detail::CG::StorageInitHelper CGData,
647-
bool EventNeeded);
643+
/// @return Returns a pair of an event and a boolean indicating whether the
644+
/// scheduler was bypassed. If an event is required, then the first element of
645+
/// the pair is the event representing the execution of the graph. If no event
646+
/// is required, the first element is nullptr. The second element is true if
647+
/// the scheduler was bypassed, false otherwise.
648+
std::pair<EventImplPtr, bool>
649+
enqueue(sycl::detail::queue_impl &Queue,
650+
sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded);
648651

649652
/// Iterates through all the nodes in the graph to build the list of
650653
/// accessor requirements for the whole graph and for each partition.

sycl/source/detail/queue_impl.cpp

Lines changed: 106 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,16 @@ void queue_impl::addEvent(const detail::EventImplPtr &EventImpl) {
284284
}
285285
}
286286

287+
void queue_impl::addEventUnlocked(const detail::EventImplPtr &EventImpl) {
288+
if (!EventImpl)
289+
return;
290+
Command *Cmd = EventImpl->getCommand();
291+
if (Cmd != nullptr && EventImpl->getHandle() == nullptr) {
292+
std::weak_ptr<event_impl> EventWeakPtr{EventImpl};
293+
MEventsWeak.push_back(std::move(EventWeakPtr));
294+
}
295+
}
296+
287297
detail::EventImplPtr
288298
queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
289299
bool CallerNeedsEvent, const detail::code_location &Loc,
@@ -528,16 +538,23 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
528538
KData.validateAndSetKernelLaunchProperties(Props, hasCommandGraph(),
529539
getDeviceImpl());
530540

531-
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &&CGData,
532-
bool SchedulerBypass) -> EventImplPtr {
541+
auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &&CGData)
542+
-> std::pair<EventImplPtr, bool> {
543+
bool SchedulerBypass =
544+
(CGData.MEvents.size() > 0
545+
? detail::Scheduler::areEventsSafeForSchedulerBypass(
546+
CGData.MEvents, getContextImpl())
547+
: true) &&
548+
!hasCommandGraph();
533549
if (SchedulerBypass) {
534550
// No need to copy/move the kernel function, so we set
535551
// the function pointer to the original function
536552
KData.setKernelFunc(HostKernel.getPtr());
537553

538-
return submit_kernel_scheduler_bypass(KData, CGData.MEvents,
539-
CallerNeedsEvent, nullptr, nullptr,
540-
CodeLoc, IsTopCodeLoc);
554+
return {submit_kernel_scheduler_bypass(KData, CGData.MEvents,
555+
CallerNeedsEvent, nullptr, nullptr,
556+
CodeLoc, IsTopCodeLoc),
557+
/*SchedulerBypass*/ true};
541558
}
542559
std::unique_ptr<detail::CG> CommandGroup;
543560
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
@@ -565,57 +582,101 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
565582
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
566583

567584
if (auto GraphImpl = getCommandGraph(); GraphImpl) {
568-
return submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
569-
detail::CGType::Kernel);
585+
return {submit_command_to_graph(*GraphImpl, std::move(CommandGroup),
586+
detail::CGType::Kernel),
587+
/*SchedulerBypass*/ false};
570588
}
571589

572-
return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
573-
*this, true);
590+
return {detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
591+
*this, true),
592+
/*SchedulerBypass*/ false};
574593
};
575594

576-
return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc);
595+
return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc,
596+
detail::CGType::Kernel,
597+
/*InsertBarrierForInOrderCommand*/ false);
598+
}
599+
600+
EventImplPtr queue_impl::submit_graph_direct_impl(
601+
std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
602+
ExecGraph,
603+
bool CallerNeedsEvent, sycl::span<const event> DepEvents,
604+
[[maybe_unused]] const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
605+
bool EventNeeded = CallerNeedsEvent || ExecGraph->containsHostTask() ||
606+
!supportsDiscardingPiEvents();
607+
auto SubmitGraphFunc = [&](detail::CG::StorageInitHelper &&CGData)
608+
-> std::pair<EventImplPtr, bool> {
609+
if (auto ParentGraph = getCommandGraph(); ParentGraph) {
610+
std::unique_ptr<detail::CG> CommandGroup;
611+
{
612+
ext::oneapi::experimental::detail::graph_impl::ReadLock ExecLock(
613+
ExecGraph->MMutex);
614+
CGData.MRequirements = ExecGraph->getRequirements();
615+
}
616+
// Here we are using the CommandGroup without passing a CommandBuffer to
617+
// pass the exec_graph_impl and event dependencies. Since this subgraph
618+
// CG will not be executed this is fine.
619+
CommandGroup.reset(
620+
new sycl::detail::CGExecCommandBuffer(nullptr, ExecGraph, CGData));
621+
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
622+
return {submit_command_to_graph(*ParentGraph, std::move(CommandGroup),
623+
detail::CGType::ExecCommandBuffer),
624+
/*SchedulerBypass*/ false};
625+
} else {
626+
return ExecGraph->enqueue(*this, CGData, EventNeeded);
627+
}
628+
};
629+
// If the graph contains a host task, we may need to insert a barrier prior
630+
// to submission to ensure correct ordering with in-order queues.
631+
return submit_direct(CallerNeedsEvent, DepEvents, SubmitGraphFunc,
632+
detail::CGType::ExecCommandBuffer,
633+
ExecGraph->containsHostTask());
577634
}
578635

579636
template <typename SubmitCommandFuncType>
580-
detail::EventImplPtr
581-
queue_impl::submit_direct(bool CallerNeedsEvent,
582-
sycl::span<const event> DepEvents,
583-
SubmitCommandFuncType &SubmitCommandFunc) {
637+
detail::EventImplPtr queue_impl::submit_direct(
638+
bool CallerNeedsEvent, sycl::span<const event> DepEvents,
639+
SubmitCommandFuncType &SubmitCommandFunc, detail::CGType Type,
640+
bool InsertBarrierForInOrderCommand) {
584641
detail::CG::StorageInitHelper CGData;
585642
std::unique_lock<std::mutex> Lock(MMutex);
586-
587-
// Used by queue_empty() and getLastEvent()
588-
MEmpty.store(false, std::memory_order_release);
643+
const bool inOrder = isInOrder();
589644

590645
// Sync with an external event
591646
std::optional<event> ExternalEvent = popExternalEvent();
592647
if (ExternalEvent) {
593648
registerEventDependency</*LockQueue*/ false>(
594649
getSyclObjImpl(*ExternalEvent), CGData.MEvents, this, getContextImpl(),
595650
getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr,
596-
detail::CGType::Kernel);
651+
Type);
597652
}
598653

599654
auto &Deps = hasCommandGraph() ? MExtGraphDeps : MDefaultGraphDeps;
600655

601656
// Sync with the last event for in order queue
602657
EventImplPtr &LastEvent = Deps.LastEventPtr;
603-
if (isInOrder() && LastEvent) {
658+
if (inOrder && LastEvent) {
604659
registerEventDependency</*LockQueue*/ false>(
605660
LastEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(),
606-
hasCommandGraph() ? getCommandGraph().get() : nullptr,
607-
detail::CGType::Kernel);
661+
hasCommandGraph() ? getCommandGraph().get() : nullptr, Type);
662+
} else if (inOrder && !MEmpty.load(std::memory_order_acquire) &&
663+
InsertBarrierForInOrderCommand) {
664+
// A barrier is injected to ensure ordering with prior commands
665+
auto ResEvent = insertHelperBarrier();
666+
registerEventDependency</*LockQueue*/ false>(
667+
ResEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(),
668+
hasCommandGraph() ? getCommandGraph().get() : nullptr, Type);
608669
}
609670

610671
for (event e : DepEvents) {
611672
registerEventDependency</*LockQueue*/ false>(
612673
getSyclObjImpl(e), CGData.MEvents, this, getContextImpl(),
613674
getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr,
614-
detail::CGType::Kernel);
675+
Type);
615676
}
616677

617678
// Barrier and un-enqueued commands synchronization for out or order queue
618-
if (!isInOrder()) {
679+
if (!inOrder) {
619680
MMissedCleanupRequests.unset(
620681
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
621682
for (auto &UpdatedGraph : MissedCleanupRequests)
@@ -628,32 +689,30 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
628689
}
629690
}
630691

631-
bool SchedulerBypass =
632-
(CGData.MEvents.size() > 0
633-
? detail::Scheduler::areEventsSafeForSchedulerBypass(
634-
CGData.MEvents, getContextImpl())
635-
: true) &&
636-
!hasCommandGraph();
692+
// Used by queue_empty() and getLastEvent()
693+
MEmpty.store(false, std::memory_order_release);
694+
695+
auto [EventImpl, SchedulerBypass] = SubmitCommandFunc(std::move(CGData));
637696

638697
// Synchronize with the "no last event mode", used by the handler-based
639698
// kernel submit path
640-
MNoLastEventMode.store(isInOrder() && SchedulerBypass,
641-
std::memory_order_relaxed);
642-
643-
EventImplPtr EventImpl =
644-
SubmitCommandFunc(std::move(CGData), SchedulerBypass);
699+
MNoLastEventMode.store(inOrder && SchedulerBypass, std::memory_order_relaxed);
645700

646701
// Sync with the last event for in order queue. For scheduler-bypass flow,
647702
// the ordering is done at the layers below the SYCL runtime,
648703
// but for the scheduler-based flow, it needs to be done here, as the
649704
// scheduler handles host task submissions.
650-
if (isInOrder()) {
705+
if (inOrder) {
651706
LastEvent = SchedulerBypass ? nullptr : EventImpl;
652707
}
653708

654-
// Barrier and un-enqueued commands synchronization for out or order queue
655-
if (!isInOrder() && !EventImpl->isEnqueued()) {
656-
Deps.UnenqueuedCmdEvents.push_back(EventImpl);
709+
// Barrier and un-enqueued commands synchronization for out or order queue.
710+
// The event must also be stored for future wait calls.
711+
if (!inOrder) {
712+
if (!EventImpl->isEnqueued()) {
713+
Deps.UnenqueuedCmdEvents.push_back(EventImpl);
714+
}
715+
addEventUnlocked(EventImpl);
657716
}
658717

659718
return CallerNeedsEvent ? std::move(EventImpl) : nullptr;
@@ -1104,6 +1163,15 @@ void queue_impl::verifyProps(const property_list &Props) const {
11041163
CheckPropertiesWithData);
11051164
}
11061165

1166+
EventImplPtr queue_impl::insertHelperBarrier() {
1167+
auto ResEvent = detail::event_impl::create_device_event(*this);
1168+
ur_event_handle_t UREvent = nullptr;
1169+
getAdapter().call<UrApiKind::urEnqueueEventsWaitWithBarrier>(
1170+
getHandleRef(), 0, nullptr, &UREvent);
1171+
ResEvent->setHandle(UREvent);
1172+
return ResEvent;
1173+
}
1174+
11071175
} // namespace detail
11081176
} // namespace _V1
11091177
} // namespace sycl

0 commit comments

Comments
 (0)