@@ -284,6 +284,16 @@ void queue_impl::addEvent(const detail::EventImplPtr &EventImpl) {
284284 }
285285}
286286
287+ void queue_impl::addEventUnlocked (const detail::EventImplPtr &EventImpl) {
288+ if (!EventImpl)
289+ return ;
290+ Command *Cmd = EventImpl->getCommand ();
291+ if (Cmd != nullptr && EventImpl->getHandle () == nullptr ) {
292+ std::weak_ptr<event_impl> EventWeakPtr{EventImpl};
293+ MEventsWeak.push_back (std::move (EventWeakPtr));
294+ }
295+ }
296+
287297detail::EventImplPtr
288298queue_impl::submit_impl (const detail::type_erased_cgfo_ty &CGF,
289299 bool CallerNeedsEvent, const detail::code_location &Loc,
@@ -528,16 +538,23 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
528538 KData.validateAndSetKernelLaunchProperties (Props, hasCommandGraph (),
529539 getDeviceImpl ());
530540
531- auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &&CGData,
532- bool SchedulerBypass) -> EventImplPtr {
541+ auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &&CGData)
542+ -> std::pair<EventImplPtr, bool > {
543+ bool SchedulerBypass =
544+ (CGData.MEvents .size () > 0
545+ ? detail::Scheduler::areEventsSafeForSchedulerBypass (
546+ CGData.MEvents , getContextImpl ())
547+ : true ) &&
548+ !hasCommandGraph ();
533549 if (SchedulerBypass) {
534550 // No need to copy/move the kernel function, so we set
535551 // the function pointer to the original function
536552 KData.setKernelFunc (HostKernel.getPtr ());
537553
538- return submit_kernel_scheduler_bypass (KData, CGData.MEvents ,
539- CallerNeedsEvent, nullptr , nullptr ,
540- CodeLoc, IsTopCodeLoc);
554+ return {submit_kernel_scheduler_bypass (KData, CGData.MEvents ,
555+ CallerNeedsEvent, nullptr , nullptr ,
556+ CodeLoc, IsTopCodeLoc),
557+ /* SchedulerBypass*/ true };
541558 }
542559 std::unique_ptr<detail::CG> CommandGroup;
543560 std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
@@ -565,57 +582,101 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
565582 CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
566583
567584 if (auto GraphImpl = getCommandGraph (); GraphImpl) {
568- return submit_command_to_graph (*GraphImpl, std::move (CommandGroup),
569- detail::CGType::Kernel);
585+ return {submit_command_to_graph (*GraphImpl, std::move (CommandGroup),
586+ detail::CGType::Kernel),
587+ /* SchedulerBypass*/ false };
570588 }
571589
572- return detail::Scheduler::getInstance ().addCG (std::move (CommandGroup),
573- *this , true );
590+ return {detail::Scheduler::getInstance ().addCG (std::move (CommandGroup),
591+ *this , true ),
592+ /* SchedulerBypass*/ false };
574593 };
575594
576- return submit_direct (CallerNeedsEvent, DepEvents, SubmitKernelFunc);
595+ return submit_direct (CallerNeedsEvent, DepEvents, SubmitKernelFunc,
596+ detail::CGType::Kernel,
597+ /* InsertBarrierForInOrderCommand*/ false );
598+ }
599+
600+ EventImplPtr queue_impl::submit_graph_direct_impl (
601+ std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
602+ ExecGraph,
603+ bool CallerNeedsEvent, sycl::span<const event> DepEvents,
604+ [[maybe_unused]] const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
605+ bool EventNeeded = CallerNeedsEvent || ExecGraph->containsHostTask () ||
606+ !supportsDiscardingPiEvents ();
607+ auto SubmitGraphFunc = [&](detail::CG::StorageInitHelper &&CGData)
608+ -> std::pair<EventImplPtr, bool > {
609+ if (auto ParentGraph = getCommandGraph (); ParentGraph) {
610+ std::unique_ptr<detail::CG> CommandGroup;
611+ {
612+ ext::oneapi::experimental::detail::graph_impl::ReadLock ExecLock (
613+ ExecGraph->MMutex );
614+ CGData.MRequirements = ExecGraph->getRequirements ();
615+ }
616+ // Here we are using the CommandGroup without passing a CommandBuffer to
617+ // pass the exec_graph_impl and event dependencies. Since this subgraph
618+ // CG will not be executed this is fine.
619+ CommandGroup.reset (
620+ new sycl::detail::CGExecCommandBuffer (nullptr , ExecGraph, CGData));
621+ CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
622+ return {submit_command_to_graph (*ParentGraph, std::move (CommandGroup),
623+ detail::CGType::ExecCommandBuffer),
624+ /* SchedulerBypass*/ false };
625+ } else {
626+ return ExecGraph->enqueue (*this , CGData, EventNeeded);
627+ }
628+ };
629+ // If the graph contains a host task, we may need to insert a barrier prior
630+ // to submission to ensure correct ordering with in-order queues.
631+ return submit_direct (CallerNeedsEvent, DepEvents, SubmitGraphFunc,
632+ detail::CGType::ExecCommandBuffer,
633+ ExecGraph->containsHostTask ());
577634}
578635
579636template <typename SubmitCommandFuncType>
580- detail::EventImplPtr
581- queue_impl::submit_direct ( bool CallerNeedsEvent,
582- sycl::span< const event> DepEvents ,
583- SubmitCommandFuncType &SubmitCommandFunc ) {
637+ detail::EventImplPtr queue_impl::submit_direct (
638+ bool CallerNeedsEvent, sycl::span< const event> DepEvents ,
639+ SubmitCommandFuncType &SubmitCommandFunc, detail::CGType Type ,
640+ bool InsertBarrierForInOrderCommand ) {
584641 detail::CG::StorageInitHelper CGData;
585642 std::unique_lock<std::mutex> Lock (MMutex);
586-
587- // Used by queue_empty() and getLastEvent()
588- MEmpty.store (false , std::memory_order_release);
643+ const bool inOrder = isInOrder ();
589644
590645 // Sync with an external event
591646 std::optional<event> ExternalEvent = popExternalEvent ();
592647 if (ExternalEvent) {
593648 registerEventDependency</* LockQueue*/ false >(
594649 getSyclObjImpl (*ExternalEvent), CGData.MEvents , this , getContextImpl (),
595650 getDeviceImpl (), hasCommandGraph () ? getCommandGraph ().get () : nullptr ,
596- detail::CGType::Kernel );
651+ Type );
597652 }
598653
599654 auto &Deps = hasCommandGraph () ? MExtGraphDeps : MDefaultGraphDeps;
600655
601656 // Sync with the last event for in order queue
602657 EventImplPtr &LastEvent = Deps.LastEventPtr ;
603- if (isInOrder () && LastEvent) {
658+ if (inOrder && LastEvent) {
604659 registerEventDependency</* LockQueue*/ false >(
605660 LastEvent, CGData.MEvents , this , getContextImpl (), getDeviceImpl (),
606- hasCommandGraph () ? getCommandGraph ().get () : nullptr ,
607- detail::CGType::Kernel);
661+ hasCommandGraph () ? getCommandGraph ().get () : nullptr , Type);
662+ } else if (inOrder && !MEmpty.load (std::memory_order_acquire) &&
663+ InsertBarrierForInOrderCommand) {
664+ // A barrier is injected to ensure ordering with prior commands
665+ auto ResEvent = insertHelperBarrier ();
666+ registerEventDependency</* LockQueue*/ false >(
667+ ResEvent, CGData.MEvents , this , getContextImpl (), getDeviceImpl (),
668+ hasCommandGraph () ? getCommandGraph ().get () : nullptr , Type);
608669 }
609670
610671 for (event e : DepEvents) {
611672 registerEventDependency</* LockQueue*/ false >(
612673 getSyclObjImpl (e), CGData.MEvents , this , getContextImpl (),
613674 getDeviceImpl (), hasCommandGraph () ? getCommandGraph ().get () : nullptr ,
614- detail::CGType::Kernel );
675+ Type );
615676 }
616677
617678 // Barrier and un-enqueued commands synchronization for out or order queue
618- if (!isInOrder () ) {
679+ if (!inOrder ) {
619680 MMissedCleanupRequests.unset (
620681 [&](MissedCleanupRequestsType &MissedCleanupRequests) {
621682 for (auto &UpdatedGraph : MissedCleanupRequests)
@@ -628,32 +689,30 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
628689 }
629690 }
630691
631- bool SchedulerBypass =
632- (CGData.MEvents .size () > 0
633- ? detail::Scheduler::areEventsSafeForSchedulerBypass (
634- CGData.MEvents , getContextImpl ())
635- : true ) &&
636- !hasCommandGraph ();
692+ // Used by queue_empty() and getLastEvent()
693+ MEmpty.store (false , std::memory_order_release);
694+
695+ auto [EventImpl, SchedulerBypass] = SubmitCommandFunc (std::move (CGData));
637696
638697 // Synchronize with the "no last event mode", used by the handler-based
639698 // kernel submit path
640- MNoLastEventMode.store (isInOrder () && SchedulerBypass,
641- std::memory_order_relaxed);
642-
643- EventImplPtr EventImpl =
644- SubmitCommandFunc (std::move (CGData), SchedulerBypass);
699+ MNoLastEventMode.store (inOrder && SchedulerBypass, std::memory_order_relaxed);
645700
646701 // Sync with the last event for in order queue. For scheduler-bypass flow,
647702 // the ordering is done at the layers below the SYCL runtime,
648703 // but for the scheduler-based flow, it needs to be done here, as the
649704 // scheduler handles host task submissions.
650- if (isInOrder () ) {
705+ if (inOrder ) {
651706 LastEvent = SchedulerBypass ? nullptr : EventImpl;
652707 }
653708
654- // Barrier and un-enqueued commands synchronization for out or order queue
655- if (!isInOrder () && !EventImpl->isEnqueued ()) {
656- Deps.UnenqueuedCmdEvents .push_back (EventImpl);
709+ // Barrier and un-enqueued commands synchronization for out or order queue.
710+ // The event must also be stored for future wait calls.
711+ if (!inOrder) {
712+ if (!EventImpl->isEnqueued ()) {
713+ Deps.UnenqueuedCmdEvents .push_back (EventImpl);
714+ }
715+ addEventUnlocked (EventImpl);
657716 }
658717
659718 return CallerNeedsEvent ? std::move (EventImpl) : nullptr ;
@@ -1104,6 +1163,15 @@ void queue_impl::verifyProps(const property_list &Props) const {
11041163 CheckPropertiesWithData);
11051164}
11061165
1166+ EventImplPtr queue_impl::insertHelperBarrier () {
1167+ auto ResEvent = detail::event_impl::create_device_event (*this );
1168+ ur_event_handle_t UREvent = nullptr ;
1169+ getAdapter ().call <UrApiKind::urEnqueueEventsWaitWithBarrier>(
1170+ getHandleRef (), 0 , nullptr , &UREvent);
1171+ ResEvent->setHandle (UREvent);
1172+ return ResEvent;
1173+ }
1174+
11071175} // namespace detail
11081176} // namespace _V1
11091177} // namespace sycl
0 commit comments