// Copyright Epic Games, Inc. All Rights Reserved.

#include "D3D12Submission.h"
#include "D3D12RHIPrivate.h"
#include "HAL/Runnable.h"
#include "HAL/RunnableThread.h"
#include "IRenderCaptureProvider.h"
#include "Stats/ThreadIdleStats.h"

#ifndef D3D12_PLATFORM_SUPPORTS_BLOCKING_FENCES
#define D3D12_PLATFORM_SUPPORTS_BLOCKING_FENCES 1
#endif

// These defines control which threads are enabled in the GPU submission pipeline.
#define D3D12_USE_SUBMISSION_THREAD (1)
#define D3D12_USE_INTERRUPT_THREAD  (1 && D3D12_PLATFORM_SUPPORTS_BLOCKING_FENCES)

static TAutoConsoleVariable<int32> CVarRHIUseSubmissionThread(
	TEXT("rhi.UseSubmissionThread"),
	2,
	TEXT("Whether to enable the RHI submission thread.\n")
	TEXT("  0: No\n")
	TEXT("  1: Yes, but not when running with multi-gpu.\n")
	TEXT("  2: Yes, always\n"),
	ECVF_ReadOnly);

DECLARE_CYCLE_STAT(TEXT("Submit"), STAT_D3D12Submit, STATGROUP_D3D12RHI);

DECLARE_CYCLE_STAT(TEXT("GPU Total Time [All Queues]"), STAT_RHI_GPUTotalTime, STATGROUP_D3D12RHI);
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [Hardware Timer]"), STAT_RHI_GPUTotalTimeHW, STATGROUP_D3D12RHI);
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [Graphics]"), STAT_RHI_GPUTotalTimeGraphics, STATGROUP_D3D12RHI);
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [Async Compute]"), STAT_RHI_GPUTotalTimeAsyncCompute, STATGROUP_D3D12RHI);
DECLARE_CYCLE_STAT(TEXT("GPU Total Time [Copy]"), STAT_RHI_GPUTotalTimeCopy, STATGROUP_D3D12RHI);

DECLARE_STATS_GROUP(TEXT("D3D12RHIPipeline"), STATGROUP_D3D12RHIPipeline, STATCAT_Advanced);

DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU IA Vertices"   ), STAT_D3D12RHI_IAVertices   , STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU IA Primitives" ), STAT_D3D12RHI_IAPrimitives , STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU VS Invocations"), STAT_D3D12RHI_VSInvocations, STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU GS Invocations"), STAT_D3D12RHI_GSInvocations, STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU GS Primitives" ), STAT_D3D12RHI_GSPrimitives , STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU C Invocations" ), STAT_D3D12RHI_CInvocations , STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU C Primitives"  ), STAT_D3D12RHI_CPrimitives  , STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU PS Invocations"), STAT_D3D12RHI_PSInvocations, STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU HS Invocations"), STAT_D3D12RHI_HSInvocations, STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU DS Invocations"), STAT_D3D12RHI_DSInvocations, STATGROUP_D3D12RHIPipeline);
DECLARE_DWORD_ACCUMULATOR_STAT(TEXT("GPU CS Invocations"), STAT_D3D12RHI_CSInvocations, STATGROUP_D3D12RHIPipeline);

static float GD3D12SubmissionTimeout = 5.0;
static FAutoConsoleVariableRef CVarD3D12SubmissionTimeout(
	TEXT("r.D3D12.SubmissionTimeout"),
	GD3D12SubmissionTimeout,
	TEXT("The maximum time, in seconds, that a submitted GPU command list is allowed to take before the RHI reports a GPU hang"),
	ECVF_RenderThreadSafe);

static int32 GD3D12SubmissionMaxExecuteBatchSizeDirect = std::numeric_limits<int32>::max();
static FAutoConsoleVariableRef CVarD3D12SubmissionMaxExecuteBatchSizeDirect(
	TEXT("r.D3D12.Submission.MaxExecuteBatchSize.Direct"),
	GD3D12SubmissionMaxExecuteBatchSizeDirect,
	TEXT("The maximum number of command lists to pass to a single ExecuteCommandLists invocation for direct queues\n")
	TEXT("The valid range is 1 to INT_MAX inclusive. Values less than 1 will be clamped to 1"),
	ECVF_RenderThreadSafe);

static int32 GD3D12SubmissionMaxExecuteBatchSizeCopy = std::numeric_limits<int32>::max();
static FAutoConsoleVariableRef CVarD3D12SubmissionMaxExecuteBatchSizeCopy(
	TEXT("r.D3D12.Submission.MaxExecuteBatchSize.Copy"),
	GD3D12SubmissionMaxExecuteBatchSizeCopy,
	TEXT("The maximum number of command lists to pass to a single ExecuteCommandLists invocation for copy queues\n")
	TEXT("The valid range is 1 to INT_MAX inclusive. Values less than 1 will be clamped to 1"),
	ECVF_RenderThreadSafe);

static int32 GD3D12SubmissionMaxExecuteBatchSizeAsync = std::numeric_limits<int32>::max();
static FAutoConsoleVariableRef CVarD3D12SubmissionMaxExecuteBatchSizeAsync(
	TEXT("r.D3D12.Submission.MaxExecuteBatchSize.Async"),
	GD3D12SubmissionMaxExecuteBatchSizeAsync,
	TEXT("The maximum number of command lists to pass to a single ExecuteCommandLists invocation for async queues\n")
	TEXT("The valid range is 1 to INT_MAX inclusive. Values less than 1 will be clamped to 1"),
	ECVF_RenderThreadSafe);

static std::atomic<int> GGPUCrashDetected = false;

class FD3D12Thread final : private FRunnable
{
public:
	typedef FD3D12DynamicRHI::FProcessResult(FD3D12DynamicRHI::*FQueueFunc)();

	FD3D12Thread(TCHAR const* Name, EThreadPriority Priority, FD3D12DynamicRHI* RHI, FQueueFunc QueueFunc)
		: RHI(RHI)
		, QueueFunc(QueueFunc)
		, Event(CreateEvent(nullptr, false, false, nullptr))
		, Thread(FRunnableThread::Create(this, Name, 0, Priority))
	{}

	virtual ~FD3D12Thread()
	{
		bExit = true;
		SetEvent(Event);

		Thread->WaitForCompletion();
		delete Thread;

		CloseHandle(Event);
	}

	void Kick() const
	{
		SetEvent(Event);
	}

	void Join() const
	{
		Thread->WaitForCompletion();
	}

	uint32 GetThreadID() const
	{
		return Thread->GetThreadID();
	}

private:
	virtual uint32 Run() override
	{
		while (!bExit)
		{
			// Process the queue until no more progress is made
			FD3D12DynamicRHI::FProcessResult Result;
			do { Result = (RHI->*QueueFunc)(); }
			while (EnumHasAllFlags(Result.Status, FD3D12DynamicRHI::EQueueStatus::Processed));

			WaitForSingleObject(Event, Result.WaitTimeout);
		}

		// Drain any remaining work in the queue
		while (EnumHasAllFlags((RHI->*QueueFunc)().Status, FD3D12DynamicRHI::EQueueStatus::Pending)) {}

		return 0;
	}

	FD3D12DynamicRHI* RHI;
	FQueueFunc QueueFunc;
	bool bExit = false;

public:
	// Can't use FEvent here since we need to be able to get the underlying HANDLE
	// for the ID3D12Fences to signal via ID3D12Fence::SetEventOnCompletion().
	HANDLE const Event;

private:
	FRunnableThread* Thread = nullptr;
};

void FD3D12DynamicRHI::InitializeSubmissionPipe()
{
	if (FPlatformProcess::SupportsMultithreading())
	{
#if D3D12_USE_INTERRUPT_THREAD
		InterruptThread = new FD3D12Thread(TEXT("RHIInterruptThread"), TPri_Highest, this, &FD3D12DynamicRHI::ProcessInterruptQueue);
#endif

#if D3D12_USE_SUBMISSION_THREAD
		bool bUseSubmissionThread = false;
		switch (CVarRHIUseSubmissionThread.GetValueOnAnyThread())
		{
		case 1: bUseSubmissionThread = FRHIGPUMask::All().HasSingleIndex(); break;
		case 2: bUseSubmissionThread = true; break;
		}

		// Currently RenderDoc can't make programmatic captures when we use a submission thread.
		bUseSubmissionThread &= !IRenderCaptureProvider::IsAvailable() || IRenderCaptureProvider::Get().CanSupportSubmissionThread();

		if (bUseSubmissionThread)
		{
			SubmissionThread = new FD3D12Thread(TEXT("RHISubmissionThread"), TPri_Highest, this, &FD3D12DynamicRHI::ProcessSubmissionQueue);
		}
#endif
	}

	// Initialize the timing structs in each queue, and the engine GPU profilers
	{
		TArray<FD3D12Payload*> Payloads;
	#if RHI_NEW_GPU_PROFILER
		TArray<UE::RHI::GPUProfiler::FQueue> ProfilerQueues;
	#endif

		ForEachQueue([&](FD3D12Queue& Queue)
		{
			FD3D12Payload* Payload = Payloads.Emplace_GetRef(new FD3D12Payload(Queue));
			Payload->Timing = CurrentTimingPerQueue.CreateNew(Queue);

		#if RHI_NEW_GPU_PROFILER
			ProfilerQueues.Add(Queue.GetProfilerQueue());
		#endif
		});

	#if RHI_NEW_GPU_PROFILER
		UE::RHI::GPUProfiler::InitializeQueues(ProfilerQueues);
	#endif
		SubmitPayloads(MoveTemp(Payloads));
	}
}

void FD3D12DynamicRHI::ShutdownSubmissionPipe()
{
	delete SubmissionThread;
	SubmissionThread = nullptr;

	delete InterruptThread;
	InterruptThread = nullptr;

	if (EopTask)
	{
		ProcessInterruptQueueUntil(EopTask);
		EopTask = nullptr;
	}
}

// A finalized set of command payloads. This type is used to implement the RHI command list submission API.
struct FD3D12FinalizedCommands : public IRHIPlatformCommandList, public TArray<FD3D12Payload*>
{};

void FD3D12DynamicRHI::RHIFinalizeContext(FRHIFinalizeContextArgs&& Args, TRHIPipelineArray<IRHIPlatformCommandList*>& Output)
{
	auto FinalizeContext = [&](FD3D12CommandContext* CmdContext, FD3D12FinalizedCommands& Result)
	{
		CmdContext->Finalize(Result);

		if (!CmdContext->IsDefaultContext())
		{
			CmdContext->ClearState();
			CmdContext->GetParentDevice()->ReleaseContext(CmdContext);
		}
	};

	for(IRHIComputeContext* Context : Args.Contexts)
	{
		FD3D12FinalizedCommands Result;
		ERHIPipeline Pipeline = Context->GetPipeline();
		
		FD3D12CommandContextBase* CmdContextBase = static_cast<FD3D12CommandContextBase*>(Context);
		if (FD3D12CommandContextRedirector* Redirector = CmdContextBase->AsRedirector())
		{
			for (uint32 GPUIndex : Redirector->GetPhysicalGPUMask())
				FinalizeContext(Redirector->GetSingleDeviceContext(GPUIndex), Result);
			
			if (!Redirector->bIsDefaultContext)
			{
				delete Redirector;
			}
		}
		else
		{
			FD3D12CommandContext* CmdContext = static_cast<FD3D12CommandContext*>(CmdContextBase);
			FinalizeContext(CmdContext, Result);
		}
		
		Output[Pipeline] = Result.Num() ? new FD3D12FinalizedCommands(MoveTemp(Result)) : nullptr;
	}
}

void FD3D12DynamicRHI::RHISubmitCommandLists(FRHISubmitCommandListsArgs&& Args)
{
	SubmitCommands(MakeArrayView(reinterpret_cast<FD3D12FinalizedCommands**>(Args.CommandLists.GetData()), Args.CommandLists.Num()));
}

void FD3D12DynamicRHI::SubmitCommands(TConstArrayView<FD3D12FinalizedCommands*> Commands)
{
	SCOPED_NAMED_EVENT_TEXT("CommandList_Submit", FColor::Magenta);

#if RHI_NEW_GPU_PROFILER

	TArray<FD3D12Payload*> AllPayloads;
	for (FD3D12FinalizedCommands* Payloads : Commands)
	{
	#if WITH_RHI_BREADCRUMBS
		TSharedPtr<FRHIBreadcrumbAllocatorArray> BreadcrumbAllocators {};
		if (Payloads->BreadcrumbAllocators.Num())
		{
			BreadcrumbAllocators = MakeShared<FRHIBreadcrumbAllocatorArray>(MoveTemp(Payloads->BreadcrumbAllocators));
		}

		for (FD3D12Payload* Payload : *Payloads)
		{
			Payload->BreadcrumbRange = Payloads->BreadcrumbRange;
			if (BreadcrumbAllocators.IsValid())
			{
				check(!Payload->BreadcrumbAllocators.IsValid());
				Payload->BreadcrumbAllocators = BreadcrumbAllocators;
			}
		}
	#endif

		AllPayloads.Append(MoveTemp(*Payloads));
		delete Payloads;
	}

	SubmitPayloads(MoveTemp(AllPayloads));

#else

	TArray<FD3D12Payload*> AllPayloads;
	#if WITH_RHI_BREADCRUMBS
	TArray<TSharedPtr<FRHIBreadcrumbAllocator>> BreadcrumbAllocators;
	#endif

	for (FD3D12FinalizedCommands* Payloads : Commands)
	{
	#if WITH_RHI_BREADCRUMBS
		for (FD3D12Payload* Payload : *Payloads)
		{
			Payload->BreadcrumbRange = Payloads->BreadcrumbRange;
		}
	#endif

		AllPayloads.Append(MoveTemp(static_cast<TArray<FD3D12Payload*>&>(*Payloads)));
	#if WITH_RHI_BREADCRUMBS
		BreadcrumbAllocators.Append(MoveTemp(Payloads->BreadcrumbAllocators));
	#endif
		delete Payloads;
	}

	SubmitPayloads(MoveTemp(AllPayloads));

	#if WITH_RHI_BREADCRUMBS
	// Enqueue the breadcrumb allocator references for cleanup once all prior payloads have completed on the GPU.
	DeferredDelete([Array = MoveTemp(BreadcrumbAllocators)]() {});
	#endif

#endif
}

void FD3D12DynamicRHI::SubmitPayloads(TArray<FD3D12Payload*>&& Payloads)
{
	if (Payloads.Num())
	{
		PendingPayloadsForSubmission.Enqueue(new TArray<FD3D12Payload*>(MoveTemp(Payloads)));
	}

	if (SubmissionThread)
	{
		SubmissionThread->Kick();
	}
	else
	{
		// Since we're processing directly on the calling thread, we need to take a scope lock.
		// Multiple engine threads might be calling Submit().
		{
			FScopeLock Lock(&SubmissionCS);

			// Process the submission queue until no further progress is being made.
			while (EnumHasAnyFlags(ProcessSubmissionQueue().Status, EQueueStatus::Processed)) {}
		}
	}

	// Use this opportunity to pump the interrupt queue
	ProcessInterruptQueueUntil(nullptr);
}

static int32 GetMaxExecuteBatchSize(ED3D12QueueType QueueType)
{
	switch (QueueType)
	{
		case ED3D12QueueType::Direct:
			return std::max(1, GD3D12SubmissionMaxExecuteBatchSizeDirect);
		case ED3D12QueueType::Copy:
			return std::max(1, GD3D12SubmissionMaxExecuteBatchSizeCopy);
		case ED3D12QueueType::Async:
			return std::max(1, GD3D12SubmissionMaxExecuteBatchSizeAsync);
		default:
			// Need to add new queue type and CVar
			checkNoEntry();
			return std::numeric_limits<int32>::max();
	}
}

FD3D12DynamicRHI::FProcessResult FD3D12DynamicRHI::ProcessSubmissionQueue()
{
	SCOPED_NAMED_EVENT_TEXT("SubmissionQueue_Process", FColor::Turquoise);
	SCOPE_CYCLE_COUNTER(STAT_D3D12Submit);
	LLM_SCOPE_BYNAME(TEXT("RHIMisc/ProcessSubmissionQueue"));

	FD3D12Queue::FPayloadArray PayloadsToHandDown;

	FProcessResult Result;

	auto FlushPayloads = [&PayloadsToHandDown, &Result, DynamicRHI = this](int32 MinPayloadsToFlush = 1)
	{
		if (PayloadsToHandDown.Num() >= MinPayloadsToFlush)
		{
			Result.Status |= EQueueStatus::Processed;
			DynamicRHI->FlushBatchedPayloads(PayloadsToHandDown);
		}
	};

	bool bProgress;
	bool bKickInterruptThread = false;

	do
	{
		bProgress = false;
		Result.Status = EQueueStatus::None;

		// Push all pending payloads into the ordered per-device, per-pipe pending queues
		{
			TArray<FD3D12Payload*>* Array;
			while (PendingPayloadsForSubmission.Dequeue(Array))
			{
				for (FD3D12Payload* Payload : *Array)
				{
					Payload->Queue.PendingSubmission.Enqueue(Payload);
				}
				delete Array;
			}
		}

		//
		// Fence values for FD3D12SyncPoint are determined on the submission thread,
		// where each queue has a monotonically incrementing fence value.
		//
		// We might receive work that waits on a sync point which has not yet been submitted
		// to the queue that will signal it, so we need to delay processing of those
		// payloads until the fence value is known.
		//

		// Process all queues (across all devices and adapters) to flush work.
		// Any sync point waits where the fence value is unknown will be left in the 
		// appropriate queue, to be processed the next time commands are submitted.
		ForEachQueue([&](FD3D12Queue& CurrentQueue)
		{
			while (true)
			{
				{
					FD3D12Payload* Payload = CurrentQueue.PendingSubmission.Peek();
					if (!Payload)
						return;

					// Accumulate the list of fences to await, and their maximum values
					while (Payload->SyncPointsToWait.Index < Payload->SyncPointsToWait.Num())
					{
						FD3D12SyncPointRef& SyncPoint = Payload->SyncPointsToWait[Payload->SyncPointsToWait.Index];
						if (!SyncPoint->ResolvedFence.IsSet())
						{
							// Need to wait on a sync point, but the fence value has not been resolved yet
							// (no other payloads have signaled the sync point yet).
						
							// Skip processing this queue, and move on to the next. We will retry later when
							// further work is submitted, which may contain the sync point we need.
							Result.Status |= EQueueStatus::Pending;
							return;
						}

						Payload->AddQueueFenceWait(
							SyncPoint->ResolvedFence->Fence,
							SyncPoint->ResolvedFence->Value
						);

						Payload->SyncPointsToWait.Index++;
						bProgress = true;
					}
				
					// All necessary sync points have been resolved.
					Payload->SyncPointsToWait = {};
					CurrentQueue.PendingSubmission.Pop();
					bProgress = true;

					check(!CurrentQueue.PayloadToSubmit);
					CurrentQueue.PayloadToSubmit = Payload;
					Result.Status |= EQueueStatus::Processed;
					bKickInterruptThread = true;

					//
					// Now we generate any required barrier command lists. These may require
					// executing on a different queue (e.g. graphics-only transitions required 
					// before async compute work), so we gather potential work across all
					// queues for this device.
					//
					auto AccumulateQueries = [&](FD3D12CommandList* CommandList)
					{
						FD3D12Queue& TargetQueue = CommandList->Device->GetQueue(CommandList->QueueType);
						const uint32 MaxBatchSize = GetMaxExecuteBatchSize(TargetQueue.QueueType);

						// Occlusion + Pipeline Stats Queries
						TargetQueue.BatchedObjects.OcclusionQueries.Append(MoveTemp(CommandList->State.OcclusionQueries));
						TargetQueue.BatchedObjects.PipelineStatsQueries.Append(MoveTemp(CommandList->State.PipelineStatsQueries));

#if RHI_NEW_GPU_PROFILER
						TargetQueue.BatchedObjects.TimestampQueries.Append(MoveTemp(CommandList->State.TimestampQueries));
#else
						// Timestamp Queries
						if (CommandList->State.BeginTimestamp)
						{
							// Keep only the first Begin() in the batch
							if (TargetQueue.NumCommandListsInBatch++ == 0)
							{
								TargetQueue.BatchedObjects.TimestampQueries.Emplace(MoveTemp(CommandList->State.BeginTimestamp));
							}
							else
							{
								// Remove the previous End() timestamp, to join the range together.
								check(TargetQueue.BatchedObjects.TimestampQueries.Last().Type == ED3D12QueryType::CommandListEnd);
								TargetQueue.BatchedObjects.TimestampQueries.RemoveAt(TargetQueue.BatchedObjects.TimestampQueries.Num() - 1);
							}

							TargetQueue.BatchedObjects.TimestampQueries.Append(MoveTemp(CommandList->State.TimestampQueries));
							TargetQueue.BatchedObjects.TimestampQueries.Emplace(MoveTemp(CommandList->State.EndTimestamp));

							if (TargetQueue.NumCommandListsInBatch >= MaxBatchSize)
							{
								// Start a new batch
								TargetQueue.NumCommandListsInBatch = 0;
							}
						}
						else
						{
							// No begin timestamp means timestamps aren't supported on this queue
							check(CommandList->State.TimestampQueries.IsEmpty());
							check(!CommandList->State.EndTimestamp);
						}
#endif
					};

					for (int32 Index = 0; Index < Payload->CommandListsToExecute.Num(); Index++)
					{
						FD3D12CommandList* CurrentCommandList = Payload->CommandListsToExecute[Index];
						AccumulateQueries(CurrentCommandList);
					}
				}

				FlushPayloads(FD3D12Queue::MaxBatchedPayloads);

				// Now submit the original payload
				CurrentQueue.FinalizePayload(false, PayloadsToHandDown);
				FlushPayloads(FD3D12Queue::MaxBatchedPayloads);
			}
		});
	} while (bProgress);

	FlushPayloads();

	if (InterruptThread && bKickInterruptThread)
	{
		InterruptThread->Kick();
	}

	return Result;
}

uint64 FD3D12Queue::FinalizePayload(bool bRequiresSignal, FPayloadArray& PayloadsToHandDown)
{
	TRACE_CPUPROFILER_EVENT_SCOPE(ExecuteCommandList);
	LLM_SCOPE_BYNAME(TEXT("RHIMisc/ExecuteCommandLists"));

	check(PayloadToSubmit && this == &PayloadToSubmit->Queue);
	check(PayloadToSubmit->SyncPointsToWait.Num() == 0);

	NumCommandListsInBatch = 0;

	BarrierTimestamps.CloseAndReset(PayloadToSubmit->BatchedObjects.QueryRanges);

	// Gather query ranges from this payload, grouping by heap pointer
	if (BatchedObjects.QueryRanges.Num())
	{
		for (auto& [Heap, Ranges] : PayloadToSubmit->BatchedObjects.QueryRanges)
		{
			BatchedObjects.QueryRanges.FindOrAdd(Heap).Append(MoveTemp(Ranges));
		}
		PayloadToSubmit->BatchedObjects.QueryRanges.Reset();
	}
	else
	{
		BatchedObjects.QueryRanges = MoveTemp(PayloadToSubmit->BatchedObjects.QueryRanges);
	}

	check(PayloadToSubmit->BatchedObjects.IsEmpty());

	if (!BatchedObjects.IsEmpty())
	{
		// Always resolve queries if we're switching the Timing struct,
		// since we need to gather the timestamps for that frame.
		bool bResolveQueries = PayloadToSubmit->Timing.IsSet();

		if (!bResolveQueries)
		{
			// If this payload will signal a CPU-visible sync point, we need to resolve queries.
			// This makes sure that the query data has reached the CPU before the sync point the CPU is waiting on is signaled.
			for (FD3D12SyncPoint* SyncPoint : PayloadToSubmit->SyncPointsToSignal)
			{
				if (SyncPoint->GetType() == ED3D12SyncPointType::GPUAndCPU)
				{
					bResolveQueries = true;
					break;
				}
			}
		}

		if (bResolveQueries)
		{
			{
				FD3D12CommandList* ResolveCommandList = nullptr;

				// We've got queries to resolve. Allocate a command list.
				auto GetResolveCommandList = [&]() -> FD3D12CommandList*
				{
					if (ResolveCommandList)
						return ResolveCommandList;

					if (!BarrierAllocator)
						BarrierAllocator = Device->ObtainCommandAllocator(QueueType);

					return ResolveCommandList = Device->ObtainCommandList(BarrierAllocator, nullptr, nullptr);
				};

				// Ranges are grouped by heap pointer.
				for (auto& [Heap, Ranges] : BatchedObjects.QueryRanges)
				{
					{
#if ENABLE_RESIDENCY_MANAGEMENT
						TArray<FD3D12ResidencyHandle*, TInlineAllocator<2>> ResidencyHandles;
						ResidencyHandles.Add(&Heap->GetHeapResidencyHandle());
						ResidencyHandles.Append(Heap->GetResultBuffer()->GetResidencyHandles());
						GetResolveCommandList()->AddToResidencySet(ResidencyHandles);
#endif // ENABLE_RESIDENCY_MANAGEMENT
					}

					if (Heap->GetD3DQueryHeap())
					{
						// Sort the ranges into ascending order so we can merge adjacent ones,
						// to reduce the number of ResolveQueryData calls we need to make.
						Ranges.Sort();

						for (int32 Index = 0; Index < Ranges.Num(); )
						{
							FD3D12QueryRange Range = Ranges[Index++];

							while (Index < Ranges.Num() && Ranges[Index].Start == Range.End)
							{
								// Ranges are contiguous. Extend.
								Range.End = Ranges[Index++].End;
							}

							GetResolveCommandList()->GraphicsCommandList()->ResolveQueryData(
								Heap->GetD3DQueryHeap(),
								Heap->QueryType,
								Range.Start,
								Range.End - Range.Start,
								Heap->GetResultBuffer()->GetResource(),
								Range.Start * Heap->GetResultSize()
							);
						}
					}
				}

				if (ResolveCommandList)
				{
					ResolveCommandList->Close();
					PayloadToSubmit->CommandListsToExecute.Add(ResolveCommandList);
				}
			}

			// Move all the batched objects in this queue into the payload, so they get passed down the pipe.
			PayloadToSubmit->BatchedObjects = MoveTemp(BatchedObjects);
		}
	}

	if (BarrierAllocator)
	{
		PayloadToSubmit->AllocatorsToRelease.Add(BarrierAllocator);
		BarrierAllocator = nullptr;
	}

	// Keep the latest fence value in the submitted payload.
	// The interrupt thread uses this to determine when work has completed.
	uint64 NextCompletionValue = Fence.NextCompletionValue;

	// Set the fence/value pair into any sync points we need to signal.
	for (FD3D12SyncPointRef& SyncPoint : PayloadToSubmit->SyncPointsToSignal)
	{
		check(!SyncPoint->ResolvedFence.IsSet());
		SyncPoint->ResolvedFence.Emplace(Fence, NextCompletionValue);
	}

	PayloadToSubmit->CompletionFenceValue = NextCompletionValue;
	PayloadToSubmit->bAlwaysSignal |= bRequiresSignal;

	if (PayloadToSubmit->RequiresQueueFenceSignal())
	{
		++Fence.NextCompletionValue;
	}

	PayloadsToHandDown.Add(PayloadToSubmit);
	PayloadToSubmit = nullptr;

	return NextCompletionValue;
}

void FD3D12DynamicRHI::UpdateReservedResources(FD3D12Payload* Payload)
{
	FD3D12Queue& Queue = Payload->Queue;

	// On some devices, some queues cannot perform tile remapping operations.
	// We can work around this limitation by running the remapping in lockstep on another queue:
	// - tile mapping queue waits for commands on this queue to finish
	// - tile mapping queue performs the commit/decommit operations
	// - this queue waits for tile mapping queue to finish
	// The extra sync is not required when the current queue is capable of the remapping operations.

	ID3D12CommandQueue* TileMappingQueue = (Queue.bSupportsTileMapping ? Queue.D3DCommandQueue : Queue.Device->TileMappingQueue).GetReference();
	FD3D12Fence& TileMappingFence = Queue.Device->TileMappingFence;

	const bool bCrossQueueSyncRequired = TileMappingQueue != Queue.D3DCommandQueue.GetReference();

	if (bCrossQueueSyncRequired)
	{
		// tile mapping queue waits for commands on this queue to finish
		Queue.D3DCommandQueue->Signal(TileMappingFence.D3DFence, ++TileMappingFence.LastSignaledValue);
		TileMappingQueue->Wait(TileMappingFence.D3DFence, TileMappingFence.LastSignaledValue);
	}

	for (const FD3D12CommitReservedResourceDesc& CommitDesc : Payload->ReservedResourcesToCommit)
	{
		checkf(CommitDesc.Resource, TEXT("FD3D12CommitReservedResourceDesc::Resource must be set"));
		CommitDesc.Resource->CommitReservedResource(TileMappingQueue, CommitDesc.CommitSizeInBytes);
	}

	if (bCrossQueueSyncRequired)
	{
		// this queue waits for tile mapping operations to finish
		TileMappingQueue->Signal(TileMappingFence.D3DFence, ++TileMappingFence.LastSignaledValue);
		Queue.D3DCommandQueue->Wait(TileMappingFence.D3DFence, TileMappingFence.LastSignaledValue);
	}
}

void FD3D12DynamicRHI::FlushBatchedPayloads(FD3D12Queue::FPayloadArray& PayloadsToSubmit)
{
	uint32 FirstPayload = 0, LastPayload = 0;

	auto Wait = [this](FD3D12Payload* Payload)
	{
		FD3D12Queue& Queue = Payload->Queue;

		// Wait for queue fences
		for (auto& [LocalFence, Value] : Payload->QueueFencesToWait)
		{
		#if RHI_NEW_GPU_PROFILER
			Payload->EventStream.Emplace<UE::RHI::GPUProfiler::FEvent::FWaitFence>(
				  FPlatformTime::Cycles64()
				, Value
				, LocalFence.OwnerQueue->GetProfilerQueue()
			);
		#endif

			VERIFYD3D12RESULT(Queue.D3DCommandQueue->Wait(LocalFence.D3DFence, Value));
		}

		// Wait for manual fences
		for (auto& [LocalFence, Value] : Payload->ManualFencesToWait)
		{
			VERIFYD3D12RESULT(Queue.D3DCommandQueue->Wait(LocalFence, Value));
		}
	};

	auto Flush = [&]()
	{
		if (FirstPayload == LastPayload)
			return;

		FD3D12Queue& Queue = PayloadsToSubmit[FirstPayload]->Queue;

		// Build SOA layout needed to call ExecuteCommandLists().
		TArray<FD3D12CommandList*, TInlineAllocator<128>> CommandLists;
		TArray<ID3D12CommandList*, TInlineAllocator<128>> D3DCommandLists;
#if ENABLE_RESIDENCY_MANAGEMENT
		TArray<FD3D12ResidencySet*, TInlineAllocator<128>> ResidencySets;
#endif

		uint64 Time = FPlatformTime::Cycles64();

		// Accumulate the command lists from the payload
		for (uint32 Index = FirstPayload; Index < LastPayload; ++Index)
		{
			FD3D12Payload* Payload = PayloadsToSubmit[Index];
			check(&Payload->Queue == &Queue);

			for (FD3D12CommandList* CommandList : Payload->CommandListsToExecute)
			{
				check(CommandList->IsClosed());

#if RHI_NEW_GPU_PROFILER
				CommandList->FlushProfilerEvents(Payload->EventStream, Time);
#endif // RHI_NEW_GPU_PROFILER

				D3DCommandLists.Add(CommandList->Interfaces.CommandList);

#if ENABLE_RESIDENCY_MANAGEMENT
				ResidencySets.Add(CommandList->CloseResidencySet());
#endif
			}
			CommandLists.Append(MoveTemp(Payload->CommandListsToExecute));
		}

		const int32 MaxBatchSize = GetMaxExecuteBatchSize(Queue.QueueType);
		const int32 NumCommandLists = D3DCommandLists.Num();

		for (int32 DispatchNum, Offset = 0; Offset < NumCommandLists; Offset += DispatchNum)
		{
			DispatchNum = FMath::Min(NumCommandLists - Offset, MaxBatchSize);

			extern int32 GD3D12MaxCommandsPerCommandList;
			if (GD3D12MaxCommandsPerCommandList > 0)
			{
				// Limit the dispatch group based on the total number of commands each command list contains, so that we
				// don't submit more than approx "GD3D12MaxCommandsPerCommandList" commands per call to ExecuteCommandLists().
				int32 Index = 0;
				for (int32 NumCommands = 0; Index < DispatchNum && NumCommands < GD3D12MaxCommandsPerCommandList; ++Index)
				{
					NumCommands += CommandLists[Offset + Index]->State.NumCommands;
				}

				DispatchNum = Index;
			}

			INC_DWORD_STAT(STAT_D3D12ExecutedCommandListBatches);
			INC_DWORD_STAT_BY(STAT_D3D12ExecutedCommandLists, DispatchNum);

			Queue.ExecuteCommandLists(
				MakeArrayView<ID3D12CommandList*>(&D3DCommandLists[Offset], DispatchNum)
#if ENABLE_RESIDENCY_MANAGEMENT
				, MakeArrayView<FD3D12ResidencySet*>(&ResidencySets[Offset], DispatchNum)
#endif
			);

#if LOG_EXECUTE_COMMAND_LISTS
			LogExecuteCommandLists(DispatchNum, &D3DCommandLists[Offset]);
#endif
		}

		// Release the FD3D12CommandList instances back to the parent device object pool.
		for (FD3D12CommandList* CommandList : CommandLists)
		{
			CommandList->Device->ReleaseCommandList(CommandList);
		}

		FirstPayload = LastPayload;
	};

	auto Signal = [this](FD3D12Payload* Payload)
	{
		FD3D12Queue& Queue = Payload->Queue;

		// Signal any manual fences
		for (auto& [ManualFence, Value] : Payload->ManualFencesToSignal)
		{
			VERIFYD3D12RESULT(Queue.D3DCommandQueue->Signal(ManualFence, Value));
		}

		// Signal the queue fence
		if (Payload->RequiresQueueFenceSignal())
		{
			check(Queue.Fence.LastSignaledValue < Payload->CompletionFenceValue);

		#if RHI_NEW_GPU_PROFILER
			Payload->EventStream.Emplace<UE::RHI::GPUProfiler::FEvent::FSignalFence>(
				  FPlatformTime::Cycles64()
				, Payload->CompletionFenceValue
			);
		#endif

			VERIFYD3D12RESULT(Queue.D3DCommandQueue->Signal(Queue.Fence.D3DFence, Payload->CompletionFenceValue));
			Queue.Fence.LastSignaledValue.store(Payload->CompletionFenceValue, std::memory_order_release);
		}

	#if RHI_NEW_GPU_PROFILER
		if (Payload->EndFrameEvent.IsSet())
		{
			Payload->EndFrameEvent->CPUTimestamp = FPlatformTime::Cycles64();
			Payload->EventStream.Emplace<UE::RHI::GPUProfiler::FEvent::FFrameBoundary>(*Payload->EndFrameEvent);
		}
	#endif

		// Submission of this payload is completed. Signal the submission event if one was provided.
		if (Payload->SubmissionEvent)
		{
			Payload->SubmissionEvent->DispatchSubsequents();
		}
	};

	FD3D12Queue* PrevQueue = nullptr;
	for (FD3D12Payload* Payload : PayloadsToSubmit)
	{
		if (PrevQueue != &Payload->Queue)
		{
			Flush();
			PrevQueue = &Payload->Queue;
		}

		Payload->Queue.PendingInterrupt.Enqueue(Payload);

#if RHI_NEW_GPU_PROFILER
		if (Payload->Timing.IsSet())
		{
			Flush();

			if (FD3D12Timing* LocalTiming = *Payload->Timing)
			{
				// Calibrate the GPU timestamp / clock, if the queue type supports calibration.
				if (Payload->Queue.QueueType != ED3D12QueueType::Copy || Payload->Queue.Device->GetParentAdapter()->AreCopyQueueTimestampQueriesSupported())
				{
					SCOPED_NAMED_EVENT(CalibrateClocks, FColor::Red);
					VERIFYD3D12RESULT(Payload->Queue.D3DCommandQueue->GetClockCalibration(&LocalTiming->GPUTimestamp, &LocalTiming->CPUTimestamp));
					VERIFYD3D12RESULT(Payload->Queue.D3DCommandQueue->GetTimestampFrequency(&LocalTiming->GPUFrequency));
					QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*>(&LocalTiming->CPUFrequency));
				}
			}
		}
#endif // RHI_NEW_GPU_PROFILER

		if (Payload->HasWaitWork())
		{
			Flush();
			Wait(Payload);
		}

		if (Payload->HasUpdateReservedResourcesWork())
		{
			Flush();
			UpdateReservedResources(Payload);
		}

		if (Payload->HasPreExecuteWork())
		{
			Flush();
			Payload->PreExecute();
		}

		LastPayload++;

		if (Payload->HasSignalWork())
		{
			Flush();
			Signal(Payload);
		}
	}

	Flush();

	for (FD3D12Payload* Payload : PayloadsToSubmit)
	{
		// Only set this bool to true once we'll never touch the payload again on this thread.
		// This is because the bool hands ownership to the interrupt thread, which might delete the payload.
		Payload->bSubmitted = true;
	}

	PayloadsToSubmit.Reset();	
}

void FD3D12PayloadBase::AddQueueFenceWait(FD3D12Fence& InFence, uint64 InValue)
{
	for (auto& [Fence, Value] : QueueFencesToWait)
	{
		if (&Fence == &InFence)
		{
			Value = FMath::Max(Value, InValue);
			return;
		}
	}

	QueueFencesToWait.Add({ InFence, InValue });
}

void FD3D12SyncPoint::Wait() const
{
	checkf(GraphEvent, TEXT("This sync point was not created with a CPU event. Cannot wait for completion on the CPU."));

	if (!GraphEvent->IsComplete())
	{
		// Block the calling thread until the graph event is signaled by the interrupt thread.
		SCOPED_NAMED_EVENT_TEXT("SyncPoint_Wait", FColor::Turquoise);
		FD3D12DynamicRHI::GetD3DRHI()->ProcessInterruptQueueUntil(GraphEvent);
	}

	check(GraphEvent->IsComplete());
}

void FD3D12DynamicRHI::ProcessInterruptQueueUntil(FGraphEvent* GraphEvent)
{
	if (InterruptThread)
	{
		if (GraphEvent && !GraphEvent->IsComplete())
		{
			GraphEvent->Wait();
		}
	}
	else
	{
		// Use the current thread to process the interrupt queue until the sync point we're waiting for is signaled.
		// If GraphEvent is nullptr, process the queue until no further progress is made (assuming we can acquire the lock), then return.
		if (!GraphEvent || !GraphEvent->IsComplete())
		{
			// If we're waiting for a sync point, accumulate the idle time
			UE::Stats::FThreadIdleStats::FScopeIdle IdleScope(/* bIgnore = */GraphEvent == nullptr);

		Retry:
			if (InterruptCS.TryLock())
			{
				TGuardValue<uint32> Guard(InterruptThreadID, FPlatformTLS::GetCurrentThreadId());
				
				FProcessResult Result;
				do { Result = ProcessInterruptQueue(); }
				// If we have a sync point, keep processing until the sync point is signaled.
				// Otherwise, process until no more progress is being made.
				while (GraphEvent
					? !GraphEvent->IsComplete()
					: EnumHasAllFlags(Result.Status, EQueueStatus::Processed)
				);

				InterruptCS.Unlock();
			}
			else if (GraphEvent && !GraphEvent->IsComplete())
			{
				// Failed to get the lock. Another thread is processing the interrupt queue. Try again...
				FPlatformProcess::SleepNoStats(0);
				goto Retry;
			}
		}
	}
}

void FD3D12DynamicRHI::ProcessInterruptQueueOnGPUCrash()
{
	// This function will not return.

	// We know this function was called due to a GPU crash, so let the thread know.
	GGPUCrashDetected.store(true, std::memory_order_release);
	
	if (InterruptThread)
	{
		// Since we have an interrupt thread, allow it to process the GPU crash.
		// This is necessary so it can retrieve all the active payloads for resolving breadcrumbs.
		InterruptThread->Kick();

		// Wait for the interrupt thread to exit (which will never happen).
		InterruptThread->Join();
	}
	else
	{
		// If we have no interrupt thread, assume ownership on the current thread
		// (or block forever on the scope lock if multiple threads enter this function).
		FScopeLock Lock(&InterruptCS);
		TGuardValue<uint32> Guard(InterruptThreadID, FPlatformTLS::GetCurrentThreadId());

		while (true)
		{
			ProcessInterruptQueue();
		}
	}
}

bool FD3D12DynamicRHI::IsInInterruptThread() const
{
	uint32 ThisThreadID = FPlatformTLS::GetCurrentThreadId();

	// If we don't have a dedicated interrupt thread, the thread currently acting
	// as the interrupt thread is tracked via the InterruptThreadID field.

	if (InterruptThread)
	{
		return ThisThreadID == InterruptThread->GetThreadID();
	}
	else
	{
		return ThisThreadID == InterruptThreadID;
	}
}

D3D12_QUERY_DATA_PIPELINE_STATISTICS& operator += (D3D12_QUERY_DATA_PIPELINE_STATISTICS& LHS, D3D12_QUERY_DATA_PIPELINE_STATISTICS const& RHS)
{
	LHS.IAVertices	  += RHS.IAVertices;
	LHS.IAPrimitives  += RHS.IAPrimitives;
	LHS.VSInvocations += RHS.VSInvocations;
	LHS.GSInvocations += RHS.GSInvocations;
	LHS.GSPrimitives  += RHS.GSPrimitives;
	LHS.CInvocations  += RHS.CInvocations;
	LHS.CPrimitives	  += RHS.CPrimitives;
	LHS.PSInvocations += RHS.PSInvocations;
	LHS.HSInvocations += RHS.HSInvocations;
	LHS.DSInvocations += RHS.DSInvocations;
	LHS.CSInvocations += RHS.CSInvocations;
	return LHS;
}

FD3D12DynamicRHI::FProcessResult FD3D12DynamicRHI::ProcessInterruptQueue()
{
	SCOPED_NAMED_EVENT_TEXT("InterruptQueue_Process", FColor::Yellow);
	LLM_SCOPE_BYNAME(TEXT("RHIMisc/ProcessInterruptQueue"));

	// Timer that clamps each tick to prevent false positive GPU timeouts
	// when a debugger is attached and the process is broken.
	struct FTimer
	{
		uint64 Elapsed;
		uint64 Last;

		FTimer()
			: Elapsed(0)
			, Last(FPlatformTime::Cycles64())
		{}

		void Tick()
		{
			static const uint64 MaxDeltaCycles = uint64(1.0 / FPlatformTime::GetSecondsPerCycle64()); // 1 second
			uint64 Current = FPlatformTime::Cycles64();
			Elapsed += FMath::Min(MaxDeltaCycles, Current - Last);
			Last = Current;
		}
	} static Timer;

	Timer.Tick();

	auto CheckForDeviceRemoved = [this](FD3D12Queue& Queue)
	{
		// If we get an error code here, we can't pass it directly to VERIFYD3D12RESULT, because that expects DXGI_ERROR_DEVICE_REMOVED,
		// DXGI_ERROR_DEVICE_RESET etc. and wants to obtain the reason code itself by calling GetDeviceRemovedReason (again).
		HRESULT DeviceRemovedReason = Queue.Device->GetDevice()->GetDeviceRemovedReason();
		if (DeviceRemovedReason != S_OK)
		{
			TerminateOnGPUCrash();
		}
	};

	FProcessResult Result;
	ForEachQueue([&](FD3D12Queue& CurrentQueue)
	{
		while (FD3D12Payload* Payload = CurrentQueue.PendingInterrupt.Peek())
		{
			if (!Payload->bSubmitted)
				break;

			// Check for GPU completion
			uint64 CompletedFenceValue = CurrentQueue.Fence.D3DFence->GetCompletedValue();
			uint64 LastSignaledFenceValue = CurrentQueue.Fence.LastSignaledValue.load(std::memory_order_acquire);

			// If the GPU crashes or hangs, the driver will signal all fences to UINT64_MAX.
			if (CompletedFenceValue == UINT64_MAX)
			{
				CheckForDeviceRemoved(CurrentQueue);
			}

			if (CompletedFenceValue < Payload->CompletionFenceValue)
			{
				// Command list batch has not yet completed on this queue.
				// Ask the driver to wake this thread again when the required value is reached.
				if (InterruptThread && !CurrentQueue.Fence.bInterruptAwaited)
				{
					SCOPED_NAMED_EVENT_TEXT("SetEventOnCompletion", FColor::Red);
					VERIFYD3D12RESULT(CurrentQueue.Fence.D3DFence->SetEventOnCompletion(Payload->CompletionFenceValue, InterruptThread->Event));
					CurrentQueue.Fence.bInterruptAwaited = true;
				}

				// Skip processing this queue and move on to the next.
				Result.Status |= EQueueStatus::Pending;

				// Detect a hung GPU
				if (!Payload->SubmissionTime.IsSet() && LastSignaledFenceValue >= Payload->CompletionFenceValue)
				{
					//
					// Keep track of the first time we've checked for completion on the interrupt thread.
					// We set this here to avoid false positives when a debugger is attached. If we'd set this on the submission thread, it
					// is possible for the title to be paused by the debugger after the time is set but before the payload has reached the GPU.
					//
					Payload->SubmissionTime = Timer.Elapsed;
				}

				if (Payload->SubmissionTime.IsSet() && Payload->SubmissionTime != TNumericLimits<uint64>::Max())
				{
					static const double CyclesPerSecond = 1.0 / FPlatformTime::GetSecondsPerCycle64();
					const uint64 TimeoutCycles = FMath::TruncToInt64(GD3D12SubmissionTimeout * CyclesPerSecond);

					uint64 ElapsedCycles = Timer.Elapsed - Payload->SubmissionTime.GetValue();

					if (ElapsedCycles > TimeoutCycles)
					{
						// The last submission on this pipe did not complete within the timeout period. Assume the GPU has hung.
						HandleGpuTimeout(Payload, ElapsedCycles * FPlatformTime::GetSecondsPerCycle64());

						// Set to int max to indicate we've already reported the timeout for this payload.
						Payload->SubmissionTime = TNumericLimits<uint64>::Max();
					}
					else
					{
						// Adjust the event wait timeout to cause the interrupt thread to wake automatically when
						// the timeout for this payload is reached, assuming it hasn't been woken by the GPU already.
						uint64 RemainingCycles = TimeoutCycles - ElapsedCycles;
						uint32 RemainingMilliseconds = FMath::TruncToInt(RemainingCycles * FPlatformTime::GetSecondsPerCycle64() * 1000.0);
						Result.WaitTimeout = FMath::Min(Result.WaitTimeout, RemainingMilliseconds);
					}
				}
				break;
			}

			// At this point, the current command list has completed on the GPU.
			CurrentQueue.Fence.bInterruptAwaited = false;
			CurrentQueue.PendingInterrupt.Pop();
			Result.Status |= EQueueStatus::Processed;

			// Resolve query results
			{
				for (FD3D12QueryLocation& Query : Payload->BatchedObjects.OcclusionQueries)
				{
					check(Query.Target);
					Query.CopyResultTo(Query.Target);
				}

				for (FD3D12QueryLocation& Query : Payload->BatchedObjects.PipelineStatsQueries)
				{
					if (Query.Target)
					{
						Query.CopyResultTo(Query.Target);
					}
					else
					{
						// Pipeline stats queries without targets are the ones that surround whole command lists.
						CurrentQueue.Timing->PipelineStats += Query.GetResult<D3D12_QUERY_DATA_PIPELINE_STATISTICS>();
					}
				}

				if (Payload->BatchedObjects.TimestampQueries.Num())
				{
					// Some timestamp queries report in microseconds
					const double MicrosecondsScale = 1000000.0 / CurrentQueue.Device->GetTimestampFrequency(CurrentQueue.QueueType);

					for (FD3D12QueryLocation& Query : Payload->BatchedObjects.TimestampQueries)
					{
						if (Query.Target)
						{
							Query.CopyResultTo(Query.Target);
						}

						switch (Query.Type)
						{
						case ED3D12QueryType::TimestampMicroseconds:
						case ED3D12QueryType::TimestampRaw:
							check(Query.Target);
							if (Query.Type == ED3D12QueryType::TimestampMicroseconds)
							{
								// Convert to microseconds
								*static_cast<uint64*>(Query.Target) = FPlatformMath::TruncToInt(double(*static_cast<uint64*>(Query.Target)) * MicrosecondsScale);
							}
							break;

					#if RHI_NEW_GPU_PROFILER
						case ED3D12QueryType::ProfilerTimestampTOP:
						case ED3D12QueryType::ProfilerTimestampBOP:
							{
								// Convert from GPU timestamp to CPU timestamp (relative to FPlatformTime::Cycles64())
								uint64& Target = *static_cast<uint64*>(Query.Target);

								uint64 GPUDelta = Target - CurrentQueue.Timing->GPUTimestamp;
								uint64 CPUDelta = (GPUDelta * CurrentQueue.Timing->CPUFrequency) / CurrentQueue.Timing->GPUFrequency;

								Target = CPUDelta + CurrentQueue.Timing->CPUTimestamp;
							}
							break;
					#else
						case ED3D12QueryType::CommandListBegin:
						case ED3D12QueryType::CommandListEnd:
						case ED3D12QueryType::IdleBegin:
						case ED3D12QueryType::IdleEnd:
							check(CurrentQueue.Timing);
							CurrentQueue.Timing->Timestamps.Add(Query.GetResult<uint64>());
							break;
					#endif
						}
					}
				}
			}

		#if RHI_NEW_GPU_PROFILER
			if (!Payload->EventStream.IsEmpty())
			{
				check(CurrentQueue.Timing);
				CurrentQueue.Timing->EventStream.Append(MoveTemp(Payload->EventStream));
			}
		#endif

			if (Payload->Timing.IsSet())
			{
				// Switch the new timing struct into the queue. This redirects timestamp results to separate each frame's work.
				CurrentQueue.Timing = Payload->Timing.GetValue();
			}

			// Signal the CPU events of all sync points associated with this batch.
			for (FD3D12SyncPointRef& SyncPoint : Payload->SyncPointsToSignal)
			{
				if (SyncPoint->GraphEvent)
				{
					SyncPoint->GraphEvent->DispatchSubsequents();
				}
			}

			// We're done with this payload now.

			// GPU resources the payload is holding a reference to will be cleaned up here.
			// E.g. command list allocators, which get recycled on the parent device.
			delete Payload;
		}

		CheckForDeviceRemoved(CurrentQueue);
	});

	if (GGPUCrashDetected.load(std::memory_order_relaxed))
	{
		// If this was set by ProcessInterruptQueueOnGPUCrash, we know a crash was detected, so process it immediately. We can't always rely on
		// queue processing to catch it, as GetDeviceRemovedReason sometimes returns S_OK despite an earlier API call having reported a lost device.
		TerminateOnGPUCrash();
	}

	return Result;
}

FD3D12PayloadBase::FD3D12PayloadBase(FD3D12Queue& Queue)
	: Queue(Queue)
#if RHI_NEW_GPU_PROFILER
	, EventStream(Queue.GetProfilerQueue())
#endif
{}

FD3D12PayloadBase::~FD3D12PayloadBase()
{
	for (FD3D12CommandAllocator* Allocator : AllocatorsToRelease)
	{
		Queue.Device->ReleaseCommandAllocator(Allocator);
	}
}

void FD3D12PayloadBase::PreExecute()
{
	if (PreExecuteCallback)
	{
		PreExecuteCallback(Queue.D3DCommandQueue);
	}
}

#ifndef D3D12_PREFER_QUERIES_FOR_GPU_TIME
#define D3D12_PREFER_QUERIES_FOR_GPU_TIME 0
#endif

static TAutoConsoleVariable<int32> CVarGPUTimeFromTimestamps(
	TEXT("r.D3D12.GPUTimeFromTimestamps"),
	D3D12_PREFER_QUERIES_FOR_GPU_TIME,
	TEXT("Prefer timestamps instead of GetHardwareGPUFrameTime to compute GPU frame time"),
	ECVF_RenderThreadSafe);

void FD3D12DynamicRHI::ProcessTimestamps(FD3D12TimingArray const& TimingPerQueue)
{
#if RHI_NEW_GPU_PROFILER
	{
		TArray<UE::RHI::GPUProfiler::FEventStream, TInlineAllocator<GD3D12MaxNumQueues>> Streams;
		for (auto const& Timing : TimingPerQueue)
		{
			Streams.Add(MoveTemp(Timing->EventStream));
		}
		UE::RHI::GPUProfiler::ProcessEvents(Streams);
	}
#else

	// The total number of cycles where at least one GPU pipe was busy during the frame.
	uint64 UnionBusyCycles = 0;
	int32 BusyPipes = 0;

	uint64 LastMinCycles = 0;
	bool bFirst = true;

	// Process the time ranges from each pipe.
	while (true)
	{
		// Find the next minimum timestamp
		FD3D12Timing* NextMin = nullptr;
		for (auto const& Current : TimingPerQueue)
		{
			if (Current->HasMoreTimestamps() && (!NextMin || Current->GetCurrentTimestamp() < NextMin->GetCurrentTimestamp()))
			{
				NextMin = Current.Get();
			}
		}

		if (!NextMin)
			break; // No more timestamps to process

		if (!bFirst)
		{
			if (BusyPipes > 0 && NextMin->GetCurrentTimestamp() > LastMinCycles)
			{
				// Accumulate the union busy time across all pipes
				UnionBusyCycles += NextMin->GetCurrentTimestamp() - LastMinCycles;
			}

			if (!NextMin->IsStartingWork())
			{
				// Accumulate the busy time for this pipe specifically.
				NextMin->BusyCycles += NextMin->GetCurrentTimestamp() - NextMin->GetPreviousTimestamp();
			}
		}

		LastMinCycles = NextMin->GetCurrentTimestamp();

		BusyPipes += NextMin->IsStartingWork() ? 1 : -1;
		check(BusyPipes >= 0);

		NextMin->AdvanceTimestamp();
		bFirst = false;
	}

	check(BusyPipes == 0);
	
#endif

	D3D12_QUERY_DATA_PIPELINE_STATISTICS PipelineStats{};
	for (auto const& Current : TimingPerQueue)
	{
		PipelineStats += Current->PipelineStats;
	}

	SET_DWORD_STAT(STAT_D3D12RHI_IAVertices   , PipelineStats.IAVertices   );
	SET_DWORD_STAT(STAT_D3D12RHI_IAPrimitives , PipelineStats.IAPrimitives );
	SET_DWORD_STAT(STAT_D3D12RHI_VSInvocations, PipelineStats.VSInvocations);
	SET_DWORD_STAT(STAT_D3D12RHI_GSInvocations, PipelineStats.GSInvocations);
	SET_DWORD_STAT(STAT_D3D12RHI_GSPrimitives , PipelineStats.GSPrimitives );
	SET_DWORD_STAT(STAT_D3D12RHI_CInvocations , PipelineStats.CInvocations );
	SET_DWORD_STAT(STAT_D3D12RHI_CPrimitives  , PipelineStats.CPrimitives  );
	SET_DWORD_STAT(STAT_D3D12RHI_PSInvocations, PipelineStats.PSInvocations);
	SET_DWORD_STAT(STAT_D3D12RHI_HSInvocations, PipelineStats.HSInvocations);
	SET_DWORD_STAT(STAT_D3D12RHI_DSInvocations, PipelineStats.DSInvocations);
	SET_DWORD_STAT(STAT_D3D12RHI_CSInvocations, PipelineStats.CSInvocations);

#if RHI_NEW_GPU_PROFILER == 0

	// @todo mgpu - how to handle multiple devices / queues with potentially different timestamp frequencies?
	FD3D12Device* Device = GetAdapter().GetDevice(0);
	double Frequency = Device->GetTimestampFrequency(ED3D12QueueType::Direct);

	const double Scale64 = 1.0 / (Frequency * FPlatformTime::GetSecondsPerCycle64());

	// Update the global GPU frame time stats
	SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTime, FPlatformMath::TruncToInt(double(UnionBusyCycles) * Scale64));

	double HardwareGPUTime = 0.0;
	if (GetHardwareGPUFrameTime(HardwareGPUTime) && CVarGPUTimeFromTimestamps.GetValueOnAnyThread() == 0)
	{
		SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeHW, HardwareGPUTime);
		GRHIGPUFrameTimeHistory.PushFrameCycles(1.0 / FPlatformTime::GetSecondsPerCycle64(), HardwareGPUTime);
	}
	else
	{
		SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeHW, 0);
		GRHIGPUFrameTimeHistory.PushFrameCycles(Frequency, UnionBusyCycles);
	}

	for (auto const& Current : TimingPerQueue)
	{
		switch (Current->Queue.QueueType)
		{
		case ED3D12QueueType::Direct: SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeGraphics    , FPlatformMath::TruncToInt(double(Current->BusyCycles) * Scale64)); break;
		case ED3D12QueueType::Async : SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeAsyncCompute, FPlatformMath::TruncToInt(double(Current->BusyCycles) * Scale64)); break;
		case ED3D12QueueType::Copy  : SET_CYCLE_COUNTER(STAT_RHI_GPUTotalTimeCopy        , FPlatformMath::TruncToInt(double(Current->BusyCycles) * Scale64)); break;
		}
	}

#endif
}