diff --git a/cmake/XglCompileDefinitions.cmake b/cmake/XglCompileDefinitions.cmake index 8ddc2268..7a345008 100644 --- a/cmake/XglCompileDefinitions.cmake +++ b/cmake/XglCompileDefinitions.cmake @@ -119,6 +119,9 @@ macro(xgl_set_compile_definitions) endif() #endif +#if VKI_RAY_TRACING +#endif + if (XGL_ENABLE_GCOV) target_compile_definitions(xgl PRIVATE ICD_ENABLE_GCOV) endif() @@ -132,6 +135,9 @@ macro(xgl_set_compile_definitions) #if VKI_RAY_TRACING #endif +#if VKI_RAY_TRACING +#endif + #if VKI_RAY_TRACING #endif diff --git a/cmake/XglVersions.cmake b/cmake/XglVersions.cmake index 02029eeb..4f65ae21 100644 --- a/cmake/XglVersions.cmake +++ b/cmake/XglVersions.cmake @@ -28,7 +28,7 @@ include_guard() # This will become the value of PAL_CLIENT_INTERFACE_MAJOR_VERSION. It describes the version of the PAL interface # that the ICD supports. PAL uses this value to enable backwards-compatibility for older interface versions. # It must be updated on each PAL promotion after handling all of the interface changes described in palLib.h. -set(ICD_PAL_CLIENT_MAJOR_VERSION "867") +set(ICD_PAL_CLIENT_MAJOR_VERSION "878") # This will become the value of GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION if ICD_GPUOPEN_DEVMODE_BUILD=1. # It describes the interface version of the gpuopen shared module (part of PAL) that the ICD supports. diff --git a/icd/CMakeLists.txt b/icd/CMakeLists.txt index 2da6c892..3eefab43 100644 --- a/icd/CMakeLists.txt +++ b/icd/CMakeLists.txt @@ -169,6 +169,7 @@ if (VKI_RAY_TRACING) api/raytrace/vk_ray_tracing_pipeline.cpp api/raytrace/ray_tracing_device.cpp api/vk_deferred_operation.cpp + api/appopt/bvh_batch_layer.cpp ) endif() #endif diff --git a/icd/Loader/LunarG/Lnx/amd-icd.json b/icd/Loader/LunarG/Lnx/amd-icd.json index c6aed269..f7817f69 100644 --- a/icd/Loader/LunarG/Lnx/amd-icd.json +++ b/icd/Loader/LunarG/Lnx/amd-icd.json @@ -2,13 +2,13 @@ "file_format_version": "1.0.0", "ICD": { "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.280" + "api_version": "1.3.285" }, "layer": { "name": "VK_LAYER_AMD_switchable_graphics_@ISABITS@", "type": "GLOBAL", "library_path": "@AMDVLK_INSTALL_PATH@/amdvlk@ISABITS@.so", - "api_version": "1.3.280", + "api_version": "1.3.285", "implementation_version": "1", "description": "AMD switchable graphics layer", "functions": { diff --git a/icd/api/app_profile.cpp b/icd/api/app_profile.cpp index cf466325..c3013a5f 100644 --- a/icd/api/app_profile.cpp +++ b/icd/api/app_profile.cpp @@ -100,6 +100,12 @@ struct AppProfilePattern AppProfilePatternEntry entries[16]; }; +// define PatternEnd + +constexpr AppProfilePatternEntry PatternEnd = {}; + +// Section START AppProfilePatternEntry for all Games + constexpr AppProfilePatternEntry AppNameDoom = { PatternAppNameLower, @@ -600,6 +606,24 @@ constexpr AppProfilePatternEntry AppNameX4Foundations "x4" }; +constexpr AppProfilePatternEntry AppNameHaloInfiniteLauncher +{ + PatternAppNameLower, + "haloinfinite.exe" +}; + +constexpr AppProfilePatternEntry AppNameTf2Win64 +{ + PatternAppNameLower, + "tf_win64.exe" +}; + +constexpr AppProfilePatternEntry AppNameTf2Linux64 +{ + PatternAppNameLower, + "tf_linux64" +}; + constexpr AppProfilePatternEntry AppNameX4Engine { PatternEngineNameLower, @@ -732,9 +756,36 @@ constexpr AppProfilePatternEntry AppEngineQuanticDream "quantic dream engine" }; -constexpr AppProfilePatternEntry PatternEnd = {}; +constexpr AppProfilePatternEntry AppNameEnshrouded = +{ + PatternAppNameLower, + "enshrouded" +}; + +constexpr AppProfilePatternEntry AppEngineHolistic = +{ + PatternEngineNameLower, + "holistic" +}; + +constexpr AppProfilePatternEntry AppNameWindowKill = +{ + PatternAppNameLower, + "windowkill" +}; + +constexpr AppProfilePatternEntry AppEngineGodot = +{ + PatternEngineNameLower, + "godot engine" +}; + +// Section END of AppProfilePatternEntry for all games // This is a table of patterns. The first matching pattern in this table will be returned. +// Note: If an app gets detected by both app name and engine name, +// whatever comes first in this array will be the chosen app profile in ScanApplicationProfile(). +// This should get fixed so not as to get bitten by the order here! AppProfilePattern AppPatternTable[] = { { @@ -800,14 +851,6 @@ AppProfilePattern AppPatternTable[] = } }, - { - AppProfile::IdTechEngine, - { - AppEngineIdTech, - PatternEnd - } - }, - { AppProfile::Dota2, { @@ -1375,6 +1418,32 @@ AppProfilePattern AppPatternTable[] = } }, + { + AppProfile::DxvkHaloInfiniteLauncher, + { + AppNameHaloInfiniteLauncher, + AppEngineDXVK, + PatternEnd + } + }, + + { + AppProfile::DxvkTf2, + { + AppNameTf2Win64, + AppEngineDXVK, + PatternEnd + } + }, + + { + AppProfile::DxvkTf2, + { + AppNameTf2Linux64, + AppEngineDXVK, + PatternEnd + } + }, { AppProfile::MetalGearSolid5, { @@ -1466,6 +1535,23 @@ AppProfilePattern AppPatternTable[] = } }, + { + AppProfile::Enshrouded, + { + AppNameEnshrouded, + AppEngineHolistic, + PatternEnd + } + }, + + { + AppProfile::HolisticEngine, + { + AppEngineHolistic, + PatternEnd + } + }, + { AppProfile::Zink, { @@ -1496,6 +1582,23 @@ AppProfilePattern AppPatternTable[] = AppEngineDXVK, PatternEnd } + }, + + { + AppProfile::IdTechEngine, + { + AppEngineIdTech, + PatternEnd + } + }, + + { + AppProfile::WindowKill, + { + AppNameWindowKill, + AppEngineGodot, + PatternEnd + } } }; diff --git a/icd/api/app_shader_optimizer.cpp b/icd/api/app_shader_optimizer.cpp index 475b230f..94fbe86b 100644 --- a/icd/api/app_shader_optimizer.cpp +++ b/icd/api/app_shader_optimizer.cpp @@ -1154,12 +1154,22 @@ void ShaderOptimizer::BuildAppProfile() if ((m_settings.pipelineProfileIgnoresAppProfile == false) && (pMemory != nullptr)) { memset(pMemory, 0, newSize); + BuildAppProfileGeneric(); { BuildAppProfileLlpc(); } } } +// ===================================================================================================================== +void ShaderOptimizer::BuildAppProfileGeneric() +{ + const AppProfile appProfile = m_pDevice->GetAppProfile(); + const Pal::GpuType gpuType = m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->PalProperties().gpuType; + + uint32 i = 0; +} + // ===================================================================================================================== void ShaderOptimizer::BuildAppProfileLlpc() { diff --git a/icd/api/appopt/bvh_batch_layer.cpp b/icd/api/appopt/bvh_batch_layer.cpp new file mode 100644 index 00000000..9fc0d5d7 --- /dev/null +++ b/icd/api/appopt/bvh_batch_layer.cpp @@ -0,0 +1,818 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file bvh_batch_layer.cpp +* @brief Implementation of bvh batch layer. +*********************************************************************************************************************** +*/ + +#if VKI_RAY_TRACING + +#include + +#include "bvh_batch_layer.h" +#include "vk_cmdbuffer.h" +#include "raytrace/ray_tracing_device.h" +#include "palVectorImpl.h" + +namespace vk +{ + +// ===================================================================================================================== +BvhBatchLayer::BvhBatchLayer( + Device* pDevice) + : + m_pInstance(pDevice->VkInstance()), + m_emptyStateCount(0), + m_pEmptyStateStack() +{ +} + +// ===================================================================================================================== +BvhBatchLayer::~BvhBatchLayer() +{ + for (uint32_t stateIdx = 0; stateIdx < m_emptyStateCount; ++stateIdx) + { + m_pEmptyStateStack[stateIdx]->DestroyState(); + } +} + +// ===================================================================================================================== +VkResult BvhBatchLayer::Init( + Device* pDevice) +{ + VkResult result = VK_SUCCESS; + + if (pDevice->GetRuntimeSettings().batchBvhBuilds == BatchBvhModeImplicitAndLog) + { + const char* pRootDir = pDevice->PalDevice(DefaultDeviceIndex)->GetDebugFilePath(); + + if (pRootDir != nullptr) + { + char absPath[1024] = {}; + Util::Snprintf(absPath, sizeof(absPath), "%s/%s", pRootDir, "BvhBatchLog.txt"); + + if (result == VK_SUCCESS) + { + result = PalToVkResult(m_logFile.Open(absPath, Util::FileAccessMode::FileAccessAppend)); + } + + if (result == VK_SUCCESS) + { + result = PalToVkResult(m_logFile.Printf("|--------------BEGIN RUN--------------\n")); + } + } + else + { + // AMD_DEBUG_DIR must be set for logging + result = VK_ERROR_UNKNOWN; + } + } + + return result; +} + +// ===================================================================================================================== +VkResult BvhBatchLayer::CreateLayer( + Device* pDevice, + BvhBatchLayer** ppLayer) +{ + VkResult result = VK_SUCCESS; + BvhBatchLayer* pLayer = nullptr; + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + + if ((settings.batchBvhBuilds == BatchBvhModeImplicit) || (settings.batchBvhBuilds == BatchBvhModeImplicitAndLog)) + { + void* pMem = pDevice->VkInstance()->AllocMem(sizeof(BvhBatchLayer), VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + + if (pMem != nullptr) + { + pLayer = VK_PLACEMENT_NEW(pMem) BvhBatchLayer(pDevice); + } + else + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } + + if (result == VK_SUCCESS) + { + result = pLayer->Init(pDevice); + } + + if (result == VK_SUCCESS) + { + *ppLayer = pLayer; + } + + return result; +} + +// ===================================================================================================================== +void BvhBatchLayer::DestroyLayer() +{ + m_logFile.Printf("|--------------END RUN--------------\n"); + m_logFile.Close(); + + Instance* pInstance = VkInstance(); + Util::Destructor(this); + pInstance->FreeMem(this); +} + +// ===================================================================================================================== +void BvhBatchLayer::VLog( + const char* pFormat, + va_list argList) +{ + VK_ASSERT(LoggingEnabled()); + + Util::MutexAuto lock(&m_mutex); + + Util::Result printResult = m_logFile.VPrintf(pFormat, argList); + VK_ASSERT(printResult == Util::Result::Success); +} + +// ===================================================================================================================== +BvhBatchState* BvhBatchLayer::CreateState( + CmdBuffer* pCmdBuffer) +{ + // Try to reuse a previously freed state + BvhBatchState* pState = PopEmptyState(); + + if (pState != nullptr) + { + pState->Log("Reusing a stashed BvhBatchState.\n"); + } + else + { + // Allocate a new state if no previously freed states were available + void* pMem = m_pInstance->AllocMem(sizeof(BvhBatchState), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + pState = (pMem != nullptr) ? (VK_PLACEMENT_NEW(pMem) BvhBatchState(this)) : nullptr; + } + + // Link this state to the given cmd buffer + pCmdBuffer->SetBvhBatchState(pState); + + VK_ASSERT(pState != nullptr); + return pState; +} + +// ===================================================================================================================== +bool BvhBatchLayer::PushEmptyState( + BvhBatchState* pState) +{ + bool success = false; + + Util::MutexAuto lock(&m_mutex); + + if (m_emptyStateCount < VK_ARRAY_SIZE(m_pEmptyStateStack)) + { + m_pEmptyStateStack[m_emptyStateCount] = pState; + m_emptyStateCount++; + + success = true; + } + + return success; +} + +// ===================================================================================================================== +BvhBatchState* BvhBatchLayer::PopEmptyState() +{ + BvhBatchState* pState = nullptr; + + Util::MutexAuto lock(&m_mutex); + + if (m_emptyStateCount > 0) + { + m_emptyStateCount--; + pState = m_pEmptyStateStack[m_emptyStateCount]; + } + + return pState; +} + +// ===================================================================================================================== +BvhBatchState::BvhBatchState( + BvhBatchLayer* pLayer) + : + m_type(BvhBatchType::Undefined), + m_pCmdBuffer(nullptr), + m_pLayer(pLayer), + m_geomInfos(pLayer->VkInstance()->Allocator()), + m_rangeInfosOrMaxPrimCounts(pLayer->VkInstance()->Allocator()), + m_indirectVirtAddrs(pLayer->VkInstance()->Allocator()), + m_indirectStrides(pLayer->VkInstance()->Allocator()), + m_infoCount(0), + m_allocations(pLayer->VkInstance()->Allocator()) +{ + Log("Allocating a new BvhBatchState.\n"); +} + +// ===================================================================================================================== +BvhBatchState::~BvhBatchState() +{ +} + +// ===================================================================================================================== +void BvhBatchState::Log( + const char* pFormat, + ...) +{ + if (m_pLayer->LoggingEnabled()) + { + char prependedStr[21] = {}; + Util::Snprintf(prependedStr, sizeof(prependedStr), "|-- 0x%" PRIx64 " - ", this); + + va_list argList = {}; + m_pLayer->VLog(prependedStr, argList); + + va_start(argList, pFormat); + m_pLayer->VLog(pFormat, argList); + va_end(argList); + } +} + +// ===================================================================================================================== +void BvhBatchState::DestroyState() +{ + Log("Freeing a BvhBatchState.\n"); + Util::Destructor(this); + m_pLayer->VkInstance()->FreeMem(this); +} + +// ===================================================================================================================== +void BvhBatchState::Reset() +{ + for (auto pMem : m_allocations) + { + m_pLayer->VkInstance()->FreeMem(pMem); + } + + m_type = BvhBatchType::Undefined; + m_allocations.Clear(); + m_geomInfos.Clear(); + m_rangeInfosOrMaxPrimCounts.Clear(); + m_indirectVirtAddrs.Clear(); + m_indirectStrides.Clear(); + m_infoCount = 0; + + // Unlink this state from the cmd buffer + m_pCmdBuffer->SetBvhBatchState(nullptr); + m_pCmdBuffer = nullptr; + + // Try to stash this now empty state to be reused later + if (m_pLayer->PushEmptyState(this)) + { + Log("Stashing a BvhBatchState during reset.\n"); + } + else + { + DestroyState(); + } +} + +// ===================================================================================================================== +template +bool BvhBatchState::EnqueueBvhBuild( + CmdBuffer* pCmdBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts) +{ + static_assert(batchType != BvhBatchType::Undefined, "Invalid batch type provided to EnqueueBvhBuild via template."); + + // Ensure the batch type in the state matches + if ((m_type != batchType) && (m_type != BvhBatchType::Undefined)) + { + Flush(); + } + + // Determine how much memory the hard copy needs + size_t memSize = GetHardCopyMemSize(infoCount, pInfos); + + // Allocate memory for the hard copy + void* pMem = m_pLayer->VkInstance()->AllocMem(memSize, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + // Hard copy given data + if (pMem != nullptr) + { + if (m_infoCount == 0) + { + m_pCmdBuffer = pCmdBuffer; + } + else if (m_pCmdBuffer != pCmdBuffer) + { + // CmdBuffer pointer shouldn't change when pending infos are present + VK_NEVER_CALLED(); + Flush(); + } + + Log("Enqueueing %u BVH build infos (batchType - %u).\n", infoCount, batchType); + HardCopyBuildInfos( + infoCount, + pInfos, + ppBuildRangeInfos, + pIndirectDeviceAddresses, + pIndirectStrides, + ppMaxPrimitiveCounts, + pMem, + memSize); + } + else + { + // Failed to allocate memory + VK_NEVER_CALLED(); + } + + return (pMem != nullptr); +} + +// ===================================================================================================================== +void BvhBatchState::Flush() +{ + if (m_infoCount > 0) + { + BvhBatchLayer* pLayer = m_pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + + VK_ASSERT(m_type != BvhBatchType::Undefined); + + if (m_type == BvhBatchType::Direct) + { + Log("Flushing a direct build batch (infoCount - %u).\n", m_infoCount); + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdBuildAccelerationStructuresKHR)( + reinterpret_cast(ApiCmdBuffer::FromObject(m_pCmdBuffer)), + m_infoCount, + m_geomInfos.Data(), + reinterpret_cast(m_rangeInfosOrMaxPrimCounts.Data())); + } + else + { + Log("Flushing an indirect build batch (infoCount - %u).\n", m_infoCount); + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdBuildAccelerationStructuresIndirectKHR)( + reinterpret_cast(ApiCmdBuffer::FromObject(m_pCmdBuffer)), + m_infoCount, + m_geomInfos.Data(), + m_indirectVirtAddrs.Data(), + m_indirectStrides.Data(), + reinterpret_cast(m_rangeInfosOrMaxPrimCounts.Data())); + } + + Reset(); + } +} + +// ===================================================================================================================== +void BvhBatchState::TryFlush( + VkFlags64 srcStageMask) +{ + constexpr VkFlags64 TargetSrcStages = + VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR | + VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT; + + if ((srcStageMask & TargetSrcStages) != 0) + { + Log("Flushing via barrier or event (srcStageMask - %llu).\n", srcStageMask); + Flush(); + } +} + +// ===================================================================================================================== +void BvhBatchState::TryFlush( + uint32_t depInfoCount, + const VkDependencyInfo* pDependencyInfos) +{ + VkFlags64 globalSrcMask = 0u; + + for (uint32_t i = 0; i < depInfoCount; ++i) + { + const auto& dependencyInfo = pDependencyInfos[i]; + + for (uint32_t j = 0; j < dependencyInfo.memoryBarrierCount; j++) + { + globalSrcMask |= dependencyInfo.pMemoryBarriers[j].srcStageMask; + } + for (uint32_t j = 0; j < dependencyInfo.bufferMemoryBarrierCount; j++) + { + globalSrcMask |= dependencyInfo.pBufferMemoryBarriers[j].srcStageMask; + } + for (uint32_t j = 0; j < dependencyInfo.imageMemoryBarrierCount; j++) + { + globalSrcMask |= dependencyInfo.pImageMemoryBarriers[j].srcStageMask; + } + } + + TryFlush(globalSrcMask); +} + +// ===================================================================================================================== +template +size_t BvhBatchState::GetHardCopyMemSize( + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos) +{ + // Calculate total geometry structs and ptrs across all infos + size_t totalGeomCount = 0; + size_t totalGeomPtrCount = 0; + for (uint32_t infoIdx = 0; infoIdx < infoCount; ++infoIdx) + { + totalGeomCount += pInfos[infoIdx].geometryCount; + + if (pInfos[infoIdx].ppGeometries != nullptr) + { + totalGeomPtrCount += pInfos[infoIdx].geometryCount; + } + } + + // Memory size for pGeometries and ppGeometies + size_t memSize = + (totalGeomCount * sizeof(VkAccelerationStructureGeometryKHR)) + + (totalGeomPtrCount * sizeof(void*)); + + // Memory size for ppBuildRangeInfos or ppMaxPrimitiveCounts + if (batchType == BvhBatchType::Direct) + { + memSize += (totalGeomCount * sizeof(VkAccelerationStructureBuildRangeInfoKHR)); + } + else + { + memSize += (totalGeomCount * sizeof(uint32_t*)); + } + + // Report the memory size required + return memSize; +} + +// ===================================================================================================================== +template +void BvhBatchState::HardCopyBuildInfos( + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts, + void* pMem, + size_t memSize) +{ + m_allocations.PushBack(pMem); + + for (uint32_t infoIdx = 0; infoIdx < infoCount; ++infoIdx) + { + VkAccelerationStructureBuildGeometryInfoKHR geomInfoDst = pInfos[infoIdx]; + + // Per spec, pNext must be NULL + VK_ASSERT(geomInfoDst.pNext == nullptr); + + const size_t geometrySize = geomInfoDst.geometryCount * sizeof(VkAccelerationStructureGeometryKHR); + const size_t geometryPtrSize = geomInfoDst.geometryCount * sizeof(void*); + + if (geomInfoDst.ppGeometries != nullptr) + { + // Array of Goemetry pointers + VkAccelerationStructureGeometryKHR** ppGeometries = + static_cast(pMem); + + // Geometry descs follow the pointers + VkAccelerationStructureGeometryKHR* pGeometries = + static_cast(Util::VoidPtrInc(pMem, geometryPtrSize)); + + // Copy each geometry info and its new pointer into the internal allocation + for (uint32 i = 0; i < geomInfoDst.geometryCount; i++) + { + pGeometries[i] = *geomInfoDst.ppGeometries[i]; + ppGeometries[i] = &pGeometries[i]; + } + + // Apply the local copy + geomInfoDst.ppGeometries = + static_cast(pMem); + + // Increment the data pointer for the following copy + pMem = Util::VoidPtrInc(pMem, geometrySize + geometryPtrSize); + } + else + { + // Copy original geometry info into the internal allocation + memcpy(pMem, geomInfoDst.pGeometries, geometrySize); + + // Apply the local copy + geomInfoDst.pGeometries = + static_cast(pMem); + + // Increment the data pointer for the following copy + pMem = Util::VoidPtrInc(pMem, geometrySize); + } + + m_type = batchType; + m_geomInfos.PushBack(geomInfoDst); + m_infoCount++; + + if (batchType == BvhBatchType::Direct) + { + // Copy BuildRangeInfos into internal allocation + const size_t rangeInfoSize = geomInfoDst.geometryCount * sizeof(VkAccelerationStructureBuildRangeInfoKHR); + memcpy(pMem, ppBuildRangeInfos[infoIdx], rangeInfoSize); + + m_rangeInfosOrMaxPrimCounts.PushBack(pMem); + + // Increment the data pointer for the following copy + pMem = Util::VoidPtrInc(pMem, rangeInfoSize); + } + else + { + // Copy MaxPrimitiveCounts into internal allocation + const size_t maxPrimCountsSize = geomInfoDst.geometryCount * sizeof(uint32_t); + memcpy(pMem, ppMaxPrimitiveCounts[infoIdx], maxPrimCountsSize); + + m_rangeInfosOrMaxPrimCounts.PushBack(pMem); + + // Increment the data pointer for the following copy + pMem = Util::VoidPtrInc(pMem, maxPrimCountsSize); + + m_indirectVirtAddrs.PushBack(pIndirectDeviceAddresses[infoIdx]); + m_indirectStrides.PushBack(pIndirectStrides[infoIdx]); + } + } + + // Ensure that we did not overallocate nor underallocate + VK_ASSERT((reinterpret_cast(pMem) - reinterpret_cast(m_allocations.Back())) == memSize); +} + +namespace entry +{ + +namespace bvhBatchLayer +{ + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdBuildAccelerationStructuresKHR( + VkCommandBuffer commandBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos) +{ + bool queued = false; + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState == nullptr) + { + pState = pLayer->CreateState(pCmdBuffer); + } + + if (pState != nullptr) + { + queued = pState->EnqueueBvhBuild( + pCmdBuffer, + infoCount, + pInfos, + ppBuildRangeInfos, + nullptr, + nullptr, + nullptr); + + if (queued == false) + { + // State exists, but we were not able to enqueue. Flush any valid contents in the batch. + pState->Flush(); + } + } + + if (queued == false) + { + // We were not able to batch. Add directly to cmd buffer. + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdBuildAccelerationStructuresKHR)( + commandBuffer, + infoCount, + pInfos, + ppBuildRangeInfos); + } +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdBuildAccelerationStructuresIndirectKHR( + VkCommandBuffer commandBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts) +{ + bool queued = false; + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState == nullptr) + { + pState = pLayer->CreateState(pCmdBuffer); + } + + if (pState != nullptr) + { + queued = pState->EnqueueBvhBuild( + pCmdBuffer, + infoCount, + pInfos, + nullptr, + pIndirectDeviceAddresses, + pIndirectStrides, + ppMaxPrimitiveCounts); + + if (queued == false) + { + // State exists, but we were not able to enqueue. Flush any valid contents in the batch. + pState->Flush(); + } + } + + if (queued == false) + { + // We were not able to batch. Add directly to cmd buffer. + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdBuildAccelerationStructuresIndirectKHR)( + commandBuffer, + infoCount, + pInfos, + pIndirectDeviceAddresses, + pIndirectStrides, + ppMaxPrimitiveCounts); + } +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier( + VkCommandBuffer commandBuffer, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags dstStageMask, + VkDependencyFlags dependencyFlags, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->TryFlush(srcStageMask); + } + + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdPipelineBarrier)( + commandBuffer, + srcStageMask, + dstStageMask, + dependencyFlags, + memoryBarrierCount, + pMemoryBarriers, + bufferMemoryBarrierCount, + pBufferMemoryBarriers, + imageMemoryBarrierCount, + pImageMemoryBarriers); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier2( + VkCommandBuffer commandBuffer, + const VkDependencyInfoKHR* pDependencyInfo) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->TryFlush(1, pDependencyInfo); + } + + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdPipelineBarrier2)(commandBuffer, pDependencyInfo); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdWaitEvents( + VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags dstStageMask, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->TryFlush(srcStageMask); + } + + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdWaitEvents)( + commandBuffer, + eventCount, + pEvents, + srcStageMask, + dstStageMask, + memoryBarrierCount, + pMemoryBarriers, + bufferMemoryBarrierCount, + pBufferMemoryBarriers, + imageMemoryBarrierCount, + pImageMemoryBarriers); +} + +// ===================================================================================================================== +VKAPI_ATTR void VKAPI_CALL vkCmdWaitEvents2( + VkCommandBuffer commandBuffer, + uint32_t eventCount, + const VkEvent* pEvents, + const VkDependencyInfoKHR* pDependencyInfos) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->TryFlush(eventCount, pDependencyInfos); + } + + BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkCmdWaitEvents2)(commandBuffer, eventCount, pEvents, pDependencyInfos); +} + +// ===================================================================================================================== +VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer( + VkCommandBuffer commandBuffer) +{ + CmdBuffer* pCmdBuffer = ApiCmdBuffer::ObjectFromHandle(commandBuffer); + BvhBatchLayer* pLayer = pCmdBuffer->VkDevice()->RayTrace()->GetBvhBatchLayer(); + BvhBatchState* pState = pCmdBuffer->GetBvhBatchState(); + + if (pState != nullptr) + { + pState->Log("Flushing via vkEndCommandBuffer\n"); + pState->Flush(); + } + + return BVH_BATCH_LAYER_CALL_NEXT_LAYER(vkEndCommandBuffer)(commandBuffer); +} + +} // namespace bvhBatchLayer + +} // namespace entry + +// ===================================================================================================================== +void BvhBatchLayer::OverrideDispatchTable( + DispatchTable* pDispatchTable) +{ + // Save current device dispatch table to use as the next layer. + m_nextLayer = *pDispatchTable; + + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdBuildAccelerationStructuresKHR); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdBuildAccelerationStructuresIndirectKHR); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdPipelineBarrier); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdPipelineBarrier2); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdWaitEvents); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkCmdWaitEvents2); + BVH_BATCH_LAYER_OVERRIDE_ENTRY(vkEndCommandBuffer); +} + +} // namespace vk + +#endif diff --git a/icd/api/appopt/bvh_batch_layer.h b/icd/api/appopt/bvh_batch_layer.h new file mode 100644 index 00000000..aa648302 --- /dev/null +++ b/icd/api/appopt/bvh_batch_layer.h @@ -0,0 +1,160 @@ +/* + *********************************************************************************************************************** + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + **********************************************************************************************************************/ +/** +*********************************************************************************************************************** +* @file bvh_batch_layer.h +* @brief Declaration of bvh batch layer +*********************************************************************************************************************** +*/ + +#if VKI_RAY_TRACING +#ifndef __BVH_BATCH_LAYER_H +#define __BVH_BATCH_LAYER_H + +#pragma once +#include "opt_layer.h" +#include "vk_alloccb.h" +#include "vk_cmdbuffer.h" +#include "palVector.h" +#include "palMutex.h" +#include "palFile.h" + +namespace vk +{ + +enum class BvhBatchType : uint32 +{ + Undefined, + Direct, + Indirect +}; + +class BvhBatchLayer; + +class BvhBatchState +{ +public: + BvhBatchState(BvhBatchLayer* pLayer); + ~BvhBatchState(); + + void Log(const char* pFormat, ...); + + template + bool EnqueueBvhBuild( + CmdBuffer* pCmdBuffer, + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts); + + void Reset(); + void Flush(); + void TryFlush(VkFlags64 srcStageMask); + void TryFlush(uint32_t depInfoCount, const VkDependencyInfo* pDependencyInfos); + void DestroyState(); + +private: + template + size_t GetHardCopyMemSize( + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos); + + template + void HardCopyBuildInfos( + uint32_t infoCount, + const VkAccelerationStructureBuildGeometryInfoKHR* pInfos, + const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos, + const VkDeviceAddress* pIndirectDeviceAddresses, + const uint32_t* pIndirectStrides, + const uint32_t* const* ppMaxPrimitiveCounts, + void* pMem, + size_t memSize); + + typedef Util::Vector GeometryInfoList; + typedef Util::Vector VoidPtrList; + typedef Util::Vector VirtAddrList; + typedef Util::Vector StrideList; + + BvhBatchType m_type; + CmdBuffer* m_pCmdBuffer; + BvhBatchLayer* m_pLayer; + GeometryInfoList m_geomInfos; + VoidPtrList m_rangeInfosOrMaxPrimCounts; + VirtAddrList m_indirectVirtAddrs; + StrideList m_indirectStrides; + uint32_t m_infoCount; + VoidPtrList m_allocations; +}; + +class BvhBatchLayer final : public OptLayer +{ +public: + ~BvhBatchLayer(); + + static VkResult CreateLayer(Device* pDevice, BvhBatchLayer** ppLayer); + void DestroyLayer(); + + virtual void OverrideDispatchTable(DispatchTable* pDispatchTable) override; + + void VLog(const char* pFormat, va_list argList); + + BvhBatchState* CreateState(CmdBuffer* pCmdBuffer); + bool PushEmptyState(BvhBatchState* pState); + BvhBatchState* PopEmptyState(); + + Instance* VkInstance() { return m_pInstance; } + bool LoggingEnabled() { return m_logFile.IsOpen(); } + +private: + PAL_DISALLOW_COPY_AND_ASSIGN(BvhBatchLayer); + + BvhBatchLayer(Device* pDevice); + + VkResult Init(Device* pDevice); + + Instance* m_pInstance; + Util::Mutex m_mutex; + + uint32_t m_emptyStateCount; + BvhBatchState* m_pEmptyStateStack[16]; + + Util::File m_logFile; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +#define BVH_BATCH_LAYER_OVERRIDE_ALIAS(entry_name, func_name) \ + pDispatchTable->OverrideEntryPoints()->entry_name = vk::entry::bvhBatchLayer::func_name; + +#define BVH_BATCH_LAYER_OVERRIDE_ENTRY(entry_name) BVH_BATCH_LAYER_OVERRIDE_ALIAS(entry_name, entry_name) + +#define BVH_BATCH_LAYER_CALL_NEXT_LAYER(entry_name) \ + pLayer->GetNextLayer()->GetEntryPoints().entry_name + +} // namespace vk + +#endif /* __BVH_BATCH_LAYER_H */ +#endif /* VKI_RAY_TRACING */ diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp10_3/generic/StrangeBrigade/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp10_3/generic/StrangeBrigade/profile.json new file mode 100644 index 00000000..c43cb320 --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp10_3/generic/StrangeBrigade/profile.json @@ -0,0 +1,3 @@ +{ + "entries": [] +} \ No newline at end of file diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Enshrouded/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Enshrouded/profile.json new file mode 100644 index 00000000..d584db1f --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi31/Enshrouded/profile.json @@ -0,0 +1,56 @@ +{ + "entries": [ + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0xcf8eb50df001f7cc ce615bacc3823464" + } + }, + "action": { + "cs": { + "threadGroupSwizzleMode": "_16x16" + } + } + }, + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0x4eb6c36d1b5fab73 110e3e5875ad5038" + } + }, + "action": { + "cs": { + "disableCodeSinking": true + } + } + }, + { + "pattern": { + "ps": { + "codeHash": "0x3c706d601cf4803e 107f065dcad03a0b" + } + }, + "action": { + "ps": { + "waveSize": 32 + } + } + }, + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0xaa8891d44ef6d284 ebf339f1b47fe1d1" + } + }, + "action": { + "cs": { + "wgpMode": 2, + "threadGroupSwizzleMode": "_16x16" + } + } + } + ] +} \ No newline at end of file diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi32/Enshrouded/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi32/Enshrouded/profile.json new file mode 100644 index 00000000..58c05d67 --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/Navi32/Enshrouded/profile.json @@ -0,0 +1,42 @@ +{ + "entries": [ + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0x4eb6c36d1b5fab73 110e3e5875ad5038" + } + }, + "action": { + "cs": { + "disableCodeSinking": true + } + } + }, + { + "pattern": { + "ps": { + "codeHash": "0x3c706d601cf4803e 107f065dcad03a0b" + } + }, + "action": { + "ps": { + "waveSize": 32 + } + } + }, + { + "pattern": { + "shaderOnly": true, + "cs": { + "codeHash": "0xcf8eb50df001f7cc ce615bacc3823464" + } + }, + "action": { + "cs": { + "threadGroupSwizzleMode": "_16x16" + } + } + } + ] +} \ No newline at end of file diff --git a/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/StrangeBrigade/profile.json b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/StrangeBrigade/profile.json new file mode 100644 index 00000000..c43cb320 --- /dev/null +++ b/icd/api/appopt/shader_profiles/llpc/gfxIp11_0/generic/StrangeBrigade/profile.json @@ -0,0 +1,3 @@ +{ + "entries": [] +} \ No newline at end of file diff --git a/icd/api/compiler_solution_llpc.cpp b/icd/api/compiler_solution_llpc.cpp index 15be0b29..8f886284 100644 --- a/icd/api/compiler_solution_llpc.cpp +++ b/icd/api/compiler_solution_llpc.cpp @@ -37,6 +37,8 @@ #include +using namespace std::chrono_literals; + namespace vk { @@ -846,7 +848,7 @@ void LlpcHelperThreadProvider::WaitForTasks() { while (m_pDeferredWorkload->completedInstances < m_pDeferredWorkload->totalInstances) { - m_pDeferredWorkload->event.Wait(Util::fseconds { 1.0f }); + m_pDeferredWorkload->event.Wait(1s); } } @@ -1020,6 +1022,11 @@ VkResult CompilerSolutionLlpc::CreateLlpcCompiler( llpcOptions[numOptions++] = "-enable-pipeline-dump"; } + if (settings.enableImageMsaaLoadOpt) + { + llpcOptions[numOptions++] = "-mattr=-msaa-load-dst-sel-bug"; + } + optionLength = Util::Snprintf(pOptionBuffer, bufSize, "-pipeline-dump-dir=%s", settings.pipelineDumpDir); ++optionLength; llpcOptions[numOptions++] = pOptionBuffer; diff --git a/icd/api/debug_printf.cpp b/icd/api/debug_printf.cpp index 6fd013ac..866ce8fa 100644 --- a/icd/api/debug_printf.cpp +++ b/icd/api/debug_printf.cpp @@ -37,6 +37,7 @@ #include using namespace vk; +using namespace std::chrono_literals; //===================================================================================================================== DebugPrintf::DebugPrintf( @@ -120,13 +121,18 @@ void DebugPrintf::BindPipeline( Pal::BufferViewInfo srdInfo = {}; srdInfo.gpuAddr = m_printfMemory.GpuVirtAddr(deviceIdx); - srdInfo.range = m_printfMemory.Size(); + srdInfo.range = m_printfMemory.Size(); + pDevice->PalDevice(deviceIdx)->CreateUntypedBufferViewSrds(1, &srdInfo, pTable); + m_frame = 1; + const Pal::uint32* pEntry = reinterpret_cast(&tableVa); + pCmdBuffer->CmdSetUserData(static_cast(bindPoint), userDataOffset, 1, pEntry); m_parsedFormatStrings.Reset(); + for (auto it = pPipeline->GetFormatStrings()->Begin(); it.Get() != nullptr; it.Next()) { bool found = true; @@ -214,7 +220,7 @@ Pal::Result DebugPrintf::PostQueueProcess( while (true) { palResult = pDevice->PalDevice(DefaultDeviceIndex)->WaitForSemaphores( - 1, palSemaphores, waitValues, 0, std::chrono::nanoseconds {1000000llu}); + 1, palSemaphores, waitValues, 0, 1ms); decodeOffset = ProcessDebugPrintfBuffer(pDevice, deviceIdx, decodeOffset, &file); if ((PalToVkResult(palResult) <= 0) || (loopIndex++ > 1000)) diff --git a/icd/api/include/app_profile.h b/icd/api/include/app_profile.h index 0c2f8811..27496109 100644 --- a/icd/api/include/app_profile.h +++ b/icd/api/include/app_profile.h @@ -94,7 +94,6 @@ enum class AppProfile : uint32_t EvilGenius2, // Evil Genius 2 KnockoutCity, // Knockout City SkyGold, // Sky Gold by NetEase - IdTechEngine, // id Tech Engine (Default) Feral3DEngine, // Feral3D Engine (Default) StrangeEngine, // Strange Engine (Default) SedpEngine, // Serious Engine (Default) @@ -112,6 +111,9 @@ enum class AppProfile : uint32_t SniperElite5, // Sniper Elite 5 by Rebellion SeriousSamVrTheLastHope, // Serious Sam VR The Last Hope by Croteam BaldursGate3, // Baldur's Gate by Larian Studios + Enshrouded, // Enshrouded by Keen Games + HolisticEngine, // Holistic Engine by Keen Games + IdTechEngine, // id Tech Engine (Default) #if VKI_RAY_TRACING ControlDX12, // VKD3D Control Ultimate Edition RayTracingWeekends, // RayTracingInVulkan demo @@ -122,6 +124,9 @@ enum class AppProfile : uint32_t DxvkGodOfWar, // DXVK God of War ELEX2, // ELEX II X4Foundations, // X4: Foundations by Egosoft + DxvkHaloInfiniteLauncher,// DXVK Halo Infinite Launcher (Don't Confuse it with VKD3D + // Halo Infinite Game) + DxvkTf2, // DXVK Team Fortress 2 MetalGearSolid5, // Metal Gear Solid5 : The Phantom Pain MetalGearSolid5Online, // Metal Gear Solid5 : The Phantom Pain Online YamagiQuakeII, // Yamagi Quake II @@ -142,6 +147,7 @@ enum class AppProfile : uint32_t Enscape, // Enscape by Chaos Vkd3dEngine, // vkd3d-proton for steam games DXVK, // DXVK + WindowKill, // Windowkill by torcado }; struct ProfileSettings diff --git a/icd/api/include/app_shader_optimizer.h b/icd/api/include/app_shader_optimizer.h index f3a1fc7b..4ac850b5 100644 --- a/icd/api/include/app_shader_optimizer.h +++ b/icd/api/include/app_shader_optimizer.h @@ -201,6 +201,7 @@ class ShaderOptimizer void BuildTuningProfile(); void BuildAppProfile(); + void BuildAppProfileGeneric(); void BuildAppProfileLlpc(); diff --git a/icd/api/include/compiler_solution.h b/icd/api/include/compiler_solution.h index 2ad93eb0..377401cc 100644 --- a/icd/api/include/compiler_solution.h +++ b/icd/api/include/compiler_solution.h @@ -365,6 +365,7 @@ class CompilerSolution virtual Vkgc::BinaryData ExtractPalElfBinary(const Vkgc::BinaryData& shaderBinary) = 0; static void DisableNggCulling(Vkgc::NggState* pNggState); + static const char* GetShaderStageName(ShaderStage shaderStage); #if VKI_RAY_TRACING static void UpdateRayTracingFunctionNames( @@ -400,7 +401,6 @@ class CompilerSolution PipelineBinaryCache* m_pBinaryCache; // Internal pipeline binary cache // NOTE: It is owned by PipelineCompiler. PipelineCompileCacheMatrix m_gplCacheMatrix; // Graphics pipeline compile statistic info - static const char* GetShaderStageName(ShaderStage shaderStage); static const char* GetGraphicsLibraryName(GraphicsLibraryType libraryType); private: diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h index 6d09e280..38468f26 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_core.h @@ -69,7 +69,7 @@ extern "C" { #define VK_API_VERSION_1_0 VK_MAKE_API_VERSION(0, 1, 0, 0)// Patch version should always be set to 0 // Version of this file -#define VK_HEADER_VERSION 280 +#define VK_HEADER_VERSION 285 // Complete version of this file #define VK_HEADER_VERSION_COMPLETE VK_MAKE_API_VERSION(0, 1, 3, VK_HEADER_VERSION) @@ -1046,6 +1046,8 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTENDED_SPARSE_ADDRESS_SPACE_PROPERTIES_NV = 1000492001, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MUTABLE_DESCRIPTOR_TYPE_FEATURES_EXT = 1000351000, VK_STRUCTURE_TYPE_MUTABLE_DESCRIPTOR_TYPE_CREATE_INFO_EXT = 1000351002, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LEGACY_VERTEX_ATTRIBUTES_FEATURES_EXT = 1000495000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_LEGACY_VERTEX_ATTRIBUTES_PROPERTIES_EXT = 1000495001, VK_STRUCTURE_TYPE_LAYER_SETTINGS_CREATE_INFO_EXT = 1000496000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_BUILTINS_FEATURES_ARM = 1000497000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_CORE_BUILTINS_PROPERTIES_ARM = 1000497001, @@ -1112,6 +1114,9 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAW_ACCESS_CHAINS_FEATURES_NV = 1000555000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT16_VECTOR_FEATURES_NV = 1000563000, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_RAY_TRACING_VALIDATION_FEATURES_NV = 1000568000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_ALIGNMENT_CONTROL_FEATURES_MESA = 1000575000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_ALIGNMENT_CONTROL_PROPERTIES_MESA = 1000575001, + VK_STRUCTURE_TYPE_IMAGE_ALIGNMENT_CONTROL_CREATE_INFO_MESA = 1000575002, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTERS_FEATURES, VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETER_FEATURES = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_DRAW_PARAMETERS_FEATURES, VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, @@ -1676,7 +1681,7 @@ typedef enum VkFormat { VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG = 1000054005, VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG = 1000054006, VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG = 1000054007, - VK_FORMAT_R16G16_S10_5_NV = 1000464000, + VK_FORMAT_R16G16_SFIXED5_NV = 1000464000, VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR = 1000470000, VK_FORMAT_A8_UNORM_KHR = 1000470001, VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT = VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK, @@ -1733,6 +1738,7 @@ typedef enum VkFormat { VK_FORMAT_G16_B16R16_2PLANE_444_UNORM_EXT = VK_FORMAT_G16_B16R16_2PLANE_444_UNORM, VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT = VK_FORMAT_A4R4G4B4_UNORM_PACK16, VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT = VK_FORMAT_A4B4G4R4_UNORM_PACK16, + VK_FORMAT_R16G16_S10_5_NV = VK_FORMAT_R16G16_SFIXED5_NV, VK_FORMAT_MAX_ENUM = 0x7FFFFFFF } VkFormat; @@ -11109,6 +11115,7 @@ typedef VkFlags64 VkPipelineCreateFlagBits2KHR; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DISABLE_OPTIMIZATION_BIT_KHR = 0x00000001ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_ALLOW_DERIVATIVES_BIT_KHR = 0x00000002ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DERIVATIVE_BIT_KHR = 0x00000004ULL; +static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_ENABLE_LEGACY_DITHERING_BIT_EXT = 0x400000000ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR = 0x00000008ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DISPATCH_BASE_BIT_KHR = 0x00000010ULL; static const VkPipelineCreateFlagBits2KHR VK_PIPELINE_CREATE_2_DEFER_COMPILE_BIT_NV = 0x00000020ULL; @@ -18502,7 +18509,7 @@ VKAPI_ATTR void VKAPI_CALL vkCmdOpticalFlowExecuteNV( // VK_EXT_legacy_dithering is a preprocessor guard. Do not pass it to API calls. #define VK_EXT_legacy_dithering 1 -#define VK_EXT_LEGACY_DITHERING_SPEC_VERSION 1 +#define VK_EXT_LEGACY_DITHERING_SPEC_VERSION 2 #define VK_EXT_LEGACY_DITHERING_EXTENSION_NAME "VK_EXT_legacy_dithering" typedef struct VkPhysicalDeviceLegacyDitheringFeaturesEXT { VkStructureType sType; @@ -18727,6 +18734,24 @@ typedef struct VkPhysicalDeviceExtendedSparseAddressSpacePropertiesNV { #define VK_EXT_MUTABLE_DESCRIPTOR_TYPE_EXTENSION_NAME "VK_EXT_mutable_descriptor_type" +// VK_EXT_legacy_vertex_attributes is a preprocessor guard. Do not pass it to API calls. +#define VK_EXT_legacy_vertex_attributes 1 +#define VK_EXT_LEGACY_VERTEX_ATTRIBUTES_SPEC_VERSION 1 +#define VK_EXT_LEGACY_VERTEX_ATTRIBUTES_EXTENSION_NAME "VK_EXT_legacy_vertex_attributes" +typedef struct VkPhysicalDeviceLegacyVertexAttributesFeaturesEXT { + VkStructureType sType; + void* pNext; + VkBool32 legacyVertexAttributes; +} VkPhysicalDeviceLegacyVertexAttributesFeaturesEXT; + +typedef struct VkPhysicalDeviceLegacyVertexAttributesPropertiesEXT { + VkStructureType sType; + void* pNext; + VkBool32 nativeUnalignedPerformance; +} VkPhysicalDeviceLegacyVertexAttributesPropertiesEXT; + + + // VK_EXT_layer_settings is a preprocessor guard. Do not pass it to API calls. #define VK_EXT_layer_settings 1 #define VK_EXT_LAYER_SETTINGS_SPEC_VERSION 2 @@ -19147,6 +19172,30 @@ typedef struct VkPhysicalDeviceRayTracingValidationFeaturesNV { +// VK_MESA_image_alignment_control is a preprocessor guard. Do not pass it to API calls. +#define VK_MESA_image_alignment_control 1 +#define VK_MESA_IMAGE_ALIGNMENT_CONTROL_SPEC_VERSION 1 +#define VK_MESA_IMAGE_ALIGNMENT_CONTROL_EXTENSION_NAME "VK_MESA_image_alignment_control" +typedef struct VkPhysicalDeviceImageAlignmentControlFeaturesMESA { + VkStructureType sType; + void* pNext; + VkBool32 imageAlignmentControl; +} VkPhysicalDeviceImageAlignmentControlFeaturesMESA; + +typedef struct VkPhysicalDeviceImageAlignmentControlPropertiesMESA { + VkStructureType sType; + void* pNext; + uint32_t supportedImageAlignmentMask; +} VkPhysicalDeviceImageAlignmentControlPropertiesMESA; + +typedef struct VkImageAlignmentControlCreateInfoMESA { + VkStructureType sType; + const void* pNext; + uint32_t maximumRequestedAlignment; +} VkImageAlignmentControlCreateInfoMESA; + + + // VK_KHR_acceleration_structure is a preprocessor guard. Do not pass it to API calls. #define VK_KHR_acceleration_structure 1 #define VK_KHR_ACCELERATION_STRUCTURE_SPEC_VERSION 13 diff --git a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_metal.h b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_metal.h index e6f7bf7a..89a55749 100644 --- a/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_metal.h +++ b/icd/api/include/khronos/sdk-1.3/vulkan/vulkan_metal.h @@ -52,28 +52,28 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateMetalSurfaceEXT( #define VK_EXT_metal_objects 1 #ifdef __OBJC__ @protocol MTLDevice; -typedef id MTLDevice_id; +typedef __unsafe_unretained id MTLDevice_id; #else typedef void* MTLDevice_id; #endif #ifdef __OBJC__ @protocol MTLCommandQueue; -typedef id MTLCommandQueue_id; +typedef __unsafe_unretained id MTLCommandQueue_id; #else typedef void* MTLCommandQueue_id; #endif #ifdef __OBJC__ @protocol MTLBuffer; -typedef id MTLBuffer_id; +typedef __unsafe_unretained id MTLBuffer_id; #else typedef void* MTLBuffer_id; #endif #ifdef __OBJC__ @protocol MTLTexture; -typedef id MTLTexture_id; +typedef __unsafe_unretained id MTLTexture_id; #else typedef void* MTLTexture_id; #endif @@ -81,12 +81,12 @@ typedef void* MTLTexture_id; typedef struct __IOSurface* IOSurfaceRef; #ifdef __OBJC__ @protocol MTLSharedEvent; -typedef id MTLSharedEvent_id; +typedef __unsafe_unretained id MTLSharedEvent_id; #else typedef void* MTLSharedEvent_id; #endif -#define VK_EXT_METAL_OBJECTS_SPEC_VERSION 1 +#define VK_EXT_METAL_OBJECTS_SPEC_VERSION 2 #define VK_EXT_METAL_OBJECTS_EXTENSION_NAME "VK_EXT_metal_objects" typedef enum VkExportMetalObjectTypeFlagBitsEXT { diff --git a/icd/api/include/khronos/vulkan.h b/icd/api/include/khronos/vulkan.h index 030be791..11faf725 100644 --- a/icd/api/include/khronos/vulkan.h +++ b/icd/api/include/khronos/vulkan.h @@ -65,6 +65,9 @@ #include "devext/vk_amd_shader_texel_buffer_explicit_format.h" #endif +#if VKI_RAY_TRACING +#endif + #define VK_FORMAT_A1B5G5R5_UNORM_PACK16 VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR #define VK_FORMAT_BEGIN_RANGE VK_FORMAT_UNDEFINED diff --git a/icd/api/include/vk_cmdbuffer.h b/icd/api/include/vk_cmdbuffer.h index 192d051a..b74a3f69 100644 --- a/icd/api/include/vk_cmdbuffer.h +++ b/icd/api/include/vk_cmdbuffer.h @@ -51,9 +51,6 @@ #include "renderpass/renderpass_builder.h" -#if VKI_RAY_TRACING -#endif - #include "debug_printf.h" #include "palCmdBuffer.h" #include "palDequeImpl.h" @@ -95,6 +92,7 @@ class QueryPool; #if VKI_RAY_TRACING class RayTracingPipeline; class AccelerationStructureQueryPool; +class BvhBatchState; #endif constexpr uint8_t DefaultStencilOpValue = 1; @@ -1458,6 +1456,10 @@ class CmdBuffer const Pal::IGpuMemory& cpsMem) const; bool HasRayTracing() const { return m_flags.hasRayTracing; } + + BvhBatchState* GetBvhBatchState() const { return m_pBvhBatchState; } + + void SetBvhBatchState(BvhBatchState* pBvhBatchState) { m_pBvhBatchState = pBvhBatchState; } #endif template @@ -1598,7 +1600,7 @@ class CmdBuffer void RPLoadOpClearColor(uint32_t count, const RPLoadOpClearInfo* pClears); void RPLoadOpClearDepthStencil(uint32_t count, const RPLoadOpClearInfo* pClears); void RPBindTargets(const RPBindTargetsInfo& targets); - void RPSyncPostLoadOpColorClear(); + void RPSyncPostLoadOpColorClear(uint32_t count, const RPLoadOpClearInfo* pClears); void BindTargets(); @@ -1930,7 +1932,8 @@ class CmdBuffer #else uint32_t reserved4 : 1; #endif - uint32_t reserved : 14; + uint32_t offsetMode : 1; + uint32_t reserved : 13; }; }; @@ -1979,6 +1982,7 @@ class CmdBuffer bool m_reverseThreadGroupState; #if VKI_RAY_TRACING Util::Vector m_scratchVidMemList; // Ray-tracing scratch memory + BvhBatchState* m_pBvhBatchState; uint64 m_maxCpsMemSize; // max ray sorting memory requested diff --git a/icd/api/include/vk_conv.h b/icd/api/include/vk_conv.h index d49c1d36..5a66cba6 100755 --- a/icd/api/include/vk_conv.h +++ b/icd/api/include/vk_conv.h @@ -4077,20 +4077,20 @@ const char* VkResultName(VkResult result); inline std::chrono::nanoseconds Uint64ToChronoNano(uint64_t nanoSeconds) { - const uint64_t maxNano = static_cast(std::chrono::nanoseconds::max().count()); - return std::chrono::nanoseconds { Util::Min(nanoSeconds, maxNano) }; + constexpr uint64_t MaxNanos = uint64_t(std::chrono::nanoseconds::max().count()); + return std::chrono::nanoseconds{ Util::Min(nanoSeconds, MaxNanos) }; } inline std::chrono::milliseconds Uint64ToChronoMilli(uint64_t milliSeconds) { - const uint64_t maxMilli = static_cast(std::chrono::milliseconds::max().count()); - return std::chrono::milliseconds { Util::Min(milliSeconds, maxMilli) }; + constexpr uint64_t MaxMillis = uint64_t(std::chrono::milliseconds::max().count()); + return std::chrono::milliseconds{ Util::Min(milliSeconds, MaxMillis) }; } inline std::chrono::seconds Uint64ToChronoSeconds(uint64_t seconds) { - const uint64_t maxSeconds = static_cast(std::chrono::seconds::max().count()); - return std::chrono::seconds { Util::Min(seconds, maxSeconds) }; + constexpr uint64_t MaxSeconds = uint64_t(std::chrono::seconds::max().count()); + return std::chrono::seconds{ Util::Min(seconds, MaxSeconds) }; } } // namespace vk diff --git a/icd/api/include/vk_device.h b/icd/api/include/vk_device.h index 7024ca43..1ebee441 100644 --- a/icd/api/include/vk_device.h +++ b/icd/api/include/vk_device.h @@ -167,8 +167,9 @@ class Device uint32 primitivesGeneratedQuery : 1; uint32 reserved1 : 1; uint32 reserved2 : 1; + uint32 robustVertexBufferExtend : 1; - uint32 reserved : 12; + uint32 reserved : 11; }; uint32 u32All; @@ -959,7 +960,7 @@ class Device // This is from device create info, VkDevicePrivateDataCreateInfoEXT uint32 m_privateDataSlotRequestCount; - volatile uint64 m_nextPrivateDataSlot; + uint64 m_nextPrivateDataSlot; size_t m_privateDataSize; Util::RWLock m_privateDataRWLock; diff --git a/icd/api/include/vk_extensions.h b/icd/api/include/vk_extensions.h index c116faed..443ef9ed 100644 --- a/icd/api/include/vk_extensions.h +++ b/icd/api/include/vk_extensions.h @@ -424,6 +424,8 @@ class DeviceExtensions final : public Extensions EXT_PRIVATE_DATA, EXT_PROVOKING_VERTEX, EXT_QUEUE_FAMILY_FOREIGN, +#if VKI_RAY_TRACING +#endif EXT_ROBUSTNESS2, EXT_SAMPLER_FILTER_MINMAX, EXT_SAMPLE_LOCATIONS, diff --git a/icd/api/include/vk_image.h b/icd/api/include/vk_image.h index 1b6534c1..85100d90 100644 --- a/icd/api/include/vk_image.h +++ b/icd/api/include/vk_image.h @@ -299,7 +299,8 @@ class Image final : public NonDispatchable uint32_t sampleLocsCompatDepth : 1; // VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT uint32_t isProtected : 1; // VK_IMAGE_CREATE_PROTECTED_BIT uint32_t treatAsSrgb : 1; // True if this image is to be interpreted as SRGB where possible - uint32_t reserved : 15; + uint32_t reserved1 : 1; + uint32_t reserved : 14; }; uint32_t u32All; }; @@ -334,7 +335,8 @@ class Image final : public NonDispatchable uint32_t externallyShareable : 1; // True if the backing memory of this image may be shared externally. uint32_t externalD3DHandle : 1; // True if image is backed by a D3D11 image uint32_t externalPinnedHost : 1; // True if image backing memory is compatible with pinned sysmem. - uint32_t reserved : 28; + uint32_t reserved1 : 1; + uint32_t reserved : 27; }; uint32_t u32All; }; diff --git a/icd/api/include/vk_indirect_commands_layout.h b/icd/api/include/vk_indirect_commands_layout.h index 211c0d10..9c58ef80 100644 --- a/icd/api/include/vk_indirect_commands_layout.h +++ b/icd/api/include/vk_indirect_commands_layout.h @@ -84,18 +84,13 @@ class IndirectCommandsLayout final : public NonDispatchable struct { - Pal::QueueType palQueueType; - Pal::EngineType palEngineType; - VkShaderStageFlags validShaderStages; - uint32_t palImageLayoutFlag; - VkQueueFamilyProperties properties; + Pal::QueueType palQueueType; + Pal::EngineType palEngineType; + VkShaderStageFlags validShaderStages; + uint32_t palImageLayoutFlag; + VkQueueFamilyProperties properties; } m_queueFamilies[Queue::MaxQueueFamilies]; // List of indices for compute engines that aren't exclusive. diff --git a/icd/api/include/vk_queue.h b/icd/api/include/vk_queue.h index ac3f5215..87add35f 100644 --- a/icd/api/include/vk_queue.h +++ b/icd/api/include/vk_queue.h @@ -211,7 +211,8 @@ class Queue enum { - MaxQueueFamilies = Pal::QueueTypeCount, // Maximum number of queue families + MaxQueueFamilies = Pal::QueueTypeCount // Maximum number of queue families + , MaxQueuesPerFamily = 8, // Maximum number of queues per family MaxMultiQueues = 4, diff --git a/icd/api/include/vk_swapchain.h b/icd/api/include/vk_swapchain.h index f04ae65c..946d89dd 100644 --- a/icd/api/include/vk_swapchain.h +++ b/icd/api/include/vk_swapchain.h @@ -157,10 +157,11 @@ class SwapChain final : public NonDispatchable bool IsFullscreenOrEfsePresent() const; Pal::IGpuMemory* UpdatePresentInfo( - uint32_t deviceIdx, - uint32_t imageIndex, - Pal::PresentSwapChainInfo* pPresentInfo, - const Pal::FlipStatusFlags& flipFlags); + uint32_t deviceIdx, + uint32_t imageIndex, + Pal::PresentSwapChainInfo* pPresentInfo, + const Pal::FlipStatusFlags& flipFlags, + const Pal::PerSourceFrameMetadataControl& metadataFlags); bool BuildPostProcessingCommands( Pal::ICmdBuffer* pCmdBuf, @@ -187,6 +188,7 @@ class SwapChain final : public NonDispatchable const VkHdrMetadataEXT* pMetadata); void MarkAsDeprecated( + bool releaseResources, const VkAllocationCallbacks* pAllocator); uint32_t GetVidPnSourceId() const diff --git a/icd/api/pipeline_binary_cache.cpp b/icd/api/pipeline_binary_cache.cpp index b157a3b2..bd438d7e 100644 --- a/icd/api/pipeline_binary_cache.cpp +++ b/icd/api/pipeline_binary_cache.cpp @@ -696,9 +696,7 @@ Util::Result PipelineBinaryCache::InjectBinariesFromDirectory( if ((fileCount > 0u) && (result == Util::Result::Success)) { - char* pFileNameBuffer = nullptr; - Util::Span> fileNames; - Util::Span fileNameBuffer; + char* pFileNameBuffer = nullptr; // Allocate space for pFileNames and pFileNameBuffer Util::StringView* pFileNames = static_cast*>( @@ -720,11 +718,11 @@ Util::Result PipelineBinaryCache::InjectBinariesFromDirectory( } } + Util::Span> fileNames(pFileNames, fileCount); + Util::Span fileNameBuffer(pFileNameBuffer, fileNameBufferSize); + if (result == Util::Result::Success) { - fileNames = Util::Span>(pFileNames, fileCount); - fileNameBuffer = Util::Span(pFileNameBuffer, fileNameBufferSize); - // Populate fileNames and fileNameBuffer. result = Util::GetFileNamesInDir(settings.devModeElfReplacementDirectory, fileNames, fileNameBuffer); @@ -1086,11 +1084,8 @@ VkResult PipelineBinaryCache::InitArchiveLayers( { if (totalSize >= settings.pipelineCacheDefaultLocationLimitation) { - const uint64 sec = oldestTime.time_since_epoch().count() + - settings.thresholdOfCleanUpCache; - Util::RemoveFilesOfDirOlderThan( - pCachePath, Util::SecondsSinceEpoch { Uint64ToChronoSeconds(sec) }); + pCachePath, oldestTime + Uint64ToChronoSeconds(settings.thresholdOfCleanUpCache)); } } } diff --git a/icd/api/pipeline_compiler.cpp b/icd/api/pipeline_compiler.cpp index 99083a4b..07c5e1a4 100644 --- a/icd/api/pipeline_compiler.cpp +++ b/icd/api/pipeline_compiler.cpp @@ -2233,7 +2233,7 @@ void PipelineCompiler::BuildPipelineShaderInfo( // but we want to force wavesize to wave32 internally depending on settings and shader stage. // We override any wavesize forced via shader opts also here. // NOTE: If the app uses subgroup size then wavesize forced here might get overriden later based on - // subgroupsize. To avoid this beahvior, DeprecateWave64Reporting must be set as well in settings. + // subgroupsize. To avoid this behavior, DeprecateWave64Reporting must be set as well in settings. pShaderInfoOut->options.waveSize = ShouldForceWave32(static_cast(stage), pDevice->GetRuntimeSettings().deprecateWave64) ? 32 : pShaderInfoOut->options.waveSize; @@ -2330,6 +2330,8 @@ static void BuildPipelineShadersInfo( const GraphicsPipelineShaderStageInfo* pShaderInfo, GraphicsPipelineBinaryCreateInfo* pCreateInfo) { + const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); + if (pCreateInfo->pipelineInfo.options.enableRelocatableShaderElf) { CompilerSolution::DisableNggCulling(&pCreateInfo->pipelineInfo.nggState); @@ -2353,7 +2355,7 @@ static void BuildPipelineShadersInfo( (pShaderInfo->stages[stage].codeHash.lower != 0) || (pShaderInfo->stages[stage].codeHash.upper != 0))) { - GraphicsLibraryType gplType = GetGraphicsLibraryType(static_cast(stage)); + GraphicsLibraryType gplType = GetGraphicsLibraryType(static_cast(stage)); PipelineCompiler::BuildPipelineShaderInfo(pDevice, &pShaderInfo->stages[stage], @@ -2362,6 +2364,26 @@ static void BuildPipelineShadersInfo( pCreateInfo->pPipelineProfileKey, &pCreateInfo->pipelineInfo.nggState ); + + if ((stage == ShaderStage::ShaderStageFragment) && + (ppShaderInfoOut[stage]->options.allowReZ == true) && settings.disableDepthOnlyReZ) + { + bool usesDepthOnlyAttachments = true; + + for (uint32_t i = 0; i < Pal::MaxColorTargets; ++i) + { + if (pCreateInfo->pipelineInfo.cbState.target[i].channelWriteMask != 0) + { + usesDepthOnlyAttachments = false; + break; + } + } + + if (usesDepthOnlyAttachments) + { + ppShaderInfoOut[stage]->options.allowReZ = false; + } + } } } @@ -2375,8 +2397,8 @@ static void BuildPipelineShadersInfo( // details can be found in PipelineCompiler::ConvertGraphicsPipelineInfo(). // PS: For standard gfx pipeline, GraphicsPipelineBuildInfo::enableUberFetchShader is never set as TRUE with default // panel setting because VII and PRS are always available at the same time. - if (pDevice->GetRuntimeSettings().enableUberFetchShader || - pDevice->GetRuntimeSettings().enableEarlyCompile || + if (settings.enableUberFetchShader || + settings.enableEarlyCompile || (((pCreateInfo->libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) == 0) && ((pCreateInfo->libFlags & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) != 0)) || (IsDynamicStateEnabled(dynamicStateFlags, DynamicStatesInternal::VertexInput) == true) @@ -2939,6 +2961,9 @@ VkResult PipelineCompiler::ConvertGraphicsPipelineInfo( &pCreateInfo->pipelineInfo.options ); + pCreateInfo->pipelineInfo.useSoftwareVertexBufferDescriptors = + pDevice->GetEnabledFeatures().robustVertexBufferExtend; + } uint64_t dynamicStateFlags = 0; @@ -3787,7 +3812,7 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( static_assert(RaytracingContinuations == static_cast(Vkgc::LlpcRaytracingMode::Continuations)); pCreateInfo->pipelineInfo.mode = static_cast(settings.llpcRaytracingMode); - static_assert(CpsFlagStackInGlobalMem == Vkgc::CpsFlagStackInGlobalMem); + static_assert(CpsFlagStackInGlobalMem == static_cast(Vkgc::CpsFlagStackInGlobalMem)); pCreateInfo->pipelineInfo.cpsFlags = settings.cpsFlags; pCreateInfo->pipelineInfo.isReplay = isReplay; @@ -3828,6 +3853,9 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( tempBufferSize += sizeof(BinaryData) * pIn->pLibraryInfo->libraryCount; } + const auto& gpurtOptions = pDevice->RayTrace()->GetGpurtOptions(); + tempBufferSize += gpurtOptions.size() * sizeof(Vkgc::GpurtOption); + // We can't have a pipeline with 0 shader stages VK_ASSERT(tempBufferSize > 0); @@ -4023,6 +4051,17 @@ VkResult PipelineCompiler::ConvertRayTracingPipelineInfo( pSummaries[i] = summary; } } + + if (gpurtOptions.size() > 0) + { + Vkgc::GpurtOption* pGpurtOptions = reinterpret_cast( + VoidPtrInc(pCreateInfo->pTempBuffer, tempBufferOffset)); + size_t gpurtOptionsSize = sizeof(Vkgc::GpurtOption) * gpurtOptions.size(); + tempBufferOffset += gpurtOptionsSize; + pCreateInfo->pipelineInfo.pGpurtOptions = pGpurtOptions; + pCreateInfo->pipelineInfo.gpurtOptionCount = gpurtOptions.size(); + memcpy(pGpurtOptions, gpurtOptions.Data(), gpurtOptionsSize); + } } } diff --git a/icd/api/raytrace/ray_tracing_device.cpp b/icd/api/raytrace/ray_tracing_device.cpp index 17b93902..be8958be 100644 --- a/icd/api/raytrace/ray_tracing_device.cpp +++ b/icd/api/raytrace/ray_tracing_device.cpp @@ -34,7 +34,9 @@ #include "sqtt/sqtt_layer.h" #include "sqtt/sqtt_rgp_annotations.h" #include "palAutoBuffer.h" +#include "palVectorImpl.h" #include "gpurt/gpurtLib.h" +#include "g_gpurtOptions.h" #if ICD_GPUOPEN_DEVMODE_BUILD #include "devmode/devmode_mgr.h" @@ -48,7 +50,9 @@ RayTracingDevice::RayTracingDevice( Device* pDevice) : m_pDevice(pDevice), + m_gpurtOptions(pDevice->VkInstance()->Allocator()), m_cmdContext(), + m_pBvhBatchLayer(nullptr), m_accelStructTrackerResources() { @@ -73,6 +77,7 @@ VkResult RayTracingDevice::Init() } CreateGpuRtDeviceSettings(&m_gpurtDeviceSettings); + CollectGpurtOptions(&m_gpurtOptions); for (uint32_t deviceIdx = 0; (result == VK_SUCCESS) && (deviceIdx < m_pDevice->NumPalDevices()); ++deviceIdx) { @@ -99,7 +104,6 @@ VkResult RayTracingDevice::Init() initInfo.pAccelStructTracker = GetAccelStructTracker(deviceIdx); initInfo.accelStructTrackerGpuAddr = GetAccelStructTrackerGpuVa(deviceIdx); - initInfo.deviceSettings.gpuDebugFlags = m_pDevice->GetRuntimeSettings().rtGpuDebugFlags; initInfo.deviceSettings.emulatedRtIpLevel = Pal::RayTracingIpLevel::None; switch (m_pDevice->GetRuntimeSettings().emulatedRtIpLevel) { @@ -136,17 +140,24 @@ VkResult RayTracingDevice::Init() callbacks.pfnFreeGpuMem = &RayTracingDevice::ClientFreeGpuMem; callbacks.pfnClientGetTemporaryGpuMemory = &RayTracingDevice::ClientGetTemporaryGpuMemory; - Pal::Result palResult = GpuRt::CreateDevice(initInfo, callbacks, pMemory, &m_pGpuRtDevice[deviceIdx]); + result = PalToVkResult(GpuRt::CreateDevice(initInfo, callbacks, pMemory, &m_pGpuRtDevice[deviceIdx])); - if (palResult != Pal::Result::Success) + if (result == VK_SUCCESS) { - m_pDevice->VkInstance()->FreeMem(pMemory); + result = BvhBatchLayer::CreateLayer(m_pDevice, &m_pBvhBatchLayer); + } + if (result != VK_SUCCESS) + { VK_NEVER_CALLED(); - result = VK_ERROR_INITIALIZATION_FAILED; - } + m_pDevice->VkInstance()->FreeMem(pMemory); + if (m_pBvhBatchLayer != nullptr) + { + m_pBvhBatchLayer->DestroyLayer(); + } + } } } @@ -249,10 +260,47 @@ void RayTracingDevice::CreateGpuRtDeviceSettings( m_profileRayFlags = TraceRayProfileFlagsToRayFlag(settings); m_profileMaxIterations = TraceRayProfileMaxIterationsToMaxIterations(settings); - pDeviceSettings->gpuDebugFlags = settings.gpuRtGpuDebugFlags; + pDeviceSettings->gpuDebugFlags = settings.rtGpuDebugFlags; pDeviceSettings->enableRemapScratchBuffer = settings.enableRemapScratchBuffer; pDeviceSettings->enableEarlyPairCompression = settings.enableEarlyPairCompression; pDeviceSettings->trianglePairingSearchRadius = settings.trianglePairingSearchRadius; + + pDeviceSettings->enableMergedEncodeBuild = settings.enableMergedEncodeBuild; + pDeviceSettings->enableMergedEncodeUpdate = settings.enableMergedEncodeUpdate; +} + +// ===================================================================================================================== +void RayTracingDevice::CollectGpurtOptions( + GpurtOptions* const pGpurtOptions + ) const +{ + const uint32_t optionCount = sizeof(GpuRt::OptionDefaults) / sizeof(GpuRt::OptionDefaults[0]); + + // Set up option defaults so that it won't break when a newly added option has non-zero default. + Util::HashMap optionMap(optionCount, pGpurtOptions->GetAllocator()); + optionMap.Init(); + for (uint32_t i = 0; i < optionCount; i++) + { + // We should not have duplicated option defaults. + VK_ASSERT(optionMap.FindKey(GpuRt::OptionDefaults[i].nameHash) == nullptr); + optionMap.Insert(GpuRt::OptionDefaults[i].nameHash, GpuRt::OptionDefaults[i].value); + } + + auto& settings = m_pDevice->GetRuntimeSettings(); + + uint32_t threadTraceEnabled = 0; + if (settings.rtEmitRayTracingShaderDataToken || + m_pDevice->VkInstance()->PalPlatform()->IsRaytracingShaderDataTokenRequested()) + { + threadTraceEnabled = 1; + } + *optionMap.FindKey(GpuRt::ThreadTraceEnabledOptionNameHash) = threadTraceEnabled; + + pGpurtOptions->Clear(); + for (auto it = optionMap.Begin(); it.Get() != nullptr; it.Next()) + { + pGpurtOptions->PushBack({ it.Get()->key, it.Get()->value }); + } } // ===================================================================================================================== @@ -297,6 +345,11 @@ void RayTracingDevice::Destroy() m_pDevice->VkInstance()->FreeMem(m_accelStructTrackerResources[0].pMem); } + if (m_pBvhBatchLayer != nullptr) + { + m_pBvhBatchLayer->DestroyLayer(); + } + Util::Destructor(this); m_pDevice->VkInstance()->FreeMem(this); @@ -724,11 +777,10 @@ Pal::Result RayTracingDevice::ClientCreateInternalComputePipeline( void** ppResultMemory) ///< [out] (Optional) Result PAL pipeline memory, ///< if different from obj { - uint64_t spvPassMask = - static_cast(initInfo.pClientUserData)->GetRuntimeSettings().rtInternalPipelineSpvPassMask; - vk::Device* pDevice = static_cast(initInfo.pClientUserData); + vk::Device* pDevice = static_cast(initInfo.pClientUserData); const auto& settings = pDevice->GetRuntimeSettings(); + uint64_t spvPassMask = settings.rtInternalPipelineSpvPassMask; uint64_t shaderTypeMask = 1ull << static_cast(buildInfo.shaderType); bool useSpvPass = (shaderTypeMask & spvPassMask); diff --git a/icd/api/raytrace/ray_tracing_device.h b/icd/api/raytrace/ray_tracing_device.h index d9a86636..8829dae4 100644 --- a/icd/api/raytrace/ray_tracing_device.h +++ b/icd/api/raytrace/ray_tracing_device.h @@ -31,6 +31,9 @@ #include "khronos/vulkan.h" #include "vk_defines.h" +#include "appopt/bvh_batch_layer.h" + +#include "vkgcDefs.h" namespace vk { @@ -39,6 +42,7 @@ class Device; class Queue; class InternalMemory; class CmdBuffer; +class PalAllocator; // Device-level structure for managing state related to ray-tracing. Instantiated as part of a VkDevice. class RayTracingDevice @@ -61,6 +65,8 @@ class RayTracingDevice uint32_t srd[BufferViewDwords]; }; + typedef Util::Vector GpurtOptions; + RayTracingDevice(Device* pDevice); ~RayTracingDevice(); @@ -70,6 +76,7 @@ class RayTracingDevice void CreateGpuRtDeviceSettings(GpuRt::DeviceSettings* pDeviceSettings); GpuRt::IDevice* GpuRt(uint32_t deviceIdx) { return m_pGpuRtDevice[deviceIdx]; } const GpuRt::DeviceSettings& DeviceSettings() const { return m_gpurtDeviceSettings; } + const GpurtOptions& GetGpurtOptions() const { return m_gpurtOptions; } Pal::Result InitCmdContext(uint32_t deviceIdx); CmdContext* GetCmdContext(uint32_t deviceIdx) { return &m_cmdContext[deviceIdx]; } @@ -84,6 +91,8 @@ class RayTracingDevice uint64_t GetAccelerationStructureUUID(const Pal::DeviceProperties& palProps); + BvhBatchLayer* GetBvhBatchLayer() { return m_pBvhBatchLayer; } + uint32_t GetProfileRayFlags() const { return m_profileRayFlags; } uint32_t GetProfileMaxIterations() const { return m_profileMaxIterations; } @@ -122,6 +131,7 @@ class RayTracingDevice GpuRt::IDevice* m_pGpuRtDevice[MaxPalDevices]; GpuRt::DeviceSettings m_gpurtDeviceSettings; + GpurtOptions m_gpurtOptions; uint32_t m_profileRayFlags; // Ray flag override for profiling uint32_t m_profileMaxIterations; // Max traversal iterations @@ -193,6 +203,10 @@ class RayTracingDevice const VkStridedDeviceAddressRegionKHR* pHitSbt, GpuRt::RtDispatchInfo* pDispatchInfo) const; + void CollectGpurtOptions(GpurtOptions* const pGpurtOptions) const; + + BvhBatchLayer* m_pBvhBatchLayer; + AccelStructTrackerResources m_accelStructTrackerResources[MaxPalDevices]; }; diff --git a/icd/api/raytrace/vk_acceleration_structure.cpp b/icd/api/raytrace/vk_acceleration_structure.cpp index 7930c3b1..3295d5c7 100644 --- a/icd/api/raytrace/vk_acceleration_structure.cpp +++ b/icd/api/raytrace/vk_acceleration_structure.cpp @@ -174,12 +174,13 @@ VkResult AccelerationStructure::ConvertBuildInputsKHR( uint32_t deviceIndex, const VkAccelerationStructureBuildGeometryInfoKHR& info, const VkAccelerationStructureBuildRangeInfoKHR* pBuildRangeInfos, + const uint32_t* pMaxPrimitiveCounts, GeometryConvertHelper* pHelper, GpuRt::AccelStructBuildInputs* pInputs) { VkResult result = VK_SUCCESS; - pHelper->pMaxPrimitiveCounts = nullptr; + pHelper->pMaxPrimitiveCounts = pMaxPrimitiveCounts; pHelper->pBuildRangeInfos = pBuildRangeInfos; pInputs->type = ConvertAccelerationStructureType(info.type); pInputs->flags = ConvertAccelerationStructureFlags(info.mode, info.flags); @@ -202,7 +203,9 @@ VkResult AccelerationStructure::ConvertBuildInputsKHR( if (pInstanceGeom->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR) { - pInputs->inputElemCount = (pBuildRangeInfos != nullptr) ? pBuildRangeInfos->primitiveCount : 1; + pInputs->inputElemCount = (pBuildRangeInfos != nullptr) ? + pBuildRangeInfos->primitiveCount : + pMaxPrimitiveCounts[0]; pInputs->inputElemLayout = pInstanceGeom->geometry.instances.arrayOfPointers ? GpuRt::InputElementLayout::ArrayOfPointers : GpuRt::InputElementLayout::Array; diff --git a/icd/api/raytrace/vk_acceleration_structure.h b/icd/api/raytrace/vk_acceleration_structure.h index 93702db0..a245a7e6 100644 --- a/icd/api/raytrace/vk_acceleration_structure.h +++ b/icd/api/raytrace/vk_acceleration_structure.h @@ -75,6 +75,7 @@ class AccelerationStructure final : public NonDispatchablepipePoints[pBarrier->pipePointCount] = Pal::HwPipeBottom; pBarrier->pipePointCount++; - pBarrier->srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + pBarrier->implicitSrcCacheMask |= Pal::CoherColorTarget | Pal::CoherDepthStencilTarget; } } // ===================================================================================================================== static void ConvertImplicitSyncs( - RPBarrierInfo* pBarrier, + RPBarrierInfo* pBarrier, const RuntimeSettings& settings) { pBarrier->implicitSrcCacheMask = 0; @@ -1015,17 +1014,15 @@ static void ConvertImplicitSyncs( pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; pBarrier->implicitSrcCacheMask |= Pal::CoherResolveSrc; + pBarrier->implicitDstCacheMask |= Pal::CoherResolveDst; } - if (pBarrier->flags.implicitExternalOutgoing && - (pBarrier->pipePointCount < (MaxHwPipePoints - 1)) && - settings.implicitExternalSynchronization) + if (pBarrier->flags.implicitExternalOutgoing && settings.implicitExternalSynchronization) { pBarrier->srcStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; pBarrier->dstStageMask |= VK_PIPELINE_STAGE_2_BLIT_BIT_KHR; - pBarrier->srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + pBarrier->implicitSrcCacheMask |= Pal::CoherColorTarget | Pal::CoherDepthStencilTarget; } } @@ -1055,59 +1052,77 @@ void RenderPassBuilder::PostProcessSyncPoint( pSyncPoint->barrier.flags.needsGlobalTransition = 1; } - // The barrier is active if it does any waiting or global cache synchronization or attachment transitions - if ((pSyncPoint->barrier.pipePointCount > 0) || - (pSyncPoint->barrier.flags.needsGlobalTransition) || - (pSyncPoint->transitions.NumElements() > 0)) - { - pSyncPoint->flags.active = 1; + bool hasChangingLayout = false; + bool isTransitioningOutOfUndefined = false; - if (pSyncPoint->barrier.dstStageMask == 0) + if (pSyncPoint->transitions.NumElements() > 0) + { + for (auto it = pSyncPoint->transitions.Begin(); it.Get() != nullptr; it.Next()) { - if (pSyncPoint->flags.top && (pSyncPoint->transitions.NumElements() > 0)) + RPTransitionInfo* info = it.Get(); + + if (info->prevLayout.layout == VK_IMAGE_LAYOUT_UNDEFINED) { - // If a transition occurs when entering a subpass (top == 1), it must be synced before the - // attachment is accessed. If we're leaving the subpass, chances are there's another barrier down - // the line that will sync the image correctly. - pSyncPoint->barrier.dstStageMask = AllShaderStages; + isTransitioningOutOfUndefined = true; } - else + + if ((info->prevLayout.layout != info->nextLayout.layout) || + (info->prevStencilLayout.layout != info->nextStencilLayout.layout)) { - // BOTTOM_OF_PIPE in dst mask is effectively NONE. - pSyncPoint->barrier.dstStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; + hasChangingLayout = true; } - } - // If srcSubpass for this barrier is VK_SUBPASS_EXTERNAL, srcStageMask is TOP_OF_PIPE and srcAccessMask is - // 0 then this syncTop barrier might be doing a metadata Init with a layout transition out of undefined - // layout. Set a flag here that can be tested later to set the srcStageMask correctly. - const bool needsFixForMetaDataInit = - ((pSyncPoint->flags.top) && - (pSyncPoint->barrier.flags.explicitExternalIncoming) && - (pSyncPoint->barrier.srcStageMask == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR) && - (pSyncPoint->barrier.srcAccessMask == 0)); + if (hasChangingLayout || isTransitioningOutOfUndefined) + { + break; + } + } + } - if ((pSyncPoint->barrier.srcStageMask == 0) || needsFixForMetaDataInit) + // If srcSubpass for this barrier is VK_SUBPASS_EXTERNAL, srcStageMask is TOP_OF_PIPE and srcAccessMask is + // 0 then this syncTop barrier might be doing a metadata Init with a layout transition out of undefined + // layout. Set a flag here that can be tested later to set the srcStageMask correctly. + const bool needsFixForMetaDataInit = + ((pSyncPoint->flags.top) && + (pSyncPoint->barrier.flags.explicitExternalIncoming) && + (pSyncPoint->barrier.srcStageMask == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR) && + (pSyncPoint->barrier.srcAccessMask == 0)); + + // Set the dstStageMask to non-zero only if layout is changing. If the layout is not changing and if + // dstStageMask is 0, then it's quite likely that this is an empty barrier that can be skipped. + if ((pSyncPoint->barrier.dstStageMask == 0) && hasChangingLayout) + { + if (pSyncPoint->flags.top) { - if (pSyncPoint->transitions.NumElements() > 0) - { - // RPBarrierInfo consists of one set of src/dst stage masks which currently applies to each - // transition in RPSyncPoint(). PAL now supports specifying src/dst stage masks for each individual - // image transition. Since with this change we will loop over each transition to check for - // undefined 'prev' layout, there might be some cases where we add unnecessary stalls for at least - // some transitions. - for (auto it = pSyncPoint->transitions.Begin(); it.Get() != nullptr; it.Next()) - { - RPTransitionInfo* info = it.Get(); + // If a transition occurs when entering a subpass (top == 1), it must be synced before the + // attachment is accessed. If we're leaving the subpass, chances are there's another barrier down + // the line that will sync the image correctly. + pSyncPoint->barrier.dstStageMask = AllShaderStages; + } + else + { + // BOTTOM_OF_PIPE in dst mask is effectively NONE. + pSyncPoint->barrier.dstStageMask = VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT_KHR; + } + } - if (info->prevLayout.layout == VK_IMAGE_LAYOUT_UNDEFINED) - { - pSyncPoint->barrier.srcStageMask |= pSyncPoint->barrier.dstStageMask; - } - } - } + if ((pSyncPoint->barrier.srcStageMask == 0) || needsFixForMetaDataInit) + { + if (isTransitioningOutOfUndefined && hasChangingLayout) + { + pSyncPoint->barrier.srcStageMask |= pSyncPoint->barrier.dstStageMask; } } + + const bool stageMasksNotEmpty = (((pSyncPoint->barrier.srcStageMask == 0) && + (pSyncPoint->barrier.dstStageMask == 0)) == false); + + // The barrier is active if it does any waiting or global cache synchronization or attachment transitions + if ((pSyncPoint->barrier.flags.needsGlobalTransition) || + ((pSyncPoint->transitions.NumElements() > 0) && stageMasksNotEmpty)) + { + pSyncPoint->flags.active = 1; + } } else { @@ -1300,10 +1315,15 @@ Pal::Result RenderPassBuilder::TrackAttachmentUsage( WaitForResolves(pSync); } - // Detect if an automatic layout transition is needed and insert one to the given sync point if so. Note that - // these happen before load ops are triggered (below). - if ((pAttachment->prevReferenceLayout != layout) || - ((pStencilLayout != nullptr) && (pAttachment->prevReferenceStencilLayout != *pStencilLayout))) + // We want to include all transitions if acquire-release barrier interface is used. If not, then detect if an + // automatic layout transition is needed and insert one to the given sync point if so. Note that these happen + // before load ops are triggered (below). + const bool shouldIncludeTransition = + (m_pDevice->VkPhysicalDevice(DefaultDeviceIndex)->GetRuntimeSettings().useAcquireReleaseInterface) ? true : + ((pAttachment->prevReferenceLayout != layout) || + ((pStencilLayout != nullptr) && (pAttachment->prevReferenceStencilLayout != *pStencilLayout))); + + if (shouldIncludeTransition) { RPTransitionInfo transition = {}; diff --git a/icd/api/sqtt/sqtt_rgp_annotations.h b/icd/api/sqtt/sqtt_rgp_annotations.h index 7773b752..dcc1d146 100644 --- a/icd/api/sqtt/sqtt_rgp_annotations.h +++ b/icd/api/sqtt/sqtt_rgp_annotations.h @@ -40,7 +40,7 @@ constexpr uint32_t RgpSqttInstrumentationSpecVersion = 1; // RGP SQTT Instrumentation Specification version for Vulkan-specific tables -constexpr uint32_t RgpSqttInstrumentationApiVersion = 0; +constexpr uint32_t RgpSqttInstrumentationApiVersion = 4; #if defined(BIGENDIAN_CPU) || defined(__BIG_ENDIAN__) static_assert(false, "The bitfields in this header match the RGP format specification with the assumption that " diff --git a/icd/api/strings/extensions.txt b/icd/api/strings/extensions.txt index 86ba3001..458a16c4 100644 --- a/icd/api/strings/extensions.txt +++ b/icd/api/strings/extensions.txt @@ -218,3 +218,5 @@ VK_KHR_dynamic_rendering_local_read VK_KHR_vertex_attribute_divisor VK_EXT_frame_boundary VK_EXT_image_compression_control +#if VKI_RAY_TRACING +#endif diff --git a/icd/api/vk_buffer_view.cpp b/icd/api/vk_buffer_view.cpp index 858ae2fc..f1196a96 100644 --- a/icd/api/vk_buffer_view.cpp +++ b/icd/api/vk_buffer_view.cpp @@ -112,9 +112,9 @@ void BufferView::BuildSrd( Pal::BufferViewInfo info = {}; const RuntimeSettings& settings = pDevice->GetRuntimeSettings(); - info.swizzledFormat = VkToPalFormat(format, settings); - info.stride = Pal::Formats::BytesPerPixel(info.swizzledFormat.format); - info.range = bufferRange; + info.swizzledFormat = VkToPalFormat(format, settings); + info.stride = Pal::Formats::BytesPerPixel(info.swizzledFormat.format); + info.range = bufferRange; // Bypass Mall read/write if no alloc policy is set for SRDs if (Util::TestAnyFlagSet(settings.mallNoAllocResourcePolicy, MallNoAllocBufferViewSrds)) diff --git a/icd/api/vk_cmdbuffer.cpp b/icd/api/vk_cmdbuffer.cpp index 8170a40f..d3885915 100644 --- a/icd/api/vk_cmdbuffer.cpp +++ b/icd/api/vk_cmdbuffer.cpp @@ -600,6 +600,7 @@ CmdBuffer::CmdBuffer( m_reverseThreadGroupState(false) #if VKI_RAY_TRACING , m_scratchVidMemList(pDevice->VkInstance()->Allocator()) + , m_pBvhBatchState() , m_maxCpsMemSize(0) , m_patchCpsList { @@ -630,6 +631,7 @@ CmdBuffer::CmdBuffer( m_flags.disableResetReleaseResources = settings.disableResetReleaseResources; m_flags.subpassLoadOpClearsBoundAttachments = settings.subpassLoadOpClearsBoundAttachments; m_flags.preBindDefaultState = settings.preBindDefaultState; + m_flags.offsetMode = pDevice->GetEnabledFeatures().robustVertexBufferExtend; Pal::DeviceProperties info; m_pDevice->PalDevice(DefaultDeviceIndex)->GetProperties(&info); @@ -637,14 +639,14 @@ CmdBuffer::CmdBuffer( m_flags.useBackupBuffer = false; memset(m_pBackupPalCmdBuffers, 0, sizeof(Pal::ICmdBuffer*) * MaxPalDevices); - // If supportReleaseAcquireInterface is true, the ASIC provides new barrier interface CmdReleaseThenAcquire() - // designed for Acquire/Release-based driver. This flag is currently enabled for gfx9 and above. - // If supportSplitReleaseAcquire is true, the ASIC provides split CmdRelease() and CmdAcquire() to express barrier, - // and CmdReleaseThenAcquire() is still valid. This flag is currently enabled for gfx10 and above. - m_flags.useReleaseAcquire = info.gfxipProperties.flags.supportReleaseAcquireInterface && - settings.useAcquireReleaseInterface; - m_flags.useSplitReleaseAcquire = m_flags.useReleaseAcquire && - info.gfxipProperties.flags.supportSplitReleaseAcquire; + // If supportReleaseAcquireInterface is true, the ASIC provides new barrier interface CmdReleaseThenAcquire() + // designed for Acquire/Release-based driver. This flag is currently enabled for gfx9 and above. + // If supportSplitReleaseAcquire is true, the ASIC provides split CmdRelease() and CmdAcquire() to express barrier, + // and CmdReleaseThenAcquire() is still valid. This flag is currently enabled for gfx10 and above. + m_flags.useReleaseAcquire = info.gfxipProperties.flags.supportReleaseAcquireInterface && + settings.useAcquireReleaseInterface; + m_flags.useSplitReleaseAcquire = m_flags.useReleaseAcquire && + info.gfxipProperties.flags.supportSplitReleaseAcquire; } // ===================================================================================================================== @@ -1827,9 +1829,6 @@ void CmdBuffer::ResetState() m_flags.hasConditionalRendering = false; -#if VKI_RAY_TRACING -#endif - m_debugPrintf.Reset(m_pDevice); if (m_allGpuState.pDescBufBinding != nullptr) { @@ -1868,6 +1867,14 @@ VkResult CmdBuffer::Reset(VkCommandBufferResetFlags flags) #if VKI_RAY_TRACING FreeRayTracingScratchVidMemory(); FreePatchCpsList(); + + if (m_pBvhBatchState != nullptr) + { + // Called here (outside of the BvhBatchLayer because Reset can be triggered + // either directly on the command buffer or across the whole command pool. + m_pBvhBatchState->Log("Resetting via command buffer reset.\n"); + m_pBvhBatchState->Reset(); + } #endif result = PalToVkResult(PalCmdBufferReset(releaseResources)); @@ -2353,6 +2360,14 @@ VkResult CmdBuffer::Destroy(void) FreeRayTracingScratchVidMemory(); FreePatchCpsList(); + if (m_pBvhBatchState != nullptr) + { + // Called here (outside of the BvhBatchLayer because Destroy can be triggered + // either directly on the command buffer or across the whole command pool. + m_pBvhBatchState->Log("Resetting via command buffer destroy.\n"); + m_pBvhBatchState->Reset(); + } + #endif m_debugPrintf.Reset(m_pDevice); @@ -2982,7 +2997,7 @@ void CmdBuffer::BindVertexBuffersUpdateBindingRange( { pBinding->range = pSizes[inputIdx]; - if (offset != 0) + if ((offset != 0) && (m_flags.offsetMode == false)) { padVertexBuffers = true; } @@ -3052,7 +3067,30 @@ void CmdBuffer::BindVertexBuffers( pSizes, pStrides); - PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(firstBinding, lowBindingCount, pBinding); + Pal::VertexBufferViews bufferViews = + { + .firstBuffer = firstBinding, + .bufferCount = lowBindingCount, + .offsetMode = (m_flags.offsetMode == 1) ? true : false + }; + Pal::VertexBufferView vertexViews[Pal::MaxVertexBuffers] = {}; + + if (m_flags.offsetMode) + { + for (uint32_t idx = 0; idx < lowBindingCount; idx++) + { + vertexViews[idx].gpuva = pBinding[idx].gpuAddr; + vertexViews[idx].sizeInBytes = pBinding[idx].range; + vertexViews[idx].strideInBytes = pBinding[idx].stride; + } + bufferViews.pVertexBufferViews = vertexViews; + } + else + { + bufferViews.pBufferViewInfos = pBinding; + } + + PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(bufferViews); } } @@ -3115,9 +3153,31 @@ void CmdBuffer::UpdateVertexBufferStrides( if (firstChanged <= lastChanged) { - PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers( - firstChanged, (lastChanged - firstChanged) + 1, - &PerGpuState(deviceIdx)->vbBindings[firstChanged]); + Pal::VertexBufferViews bufferViews = + { + .firstBuffer = firstChanged, + .bufferCount = (lastChanged - firstChanged) + 1, + .offsetMode = (m_flags.offsetMode == 1) ? true : false + }; + Pal::VertexBufferView vertexViews[Pal::MaxVertexBuffers] = {}; + auto pBinding = &PerGpuState(deviceIdx)->vbBindings[firstChanged]; + + if (m_flags.offsetMode) + { + for (uint32_t idx = 0; idx < (lastChanged - firstChanged + 1); idx++) + { + vertexViews[idx].gpuva = pBinding[idx].gpuAddr; + vertexViews[idx].sizeInBytes = pBinding[idx].range; + vertexViews[idx].strideInBytes = pBinding[idx].stride; + } + bufferViews.pVertexBufferViews = vertexViews; + } + else + { + bufferViews.pBufferViewInfos = pBinding; + } + + PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(bufferViews); } } while (deviceGroup.IterateNext()); @@ -3480,16 +3540,69 @@ void CmdBuffer::ExecuteIndirect( const VkGeneratedCommandsInfoNV* pInfo) { IndirectCommandsLayout* pLayout = IndirectCommandsLayout::ObjectFromHandle(pInfo->indirectCommandsLayout); + IndirectCommandsInfo info = pLayout->GetIndirectCommandsInfo(); + + uint64_t barrierCmd = 0; + + if ((info.actionType == IndirectCommandsActionType::Draw) || + (info.actionType == IndirectCommandsActionType::DrawIndexed)) + { + const bool indexed = (info.actionType == IndirectCommandsActionType::DrawIndexed); + barrierCmd = (indexed ? DbgBarrierDrawIndexed : DbgBarrierDrawNonIndexed) | DbgBarrierDrawIndirect; + + DbgBarrierPreCmd(barrierCmd); + + ValidateGraphicsStates(); + } + else if (info.actionType == IndirectCommandsActionType::Dispatch) + { + barrierCmd = DbgBarrierDispatchIndirect; + + DbgBarrierPreCmd(barrierCmd); + + if (PalPipelineBindingOwnedBy(Pal::PipelineBindPoint::Compute, PipelineBindCompute) == false) + { + RebindPipeline(); + } + } + else if (info.actionType == IndirectCommandsActionType::MeshTask) + { + barrierCmd = DbgBarrierDrawMeshTasksIndirect; + + DbgBarrierPreCmd(barrierCmd); + + ValidateGraphicsStates(); + } + else + { + VK_NEVER_CALLED(); + } + + VK_ASSERT(pInfo->streamCount == 1); + + const Buffer* pArgumentBuffer = Buffer::ObjectFromHandle(pInfo->pStreams[0].buffer); + const uint64_t argumentOffset = pInfo->pStreams[0].offset; + + const Buffer* pCountBuffer = Buffer::ObjectFromHandle(pInfo->sequencesCountBuffer); + const uint64_t countOffset = pInfo->sequencesCountOffset; + + const uint32_t maxCount = pInfo->sequencesCount; utils::IterateMask deviceGroup(m_curDeviceMask); do { const uint32_t deviceIdx = deviceGroup.Index(); - pLayout->BindPreprocessBuffer(pInfo->preprocessBuffer, - pInfo->preprocessOffset, - deviceIdx); + + PalCmdBuffer(deviceIdx)->CmdExecuteIndirectCmds( + *pLayout->PalIndirectCmdGenerator(deviceIdx), + pArgumentBuffer->GpuVirtAddr(deviceIdx) + argumentOffset, + maxCount, + (pCountBuffer == nullptr) ? 0 : pCountBuffer->GpuVirtAddr(deviceIdx) + countOffset); + } while (deviceGroup.IterateNext()); + + DbgBarrierPostCmd(barrierCmd); } // ===================================================================================================================== @@ -7733,33 +7846,99 @@ void CmdBuffer::RPEndSubpass() // ===================================================================================================================== // Handles post-clear synchronization for load-op color clears when not auto-syncing. -void CmdBuffer::RPSyncPostLoadOpColorClear() +void CmdBuffer::RPSyncPostLoadOpColorClear( + uint32_t colorClearCount, + const RPLoadOpClearInfo* pClears) { - static const Pal::BarrierTransition transition = + if (m_flags.useReleaseAcquire) { - Pal::CoherClear, - Pal::CoherColorTarget, - {} - }; + VK_ASSERT(colorClearCount > 0); - static const Pal::HwPipePoint PipePoint = Pal::HwPipePostBlt; - static const Pal::BarrierInfo Barrier = - { - Pal::HwPipePreRasterization, // waitPoint - 1, // pipePointWaitCount - &PipePoint, // pPipePoints - 0, // gpuEventWaitCount - nullptr, // ppGpuEvents - 0, // rangeCheckedTargetWaitCount - nullptr, // ppTargets - 1, // transitionCount - &transition, // pTransitions - 0, // globalSrcCacheMask - 0, // globalDstCacheMask - RgpBarrierExternalRenderPassSync // reason - }; + VirtualStackFrame virtStack(m_pStackAllocator); + + Pal::AcquireReleaseInfo barrierInfo = {}; + + barrierInfo.reason = RgpBarrierExternalRenderPassSync; + + Pal::ImgBarrier* pPalTransitions = (colorClearCount != 0) ? + virtStack.AllocArray(colorClearCount) : + nullptr; + const Image** ppImages = (colorClearCount != 0) ? + virtStack.AllocArray(colorClearCount) : + nullptr; + + for (uint32_t i = 0; i < colorClearCount; ++i) + { + const RPLoadOpClearInfo& clear = pClears[i]; + + const Framebuffer::Attachment& attachment = m_allGpuState.pFramebuffer->GetAttachment(clear.attachment); + + VK_ASSERT(pPalTransitions != nullptr); + VK_ASSERT(ppImages != nullptr); + + for (uint32_t sr = 0; sr < attachment.subresRangeCount; ++sr) + { + const uint32_t plane = attachment.subresRange[sr].startSubres.plane; + + const Pal::ImageLayout oldLayout = RPGetAttachmentLayout(clear.attachment, plane); + + const Pal::ImageLayout newLayout = { VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, 1 }; + + ppImages[barrierInfo.imageBarrierCount] = attachment.pImage; + + pPalTransitions[barrierInfo.imageBarrierCount].srcStageMask = Pal::PipelineStageBlt; + pPalTransitions[barrierInfo.imageBarrierCount].dstStageMask = Pal::PipelineStageEarlyDsTarget; + pPalTransitions[barrierInfo.imageBarrierCount].srcAccessMask = Pal::CoherClear; + pPalTransitions[barrierInfo.imageBarrierCount].dstAccessMask = Pal::CoherColorTarget; + // We set the pImage to nullptr by default here. But, this will be computed correctly later for + // each device including DefaultDeviceIndex based on the deviceId. + pPalTransitions[barrierInfo.imageBarrierCount].pImage = nullptr; + pPalTransitions[barrierInfo.imageBarrierCount].oldLayout = oldLayout; + pPalTransitions[barrierInfo.imageBarrierCount].newLayout = newLayout; + pPalTransitions[barrierInfo.imageBarrierCount].subresRange = attachment.subresRange[sr]; - PalCmdBarrier(Barrier, GetRpDeviceMask()); + barrierInfo.imageBarrierCount++; + } + } + + barrierInfo.pImageBarriers = pPalTransitions; + + PalCmdReleaseThenAcquire( + &barrierInfo, + nullptr, + nullptr, + pPalTransitions, + ppImages, + GetRpDeviceMask()); + } + else + { + static const Pal::BarrierTransition transition = + { + Pal::CoherClear, + Pal::CoherColorTarget, + {} + }; + + static const Pal::HwPipePoint PipePoint = Pal::HwPipePostBlt; + static const Pal::BarrierInfo Barrier = + { + Pal::HwPipePreRasterization, // waitPoint + 1, // pipePointWaitCount + &PipePoint, // pPipePoints + 0, // gpuEventWaitCount + nullptr, // ppGpuEvents + 0, // rangeCheckedTargetWaitCount + nullptr, // ppTargets + 1, // transitionCount + &transition, // pTransitions + 0, // globalSrcCacheMask + 0, // globalDstCacheMask + RgpBarrierExternalRenderPassSync // reason + }; + + PalCmdBarrier(Barrier, GetRpDeviceMask()); + } } // ===================================================================================================================== @@ -7799,14 +7978,15 @@ void CmdBuffer::RPBeginSubpass() if (subpasses[i].begin.loadOps.colorClearCount > 0) { RPLoadOpClearColor(subpasses[i].begin.loadOps.colorClearCount, - subpasses[i].begin.loadOps.pColorClears); + subpasses[i].begin.loadOps.pColorClears); } } // If we are manually pre-syncing color clears, we must post-sync also if (subpasses[0].begin.syncTop.barrier.flags.preColorClearSync) { - RPSyncPostLoadOpColorClear(); + RPSyncPostLoadOpColorClear(subpasses[0].begin.loadOps.colorClearCount, + subpasses[0].begin.loadOps.pColorClears); } for (uint32_t i = 0; i < subpassCount; ++i) @@ -7815,7 +7995,7 @@ void CmdBuffer::RPBeginSubpass() if (subpasses[i].begin.loadOps.dsClearCount > 0) { RPLoadOpClearDepthStencil(subpasses[i].begin.loadOps.dsClearCount, - subpasses[i].begin.loadOps.pDsClears); + subpasses[i].begin.loadOps.pDsClears); } } @@ -7838,7 +8018,7 @@ void CmdBuffer::RPBeginSubpass() // If we are manually pre-syncing color clears, we must post-sync also if (subpass.begin.syncTop.barrier.flags.preColorClearSync) { - RPSyncPostLoadOpColorClear(); + RPSyncPostLoadOpColorClear(subpass.begin.loadOps.colorClearCount, subpass.begin.loadOps.pColorClears); } // Execute any depth-stencil clear load operations @@ -8051,11 +8231,10 @@ void CmdBuffer::RPSyncPoint( pVirtStack->AllocArray(maxTransitionCount) : nullptr; - const bool isDstStageNotBottomOfPipe = (dstStageMask != Pal::PipelineStageBottomOfPipe); - // Construct global memory dependency to synchronize caches (subpass dependencies + implicit synchronization) if (rpBarrier.flags.needsGlobalTransition) { + Pal::BarrierTransition globalTransition = { }; m_pDevice->GetBarrierPolicy().ApplyBarrierCacheFlags( @@ -8082,16 +8261,6 @@ void CmdBuffer::RPSyncPoint( Pal::BarrierTransition imageTransition = { }; - m_pDevice->GetBarrierPolicy().ApplyBarrierCacheFlags( - rpBarrier.srcAccessMask, - rpBarrier.dstAccessMask, - VK_IMAGE_LAYOUT_GENERAL, - VK_IMAGE_LAYOUT_GENERAL, - &imageTransition); - - uint32_t srcAccessMask = imageTransition.srcCacheMask | rpBarrier.implicitSrcCacheMask; - uint32_t dstAccessMask = imageTransition.dstCacheMask | rpBarrier.implicitDstCacheMask; - for (uint32_t sr = 0; sr < attachment.subresRangeCount; ++sr) { const uint32_t plane = attachment.subresRange[sr].startSubres.plane; @@ -8107,50 +8276,55 @@ void CmdBuffer::RPSyncPoint( tr.attachment, plane); - if ((oldLayout.usages != newLayout.usages) || - (oldLayout.engines != newLayout.engines) || - ((srcAccessMask != dstAccessMask) && settings.rpBarrierCheckAccessMasks)) - { - VK_ASSERT(acquireReleaseInfo.imageBarrierCount < maxTransitionCount); - - ppImages[acquireReleaseInfo.imageBarrierCount] = attachment.pImage; - - pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcStageMask = srcStageMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstStageMask = dstStageMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcAccessMask = srcAccessMask; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstAccessMask = dstAccessMask; - // We set the pImage to nullptr by default here. But, this will be computed correctly later for - // each device including DefaultDeviceIndex based on the deviceId. - pPalTransitions[acquireReleaseInfo.imageBarrierCount].pImage = nullptr; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].oldLayout = oldLayout; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].newLayout = newLayout; - pPalTransitions[acquireReleaseInfo.imageBarrierCount].subresRange = attachment.subresRange[sr]; + m_pDevice->GetBarrierPolicy().ApplyBarrierCacheFlags( + rpBarrier.srcAccessMask, + rpBarrier.dstAccessMask, + ((plane == 1) ? tr.prevStencilLayout.layout : tr.prevLayout.layout), + ((plane == 1) ? tr.nextStencilLayout.layout : tr.nextLayout.layout), + &imageTransition); + + uint32_t srcAccessMask = imageTransition.srcCacheMask | rpBarrier.implicitSrcCacheMask; + uint32_t dstAccessMask = imageTransition.dstCacheMask | rpBarrier.implicitDstCacheMask; + + VK_ASSERT(acquireReleaseInfo.imageBarrierCount < maxTransitionCount); + + ppImages[acquireReleaseInfo.imageBarrierCount] = attachment.pImage; + + pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcStageMask = srcStageMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstStageMask = dstStageMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].srcAccessMask = srcAccessMask; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].dstAccessMask = dstAccessMask; + // We set the pImage to nullptr by default here. But, this will be computed correctly later for + // each device including DefaultDeviceIndex based on the deviceId. + pPalTransitions[acquireReleaseInfo.imageBarrierCount].pImage = nullptr; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].oldLayout = oldLayout; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].newLayout = newLayout; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].subresRange = attachment.subresRange[sr]; - const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr; + const Pal::MsaaQuadSamplePattern* pQuadSamplePattern = nullptr; - if (attachment.pImage->IsSampleLocationsCompatibleDepth() && - tr.flags.isInitialLayoutTransition) - { - VK_ASSERT(attachment.pImage->HasDepth()); + if (attachment.pImage->IsSampleLocationsCompatibleDepth() && + tr.flags.isInitialLayoutTransition) + { + VK_ASSERT(attachment.pImage->HasDepth()); - // Use the provided sample locations for this attachment if this is its - // initial layout transition - pQuadSamplePattern = - &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations; - } - else - { - // Otherwise, use the subpass' sample locations - uint32_t subpass = m_renderPassInstance.subpass; - pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations; - } + // Use the provided sample locations for this attachment if this is its + // initial layout transition + pQuadSamplePattern = + &m_renderPassInstance.pAttachments[tr.attachment].initialSamplePattern.locations; + } + else + { + // Otherwise, use the subpass' sample locations + uint32_t subpass = m_renderPassInstance.subpass; + pQuadSamplePattern = &m_renderPassInstance.pSamplePatterns[subpass].locations; + } - pPalTransitions[acquireReleaseInfo.imageBarrierCount].pQuadSamplePattern = pQuadSamplePattern; + pPalTransitions[acquireReleaseInfo.imageBarrierCount].pQuadSamplePattern = pQuadSamplePattern; - RPSetAttachmentLayout(tr.attachment, plane, newLayout); + RPSetAttachmentLayout(tr.attachment, plane, newLayout); - acquireReleaseInfo.imageBarrierCount++; - } + acquireReleaseInfo.imageBarrierCount++; } } @@ -8170,12 +8344,13 @@ void CmdBuffer::RPSyncPoint( acquireReleaseInfo.dstGlobalAccessMask = 0; } + const bool stageMasksNotEmpty = (((srcStageMask == 0) && (dstStageMask == 0)) == false); + // We do not require a dumb transition here in acquire/release interface because unlike Legacy barriers, // PAL flushes caches even if only the global barriers are passed-in without any image/buffer memory barriers. // Execute the barrier if it actually did anything - if ((acquireReleaseInfo.dstGlobalStageMask != Pal::PipelineStageBottomOfPipe) || - ((acquireReleaseInfo.imageBarrierCount > 0) && isDstStageNotBottomOfPipe)) + if (stageMasksNotEmpty) { PalCmdReleaseThenAcquire( &acquireReleaseInfo, @@ -9304,7 +9479,7 @@ void CmdBuffer::PushDescriptorSetKHR( case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: default: VK_ASSERT(!"Unexpected descriptor type"); break; @@ -9862,9 +10037,31 @@ void CmdBuffer::SetVertexInput( if (firstChanged <= lastChanged) { - PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers( - firstChanged, (lastChanged - firstChanged) + 1, - &PerGpuState(deviceIdx)->vbBindings[firstChanged]); + Pal::VertexBufferViews bufferViews = + { + .firstBuffer = firstChanged, + .bufferCount = (lastChanged - firstChanged) + 1, + .offsetMode = (m_flags.offsetMode == 1) ? true : false + }; + Pal::VertexBufferView vertexViews[Pal::MaxVertexBuffers] = {}; + auto pBinding = &PerGpuState(deviceIdx)->vbBindings[firstChanged]; + + if (m_flags.offsetMode) + { + for (uint32_t idx = 0; idx < (lastChanged - firstChanged + 1); idx++) + { + vertexViews[idx].gpuva = pBinding[idx].gpuAddr; + vertexViews[idx].sizeInBytes = pBinding[idx].range; + vertexViews[idx].strideInBytes = pBinding[idx].stride; + } + bufferViews.pVertexBufferViews = vertexViews; + } + else + { + bufferViews.pBufferViewInfos = pBinding; + } + + PalCmdBuffer(deviceIdx)->CmdSetVertexBuffers(bufferViews); } if (vertexBufferCount != pBindState->dynamicBindInfo.gfxDynState.vertexBufferCount) @@ -10420,6 +10617,9 @@ void CmdBuffer::BuildAccelerationStructuresPerDevice( { const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); + Util::Vector m_gpurtInfos(VkInstance()->Allocator()); + Util::Vector m_convHelpers(VkInstance()->Allocator()); + for (uint32_t infoIdx = 0; infoIdx < infoCount; ++infoIdx) { const VkAccelerationStructureBuildGeometryInfoKHR* pInfo = &pInfos[infoIdx]; @@ -10447,6 +10647,7 @@ void CmdBuffer::BuildAccelerationStructuresPerDevice( deviceIndex, *pInfo, pBuildRangeInfos, + (ppMaxPrimitiveCounts != nullptr) ? ppMaxPrimitiveCounts[infoIdx] : nullptr, &helper, &info.inputs); @@ -10456,7 +10657,19 @@ void CmdBuffer::BuildAccelerationStructuresPerDevice( const bool forceRebuildBottomLevel = Util::TestAnyFlagSet(settings.forceRebuildForUpdates, ForceRebuildForUpdatesBottomLevel); - if (settings.ifhRayTracing) + // Skip all work depending on rtTossPoint setting and type of work. + const uint32 rtTossPoint = settings.rtTossPoint; + + const bool isUpdate = Util::TestAnyFlagSet(info.inputs.flags, GpuRt::AccelStructBuildFlagPerformUpdate); + + const bool tossWork = (((info.inputs.type == GpuRt::AccelStructType::TopLevel) && + (rtTossPoint >= RtTossPointTlas)) || + ((info.inputs.type == GpuRt::AccelStructType::BottomLevel) && + (rtTossPoint >= RtTossPointBlasBuild)) || + ((info.inputs.type == GpuRt::AccelStructType::BottomLevel) && + (rtTossPoint >= RtTossPointBlasUpdate) && isUpdate)); + + if (tossWork) { info.inputs.inputElemCount = 0; } @@ -10477,18 +10690,42 @@ void CmdBuffer::BuildAccelerationStructuresPerDevice( info.indirect.indirectGpuAddr = pIndirectDeviceAddresses[infoIdx]; info.indirect.indirectStride = pIndirectStrides[infoIdx]; - helper.pMaxPrimitiveCounts = ppMaxPrimitiveCounts[infoIdx]; } - DbgBarrierPreCmd((pInfo->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) ? - DbgBuildAccelerationStructureTLAS : DbgBuildAccelerationStructureBLAS); + if (settings.batchBvhBuilds == BatchBvhModeDisabled) + { + DbgBarrierPreCmd((pInfo->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) ? + DbgBuildAccelerationStructureTLAS : DbgBuildAccelerationStructureBLAS); - m_pDevice->RayTrace()->GpuRt(deviceIndex)->BuildAccelStruct( + m_pDevice->RayTrace()->GpuRt(deviceIndex)->BuildAccelStruct( PalCmdBuffer(deviceIndex), info); - DbgBarrierPostCmd((pInfo->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) ? - DbgBuildAccelerationStructureTLAS : DbgBuildAccelerationStructureBLAS); + DbgBarrierPostCmd((pInfo->type == VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR) ? + DbgBuildAccelerationStructureTLAS : DbgBuildAccelerationStructureBLAS); + } + else + { + m_gpurtInfos.PushBack(info); + m_convHelpers.PushBack(helper); + } + } + + if (m_gpurtInfos.IsEmpty() == false) + { + DbgBarrierPreCmd(DbgBuildAccelerationStructureTLAS | DbgBuildAccelerationStructureBLAS); + + VK_ASSERT(m_gpurtInfos.NumElements() == m_convHelpers.NumElements()); + for (uint32 i = 0; i < m_gpurtInfos.NumElements(); ++i) + { + m_gpurtInfos[i].inputs.pClientData = &m_convHelpers[i]; + } + + m_pDevice->RayTrace()->GpuRt(deviceIndex)->BuildAccelStructs( + PalCmdBuffer(deviceIndex), + m_gpurtInfos); + + DbgBarrierPostCmd(DbgBuildAccelerationStructureTLAS | DbgBuildAccelerationStructureBLAS); } } diff --git a/icd/api/vk_cmdbuffer_transfer.cpp b/icd/api/vk_cmdbuffer_transfer.cpp index 5536150f..9310d62a 100644 --- a/icd/api/vk_cmdbuffer_transfer.cpp +++ b/icd/api/vk_cmdbuffer_transfer.cpp @@ -419,7 +419,18 @@ void CmdBuffer::BlitImage( palCopyInfo.rotation = Pal::ImageRotation::Ccw0; palCopyInfo.pRegions = pPalRegions; - palCopyInfo.flags.dstAsSrgb = pDstImage->TreatAsSrgb(); + + // PAL does gamma correction whenever the destination is a SRGB image or treated as one. + // If the source image is an UNORM image that contains SRGB data, we need to set dstAsNorm + // so PAL doesn't end up doing gamma correction on values that are already in SRGB space. + if (pSrcImage->TreatAsSrgb()) + { + palCopyInfo.flags.dstAsNorm = true; + } + else if (pDstImage->TreatAsSrgb()) + { + palCopyInfo.flags.dstAsSrgb = true; + } for (uint32_t regionIdx = 0; regionIdx < regionCount;) { @@ -802,9 +813,9 @@ void CmdBuffer::QueryCopy( // 64-bit values) Pal::BufferViewInfo bufferViewInfo = {}; - bufferViewInfo.range = destStride * queryCount; - bufferViewInfo.stride = 0; // Raw buffers have a zero byte stride - bufferViewInfo.swizzledFormat = Pal::UndefinedSwizzledFormat; + bufferViewInfo.range = destStride * queryCount; + bufferViewInfo.stride = 0; // Raw buffers have a zero byte stride + bufferViewInfo.swizzledFormat = Pal::UndefinedSwizzledFormat; // Set query count userData[queryCountOffset] = queryCount; diff --git a/icd/api/vk_compute_pipeline.cpp b/icd/api/vk_compute_pipeline.cpp index d2965fe0..34384780 100644 --- a/icd/api/vk_compute_pipeline.cpp +++ b/icd/api/vk_compute_pipeline.cpp @@ -33,6 +33,9 @@ #include "include/vk_pipeline_layout.h" #include "include/vk_memory.h" #include "include/vk_pipeline.h" +#if VKI_RAY_TRACING +#include "raytrace/ray_tracing_device.h" +#endif #include "palPipeline.h" #include "palPipelineAbi.h" @@ -152,7 +155,7 @@ VkResult ComputePipeline::CreatePipelineBinaries( bool shouldConvert = (pCreateInfo != nullptr) && (pDevice->GetRuntimeSettings().enablePipelineDump || - (shouldCompile && (pBinaryCreateInfo->pTempBuffer == nullptr))); + (shouldCompile && (pBinaryCreateInfo->pTempBuffer == nullptr))); VkResult convertResult = VK_ERROR_UNKNOWN; if (shouldConvert) @@ -226,7 +229,6 @@ VkResult ComputePipeline::CreatePipelineBinaries( // Add to any cache layer where missing if ((result == VK_SUCCESS) && storeBinaryToCache) - { pDevice->GetCompiler(deviceIdx)->CachePipelineBinary( &pCacheIds[deviceIdx], @@ -571,6 +573,7 @@ VkResult ComputePipeline::Create( static_cast(pipelineBinaries[DefaultDeviceIndex].pCode), pComputePipeline->GetFormatStrings()); } + } else { diff --git a/icd/api/vk_conv.cpp b/icd/api/vk_conv.cpp index d43cd44f..73869c24 100644 --- a/icd/api/vk_conv.cpp +++ b/icd/api/vk_conv.cpp @@ -750,6 +750,9 @@ const char* PalResultName( case Pal::Result::ErrorInvalidExternalHandle: resultName = "ErrorInvalidExternalHandle"; break; + case Pal::Result::ErrorIncompatibleDisplayMode: + resultName = "ErrorIncompatibleDisplayMode"; + break; default: VK_NOT_IMPLEMENTED; resultName = "??"; @@ -1127,6 +1130,7 @@ static uint32_t GetBufferSrdFormatInfo( bufferInfo.swizzledFormat = swizzledFormat; bufferInfo.range = UINT32_MAX; bufferInfo.stride = Pal::Formats::BytesPerPixel(swizzledFormat.format); + pPhysicalDevice->PalDevice()->CreateTypedBufferViewSrds(1, &bufferInfo, result); // NOTE: Until now, all buffer format info is stored the fourth DWORD of buffer SRD. please modify diff --git a/icd/api/vk_descriptor_buffer.cpp b/icd/api/vk_descriptor_buffer.cpp index c44c02ad..705fca54 100644 --- a/icd/api/vk_descriptor_buffer.cpp +++ b/icd/api/vk_descriptor_buffer.cpp @@ -258,7 +258,7 @@ VKAPI_ATTR void VKAPI_CALL vkGetDescriptorEXT( } break; } - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: default: diff --git a/icd/api/vk_descriptor_pool.cpp b/icd/api/vk_descriptor_pool.cpp index 567b1495..f693848a 100644 --- a/icd/api/vk_descriptor_pool.cpp +++ b/icd/api/vk_descriptor_pool.cpp @@ -517,6 +517,7 @@ VkResult DescriptorGpuMemHeap::Init( VkDescriptorPoolCreateFlags poolUsage = pCreateInfo->flags; uint32_t maxSets = pCreateInfo->maxSets; const VkDescriptorPoolSize* pTypeCount = pCreateInfo->pPoolSizes; + uint32_t maxInlineUniformBlockBindings = 0; m_numPalDevices = pDevice->NumPalDevices(); m_usage = poolUsage; @@ -540,6 +541,16 @@ VkResult DescriptorGpuMemHeap::Init( break; } + case VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO: + { + const VkDescriptorPoolInlineUniformBlockCreateInfo* pDescriptorPoolInlineUniformBlockCreateInfo = + reinterpret_cast(pHeader); + + maxInlineUniformBlockBindings = + pDescriptorPoolInlineUniformBlockCreateInfo->maxInlineUniformBlockBindings; + + break; + } default: break; @@ -551,6 +562,8 @@ VkResult DescriptorGpuMemHeap::Init( VkResult result = VK_SUCCESS; + m_gpuMemAddrAlignment = pDevice->GetProperties().descriptorSizes.alignmentInDwords * sizeof(uint32_t); + if (pDevice->GetRuntimeSettings().pipelineLayoutMode == PipelineLayoutAngle) { for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; ++i) @@ -561,6 +574,10 @@ VkResult DescriptorGpuMemHeap::Init( } else { + constexpr uint32_t InlineUniformGranularity = 4; + + m_gpuMemSize += ((m_gpuMemAddrAlignment - InlineUniformGranularity) * maxInlineUniformBlockBindings); + for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; ++i) { if (pTypeCount[i].type == VK_DESCRIPTOR_TYPE_MUTABLE_EXT) @@ -587,7 +604,7 @@ VkResult DescriptorGpuMemHeap::Init( } VK_ASSERT(maxSize > 0); - m_gpuMemSize += maxSize * sizeof(uint32_t) * pTypeCount[i].descriptorCount; + m_gpuMemSize += maxSize * pTypeCount[i].descriptorCount; } else { @@ -597,8 +614,6 @@ VkResult DescriptorGpuMemHeap::Init( } } - m_gpuMemAddrAlignment = pDevice->GetProperties().descriptorSizes.alignmentInDwords * sizeof(uint32_t); - if (oneShot == false) //DYNAMIC USAGE { // In case of dynamic descriptor pools we have to prepare our management structures. diff --git a/icd/api/vk_descriptor_set.cpp b/icd/api/vk_descriptor_set.cpp index e4be7758..14576653 100644 --- a/icd/api/vk_descriptor_set.cpp +++ b/icd/api/vk_descriptor_set.cpp @@ -399,8 +399,8 @@ void DescriptorUpdate::WriteBufferInfoDescriptors( (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)); // Setup and create SRD for storage buffer case - info.swizzledFormat = Pal::UndefinedSwizzledFormat; - info.stride = 0; // Raw buffers have a zero byte stride + info.swizzledFormat = Pal::UndefinedSwizzledFormat; + info.stride = 0; // Raw buffers have a zero byte stride Pal::IDevice* pPalDevice = pDevice->PalDevice(deviceIdx); @@ -468,6 +468,7 @@ void DescriptorUpdate::SetAccelerationDescriptorsBufferViewFlags( pBufferViewInfo->flags.bypassMallRead = 1; pBufferViewInfo->flags.bypassMallWrite = 1; } + } void DescriptorUpdate::WriteAccelerationStructureDescriptors( @@ -703,7 +704,7 @@ void DescriptorUpdate::WriteDescriptorSets( destBinding.dyn.dwArrayStride); break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: { VK_ASSERT(params.pNext != nullptr); VK_ASSERT(Util::IsPow2Aligned(params.dstArrayElement, 4)); @@ -837,7 +838,7 @@ void DescriptorUpdate::CopyDescriptorSets( // Just to a straight memcpy covering the entire range. memcpy(pDestAddr, pSrcAddr, srcBinding.dyn.dwArrayStride * sizeof(uint32_t) * count); } - else if (srcBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + else if (srcBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { VK_ASSERT(Util::IsPow2Aligned(params.srcArrayElement, 4)); VK_ASSERT(Util::IsPow2Aligned(params.dstArrayElement, 4)); diff --git a/icd/api/vk_descriptor_set_layout.cpp b/icd/api/vk_descriptor_set_layout.cpp index 234a0b35..ed561e55 100644 --- a/icd/api/vk_descriptor_set_layout.cpp +++ b/icd/api/vk_descriptor_set_layout.cpp @@ -184,7 +184,7 @@ uint32_t DescriptorSetLayout::GetSingleDescStaticSize( // as we pack the whole buffer SRD in the dynamic section (i.e. user data registers). size = 0; break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: size = 1; break; default: @@ -193,7 +193,7 @@ uint32_t DescriptorSetLayout::GetSingleDescStaticSize( break; } - VK_ASSERT((type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) || (Util::IsPow2Aligned(size, sizeof(uint32_t)))); + VK_ASSERT((type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) || (Util::IsPow2Aligned(size, sizeof(uint32_t)))); return size; } @@ -231,7 +231,7 @@ uint32_t DescriptorSetLayout::GetDescStaticSectionDwSize( size *= maxMultiPlaneCount; } - if (descriptorInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + if (descriptorInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { // A single binding corresponds to a whole uniform block, so handle it as one descriptor not array. size *= descriptorInfo->descriptorCount; @@ -250,7 +250,7 @@ uint32_t DescriptorSetLayout::GetDescStaticSectionDwSize( { const BindingInfo& bindingInfo = pSrcDescSetLayout->Binding(binding); - return (bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) ? + return (bindingInfo.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) ? bindingInfo.sta.dwSize : bindingInfo.sta.dwArrayStride; } @@ -339,7 +339,7 @@ void DescriptorSetLayout::ConvertBindingInfo( // Dword offset to this binding pBindingSectionInfo->dwOffset = Util::RoundUpToMultiple(pSectionInfo->dwSize, descAlignmentInDw); - if (pBindingInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + if (pBindingInfo->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { // This allows access to inline uniform blocks using dwords offsets. // Vk(Write/Copy/Update)DescriptorSet use byte values, convert them to dword. diff --git a/icd/api/vk_descriptor_update_template.cpp b/icd/api/vk_descriptor_update_template.cpp index cc5e2c69..87cfba51 100644 --- a/icd/api/vk_descriptor_update_template.cpp +++ b/icd/api/vk_descriptor_update_template.cpp @@ -81,9 +81,9 @@ VkResult DescriptorUpdateTemplate::Create( VK_ASSERT((pCreateInfo->templateType != VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR) || ((dstBinding.info.descriptorType != VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) && (dstBinding.info.descriptorType != VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) && - (dstBinding.info.descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT))); + (dstBinding.info.descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK))); - if (dstBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + if (dstBinding.info.descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) { // Convert dstArrayElement to dword VK_ASSERT(Util::IsPow2Aligned(srcEntry.dstArrayElement, 4)); @@ -182,7 +182,7 @@ DescriptorUpdateTemplate::PfnUpdateEntry DescriptorUpdateTemplate::GetUpdateEntr case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: pFunc = &UpdateEntryBuffer; break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: pFunc = &UpdateEntryInlineUniformBlock; break; #if VKI_RAY_TRACING diff --git a/icd/api/vk_device.cpp b/icd/api/vk_device.cpp index 98db2cc7..2e6cbc20 100644 --- a/icd/api/vk_device.cpp +++ b/icd/api/vk_device.cpp @@ -551,6 +551,9 @@ VkResult Device::Create( if (reinterpret_cast(pHeader)->robustBufferAccess2) { deviceFeatures.robustBufferAccessExtended = true; + { + deviceFeatures.robustVertexBufferExtend = true; + } } if (reinterpret_cast(pHeader)->robustImageAccess2) @@ -1478,6 +1481,13 @@ void Device::InitDispatchTable() m_pBarrierFilterLayer->OverrideDispatchTable(&m_dispatchTable); } +#if VKI_RAY_TRACING + if ((RayTrace() != nullptr) && (RayTrace()->GetBvhBatchLayer() != nullptr)) + { + RayTrace()->GetBvhBatchLayer()->OverrideDispatchTable(&m_dispatchTable); + } +#endif + #if VKI_GPU_DECOMPRESS if (m_pGpuDecoderLayer != nullptr) { @@ -3932,7 +3942,14 @@ void Device::GetAccelerationStructureBuildSizesKHR( const bool allowUpdate = inputs.flags & GpuRt::AccelStructBuildFlagAllowUpdate; - if (m_settings.ifhRayTracing) + const uint32 rtTossPoint = m_settings.rtTossPoint; + + // Skip all work depending on rtTossPoint setting and type of work. + const bool tossWork = (((inputs.type == GpuRt::AccelStructType::TopLevel) && (rtTossPoint >= RtTossPointTlas)) || + ((inputs.type == GpuRt::AccelStructType::BottomLevel) && + (rtTossPoint >= RtTossPointBlasBuild))); + + if (tossWork) { inputs.inputElemCount = 0; } diff --git a/icd/api/vk_graphics_pipeline.cpp b/icd/api/vk_graphics_pipeline.cpp index 5fc1f618..82811670 100644 --- a/icd/api/vk_graphics_pipeline.cpp +++ b/icd/api/vk_graphics_pipeline.cpp @@ -35,6 +35,9 @@ #include "include/vk_render_pass.h" #include "include/vk_shader.h" #include "include/vk_cmdbuffer.h" +#if VKI_RAY_TRACING +#include "raytrace/ray_tracing_device.h" +#endif #include "palAutoBuffer.h" #include "palCmdBuffer.h" @@ -49,6 +52,7 @@ #include using namespace Util; +using namespace std::chrono_literals; namespace vk { @@ -115,7 +119,7 @@ VkResult GraphicsPipeline::CreatePipelineBinaries( bool shouldConvert = (pCreateInfo != nullptr) && (pDevice->GetRuntimeSettings().enablePipelineDump || - (shouldCompile && (deviceIdx == DefaultDeviceIndex))); + (shouldCompile && (deviceIdx == DefaultDeviceIndex))); VkResult convertResult = VK_ERROR_UNKNOWN; if (shouldConvert) @@ -1660,7 +1664,7 @@ VkResult GraphicsPipeline::Destroy( { if (m_deferWorkload.pEvent != nullptr) { - auto result = m_deferWorkload.pEvent->Wait(Util::fseconds{ 10 }); + auto result = m_deferWorkload.pEvent->Wait(10s); if (result == Util::Result::Success) { Util::Destructor(m_deferWorkload.pEvent); diff --git a/icd/api/vk_graphics_pipeline_library.cpp b/icd/api/vk_graphics_pipeline_library.cpp index 8ce4eb75..058a2530 100644 --- a/icd/api/vk_graphics_pipeline_library.cpp +++ b/icd/api/vk_graphics_pipeline_library.cpp @@ -337,9 +337,8 @@ VkResult GraphicsPipelineLibrary::CreatePartialPipelineBinary( uint32_t gplMask = 0; for (uint32_t i = 0; i < ShaderStage::ShaderStageGfxCount; ++i) { - if ((pShaderInfos[i]->pModuleData != nullptr) && - (pShaderStageInfo->stages[i].pModuleHandle != nullptr) && - pCompiler->IsValidShaderModule(pShaderStageInfo->stages[i].pModuleHandle) || + if (((pShaderInfos[i]->pModuleData != nullptr) && + pCompiler->IsValidShaderModule(pShaderStageInfo->stages[i].pModuleHandle)) || (pShaderStageInfo->stages[i].codeHash.lower != 0) || (pShaderStageInfo->stages[i].codeHash.upper != 0)) { diff --git a/icd/api/vk_image.cpp b/icd/api/vk_image.cpp index 2994ac89..db539869 100644 --- a/icd/api/vk_image.cpp +++ b/icd/api/vk_image.cpp @@ -409,68 +409,83 @@ void Image::ConvertImageCreateInfo( } } + const bool isZ24DsFormat = (settings.enableD24S8 && + ((pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT) || + (pCreateInfo->format == VK_FORMAT_X8_D24_UNORM_PACK32))); + + const bool isZ16DsFormat = ((pCreateInfo->format == VK_FORMAT_D16_UNORM) || + (pCreateInfo->format == VK_FORMAT_D16_UNORM_S8_UINT)); + + if (isZ24DsFormat) + { + pPalCreateInfo->usageFlags.depthAsZ24 = 1; + } + pPalCreateInfo->metadataMode = Pal::MetadataMode::Default; pPalCreateInfo->metadataTcCompatMode = Pal::MetadataTcCompatMode::Default; - // Don't force DCC to be enabled for performance reasons unless the image is larger than the minimum size set for - // compression, another performance optimization. const Pal::GfxIpLevel gfxLevel = palProperties.gfxLevel; - if (((pPalCreateInfo->extent.width * pPalCreateInfo->extent.height) > - (settings.disableSmallSurfColorCompressionSize * settings.disableSmallSurfColorCompressionSize)) && - (Formats::IsColorFormat(createInfoFormat))) + { - const uint32_t forceEnableDccMask = settings.forceEnableDcc; + // Don't force DCC to be enabled for performance reasons unless the image is larger than the minimum size set for + // compression, another performance optimization. + if (((pPalCreateInfo->extent.width * pPalCreateInfo->extent.height) > + (settings.disableSmallSurfColorCompressionSize * settings.disableSmallSurfColorCompressionSize)) && + (Formats::IsColorFormat(createInfoFormat))) + { + const uint32_t forceEnableDccMask = settings.forceEnableDcc; - const uint32_t bpp = Pal::Formats::BitsPerPixel(pPalCreateInfo->swizzledFormat.format); - const bool isShaderStorage = (pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT); + const uint32_t bpp = Pal::Formats::BitsPerPixel(pPalCreateInfo->swizzledFormat.format); + const bool isShaderStorage = (pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT); - if (isShaderStorage && - ((forceEnableDccMask & ForceDccDefault) == 0) && - ((forceEnableDccMask & ForceDisableDcc) == 0)) - { - const bool isColorAttachment = (pCreateInfo->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT); + if (isShaderStorage && + ((forceEnableDccMask & ForceDccDefault) == 0) && + ((forceEnableDccMask & ForceDisableDcc) == 0)) + { + const bool isColorAttachment = (pCreateInfo->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT); - const bool is2DShaderStorageImage = (pCreateInfo->imageType & VK_IMAGE_TYPE_2D); - const bool is3DShaderStorageImage = (pCreateInfo->imageType & VK_IMAGE_TYPE_3D); + const bool is2DShaderStorageImage = (pCreateInfo->imageType & VK_IMAGE_TYPE_2D); + const bool is3DShaderStorageImage = (pCreateInfo->imageType & VK_IMAGE_TYPE_3D); - // Enable DCC beyond what PAL does by default for color attachments - const bool shouldForceDccForCA = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccForColorAttachments) && - isColorAttachment; - const bool shouldForceDccForNonCAShaderStorage = - Util::TestAnyFlagSet(forceEnableDccMask, ForceDccForNonColorAttachmentShaderStorage) && - (!isColorAttachment); + // Enable DCC beyond what PAL does by default for color attachments + const bool shouldForceDccForCA = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccForColorAttachments) && + isColorAttachment; + const bool shouldForceDccForNonCAShaderStorage = + Util::TestAnyFlagSet(forceEnableDccMask, ForceDccForNonColorAttachmentShaderStorage) && + (!isColorAttachment); - const bool shouldForceDccFor2D = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor2DShaderStorage) && - is2DShaderStorageImage; - const bool shouldForceDccFor3D = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor3DShaderStorage) && - is3DShaderStorageImage; + const bool shouldForceDccFor2D = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor2DShaderStorage) && + is2DShaderStorageImage; + const bool shouldForceDccFor3D = Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor3DShaderStorage) && + is3DShaderStorageImage; - const bool shouldForceDccFor32Bpp = - Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor32BppShaderStorage) && (bpp >= 32) && (bpp < 64); + const bool shouldForceDccFor32Bpp = + Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor32BppShaderStorage) && (bpp >= 32) && (bpp < 64); - const bool shouldForceDccFor64Bpp = - Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor64BppShaderStorage) && (bpp >= 64); + const bool shouldForceDccFor64Bpp = + Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor64BppShaderStorage) && (bpp >= 64); - const bool shouldForceDccForAllBpp = - ((Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor32BppShaderStorage) == false) && - (Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor64BppShaderStorage) == false)); + const bool shouldForceDccForAllBpp = + ((Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor32BppShaderStorage) == false) && + (Util::TestAnyFlagSet(forceEnableDccMask, ForceDccFor64BppShaderStorage) == false)); - // To force enable shader storage DCC, at least one of 2D/3D and one of CA/non-CA need to be set - if ((shouldForceDccFor2D || shouldForceDccFor3D) && - (shouldForceDccForCA || shouldForceDccForNonCAShaderStorage) && - (shouldForceDccFor32Bpp || shouldForceDccFor64Bpp || shouldForceDccForAllBpp)) - { - pPalCreateInfo->metadataMode = Pal::MetadataMode::ForceEnabled; + // To force enable shader storage DCC, at least one of 2D/3D and one of CA/non-CA need to be set + if ((shouldForceDccFor2D || shouldForceDccFor3D) && + (shouldForceDccForCA || shouldForceDccForNonCAShaderStorage) && + (shouldForceDccFor32Bpp || shouldForceDccFor64Bpp || shouldForceDccForAllBpp)) + { + pPalCreateInfo->metadataMode = Pal::MetadataMode::ForceEnabled; + } } - } - // This setting should only really be used for Vega20. - // Turn DCC on/off for identified cases where memory bandwidth is not the bottleneck to improve latency. - // PAL may do this implicitly, so specify force enabled instead of default. - if (settings.dccBitsPerPixelThreshold != UINT_MAX) - { - pPalCreateInfo->metadataMode = (bpp < settings.dccBitsPerPixelThreshold) ? - Pal::MetadataMode::Disabled : Pal::MetadataMode::ForceEnabled; + // This setting should only really be used for Vega20. + // Turn DCC on/off for identified cases where memory bandwidth is not the bottleneck to improve latency. + // PAL may do this implicitly, so specify force enabled instead of default. + if (settings.dccBitsPerPixelThreshold != UINT_MAX) + { + pPalCreateInfo->metadataMode = (bpp < settings.dccBitsPerPixelThreshold) ? + Pal::MetadataMode::Disabled : Pal::MetadataMode::ForceEnabled; + } } } @@ -484,13 +499,6 @@ void Image::ConvertImageCreateInfo( pPalCreateInfo->metadataMode = Pal::MetadataMode::Disabled; } - if (settings.enableD24S8 && - ((pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT) || - (pCreateInfo->format == VK_FORMAT_X8_D24_UNORM_PACK32))) - { - pPalCreateInfo->usageFlags.depthAsZ24 = 1; - } - // If DCC was disabled above, still attempt to use Fmask. if ((pPalCreateInfo->samples > 1) && pPalCreateInfo->usageFlags.colorTarget && (pPalCreateInfo->metadataMode == Pal::MetadataMode::Disabled)) @@ -538,7 +546,7 @@ void Image::ConvertImageCreateInfo( if ((extStructs.pImageCompressionControl->sType == VK_STRUCTURE_TYPE_IMAGE_COMPRESSION_CONTROL_EXT) && (extStructs.pImageCompressionControl->flags == VK_IMAGE_COMPRESSION_DISABLED_EXT)) { - pPalCreateInfo->metadataMode = Pal::MetadataMode::Disabled; + pPalCreateInfo->metadataMode = Pal::MetadataMode::Disabled; pPalCreateInfo->metadataTcCompatMode = Pal::MetadataTcCompatMode::Disabled; } } diff --git a/icd/api/vk_indirect_commands_layout.cpp b/icd/api/vk_indirect_commands_layout.cpp index 0a84dd44..8fdc7582 100644 --- a/icd/api/vk_indirect_commands_layout.cpp +++ b/icd/api/vk_indirect_commands_layout.cpp @@ -51,6 +51,7 @@ VkResult IndirectCommandsLayout::Create( createInfo.pParams = &indirectParams[0]; Pal::IIndirectCmdGenerator* pGenerators[MaxPalDevices] = {}; + Pal::IGpuMemory* pGpuMemory[MaxPalDevices] = {}; const size_t apiSize = ObjectSize(pDevice); size_t totalSize = apiSize; @@ -154,12 +155,18 @@ VkResult IndirectCommandsLayout::Create( } } + if (result == VK_SUCCESS) + { + result = BindGpuMemory(pDevice, pAllocator, pGenerators, pGpuMemory); + } + if (result == VK_SUCCESS) { VK_PLACEMENT_NEW(pMemory) IndirectCommandsLayout( pDevice, info, pGenerators, + pGpuMemory, createInfo); *pLayout = IndirectCommandsLayout::HandleFromVoidPointer(pMemory); @@ -172,7 +179,8 @@ VkResult IndirectCommandsLayout::Create( IndirectCommandsLayout::IndirectCommandsLayout( const Device* pDevice, const IndirectCommandsInfo& info, - Pal::IIndirectCmdGenerator** pPalGenerator, + Pal::IIndirectCmdGenerator** pGenerators, + Pal::IGpuMemory** pGpuMemory, const Pal::IndirectCmdGeneratorCreateInfo& palCreateInfo) : m_info(info), @@ -180,8 +188,8 @@ IndirectCommandsLayout::IndirectCommandsLayout( { for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { - m_perGpu[deviceIdx].pGenerator = pPalGenerator[deviceIdx]; - m_perGpu[deviceIdx].preprocessBufferVirtAddr = 0; + m_perGpu[deviceIdx].pGenerator = pGenerators[deviceIdx]; + m_perGpu[deviceIdx].pGpuMemory = pGpuMemory[deviceIdx]; } } @@ -305,56 +313,125 @@ void IndirectCommandsLayout::CalculateMemoryRequirements( VkMemoryRequirements2* pMemoryRequirements ) const { - VK_ASSERT(m_perGpu[DefaultDeviceIndex].pGenerator != nullptr); + // Our CP packet solution have no preprocess step. Gpu memory is not required. + pMemoryRequirements->memoryRequirements.size = 0; + pMemoryRequirements->memoryRequirements.alignment = 0; + pMemoryRequirements->memoryRequirements.memoryTypeBits = 0; + Pal::GpuMemoryRequirements memReqs = {}; - m_perGpu[DefaultDeviceIndex].pGenerator->GetGpuMemoryRequirements(&memReqs); + memReqs.flags.cpuAccess = 0; + memReqs.heaps[0] = Pal::GpuHeap::GpuHeapInvisible; + memReqs.heapCount = 1; -#if DEBUG - for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + for (uint32_t i = 0; i < memReqs.heapCount; ++i) { - VK_ASSERT(m_perGpu[deviceIdx].pGenerator != nullptr); + uint32_t typeIndexBits; - if (deviceIdx != DefaultDeviceIndex) + if (pDevice->GetVkTypeIndexBitsFromPalHeap(memReqs.heaps[i], &typeIndexBits)) { - Pal::GpuMemoryRequirements deviceReqs = {}; - m_perGpu[deviceIdx].pGenerator->GetGpuMemoryRequirements(&deviceReqs); - VK_ASSERT(memcmp(&memReqs, &deviceReqs, sizeof(deviceReqs)) == 0); + pMemoryRequirements->memoryRequirements.memoryTypeBits |= typeIndexBits; } } -#endif +} + +// ===================================================================================================================== +VkResult IndirectCommandsLayout::BindGpuMemory( + const Device* pDevice, + const VkAllocationCallbacks* pAllocator, + Pal::IIndirectCmdGenerator** pGenerators, + Pal::IGpuMemory** pGpuMemory) +{ + VkResult result = VK_SUCCESS; + Pal::Result palResult; - pMemoryRequirements->memoryRequirements.alignment = memReqs.alignment; - pMemoryRequirements->memoryRequirements.size = memReqs.size; + Pal::GpuMemoryRequirements memReqs[MaxPalDevices] = {}; + Pal::GpuMemoryCreateInfo memCreateInfos[MaxPalDevices] = {}; - pMemoryRequirements->memoryRequirements.memoryTypeBits = 0; + size_t totalSize = 0; - for (uint32_t i = 0; i < memReqs.heapCount; ++i) + void* pMemory = nullptr; + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) { - uint32_t typeIndexBits; + pGenerators[deviceIdx]->GetGpuMemoryRequirements(&memReqs[deviceIdx]); - if (pDevice->GetVkTypeIndexBitsFromPalHeap(memReqs.heaps[i], &typeIndexBits)) + memCreateInfos[deviceIdx].size = memReqs[deviceIdx].size; + memCreateInfos[deviceIdx].alignment = memReqs[deviceIdx].alignment; + memCreateInfos[deviceIdx].priority = Pal::GpuMemPriority::Normal; + memCreateInfos[deviceIdx].heapCount = memReqs[deviceIdx].heapCount; + + for (uint32 i = 0; i < memReqs[deviceIdx].heapCount; ++i) { - pMemoryRequirements->memoryRequirements.memoryTypeBits |= typeIndexBits; + memCreateInfos[deviceIdx].heaps[i] = memReqs[deviceIdx].heaps[i]; + } + + const size_t size = pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(memCreateInfos[deviceIdx], + &palResult); + + if (palResult == Pal::Result::Success) + { + totalSize += size; + } + else + { + result = PalToVkResult(palResult); + break; } } -} -// ===================================================================================================================== -void IndirectCommandsLayout::BindPreprocessBuffer( - VkBuffer buffer, - VkDeviceSize memOffset, - uint32_t deviceIdx) -{ - Buffer* pBuffer = Buffer::ObjectFromHandle(buffer); - Pal::gpusize bufferVirtAddr = pBuffer->PalMemory(deviceIdx)->Desc().gpuVirtAddr + memOffset; + if (result == VK_SUCCESS) + { + pMemory = pAllocator->pfnAllocation(pAllocator->pUserData, + totalSize, + VK_DEFAULT_MEM_ALIGN, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + + if (pMemory == nullptr) + { + result = VK_ERROR_OUT_OF_HOST_MEMORY; + } + } - if (m_perGpu[deviceIdx].preprocessBufferVirtAddr != bufferVirtAddr) + if (result == VK_SUCCESS) { - Pal::Result palResult = m_perGpu[deviceIdx].pGenerator->BindGpuMemory(pBuffer->PalMemory(deviceIdx), - memOffset); - VK_ASSERT(palResult == Pal::Result::Success); - m_perGpu[deviceIdx].preprocessBufferVirtAddr = bufferVirtAddr; + void* pPalMemory = pMemory; + + for (uint32_t deviceIdx = 0; deviceIdx < pDevice->NumPalDevices(); deviceIdx++) + { + const size_t size = pDevice->PalDevice(deviceIdx)->GetGpuMemorySize(memCreateInfos[deviceIdx], + &palResult); + + if (palResult == Pal::Result::Success) + { + palResult = pDevice->PalDevice(deviceIdx)->CreateGpuMemory(memCreateInfos[deviceIdx], + pPalMemory, + &pGpuMemory[deviceIdx]); + } + + if (palResult == Pal::Result::Success) + { + // Gpu memory binding for IndirectCmdGenerator to build SRD containing properties and parameter data. + palResult = pGenerators[deviceIdx]->BindGpuMemory(pGpuMemory[deviceIdx], 0); + } + else + { + result = PalToVkResult(palResult); + break; + } + + if (palResult == Pal::Result::Success) + { + pPalMemory = Util::VoidPtrInc(pPalMemory, size); + } + else + { + result = PalToVkResult(palResult); + break; + } + } } + + return result; } // ===================================================================================================================== @@ -368,8 +445,16 @@ VkResult IndirectCommandsLayout::Destroy( { m_perGpu[deviceIdx].pGenerator->Destroy(); } - // It's app's reponsibility to free the preprocess buffer. - m_perGpu[deviceIdx].preprocessBufferVirtAddr = 0; + + if (m_perGpu[deviceIdx].pGpuMemory != nullptr) + { + m_perGpu[deviceIdx].pGpuMemory->Destroy(); + } + } + + if (m_perGpu[DefaultDeviceIndex].pGpuMemory != nullptr) + { + pAllocator->pfnFree(pAllocator->pUserData, m_perGpu[DefaultDeviceIndex].pGpuMemory); } Util::Destructor(this); diff --git a/icd/api/vk_memory.cpp b/icd/api/vk_memory.cpp index 4b99eb59..f2fcf552 100644 --- a/icd/api/vk_memory.cpp +++ b/icd/api/vk_memory.cpp @@ -346,6 +346,7 @@ VkResult Memory::Create( if (pPinnedHostPtr == nullptr) { + vkResult = CreateGpuMemory( pDevice, pAllocator, diff --git a/icd/api/vk_physical_device.cpp b/icd/api/vk_physical_device.cpp index 75fdd9cf..e6d2e5f1 100644 --- a/icd/api/vk_physical_device.cpp +++ b/icd/api/vk_physical_device.cpp @@ -821,6 +821,9 @@ VkResult PhysicalDevice::Initialize() finalizeInfo.supportedFullScreenFrameMetadata.p2pCmdFlag = true; finalizeInfo.supportedFullScreenFrameMetadata.forceSwCfMode = true; finalizeInfo.supportedFullScreenFrameMetadata.postFrameTimerSubmission = true; + + // Need to set all 3 bits to 1 per KMD request. + finalizeInfo.supportedFullScreenFrameMetadata.flipIntervalOverride = 7; } finalizeInfo.internalTexOptLevel = VkToPalTexFilterQuality(settings.vulkanTexFilterQuality); @@ -1323,6 +1326,7 @@ void PhysicalDevice::PopulateFormatProperties() } while (aspectMask != 0); } + } #if VKI_RAY_TRACING @@ -4098,11 +4102,17 @@ bool PhysicalDevice::RayTracingSupported() const } #endif +// ===================================================================================================================== static bool IsKhrCooperativeMatrixSupported( const PhysicalDevice* pPhysicalDevice) { - return ((pPhysicalDevice == nullptr) || - (pPhysicalDevice->PalProperties().gfxipProperties.flags.supportCooperativeMatrix)); + const bool hasHardwareSupport = + ((pPhysicalDevice == nullptr) || + (pPhysicalDevice->PalProperties().gfxipProperties.flags.supportCooperativeMatrix)); + + bool emulateSupport = false; + + return hasHardwareSupport || emulateSupport; } // ===================================================================================================================== @@ -4766,7 +4776,9 @@ void PhysicalDevice::PopulateQueueFamilies() pQueueFamilyProps->minImageTransferGranularity.depth = ((transferGranularityOverride >> 16) & 0xff); } - m_queueFamilyCount++; + { + m_queueFamilyCount++; + } } } @@ -7006,6 +7018,9 @@ size_t PhysicalDevice::GetFeatures2( break; } +#if VKI_RAY_TRACING +#endif + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PAGEABLE_DEVICE_LOCAL_MEMORY_FEATURES_EXT: { auto* pExtInfo = reinterpret_cast(pHeader); @@ -8297,6 +8312,9 @@ void PhysicalDevice::GetDeviceProperties2( break; } +#if VKI_RAY_TRACING +#endif + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROVOKING_VERTEX_PROPERTIES_EXT: { auto* pProps = static_cast(pNext); diff --git a/icd/api/vk_pipeline_layout.cpp b/icd/api/vk_pipeline_layout.cpp index 677a63f9..d11fe4f1 100644 --- a/icd/api/vk_pipeline_layout.cpp +++ b/icd/api/vk_pipeline_layout.cpp @@ -39,6 +39,10 @@ #include "palMetroHash.h" #include "palVectorImpl.h" +#if VKI_RAY_TRACING +#include "raytrace/ray_tracing_device.h" +#endif + namespace vk { @@ -938,7 +942,7 @@ Vkgc::ResourceMappingNodeType PipelineLayout::MapLlpcResourceNodeType( case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: nodeType = Vkgc::ResourceMappingNodeType::DescriptorBufferCompact; break; - case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK: nodeType = Vkgc::ResourceMappingNodeType::InlineBuffer; break; case VK_DESCRIPTOR_TYPE_MUTABLE_EXT: @@ -1281,7 +1285,8 @@ VkResult PipelineLayout::BuildCompactSchemeLlpcPipelineMapping( Vkgc::ShaderStageVertexBit, userDataLayout.specConstBufVertexRegBase, MaxInternalSpecConstBuffSize, - Vkgc::SpecConstInternalBufferBindingId + ShaderStage::ShaderStageVertex, + static_cast(Vkgc::SpecConstInternalBufferBindingId) + + static_cast(ShaderStage::ShaderStageVertex), &pUserDataNodes[userDataNodeCount], &userDataNodeCount, &pResourceNodes[mappingNodeCount], @@ -1294,7 +1299,8 @@ VkResult PipelineLayout::BuildCompactSchemeLlpcPipelineMapping( Vkgc::ShaderStageFragmentBit, userDataLayout.specConstBufFragmentRegBase, MaxInternalSpecConstBuffSize, - Vkgc::SpecConstInternalBufferBindingId + ShaderStage::ShaderStageFragment, + static_cast(Vkgc::SpecConstInternalBufferBindingId) + + static_cast(ShaderStage::ShaderStageFragment), &pUserDataNodes[userDataNodeCount], &userDataNodeCount, &pResourceNodes[mappingNodeCount], diff --git a/icd/api/vk_query.cpp b/icd/api/vk_query.cpp index ccd84955..9822879d 100644 --- a/icd/api/vk_query.cpp +++ b/icd/api/vk_query.cpp @@ -559,7 +559,8 @@ VkResult QueryPoolWithStorageView::Initialize( m_pStorageView[deviceIdx] = Util::VoidPtrInc(pMemory, apiSize + (viewSize * deviceIdx)); - m_pDevice->PalDevice(deviceIdx)->CreateUntypedBufferViewSrds(1, &bufferViewInfo, m_pStorageView[deviceIdx]); + m_pDevice->PalDevice(deviceIdx)-> + CreateUntypedBufferViewSrds(1, &bufferViewInfo, m_pStorageView[deviceIdx]); } } else @@ -1008,7 +1009,7 @@ VkResult AccelerationStructureQueryPool::GetResults( { while (!ready) { - Util::SleepMs(0u); + Util::Sleep(std::chrono::milliseconds{ 0 }); value = GetAccelerationStructureQueryResults( m_queryType, diff --git a/icd/api/vk_queue.cpp b/icd/api/vk_queue.cpp index 71d28ffa..79196bd8 100644 --- a/icd/api/vk_queue.cpp +++ b/icd/api/vk_queue.cpp @@ -1152,9 +1152,6 @@ VkResult Queue::Submit( const void* pNext = submitInfo.pNext; -#if VKI_RAY_TRACING -#endif - while (pNext != nullptr) { const VkStructHeader* pHeader = static_cast(pNext); @@ -1505,7 +1502,9 @@ VkResult Queue::Submit( if (palResult == Pal::Result::Success) { - palResult = PalQueueSubmit(m_pDevice, PalTmzQueue(deviceIdx), palSubmitInfo); + { + palResult = PalQueueSubmit(m_pDevice, PalTmzQueue(deviceIdx), palSubmitInfo); + } } VK_ASSERT(palResult == Pal::Result::Success); @@ -1530,7 +1529,9 @@ VkResult Queue::Submit( if (palResult == Pal::Result::Success) { - palResult = PalQueueSubmit(m_pDevice, PalQueue(deviceIdx), palSubmitInfo); + { + palResult = PalQueueSubmit(m_pDevice, PalQueue(deviceIdx), palSubmitInfo); + } } VK_ASSERT(palResult == Pal::Result::Success); @@ -1632,8 +1633,6 @@ VkResult Queue::Submit( DebugPrintf::PostQueueSubmit(m_pDevice, this, pCmdBuffers, cmdBufferCount); -#if VKI_RAY_TRACING -#endif } } @@ -1986,18 +1985,10 @@ VkResult Queue::Present( pPresentRects[r] = VkToPalRect(rect2D); } presentInfo.rectangleCount = pVkRegion->rectangleCount; - presentInfo.pRectangles = pPresentRects; + presentInfo.pRectangles = pPresentRects; } } - // Fill in present information and obtain the PAL memory of the presentable image. - Pal::IGpuMemory* pGpuMemory = pSwapChain->UpdatePresentInfo(presentationDeviceIdx, - imageIndex, - &presentInfo, - m_flipStatus.flipFlags); - - CmdBufState* pCmdBufState = m_pCmdBufferRing->AcquireCmdBuffer(m_pDevice, presentationDeviceIdx); - // Ensure metadata is available before post processing. if (pSwapChain->GetFullscreenMgr() != nullptr) { @@ -2008,6 +1999,15 @@ VkResult Queue::Present( VK_ASSERT(palResult == Pal::Result::Success); } + // Fill in present information and obtain the PAL memory of the presentable image. + Pal::IGpuMemory* pGpuMemory = pSwapChain->UpdatePresentInfo(presentationDeviceIdx, + imageIndex, + &presentInfo, + m_flipStatus.flipFlags, + m_palFrameMetadataControl); + + CmdBufState* pCmdBufState = m_pCmdBufferRing->AcquireCmdBuffer(m_pDevice, presentationDeviceIdx); + // This must happen after the fullscreen manager has updated its overlay information and before the software // compositor has an opportunity to copy the presentable image in order to include the overlay itself. bool hasPostProcessing = BuildPostProcessCommands(presentationDeviceIdx, @@ -2124,9 +2124,6 @@ VkResult Queue::Present( } } -#if VKI_RAY_TRACING -#endif - return result; } diff --git a/icd/api/vk_swapchain.cpp b/icd/api/vk_swapchain.cpp index 5436a013..fce1dad2 100644 --- a/icd/api/vk_swapchain.cpp +++ b/icd/api/vk_swapchain.cpp @@ -52,6 +52,8 @@ #include +using namespace std::chrono_literals; + namespace vk { @@ -113,7 +115,7 @@ VkResult SwapChain::Create( // the old swapchain should be flaged as deprecated no matter whether the new swapchain is created successfully. if (pCreateInfo->oldSwapchain != VK_NULL_HANDLE) { - SwapChain::ObjectFromHandle(pCreateInfo->oldSwapchain)->MarkAsDeprecated(pAllocator); + SwapChain::ObjectFromHandle(pCreateInfo->oldSwapchain)->MarkAsDeprecated(true, pAllocator); } // Find the index of the device associated with the PAL screen and therefore, the PAL swap chain to be created @@ -672,6 +674,7 @@ VkResult SwapChain::SetupAutoStereo( // Destroy Vulkan swap chain. VkResult SwapChain::Destroy(const VkAllocationCallbacks* pAllocator) { + // Make sure the swapchain is idle and safe to be destroyed. if (m_pPalSwapChain != nullptr) { @@ -718,9 +721,9 @@ VkResult SwapChain::AcquireNextImage( const VkStructHeader* pAcquireInfo, uint32_t* pImageIndex) { - VkFence fence = VK_NULL_HANDLE; - VkSemaphore semaphore = VK_NULL_HANDLE; - uint64_t timeout = UINT64_MAX; + VkFence fence = VK_NULL_HANDLE; + VkSemaphore semaphore = VK_NULL_HANDLE; + std::chrono::nanoseconds timeout = std::chrono::nanoseconds::max(); const RuntimeSettings& settings = m_pDevice->GetRuntimeSettings(); @@ -740,7 +743,7 @@ VkResult SwapChain::AcquireNextImage( { semaphore = pVkAcquireNextImageInfoKHR->semaphore; fence = pVkAcquireNextImageInfoKHR->fence; - timeout = pVkAcquireNextImageInfoKHR->timeout; + timeout = Uint64ToChronoNano(pVkAcquireNextImageInfoKHR->timeout); Util::BitMaskScanForward(&presentationDeviceIdx, pVkAcquireNextImageInfoKHR->deviceMask); @@ -768,7 +771,7 @@ VkResult SwapChain::AcquireNextImage( if (result == VK_SUCCESS) { - acquireInfo.timeout = Uint64ToChronoNano(timeout); + acquireInfo.timeout = timeout; acquireInfo.pSemaphore = (pSemaphore != nullptr) ? pSemaphore->PalSemaphore(DefaultDeviceIndex) : nullptr; @@ -801,7 +804,7 @@ VkResult SwapChain::AcquireNextImage( result = VK_ERROR_OUT_OF_DATE_KHR; } - if ((timeout == 0) && (result == VK_TIMEOUT)) + if ((timeout == 0s) && (result == VK_TIMEOUT)) { result = VK_NOT_READY; } @@ -887,10 +890,11 @@ bool SwapChain::IsFullscreenOrEfsePresent() const // ===================================================================================================================== // Fills in the PAL swap chain present info with the appropriate image to present and returns its GPU memory. Pal::IGpuMemory* SwapChain::UpdatePresentInfo( - uint32_t deviceIdx, - uint32_t imageIndex, - Pal::PresentSwapChainInfo* pPresentInfo, - const Pal::FlipStatusFlags& flipFlags) + uint32_t deviceIdx, + uint32_t imageIndex, + Pal::PresentSwapChainInfo* pPresentInfo, + const Pal::FlipStatusFlags& flipFlags, + const Pal::PerSourceFrameMetadataControl& metadataFlags) { Pal::IGpuMemory* pSrcImageGpuMemory = nullptr; @@ -911,6 +915,7 @@ Pal::IGpuMemory* SwapChain::UpdatePresentInfo( ) { m_pFullscreenMgr->TryEnterExclusive(this); + } // Always fallback to windowed if FSE is not acquired to avoid missing presents. @@ -1118,27 +1123,31 @@ bool SwapChain::IsSuboptimal(uint32_t deviceIdx) // ===================================================================================================================== void SwapChain::MarkAsDeprecated( + bool releaseResources, const VkAllocationCallbacks* pAllocator) { m_deprecated = true; - if (m_pPalSwapChain != nullptr) + if (releaseResources) { - m_pPalSwapChain->WaitIdle(); - - for (uint32_t i = 0; i < m_properties.imageCount; ++i) + if (m_pPalSwapChain != nullptr) { - // Remove memory references to presentable image memory and destroy the images and image memory. - Memory::ObjectFromHandle(m_properties.imageMemory[i])->Free(m_pDevice, pAllocator); - Image::ObjectFromHandle(m_properties.images[i])->Destroy(m_pDevice, pAllocator); - } + m_pPalSwapChain->WaitIdle(); - m_pPalSwapChain->Destroy(); + for (uint32_t i = 0; i < m_properties.imageCount; ++i) + { + // Remove memory references to presentable image memory and destroy the images and image memory. + Memory::ObjectFromHandle(m_properties.imageMemory[i])->Free(m_pDevice, pAllocator); + Image::ObjectFromHandle(m_properties.images[i])->Destroy(m_pDevice, pAllocator); + } - // Set to null to avoid double deleting when the actual object gets destroyed. - m_pPalSwapChain = nullptr; - } + m_pPalSwapChain->Destroy(); + + // Set to null to avoid double deleting when the actual object gets destroyed. + m_pPalSwapChain = nullptr; + } + } } // ===================================================================================================================== diff --git a/icd/api/vk_utils.cpp b/icd/api/vk_utils.cpp index 45605b97..097adbe2 100644 --- a/icd/api/vk_utils.cpp +++ b/icd/api/vk_utils.cpp @@ -75,7 +75,7 @@ void WaitIdleForDebugger( // Timeout the driver to give debuggers a chance to load all of the symbols if (debugTimeout != 0) { - Util::SleepMs(debugTimeout); + Util::Sleep(std::chrono::milliseconds{ debugTimeout }); } } } diff --git a/icd/res/ver.h b/icd/res/ver.h index 255caefd..6cfc716e 100644 --- a/icd/res/ver.h +++ b/icd/res/ver.h @@ -36,7 +36,7 @@ #define VERSION_MAJOR_STR MAKE_VERSION_STRING(VULKAN_ICD_MAJOR_VERSION) "\0" // Bump up after each promotion to mainline -#define VULKAN_ICD_BUILD_VERSION 304 +#define VULKAN_ICD_BUILD_VERSION 308 // String version is needed with leading zeros and extra termination (unicode) #define VERSION_NUMBER_MINOR VULKAN_ICD_BUILD_VERSION @@ -45,7 +45,7 @@ // These values specify the driver ID and driver info string #define VULKAN_DRIVER_ID VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR // "AMDOPEN" #define VULKAN_DRIVER_NAME_STR "AMD open-source driver" -#define VULKAN_DRIVER_INFO_STR "2024.Q2.1" +#define VULKAN_DRIVER_INFO_STR "2024.Q2.2" #define VULKAN_DRIVER_INFO_STR_LLPC "(LLPC)" // These values tell which version of the conformance test the driver is compliant against diff --git a/icd/settings/settings.cpp b/icd/settings/settings.cpp index 8679fb01..c5e0cd7a 100644 --- a/icd/settings/settings.cpp +++ b/icd/settings/settings.cpp @@ -176,7 +176,11 @@ void VulkanSettingsLoader::OverrideSettingsBySystemInfo() char executableName[PATH_MAX]; char executablePath[PATH_MAX]; utils::GetExecutableNameAndPath(executableName, executablePath); - sprintf(m_settings.pipelineDumpDir, "%s/%s", m_settings.pipelineDumpDir, executableName); + Util::Snprintf(m_settings.pipelineDumpDir, + sizeof(m_settings.pipelineDumpDir), + "%s/%s", + m_settings.pipelineDumpDir, + executableName); } MakeAbsolutePath(m_settings.pipelineDumpDir, sizeof(m_settings.pipelineDumpDir), @@ -809,7 +813,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( #endif m_settings.enableUberFetchShader = true; - } if (appProfile == AppProfile::Source2Engine) @@ -823,7 +826,6 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.anisoThreshold = 1.0f; m_settings.disableMsaaStencilShaderRead = true; - } if (appProfile == AppProfile::Talos) @@ -1353,15 +1355,18 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( m_settings.forceDepthClampBasedOnZExport = true; } + if ((appProfile == AppProfile::DxvkHaloInfiniteLauncher) || + (appProfile == AppProfile::DxvkTf2) #ifndef ICD_X64_BUILD - if (appProfile == AppProfile::DXVK) + || (appProfile == AppProfile::DXVK) +#endif + ) { - // DXVK Tropic4/GTA4 page fault when GPL is enabled. + // DXVK Tropic4, GTA4, Halo Infinite Launcher page fault when GPL is enabled. // It looks incorrect pipeline layout is used. Force indirect can make optimized pipeline layout compatible // with fast-linked pipeline. m_settings.pipelineLayoutSchemeSelectionStrategy = PipelineLayoutSchemeSelectionStrategy::ForceIndirect; } -#endif if (appProfile == AppProfile::AshesOfTheSingularity) { @@ -1602,6 +1607,7 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( if (appProfile == AppProfile::Vkd3dEngine) { m_settings.exportNvComputeShaderDerivatives = true; + m_settings.exportNvDeviceGeneratedCommands = true; m_settings.exportImageCompressionControl = true; } @@ -1610,6 +1616,7 @@ VkResult VulkanSettingsLoader::OverrideProfiledSettings( { m_settings.disableSingleMipAnisoOverride = false; } + } return result; @@ -1812,7 +1819,6 @@ void VulkanSettingsLoader::ValidateSettings() { buildMode = BvhBuildModePLOC; } - m_settings.bvhBuildModeOverrideBlas = buildMode; m_settings.bvhBuildModeOverrideTlas = buildMode; } @@ -1867,6 +1873,12 @@ void VulkanSettingsLoader::ValidateSettings() m_settings.indirectCalleeIntersection = Util::Min(255U, m_settings.indirectCalleeIntersection); m_settings.indirectCalleeCallable = Util::Min(255U, m_settings.indirectCalleeCallable); m_settings.indirectCalleeTraceRays = Util::Min(255U, m_settings.indirectCalleeTraceRays); + + // Force invalid accel struct to skip traversal if toss point is traversal or greater + if (m_settings.rtTossPoint >= RtTossPointTraversal) + { + m_settings.forceInvalidAccelStruct = true; + } #endif // SkipDstCacheInv should not be enabled by default when acquire-release barrier interface is used, because PAL diff --git a/icd/settings/settings_xgl.json b/icd/settings/settings_xgl.json index 0f88fd4f..41ab5ff0 100644 --- a/icd/settings/settings_xgl.json +++ b/icd/settings/settings_xgl.json @@ -218,7 +218,7 @@ }, { "Name": "BvhBuildModeAuto", - "Value": 3, + "Value": 4, "Description": "Only for override builds. If set, falls back to regular build options." } ] @@ -1037,6 +1037,21 @@ "Scope": "Driver", "Type": "bool" }, + { + "Name": "EnableImageMsaaLoadOpt", + "Description": "Enable image MSAA load optimization on Gfx11.", + "Tags": [ + "Pipeline Options" + ], + "BuildTypes": [ + "VKI_BUILD_GFX11" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, { "Name": "DisableLoopUnrolls", "Description": "Disable loop unrolls. This modifies the default pipeline state and can be overwritten by fine-grain override settings.", @@ -1445,7 +1460,7 @@ "SPIRV Options" ], "Defaults": { - "Default": false + "Default": true }, "Scope": "Driver", "Type": "bool", @@ -2720,7 +2735,7 @@ { "Name": "RtTraceRayCounterMode", "Type": "enum", - "Description": "Enable ray tracing counters. Written to the directory specified by RayTracingCapturePath. Press the RayTracingCaptureHotKey to dump when enabled.", + "Description": "Enable ray tracing counters. Written to the directory specified by RtDumpDir. Press the RtCaptureHotKey to dump when enabled.", "Scope": "Driver", "Tags": [ "Ray Tracing" @@ -3239,6 +3254,36 @@ "Name": "RtTriangleSplittingPriority", "Scope": "Driver" }, + { + "Name": "EnableMergedEncodeBuild", + "Description": "Enable merged encode and build dispatch.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, + { + "Name": "EnableMergedEncodeUpdate", + "Description": "Enable merged encode and update dispatch.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Scope": "Driver" + }, { "Name": "RtEnableMortonCode30", "Description": "Enable Morton Code 30 bits", @@ -4191,44 +4236,6 @@ "Type": "bool", "Scope": "Driver" }, - { - "Name": "RtGpuDebugFlags", - "Description": "Gpu Debug flags for GPU RT Debug feature (asserts/printf)", - "Tags": [ - "Ray Tracing" - ], - "Defaults": { - "Default": "NoFlag" - }, - "ValidValues": { - "IsEnum": true, - "Name": "RtGpuDebugFlags", - "Values": [ - { - "Name": "NoFlag", - "Value": 0, - "Description": "Disable all gpu debug flags" - }, - { - "Name": "HostAssert", - "Value": 1, - "Description": "Enable Asserts" - }, - { - "Name": "HostPrint", - "Value": 2, - "Description": "Enable Prints" - }, - { - "Name": "ShaderHalt", - "Value": 4, - "Description": "Enable Halt shader" - } - ] - }, - "Type": "enum", - "Scope": "Driver" - }, { "Name": "EnableRemapScratchBuffer", "Description": "Enable Remapping BVH2 Data from ScratchBuffer to ResultBuffer", @@ -4528,6 +4535,47 @@ "Type": "bool", "Scope": "Driver" }, + { + "Name": "BatchBvhBuilds", + "Description": "Group BVH builds and updates based on explicit app-provided batches or our own implicit batches.", + "Tags": [ + "Ray Tracing" + ], + "BuildTypes": [ + "VKI_RAY_TRACING" + ], + "Defaults": { + "Default": "BatchBvhModeDisabled" + }, + "ValidValues": { + "IsEnum": true, + "Values": [ + { + "Name": "BatchBvhModeDisabled", + "Value": 0, + "Description": "Disables BVH batching" + }, + { + "Name": "BatchBvhModeExplicit", + "Value": 1, + "Description": "Relies on batching done by application." + }, + { + "Name": "BatchBvhModeImplicit", + "Value": 2, + "Description": "Enables our BvhBatchLayer for implicit BVH batching. Adds some overhead, but could be beneficial for apps written sub-optimally." + }, + { + "Name": "BatchBvhModeImplicitAndLog", + "Value": 3, + "Description": "Same as BatchBvhModeImplicit, but also logs layer activity to [AMD_DEBUG_DIR]/BvhBatchLog.txt. AMD_DEBUG_DIR must be set when this option is enabled (otherwise initialization will fail)." + } + ], + "Name": "BatchBvhModes" + }, + "Type": "enum", + "Scope": "Driver" + }, { "Name": "DbgBarrierPostCmdEnable", "Description": "Triggers a CmdBarrier call after any command in the given mask. The barrier behavior is controlled by the other DbgBarrierPost* settings in this category. Requires VK_ENABLE_DEBUG_BARRIERS=1 to take effect. 0x8FFFFFFF: All commands (heavyweight option)", @@ -7385,6 +7433,18 @@ "Type": "bool", "Scope": "Driver" }, + { + "Description": "If true, disables ReZ for pipelines that only read/write depth", + "Tags": [ + "Optimization" + ], + "Defaults": { + "Default": false + }, + "Type": "bool", + "Name": "DisableDepthOnlyReZ", + "Scope": "Driver" + }, { "Name": "Ac01WaNotNeeded", "Description": "Allows use AC01 fast clears. Please also check setting: Ac01WaState.", @@ -8679,8 +8739,8 @@ "Scope": "Driver" }, { - "Name": "IFHRayTracing", - "Description": "Makes the driver effectively skip the BVH build by reducing prim count to 0.", + "Name": "RtTossPoint", + "Description": "Set toss point for raytracing.", "Tags": [ "Ray Tracing" ], @@ -8688,9 +8748,40 @@ "VKI_RAY_TRACING" ], "Defaults": { - "Default": false + "Default": "RtTossPointDisabled" }, - "Type": "bool", + "ValidValues": { + "IsEnum": true, + "Name": "RtTossPointEnums", + "Values": [ + { + "Name": "RtTossPointDisabled", + "Value": 0, + "Description": "No toss points, raytracing executes normally" + }, + { + "Name": "RtTossPointTraversal", + "Value": 1, + "Description": "Disable traversal" + }, + { + "Name": "RtTossPointTlas", + "Value": 2, + "Description": "Disable traversal, TLAS build/update" + }, + { + "Name": "RtTossPointBlasUpdate", + "Value": 3, + "Description": "Disable traversal, TLAS build/update, BLAS update" + }, + { + "Name": "RtTossPointBlasBuild", + "Value": 4, + "Description": "Disable traversal, TLAS build/update, BLAS update, BLAS build" + } + ] + }, + "Type": "enum", "Scope": "Driver" }, { @@ -8709,7 +8800,7 @@ "Scope": "Driver" }, { - "Name": "GpuRtGpuDebugFlags", + "Name": "RtGpuDebugFlags", "Description": "GPURT GPU debug flags", "Tags": [ "Ray Tracing"