Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

shader cache polishing #748

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 28 additions & 60 deletions include/nbl/asset/utils/IShaderCompiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted
{
system::path absolutePath = {};
std::string contents = {};
std::array<uint64_t, 4> hash = {}; // TODO: we're not yet using IFile::getPrecomputedHash(), so for builtins we can maybe use that in the future
core::blake3_hash_t hash = {}; // TODO: we're not yet using IFile::getPrecomputedHash(), so for builtins we can maybe use that in the future
// Could be used in the future for early rejection of cache hit
//nbl::system::IFileBase::time_point_t lastWriteTime = {};

Expand Down Expand Up @@ -183,9 +183,8 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted

public:
// Used to check compatibility of Caches before reading
constexpr static inline std::string_view VERSION = "1.0.0";
constexpr static inline std::string_view VERSION = "1.1.0";

using hash_t = std::array<uint64_t,4>;
static auto const SHADER_BUFFER_SIZE_BYTES = sizeof(uint64_t) / sizeof(uint8_t); // It's obviously 8

struct SEntry
Expand All @@ -196,11 +195,9 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted
{
public:
// Perf note: hashing while preprocessor lexing is likely to be slower than just hashing the whole array like this
inline SPreprocessingDependency(const system::path& _requestingSourceDir, const std::string_view& _identifier, const std::string_view& _contents, bool _standardInclude, std::array<uint64_t, 4> _hash) :
requestingSourceDir(_requestingSourceDir), identifier(_identifier), contents(_contents), standardInclude(_standardInclude), hash(_hash)
{
assert(!_contents.empty());
}
inline SPreprocessingDependency(const system::path& _requestingSourceDir, const std::string_view& _identifier, bool _standardInclude, core::blake3_hash_t _hash) :
requestingSourceDir(_requestingSourceDir), identifier(_identifier), standardInclude(_standardInclude), hash(_hash)
{}

inline SPreprocessingDependency(SPreprocessingDependency&) = default;
inline SPreprocessingDependency& operator=(SPreprocessingDependency&) = delete;
Expand All @@ -218,11 +215,8 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted
// path or identifier
system::path requestingSourceDir = "";
std::string identifier = "";
// file contents
// TODO: change to `core::vector<uint8_t>` a compressed blob of LZMA, and store all contents together in the `SEntry`
std::string contents = "";
// hash of the contents - used to check against a found_t
std::array<uint64_t, 4> hash = {};
core::blake3_hash_t hash = {};
// If true, then `getIncludeStandard` was used to find, otherwise `getIncludeRelative`
bool standardInclude = false;
};
Expand All @@ -248,6 +242,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted
private:
friend class SCompilerArgs;
friend class SEntry;
friend class CCache;
friend void to_json(nlohmann::json&, const SPreprocessorArgs&);
friend void from_json(const nlohmann::json&, SPreprocessorArgs&);

Expand All @@ -271,7 +266,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted
std::string sourceIdentifier;
std::vector<SMacroDefinition> extraDefines;
};
// TODO: SPreprocessorArgs could just be folded into `SCompilerArgs` to have less classes and operators
// TODO: SPreprocessorArgs could just be folded into `SCompilerArgs` to have less classes and decompressShader
struct SCompilerArgs final
{
public:
Expand All @@ -290,6 +285,7 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted

private:
friend class SEntry;
friend class CCache;
friend void to_json(nlohmann::json&, const SCompilerArgs&);
friend void from_json(const nlohmann::json&, SCompilerArgs&);

Expand Down Expand Up @@ -351,33 +347,40 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted

// Now add the mainFileContents and produce both lookup and early equality rejection hashes
hashable.insert(hashable.end(), mainFileContents.begin(), mainFileContents.end());
hash = nbl::core::XXHash_256(hashable.data(), hashable.size());
lookupHash = hash[0];
for (auto i = 1u; i < 4; i++) {
core::hash_combine<uint64_t>(lookupHash, hash[i]);
}

core::blake3_hasher hasher;
hasher.update(hashable.data(), hashable.size());
hash = static_cast<core::blake3_hash_t>(hasher);
lookupHash = std::hash<core::blake3_hash_t>{}(hash);
}

// Needed to get the vector deserialization automatically
inline SEntry() {}

// Making the copy constructor deep-copy everything but the shader
inline SEntry(const SEntry& other)
: mainFileContents(other.mainFileContents), compilerArgs(other.compilerArgs), hash(other.hash), lookupHash(other.lookupHash),
dependencies(other.dependencies), cpuShader(other.cpuShader) {}
inline SEntry(const SEntry& other)
: mainFileContents(other.mainFileContents), compilerArgs(other.compilerArgs), hash(other.hash),
lookupHash(other.lookupHash), dependencies(other.dependencies), spirv(other.spirv),
uncompressedContentHash(other.uncompressedContentHash), uncompressedSize(other.uncompressedSize) {}

inline SEntry& operator=(SEntry& other) = delete;
inline SEntry(SEntry&& other) = default;
// Used for late initialization while looking up a cache, so as not to always initialize an entry even if caching was not requested
inline SEntry& operator=(SEntry&& other) = default;

void setContent(const asset::ICPUBuffer* uncompressedSpirvBuffer, dependency_container_t&& dependencies);

core::smart_refctd_ptr<ICPUShader> decompressShader() const;

// TODO: make some of these private
std::string mainFileContents;
SCompilerArgs compilerArgs;
std::array<uint64_t,4> hash;
core::blake3_hash_t hash;
size_t lookupHash;
dependency_container_t dependencies;
core::smart_refctd_ptr<asset::ICPUShader> cpuShader;
core::smart_refctd_ptr<asset::ICPUBuffer> spirv;
core::blake3_hash_t uncompressedContentHash;
size_t uncompressedSize;
};

inline void insert(SEntry&& entry)
Expand Down Expand Up @@ -429,48 +432,13 @@ class NBL_API2 IShaderCompiler : public core::IReferenceCounted

};

using EntrySet = core::unordered_multiset<SEntry, Hash, KeyEqual>;
using EntrySet = core::unordered_set<SEntry, Hash, KeyEqual>;
EntrySet m_container;

NBL_API2 EntrySet::const_iterator find_impl(const SEntry& mainFile, const CIncludeFinder* finder) const;
};

inline core::smart_refctd_ptr<ICPUShader> compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const
{
CCache::SEntry entry;
std::vector<CCache::SEntry::SPreprocessingDependency> dependencies;
if (options.readCache || options.writeCache)
entry = std::move(CCache::SEntry(code, options));

if (options.readCache)
{
auto found = options.readCache->find_impl(entry, options.preprocessorOptions.includeFinder);
if (found != options.readCache->m_container.end())
{
if (options.writeCache)
{
CCache::SEntry writeEntry = *found;
options.writeCache->insert(std::move(writeEntry));
}
return found->cpuShader;
}
}

auto retVal = compileToSPIRV_impl(code, options, options.writeCache ? &dependencies : nullptr);
// compute the SPIR-V shader content hash
{
auto backingBuffer = retVal->getContent();
const_cast<ICPUBuffer*>(backingBuffer)->setContentHash(backingBuffer->computeContentHash());
}

if (options.writeCache)
{
entry.dependencies = std::move(dependencies);
entry.cpuShader = retVal;
options.writeCache->insert(std::move(entry));
}
return retVal;
}
core::smart_refctd_ptr<ICPUShader> compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const;

inline core::smart_refctd_ptr<ICPUShader> compileToSPIRV(const char* code, const SCompilerOptions& options) const
{
Expand Down
120 changes: 109 additions & 11 deletions src/nbl/asset/utils/IShaderCompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
#include <regex>
#include <iterator>

#include <lzma/C/LzmaEnc.h>
#include <lzma/C/LzmaDec.h>

using namespace nbl;
using namespace nbl::asset;

Expand All @@ -22,6 +25,42 @@ IShaderCompiler::IShaderCompiler(core::smart_refctd_ptr<system::ISystem>&& syste
m_defaultIncludeFinder = core::make_smart_refctd_ptr<CIncludeFinder>(core::smart_refctd_ptr(m_system));
}

inline core::smart_refctd_ptr<ICPUShader> nbl::asset::IShaderCompiler::compileToSPIRV(const std::string_view code, const SCompilerOptions& options) const
{
CCache::SEntry entry;
std::vector<CCache::SEntry::SPreprocessingDependency> dependencies;
if (options.readCache || options.writeCache)
entry = std::move(CCache::SEntry(code, options));

if (options.readCache)
{
auto found = options.readCache->find_impl(entry, options.preprocessorOptions.includeFinder);
if (found != options.readCache->m_container.end())
{
if (options.writeCache)
{
CCache::SEntry writeEntry = *found;
options.writeCache->insert(std::move(writeEntry));
}
return found->decompressShader();
}
}

auto retVal = compileToSPIRV_impl(code, options, options.writeCache ? &dependencies : nullptr);
// compute the SPIR-V shader content hash
{
auto backingBuffer = retVal->getContent();
const_cast<ICPUBuffer*>(backingBuffer)->setContentHash(backingBuffer->computeContentHash());
}

if (options.writeCache)
{
entry.setContent(retVal->getContent(), std::move(dependencies));
options.writeCache->insert(std::move(entry));
}
return retVal;
}

std::string IShaderCompiler::preprocessShader(
system::IFile* sourcefile,
IShader::E_SHADER_STAGE stage,
Expand Down Expand Up @@ -116,7 +155,10 @@ auto IShaderCompiler::CIncludeFinder::getIncludeStandard(const system::path& req
retVal = std::move(contents);
else retVal = m_defaultFileSystemLoader->getInclude(requestingSourceDir.string(), includeName);

retVal.hash = nbl::core::XXHash_256((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t)));

core::blake3_hasher hasher;
hasher.update((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t)));
retVal.hash = static_cast<core::blake3_hash_t>(hasher);
return retVal;
}

Expand All @@ -129,7 +171,10 @@ auto IShaderCompiler::CIncludeFinder::getIncludeRelative(const system::path& req
if (auto contents = m_defaultFileSystemLoader->getInclude(requestingSourceDir.string(), includeName))
retVal = std::move(contents);
else retVal = std::move(trySearchPaths(includeName));
retVal.hash = nbl::core::XXHash_256((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t)));

core::blake3_hasher hasher;
hasher.update((uint8_t*)(retVal.contents.data()), retVal.contents.size() * (sizeof(char) / sizeof(uint8_t)));
retVal.hash = static_cast<core::blake3_hash_t>(hasher);
return retVal;
}

Expand Down Expand Up @@ -218,7 +263,10 @@ auto IShaderCompiler::CIncludeFinder::tryIncludeGenerators(const std::string& in

core::smart_refctd_ptr<asset::ICPUShader> IShaderCompiler::CCache::find(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const
{
return find_impl(mainFile, finder)->cpuShader;
const auto found = find_impl(mainFile, finder);
if (found==m_container.end())
return nullptr;
return found->decompressShader();
}

IShaderCompiler::CCache::EntrySet::const_iterator IShaderCompiler::CCache::find_impl(const SEntry& mainFile, const IShaderCompiler::CIncludeFinder* finder) const
Expand All @@ -238,7 +286,7 @@ IShaderCompiler::CCache::EntrySet::const_iterator IShaderCompiler::CCache::find_
else
header = finder->getIncludeRelative(dependency.requestingSourceDir, dependency.identifier);

if (header.hash != dependency.hash || header.contents != dependency.contents)
if (header.hash != dependency.hash)
{
allDependenciesMatch = false;
break;
Expand Down Expand Up @@ -267,10 +315,10 @@ core::smart_refctd_ptr<ICPUBuffer> IShaderCompiler::CCache::serialize() const
// We keep a copy of the offsets and the sizes of each shader. This is so that later on, when we add the shaders to the buffer after json creation
// (where the params array has been moved) we don't have to read the json to get the offsets again
offsets[i] = shaderBufferSize;
sizes[i] = entry.cpuShader->getContent()->getSize();
sizes[i] = entry.spirv->getSize();

// And add the params to the shader creation parameters array
shaderCreationParams.emplace_back(entry.cpuShader->getStage(), entry.cpuShader->getContentType(), entry.cpuShader->getFilepathHint(), sizes[i], shaderBufferSize);
shaderCreationParams.emplace_back(entry.compilerArgs.stage, entry.compilerArgs.preprocessorArgs.sourceIdentifier.data(), sizes[i], shaderBufferSize);
// Enlarge the shader buffer by the size of the current shader
shaderBufferSize += sizes[i];
i++;
Expand All @@ -294,7 +342,7 @@ core::smart_refctd_ptr<ICPUBuffer> IShaderCompiler::CCache::serialize() const
// Loop over entries again, adding each one's shader to the buffer.
i = 0u;
for (auto& entry : m_container) {
memcpy(retVal.data() + SHADER_BUFFER_SIZE_BYTES + offsets[i], entry.cpuShader->getContent()->getPointer(), sizes[i]);
memcpy(retVal.data() + SHADER_BUFFER_SIZE_BYTES + offsets[i], entry.spirv->getPointer(), sizes[i]);
i++;
}

Expand Down Expand Up @@ -324,7 +372,6 @@ core::smart_refctd_ptr<IShaderCompiler::CCache> IShaderCompiler::CCache::deseria
return nullptr;
}
}


// Now retrieve two vectors, one with the entries and one with the extra data to recreate the CPUShaders
std::vector<SEntry> entries;
Expand All @@ -337,13 +384,64 @@ core::smart_refctd_ptr<IShaderCompiler::CCache> IShaderCompiler::CCache::deseria
// Create buffer to hold the code
auto code = core::make_smart_refctd_ptr<ICPUBuffer>(shaderCreationParams[i].codeByteSize);
// Copy the shader bytecode into the buffer

memcpy(code->getPointer(), serializedCache.data() + SHADER_BUFFER_SIZE_BYTES + shaderCreationParams[i].offset, shaderCreationParams[i].codeByteSize);
code->setContentHash(code->computeContentHash());
// Create the ICPUShader
entries[i].cpuShader = core::make_smart_refctd_ptr<ICPUShader>(std::move(code), shaderCreationParams[i].stage, shaderCreationParams[i].contentType, std::move(shaderCreationParams[i].filepathHint));
entries[i].spirv = std::move(code);

retVal->insert(std::move(entries[i]));
}

return retVal;
}
}

static void* SzAlloc(ISzAllocPtr p, size_t size) { p = p; return _NBL_ALIGNED_MALLOC(size, _NBL_SIMD_ALIGNMENT); }
static void SzFree(ISzAllocPtr p, void* address) { p = p; _NBL_ALIGNED_FREE(address); }

void nbl::asset::IShaderCompiler::CCache::SEntry::setContent(const asset::ICPUBuffer* uncompressedSpirvBuffer, dependency_container_t&& dependencies)
{
dependencies = std::move(dependencies);
uncompressedContentHash = uncompressedSpirvBuffer->getContentHash();
uncompressedSize = uncompressedSpirvBuffer->getSize();

size_t propsSize = LZMA_PROPS_SIZE;
size_t destLen = uncompressedSpirvBuffer->getSize() + uncompressedSpirvBuffer->getSize() / 3 + 128;
std::vector<unsigned char> compressedSpirv = {};
compressedSpirv.resize(propsSize + destLen);
Comment on lines +409 to +410

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you making a temp vector just to memcpy it later into a spirv buffer?

just make the buffer now and write into that

Copy link
Member Author

@alichraghi alichraghi Oct 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the final size after compression is unknown and ICPUBuffer can't be resized (unless you ask me to add a resize method?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok didn't realize.

In that case, there's a thing called CCPUBuffer with a custom allocator that can also adopt memory

you can malloc instead of having a vector, then realloc and after realloc is complete you can create that CCustomAllocatorCPUBuffer with core::adopt_memory tag constructor overload

You just need to provide an "allocator" which will call free() on the adopted pointer

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'll address this in the upcoming Lib Shader Stage PR


CLzmaEncProps props;
LzmaEncProps_Init(&props);
props.dictSize = 1 << 16; // 64KB
props.writeEndMark = 1;

ISzAlloc alloc = { SzAlloc, SzFree };
int res = LzmaEncode(
compressedSpirv.data() + LZMA_PROPS_SIZE, &destLen,
reinterpret_cast<const unsigned char*>(uncompressedSpirvBuffer->getPointer()), uncompressedSpirvBuffer->getSize(),
&props, compressedSpirv.data(), &propsSize, props.writeEndMark,
nullptr, &alloc, &alloc);

assert(propsSize == LZMA_PROPS_SIZE);
assert(res == SZ_OK);
devshgraphicsprogramming marked this conversation as resolved.
Show resolved Hide resolved

spirv = core::make_smart_refctd_ptr<ICPUBuffer>(propsSize + destLen);
memcpy(spirv->getPointer(), compressedSpirv.data(), spirv->getSize());
}

core::smart_refctd_ptr<ICPUShader> nbl::asset::IShaderCompiler::CCache::SEntry::decompressShader() const
{
auto uncompressedBuf = core::make_smart_refctd_ptr<ICPUBuffer>(uncompressedSize);
uncompressedBuf->setContentHash(uncompressedContentHash);

size_t dstSize = uncompressedBuf->getSize();
size_t srcSize = spirv->getSize() - LZMA_PROPS_SIZE;
ELzmaStatus status;
ISzAlloc alloc = { SzAlloc, SzFree };
SRes res = LzmaDecode(
reinterpret_cast<unsigned char*>(uncompressedBuf->getPointer()), &dstSize,
reinterpret_cast<const unsigned char*>(spirv->getPointer()) + LZMA_PROPS_SIZE, &srcSize,
reinterpret_cast<const unsigned char*>(spirv->getPointer()), LZMA_PROPS_SIZE,
LZMA_FINISH_ANY, &status, &alloc);
assert(res == SZ_OK);
return core::make_smart_refctd_ptr<asset::ICPUShader>(std::move(uncompressedBuf), compilerArgs.stage, IShader::E_CONTENT_TYPE::ECT_SPIRV, compilerArgs.preprocessorArgs.sourceIdentifier.data());
devshgraphicsprogramming marked this conversation as resolved.
Show resolved Hide resolved
}
Loading