Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Remove more unnecessary conditional run layer calls #1713

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/liboslexec/backendllvm.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class BackendLLVM final : public OSOProcessorBase {
// Create llvm functions for OptiX callables
std::vector<llvm::Function*> build_llvm_optix_callables();
llvm::Function* build_llvm_fused_callable();
llvm::Function* build_check_layer_skip_stub();

/// Build up LLVM IR code for the given range [begin,end) or
/// opcodes, putting them (initially) into basic block bb (or the
Expand Down
35 changes: 35 additions & 0 deletions src/liboslexec/llvm_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,40 @@ BackendLLVM::llvm_call_layer(int layer, bool unconditional)
// if it's run unconditionally.
// The code in the parent layer itself will set its 'executed' flag.

//
// WIP COMMENT:
// Do something a little more complicated in the conditional case.
//
// We set up a stub function osl_check_layer_skip_stub(parentlayer) { return false; }
//
// In addition to checking groupdata->run[parentlayer],
// we also call the stub "bool skip = osl_check_layer_skip_stub(parentlayer)".
// The conditional becomes if (!skip && !groupdata->run[parentlayer]).
//
// During optimization, we analyze each call to that stub function.
// For a given call, we can walk up llvm's dominator tree and search
// for prior calls to the stub function. Finding one guarantees that for
// the current call, the layer is guaranteed to have already run.
//
// So if we find a hit, we replace the call with the constant true:
// bool skip = osl_check_layer_skip_stub(parentlayer) -> bool skip = true;
// Then (!skip && !groupdata->run[parentlayer]) is known false and llvm can
// constant-fold the entire if-statement away.
//
// If we don't find a hit, llvm can still inline the stub and we codegen the
// original if-statement.
//
// Similarly, if we skip the optimization, we still generate the correct code,
// we just won't remove any unnecessary checks.
//

llvm::Value* skip = ll.constant_bool(false);
if (!unconditional) {
llvm::Value* args[]
= { ll.constant(layer), sg_ptr() };
skip = ll.call_function("osl_check_layer_skip_stub", args);
}

llvm::Value* args[]
= { sg_ptr(), groupdata_ptr(), userdata_base_ptr(),
output_base_ptr(), shadeindex(), m_llvm_interactive_params_ptr };
Expand All @@ -129,6 +163,7 @@ BackendLLVM::llvm_call_layer(int layer, bool unconditional)
if (!unconditional) {
llvm::Value* executed = ll.op_load(layerfield);
executed = ll.op_ne(executed, trueval);
executed = ll.op_and(ll.op_not(skip), executed);
then_block = ll.new_basic_block("");
after_block = ll.new_basic_block("");
ll.op_branch(executed, then_block, after_block);
Expand Down
38 changes: 38 additions & 0 deletions src/liboslexec/llvm_instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,42 @@ BackendLLVM::build_llvm_init()
return ll.current_function();
}

llvm::Function* BackendLLVM::build_check_layer_skip_stub()
{
// This just creates a function that returns false

llvm::Function* stub = ll.make_function(
"osl_check_layer_skip_stub",
false,
ll.type_bool(),
{
ll.type_int(),
llvm_type_sg_ptr(),
},
false);

ll.current_function(stub);

if (ll.debug_is_enabled()) {
ustring sourcefile
= group()[0]->op(group()[0]->maincodebegin()).sourcefile();
ll.debug_push_function("osl_check_layer_skip_stub", sourcefile, 1);
}

llvm::BasicBlock* entry_bb = ll.new_basic_block("check_layer_skip_stub-bb");
ll.new_builder(entry_bb);

ll.op_return(ll.constant_bool(false));

if (ll.debug_is_enabled()) {
ll.debug_pop_function();
}

ll.end_builder();
return stub;

}

// OptiX Callables:
// Builds three OptiX callables: an init wrapper, an entry layer wrapper,
// and a "fused" callable that wraps both and owns the groupdata params buffer.
Expand Down Expand Up @@ -1575,6 +1611,7 @@ BackendLLVM::run()
#ifdef OSL_LLVM_NO_BITCODE
OSL_ASSERT(!use_rs_bitcode());
ll.module(ll.new_module("llvm_ops"));

# if OSL_USE_OPTIX
if (use_optix()) {
// If the module is created from LLVM bitcode, the target and
Expand Down Expand Up @@ -1689,6 +1726,7 @@ BackendLLVM::run()
shadingsys().m_stat_empty_instances += nlayers - m_num_used_layers;

initialize_llvm_group();
build_check_layer_skip_stub();

// Generate the LLVM IR for each layer. Skip unused layers.
m_llvm_local_mem = 0;
Expand Down
101 changes: 99 additions & 2 deletions src/liboslexec/llvm_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
#include <llvm/Transforms/Scalar/GVN.h>
#include <llvm/Transforms/Utils.h>
#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h>
#include <llvm/Transforms/Utils/BasicBlockUtils.h>

#include <llvm/Support/DynamicLibrary.h>

Expand All @@ -92,6 +93,7 @@
#include <llvm/Transforms/Utils/Cloning.h>
#include <llvm/Transforms/Utils/SymbolRewriter.h>


OSL_NAMESPACE_ENTER


Expand Down Expand Up @@ -1751,7 +1753,98 @@ LLVM_Util::InstallLazyFunctionCreator(void* (*P)(const std::string&))
exec->InstallLazyFunctionCreator(P);
}

namespace {

struct CheckLayerRemovalPass : public llvm::FunctionPass {
int m_calls_checked;
int m_calls_removed;

static char ID;
CheckLayerRemovalPass() : FunctionPass(ID) {}

bool doInitialization(llvm::Module &M) override {
m_calls_checked = 0;
m_calls_removed = 0;
return true;
}

bool runOnFunction(llvm::Function &F) override {
llvm::DominatorTree &dt = getAnalysis<llvm::DominatorTreeWrapperPass>().getDomTree();

llvm::ValueMap<llvm::BasicBlock*, llvm::DenseSet<int>> bblock_layer_lookup;

const std::string target_fn = "osl_check_layer_skip_stub";

// Find all of the stub calls and associate them with their basic block
for(auto I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
auto* call_inst = llvm::dyn_cast<llvm::CallInst>(&*I);
if (call_inst) {
llvm::Function* called = call_inst->getCalledFunction();
if (called && called->getName() == target_fn) {
llvm::Value *arg = call_inst->getArgOperand(0);
int layer = llvm::cast<llvm::ConstantInt>(arg)->getSExtValue();

llvm::BasicBlock* bblock = I->getParent();
bblock_layer_lookup[bblock].insert(layer);
m_calls_checked++;
}
}
}

if (bblock_layer_lookup.size() == 0)
return false;

// For each stub call, walk the dominator tree and look for a previous
// matching call.
std::unordered_set<llvm::CallInst*> delete_queue;
for(auto I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
auto* call_inst = llvm::dyn_cast<llvm::CallInst>(&*I);
if (call_inst) {
llvm::Function* called = call_inst->getCalledFunction();
if (called && called->getName() == target_fn) {
llvm::Value *arg = call_inst->getArgOperand(0);
int layer = llvm::cast<llvm::ConstantInt>(arg)->getSExtValue();

llvm::BasicBlock* bblock = I->getParent();
auto bbnode = dt.getNode(bblock)->getIDom();
while (bbnode) {
llvm::BasicBlock* candidate_bblock = bbnode->getBlock();
if (bblock_layer_lookup[candidate_bblock].count(layer) > 0) {
delete_queue.insert(call_inst);
break;
}
bbnode = bbnode->getIDom();
}
}
}
}

int count = delete_queue.size();
if (count == 0)
return false;

// Delete all the unnecessary stubs identified above

// WIP: This appears to be the cause of the performance regression
// (Not the dominator tree analysis above)
llvm::Value* fake = llvm::ConstantInt::get(F.getContext(), llvm::APInt(32, 1));
for (llvm::CallInst* inst : delete_queue) {
llvm::BasicBlock::iterator iterator(inst);
llvm::ReplaceInstWithValue(inst->getParent()->getInstList(), iterator, fake);
m_calls_removed++;
}
Comment on lines +1826 to +1835
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This appears to be where the performance hit originates from. It seems like the replacement and subsequent code-folding is pretty simple, so I'm confused why it is taking so much time compared to everything else that happens while codegen'ing a shader (+10-20%).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the replacement causes llvm to recook something. Does the 20% vanish if you if (always_false_global) line 1833?


return true;
}

void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
AU.addRequired<llvm::DominatorTreeWrapperPass>();
}

};
}

char CheckLayerRemovalPass::ID = 0;

void
LLVM_Util::setup_optimization_passes(int optlevel, bool target_host)
Expand All @@ -1767,6 +1860,8 @@ LLVM_Util::setup_optimization_passes(int optlevel, bool target_host)

m_llvm_module_passes = new llvm::legacy::PassManager;
llvm::legacy::PassManager& mpm = (*m_llvm_module_passes);
// TODO: Add based on optlevel
mpm.add(new CheckLayerRemovalPass());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the new pass is being run on the entire module, I think that means that it's running on all of the shadeops functions. Would it be possible to detect that you are processing a library function (by function name, or perhaps a function attribute), and early-exit from CheckLayerRemovalPass::runOnFunction() in that case?

That might not buy much since module pruning should remove unused library functions, but maybe it will help in some cases.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems like a responsible safe-guard to put in, I'll add that.


llvm::TargetMachine* target_machine = nullptr;
if (target_host) {
Expand Down Expand Up @@ -5968,8 +6063,10 @@ LLVM_Util::bitcode_string(llvm::Module* module)
std::string s;
llvm::raw_string_ostream stream(s);

for (auto&& func : module->getFunctionList())
stream << func << '\n';
module->print(stream, nullptr);

// for (auto&& func : module->getFunctionList())
// stream << func << '\n';

return stream.str();
}
Expand Down
1 change: 0 additions & 1 deletion src/liboslexec/opstring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@ osl_printf(ShaderGlobals* sg, const char* format_str, ...)
sg->context->messagefmt("{}", s);
}


OSL_SHADEOP void
osl_error(ShaderGlobals* sg, const char* format_str, ...)
{
Expand Down