AcademySoftwareFoundation · chellmuth · Aug 18, 2023 · chellmuth · Aug 18, 2023 · aconty
diff --git a/src/liboslexec/backendllvm.h b/src/liboslexec/backendllvm.h
@@ -61,6 +61,7 @@ class BackendLLVM final : public OSOProcessorBase {
     // Create llvm functions for OptiX callables
     std::vector<llvm::Function*> build_llvm_optix_callables();
     llvm::Function* build_llvm_fused_callable();
+    llvm::Function* build_check_layer_skip_stub();
 
     /// Build up LLVM IR code for the given range [begin,end) or
     /// opcodes, putting them (initially) into basic block bb (or the

diff --git a/src/liboslexec/llvm_gen.cpp b/src/liboslexec/llvm_gen.cpp
@@ -118,6 +118,40 @@ BackendLLVM::llvm_call_layer(int layer, bool unconditional)
     // if it's run unconditionally.
     // The code in the parent layer itself will set its 'executed' flag.
 
+    //
+    // WIP COMMENT:
+    // Do something a little more complicated in the conditional case.
+    //
+    // We set up a stub function osl_check_layer_skip_stub(parentlayer) { return false; }
+    //
+    // In addition to checking groupdata->run[parentlayer],
+    // we also call the stub "bool skip = osl_check_layer_skip_stub(parentlayer)".
+    // The conditional becomes if (!skip && !groupdata->run[parentlayer]).
+    //
+    // During optimization, we analyze each call to that stub function.
+    // For a given call, we can walk up llvm's dominator tree and search
+    // for prior calls to the stub function. Finding one guarantees that for
+    // the current call, the layer is guaranteed to have already run.
+    //
+    // So if we find a hit, we replace the call with the constant true:
+    // bool skip = osl_check_layer_skip_stub(parentlayer) -> bool skip = true;
+    // Then (!skip && !groupdata->run[parentlayer]) is known false and llvm can
+    // constant-fold the entire if-statement away.
+    //
+    // If we don't find a hit, llvm can still inline the stub and we codegen the
+    // original if-statement.
+    //
+    // Similarly, if we skip the optimization, we still generate the correct code,
+    // we just won't remove any unnecessary checks.
+    //
+
+    llvm::Value* skip = ll.constant_bool(false);
+    if (!unconditional) {
+        llvm::Value* args[]
+            = { ll.constant(layer), sg_ptr() };
+        skip = ll.call_function("osl_check_layer_skip_stub", args);
+    }
+
     llvm::Value* args[]
         = { sg_ptr(),          groupdata_ptr(), userdata_base_ptr(),
             output_base_ptr(), shadeindex(),    m_llvm_interactive_params_ptr };
@@ -129,6 +163,7 @@ BackendLLVM::llvm_call_layer(int layer, bool unconditional)
     if (!unconditional) {
         llvm::Value* executed = ll.op_load(layerfield);
         executed              = ll.op_ne(executed, trueval);
+        executed = ll.op_and(ll.op_not(skip), executed);
         then_block            = ll.new_basic_block("");
         after_block           = ll.new_basic_block("");
         ll.op_branch(executed, then_block, after_block);

diff --git a/src/liboslexec/llvm_instance.cpp b/src/liboslexec/llvm_instance.cpp
@@ -990,6 +990,42 @@ BackendLLVM::build_llvm_init()
     return ll.current_function();
 }
 
+llvm::Function* BackendLLVM::build_check_layer_skip_stub()
+{
+    // This just creates a function that returns false
+
+    llvm::Function* stub = ll.make_function(
+        "osl_check_layer_skip_stub",
+        false,
+        ll.type_bool(),
+        {
+            ll.type_int(),
+            llvm_type_sg_ptr(),
+        },
+        false);
+
+    ll.current_function(stub);
+
+    if (ll.debug_is_enabled()) {
+        ustring sourcefile
+            = group()[0]->op(group()[0]->maincodebegin()).sourcefile();
+        ll.debug_push_function("osl_check_layer_skip_stub", sourcefile, 1);
+    }
+
+    llvm::BasicBlock* entry_bb = ll.new_basic_block("check_layer_skip_stub-bb");
+    ll.new_builder(entry_bb);
+
+    ll.op_return(ll.constant_bool(false));
+
+    if (ll.debug_is_enabled()) {
+        ll.debug_pop_function();
+    }
+
+    ll.end_builder();
+    return stub;
+
+}
+
 // OptiX Callables:
 //  Builds three OptiX callables: an init wrapper, an entry layer wrapper,
 //  and a "fused" callable that wraps both and owns the groupdata params buffer.
@@ -1575,6 +1611,7 @@ BackendLLVM::run()
 #ifdef OSL_LLVM_NO_BITCODE
         OSL_ASSERT(!use_rs_bitcode());
         ll.module(ll.new_module("llvm_ops"));
+
 #    if OSL_USE_OPTIX
         if (use_optix()) {
             // If the module is created from LLVM bitcode, the target and
@@ -1689,6 +1726,7 @@ BackendLLVM::run()
     shadingsys().m_stat_empty_instances += nlayers - m_num_used_layers;
 
     initialize_llvm_group();
+    build_check_layer_skip_stub();
 
     // Generate the LLVM IR for each layer.  Skip unused layers.
     m_llvm_local_mem          = 0;

diff --git a/src/liboslexec/llvm_util.cpp b/src/liboslexec/llvm_util.cpp
@@ -79,6 +79,7 @@
 #include <llvm/Transforms/Scalar/GVN.h>
 #include <llvm/Transforms/Utils.h>
 #include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
 
 #include <llvm/Support/DynamicLibrary.h>
 
@@ -92,6 +93,7 @@
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <llvm/Transforms/Utils/SymbolRewriter.h>
 
+
 OSL_NAMESPACE_ENTER
 
 
@@ -1751,7 +1753,98 @@ LLVM_Util::InstallLazyFunctionCreator(void* (*P)(const std::string&))
     exec->InstallLazyFunctionCreator(P);
 }
 
+namespace {
+
+struct CheckLayerRemovalPass : public llvm::FunctionPass {
+    int m_calls_checked;
+    int m_calls_removed;
+
+    static char ID;
+    CheckLayerRemovalPass() : FunctionPass(ID) {}
+
+    bool doInitialization(llvm::Module &M) override {
+        m_calls_checked = 0;
+        m_calls_removed = 0;
+        return true;
+    }
+
+    bool runOnFunction(llvm::Function &F) override {
+        llvm::DominatorTree &dt = getAnalysis<llvm::DominatorTreeWrapperPass>().getDomTree();
+
+        llvm::ValueMap<llvm::BasicBlock*, llvm::DenseSet<int>> bblock_layer_lookup;
+
+        const std::string target_fn = "osl_check_layer_skip_stub";
 
+        // Find all of the stub calls and associate them with their basic block
+        for(auto I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
+            auto* call_inst = llvm::dyn_cast<llvm::CallInst>(&*I);
+            if (call_inst) {
+                llvm::Function* called = call_inst->getCalledFunction();
+                if (called && called->getName() == target_fn) {
+                    llvm::Value *arg = call_inst->getArgOperand(0);
+                    int layer = llvm::cast<llvm::ConstantInt>(arg)->getSExtValue();
+
+                    llvm::BasicBlock* bblock = I->getParent();
+                    bblock_layer_lookup[bblock].insert(layer);
+                    m_calls_checked++;
+                }
+            }
+        }
+
+        if (bblock_layer_lookup.size() == 0)
+            return false;
+
+        // For each stub call, walk the dominator tree and look for a previous
+        // matching call.
+        std::unordered_set<llvm::CallInst*> delete_queue;
+        for(auto I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
+            auto* call_inst = llvm::dyn_cast<llvm::CallInst>(&*I);
+            if (call_inst) {
+                llvm::Function* called = call_inst->getCalledFunction();
+                if (called && called->getName() == target_fn) {
+                    llvm::Value *arg = call_inst->getArgOperand(0);
+                    int layer = llvm::cast<llvm::ConstantInt>(arg)->getSExtValue();
+
+                    llvm::BasicBlock* bblock = I->getParent();
+                    auto bbnode = dt.getNode(bblock)->getIDom();
+                    while (bbnode) {
+                        llvm::BasicBlock* candidate_bblock = bbnode->getBlock();
+                        if (bblock_layer_lookup[candidate_bblock].count(layer) > 0) {
+                            delete_queue.insert(call_inst);
+                            break;
+                        }
+                        bbnode = bbnode->getIDom();
+                    }
+                }
+            }
+        }
+
+        int count = delete_queue.size();
+        if (count == 0)
+            return false;
+
+        // Delete all the unnecessary stubs identified above
+
+        // WIP: This appears to be the cause of the performance regression
+        // (Not the dominator tree analysis above)
+        llvm::Value* fake = llvm::ConstantInt::get(F.getContext(), llvm::APInt(32, 1));
+        for (llvm::CallInst* inst : delete_queue) {
+            llvm::BasicBlock::iterator iterator(inst);
+            llvm::ReplaceInstWithValue(inst->getParent()->getInstList(), iterator, fake);
+            m_calls_removed++;
+        }
+
+        return true;
+    }
+
+    void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+        AU.addRequired<llvm::DominatorTreeWrapperPass>();
+    }
+
+};
+}
+
+char CheckLayerRemovalPass::ID = 0;
 
 void
 LLVM_Util::setup_optimization_passes(int optlevel, bool target_host)
@@ -1767,6 +1860,8 @@ LLVM_Util::setup_optimization_passes(int optlevel, bool target_host)
 
     m_llvm_module_passes           = new llvm::legacy::PassManager;
     llvm::legacy::PassManager& mpm = (*m_llvm_module_passes);
+    // TODO: Add based on optlevel
+    mpm.add(new CheckLayerRemovalPass());
 
     llvm::TargetMachine* target_machine = nullptr;
     if (target_host) {
@@ -5968,8 +6063,10 @@ LLVM_Util::bitcode_string(llvm::Module* module)
     std::string s;
     llvm::raw_string_ostream stream(s);
 
-    for (auto&& func : module->getFunctionList())
-        stream << func << '\n';
+    module->print(stream, nullptr);
+
+    // for (auto&& func : module->getFunctionList())
+    //     stream << func << '\n';
 
     return stream.str();
 }

diff --git a/src/liboslexec/opstring.cpp b/src/liboslexec/opstring.cpp
@@ -175,7 +175,6 @@ osl_printf(ShaderGlobals* sg, const char* format_str, ...)
     sg->context->messagefmt("{}", s);
 }
 
-
 OSL_SHADEOP void
 osl_error(ShaderGlobals* sg, const char* format_str, ...)
 {