ray-project · edoakes · Feb 10, 2025 · Jan 11, 2025 · Feb 8, 2025 · Feb 10, 2025
diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc
@@ -2308,19 +2308,17 @@ json CoreWorker::OverrideRuntimeEnv(const json &child,
   return result_runtime_env;
 }
 
-// TODO(hjiang): Current implementation is not the most ideal version, since it acquires a
-// global lock for all operations; it's acceptable for now since no heavy-lifted operation
-// is involved (considering the overall scheduling overhead is single-digit millisecond
-// magnitude). But a better solution is LRU cache native providing a native support for
-// sharding and `GetOrCreate` API.
 std::shared_ptr<rpc::RuntimeEnvInfo> CoreWorker::OverrideTaskOrActorRuntimeEnvInfo(
     const std::string &serialized_runtime_env_info) const {
-  if (auto cached_runtime_env_info =
-          runtime_env_json_serialization_cache_.Get(serialized_runtime_env_info);
-      cached_runtime_env_info != nullptr) {
-    return cached_runtime_env_info;
-  }
+  auto factory = [this](const std::string &serialized_runtime_env_info) {
+    return OverrideTaskOrActorRuntimeEnvInfoImpl(serialized_runtime_env_info);
+  };
+  return runtime_env_json_serialization_cache_.GetOrCreate(serialized_runtime_env_info,
+                                                           std::move(factory));
+}
 
+std::shared_ptr<rpc::RuntimeEnvInfo> CoreWorker::OverrideTaskOrActorRuntimeEnvInfoImpl(
+    const std::string &serialized_runtime_env_info) const {
   // TODO(Catch-Bull,SongGuyang): task runtime env not support the field eager_install
   // yet, we will overwrite the filed eager_install when it did.
   std::shared_ptr<json> parent = nullptr;

diff --git a/src/ray/core_worker/core_worker.h b/src/ray/core_worker/core_worker.h
@@ -1384,6 +1384,11 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   std::shared_ptr<rpc::RuntimeEnvInfo> OverrideTaskOrActorRuntimeEnvInfo(
       const std::string &serialized_runtime_env_info) const;
 
+  // Used as the factory function for [OverrideTaskOrActorRuntimeEnvInfo] to create in LRU
+  // cache.
+  std::shared_ptr<rpc::RuntimeEnvInfo> OverrideTaskOrActorRuntimeEnvInfoImpl(
+      const std::string &serialized_runtime_env_info) const;
+
   void BuildCommonTaskSpec(
       TaskSpecBuilder &builder,
       const JobID &job_id,

diff --git a/src/ray/util/shared_lru.h b/src/ray/util/shared_lru.h
@@ -27,11 +27,11 @@
 // // Check and consume `val`.
 //
 // TODO(hjiang):
-// 1. Add a `GetOrCreate` interface, which takes factory function to creation value.
-// 2. For thread-safe cache, add a sharded container wrapper to reduce lock contention.
+// For thread-safe cache, add a sharded container wrapper to reduce lock contention.
 
 #pragma once
 
+#include <condition_variable>
 #include <cstdint>
 #include <list>
 #include <memory>
@@ -194,6 +194,68 @@ class ThreadSafeSharedLruCache final {
     return cache_.Get(std::forward<KeyLike>(key));
   }
 
+  // Get or creation for cached key-value pairs.
+  //
+  // WARNING: Currently factory cannot have exception thrown.
+  // TODO(hjiang): [factory] should support template.
+  template <typename KeyLike>
+  std::shared_ptr<Val> GetOrCreate(
+      KeyLike &&key, std::function<std::shared_ptr<Val>(const Key &)> factory) {
+    std::shared_ptr<CreationToken> creation_token;
+
+    {
+      std::unique_lock lck(mu_);
+      auto cached_val = cache_.Get(key);
+      if (cached_val != nullptr) {
+        return cached_val;
+      }
+
+      auto creation_iter = ongoing_creation_.find(key);
+
+      // Another thread has requested for the same key-value pair, simply wait for its
+      // completion.
+      if (creation_iter != ongoing_creation_.end()) {
+        creation_token = creation_iter->second;
+        ++creation_token->count;
+        creation_token->cv.wait(lck, [creation_token = creation_token.get()]() {
+          return creation_token->val != nullptr;
+        });
+
+        // Creation finished.
+        --creation_token->count;
+        if (creation_token->count == 0) {
+          // [creation_iter] could be invalidated here due to new insertion/deletion.
+          ongoing_creation_.erase(key);
+        }
+        return creation_token->val;
+      }
+
+      // Current thread is the first one to request for the key-value pair, perform
+      // factory function.
+      creation_iter =
+          ongoing_creation_.emplace(key, std::make_shared<CreationToken>()).first;
+      creation_token = creation_iter->second;
+      creation_token->count = 1;
+    }
+
+    // Place factory out of critical section.
+    std::shared_ptr<Val> val = factory(key);
+
+    {
+      std::lock_guard lck(mu_);
+      cache_.Put(key, val);
+      creation_token->val = val;
+      creation_token->cv.notify_all();
+      int new_count = --creation_token->count;
+      if (new_count == 0) {
+        // [creation_iter] could be invalidated here due to new insertion/deletion.
-        // [creation_iter] could be invalidated here due to new insertion/deletion.
-        // [creation_iter] could be invalidated here due to new insertion/deletion.
+        ongoing_creation_.erase(key);
+      }
+    }
+
+    return val;
+  }
+
   // Clear the cache.
   void Clear() {
     std::lock_guard lck(mu_);
@@ -204,8 +266,19 @@ class ThreadSafeSharedLruCache final {
   size_t max_entries() const { return cache_.max_entries(); }
 
  private:
+  struct CreationToken {
+    std::condition_variable cv;
+    // Nullptr indicate creation unfinished.
+    std::shared_ptr<Val> val;
+    // Counter for ongoing creation.
+    int count = 0;
+  };
+
   std::mutex mu_;
   SharedLruCache<Key, Val> cache_;
+
+  // Ongoing creation.
+  absl::flat_hash_map<Key, std::shared_ptr<CreationToken>> ongoing_creation_;
 };
 
 // Same interfaces as `SharedLruCache`, but all cached values are

diff --git a/src/ray/util/tests/shared_lru_test.cc b/src/ray/util/tests/shared_lru_test.cc
@@ -16,7 +16,9 @@
 
 #include <gtest/gtest.h>
 
+#include <future>
 #include <string>
+#include <thread>
 #include <type_traits>
 
 namespace ray::utils::container {
@@ -80,6 +82,41 @@ TEST(SharedLruCache, SameKeyTest) {
   EXPECT_EQ(2, *val);
 }
 
+TEST(SharedLruCache, FactoryTest) {
+  using CacheType = ThreadSafeSharedLruCache<std::string, std::string>;
+
+  std::atomic<bool> invoked = {false};  // Used to check only invoke once.
+  auto factory = [&invoked](const std::string &key) -> std::shared_ptr<std::string> {
+    EXPECT_FALSE(invoked.exchange(true));
+    // Sleep for a while so multiple threads could kick in and get blocked.
+    std::this_thread::sleep_for(std::chrono::seconds(3));
+    return std::make_shared<std::string>(key);
+  };
+
+  CacheType cache{1};
+
+  constexpr size_t kFutureNum = 100;
+  std::vector<std::future<std::shared_ptr<std::string>>> futures;
+  futures.reserve(kFutureNum);
+
+  const std::string key = "key";
+  for (size_t idx = 0; idx < kFutureNum; ++idx) {
+    futures.emplace_back(std::async(std::launch::async, [&cache, &key, &factory]() {
+      return cache.GetOrCreate(key, factory);
+    }));
+  }
+  for (auto &fut : futures) {
+    auto val = fut.get();
+    ASSERT_NE(val, nullptr);
+    ASSERT_EQ(*val, key);
+  }
+
+  // After we're sure key-value pair exists in cache, make one more call.
+  auto cached_val = cache.GetOrCreate(key, factory);
+  ASSERT_NE(cached_val, nullptr);
+  ASSERT_EQ(*cached_val, key);
+}
+
 TEST(SharedLruConstCache, TypeAliasAssertion) {
   static_assert(
       std::is_same_v<SharedLruConstCache<int, int>, SharedLruCache<int, const int>>);