ggerganov · Sumandora · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_cache_reuse = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
+    add_opt(common_arg(
+        {"--standby-timeout"}, "N",
+        string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
+        [](common_params & params, int value) {
+            params.standby_timeout = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
     add_opt(common_arg(
         {"--metrics"},
         string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

diff --git a/common/common.h b/common/common.h
@@ -306,6 +306,7 @@ struct common_params {
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
     int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
+    int32_t standby_timeout  = 0;          // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT

@@ -155,6 +155,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
+| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)<br/>(env: LLAMA_ARG_STANDBY_TIMEOUT) |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |

@@ -29,6 +29,8 @@
 #include <thread>
 #include <unordered_map>
 #include <unordered_set>
+#include <chrono>
+#include <variant>
 
 using json = nlohmann::ordered_json;
 
@@ -1177,6 +1179,8 @@ struct server_queue {
     std::function<void(server_task)> callback_new_task;
     std::function<void(void)>        callback_update_slots;
 
+    int standby_timeout;
+
     // Add a new task to the end of the queue
     int post(server_task task, bool front = false) {
         std::unique_lock<std::mutex> lock(mutex_tasks);
@@ -1291,9 +1295,18 @@ struct server_queue {
                         QUE_DBG("%s", "terminate\n");
                         return;
                     }
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
+                    const auto pred = [&] {
+                            return (!queue_tasks.empty() || !running);
+                    };
+                    if (standby_timeout > 0) {
+                        if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
+                            QUE_INF("%s", "stand-by timeout reached\n");
+                            running = false;
+                            break;
+                        }
+                    } else {
+                        condition_tasks.wait(lock, pred);
+                    }
                 }
             }
         }
@@ -1468,6 +1481,8 @@ struct server_context {
 
         n_ctx = llama_n_ctx(ctx);
 
+        queue_tasks.standby_timeout = params.standby_timeout;
+
         add_bos_token = llama_add_bos_token(model);
         has_eos_token = !llama_add_eos_token(model);