Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: Add timeout to stop the server automatically when idling for too long. #10742

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_cache_reuse = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
add_opt(common_arg(
{"--standby-timeout"}, "N",
string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
[](common_params & params, int value) {
params.standby_timeout = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
add_opt(common_arg(
{"--metrics"},
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t standby_timeout = 0; // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.

std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
Expand Down
1 change: 1 addition & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ The project is under active development, and we are [looking for feedback and co
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)<br/>(env: LLAMA_ARG_STANDBY_TIMEOUT) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
Expand Down
21 changes: 18 additions & 3 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <chrono>
#include <variant>

using json = nlohmann::ordered_json;

Expand Down Expand Up @@ -1177,6 +1179,8 @@ struct server_queue {
std::function<void(server_task)> callback_new_task;
std::function<void(void)> callback_update_slots;

int standby_timeout;

// Add a new task to the end of the queue
int post(server_task task, bool front = false) {
std::unique_lock<std::mutex> lock(mutex_tasks);
Expand Down Expand Up @@ -1291,9 +1295,18 @@ struct server_queue {
QUE_DBG("%s", "terminate\n");
return;
}
condition_tasks.wait(lock, [&]{
return (!queue_tasks.empty() || !running);
});
const auto pred = [&] {
return (!queue_tasks.empty() || !running);
};
if (standby_timeout > 0) {
if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
QUE_INF("%s", "stand-by timeout reached\n");
running = false;
break;
}
} else {
condition_tasks.wait(lock, pred);
}
}
}
}
Expand Down Expand Up @@ -1468,6 +1481,8 @@ struct server_context {

n_ctx = llama_n_ctx(ctx);

queue_tasks.standby_timeout = params.standby_timeout;

add_bos_token = llama_add_bos_token(model);
has_eos_token = !llama_add_eos_token(model);

Expand Down
Loading