Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: Add timeout to stop the server automatically when idling for too long. #10742

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
7 changes: 7 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_cache_reuse = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
add_opt(common_arg(
{"--standby-timeout"}, "N",
string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
[](common_params & params, int value) {
params.standby_timeout = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
add_opt(common_arg(
{"--metrics"},
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t standby_timeout = 0; // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.

std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
Expand Down
1 change: 1 addition & 0 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ The project is under active development, and we are [looking for feedback and co
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)<br/>(env: LLAMA_ARG_STANDBY_TIMEOUT) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
Expand Down
37 changes: 29 additions & 8 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <chrono>
#include <variant>

using json = nlohmann::ordered_json;

Expand Down Expand Up @@ -1162,6 +1164,16 @@ struct server_metrics {
}
};

struct termination_signal {
int number;
};

struct standby_timeout {};

using shutdown_reason = std::variant<termination_signal, standby_timeout>;
Sumandora marked this conversation as resolved.
Show resolved Hide resolved

std::function<void(shutdown_reason)> shutdown_handler;
Sumandora marked this conversation as resolved.
Show resolved Hide resolved

struct server_queue {
int id = 0;
bool running;
Expand Down Expand Up @@ -1258,7 +1270,7 @@ struct server_queue {
* - Check if multitask is finished
* - Update all slots
*/
void start_loop() {
void start_loop(int standby_timeout) {
Sumandora marked this conversation as resolved.
Show resolved Hide resolved
running = true;

while (true) {
Expand Down Expand Up @@ -1291,9 +1303,19 @@ struct server_queue {
QUE_DBG("%s", "terminate\n");
return;
}
condition_tasks.wait(lock, [&]{
return (!queue_tasks.empty() || !running);
});
const auto pred = [&] {
return (!queue_tasks.empty() || !running);
};
if (standby_timeout > 0) {
if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it
QUE_INF("%s", "stand-by timeout reached\n");
shutdown_handler(::standby_timeout{});
Sumandora marked this conversation as resolved.
Show resolved Hide resolved
break;
}
} else {
condition_tasks.wait(lock, pred);
}
}
}
}
Expand Down Expand Up @@ -2884,7 +2906,6 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
LOG_DBG("response: %s\n", res.body.c_str());
}

std::function<void(int)> shutdown_handler;
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;

inline void signal_handler(int signal) {
Expand All @@ -2895,7 +2916,7 @@ inline void signal_handler(int signal) {
exit(1);
}

shutdown_handler(signal);
shutdown_handler(termination_signal{ signal });
}

int main(int argc, char ** argv) {
Expand Down Expand Up @@ -3935,13 +3956,13 @@ int main(int argc, char ** argv) {
ctx_server.queue_tasks.on_update_slots(std::bind(
&server_context::update_slots, &ctx_server));

shutdown_handler = [&](int) {
shutdown_handler = [&](shutdown_reason) {
ctx_server.queue_tasks.terminate();
};

LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);

ctx_server.queue_tasks.start_loop();
ctx_server.queue_tasks.start_loop(params.standby_timeout);

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
Expand Down
Loading