Skip to content

Commit

Permalink
Improve management of multiple slots
Browse files Browse the repository at this point in the history
The server now does a better job picking the most appropriate slot, when
servicing multiple independent completion sessions.
  • Loading branch information
jart committed Dec 14, 2024
1 parent 38677b5 commit a8fd4d2
Show file tree
Hide file tree
Showing 11 changed files with 201 additions and 36 deletions.
39 changes: 32 additions & 7 deletions llamafile/flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,15 @@ const char *FLAG_prompt = nullptr;
const char *FLAG_url_prefix = "";
const char *FLAG_www_root = "/zip/www";
double FLAG_token_rate = 1;
float FLAG_decay_growth = .01;
float FLAG_frequency_penalty = 0;
float FLAG_presence_penalty = 0;
float FLAG_reserve_tokens = .15;
float FLAG_temperature = .8;
float FLAG_top_p = .95;
int FLAG_batch = 256;
int FLAG_ctx_size = 8192;
int FLAG_decay_delay = 60 * 5;
int FLAG_flash_attn = false;
int FLAG_gpu = 0;
int FLAG_http_ibuf_size = 5 * 1024 * 1024;
Expand Down Expand Up @@ -396,13 +398,6 @@ void llamafile_get_flags(int argc, char **argv) {
continue;
}

if (!strcmp(flag, "-s") || !strcmp(flag, "--slots")) {
if (i == argc)
missing("--slots");
FLAG_slots = atoi(argv[i++]);
continue;
}

if (!strcmp(flag, "-m") || !strcmp(flag, "--model")) {
if (i == argc)
missing("--model");
Expand Down Expand Up @@ -482,6 +477,36 @@ void llamafile_get_flags(int argc, char **argv) {
continue;
}

//////////////////////////////////////////////////////////////////////
// resource management flags

if (!strcmp(flag, "-s") || !strcmp(flag, "--slots")) {
if (i == argc)
missing("--slots");
FLAG_slots = atoi(argv[i++]);
continue;
}

if (!strcmp(flag, "--decay-delay")) {
if (i == argc)
missing("--decay-delay");
int n = atoi(argv[i++]);
if (!(0 <= n && n <= 31536000))
error("--decay-delay INT must be between 1 and 31536000");
FLAG_decay_delay = n;
continue;
}

if (!strcmp(flag, "--decay-growth")) {
if (i == argc)
missing("--decay-growth");
float n = atof(argv[i++]);
if (!(isnormal(n) && n > 0))
error("--decay-growth FLOAT must be greater than 0");
FLAG_decay_growth = n;
continue;
}

//////////////////////////////////////////////////////////////////////
// cpu flags

Expand Down
4 changes: 3 additions & 1 deletion llamafile/llamafile.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,15 @@ extern const char *FLAG_prompt;
extern const char *FLAG_url_prefix;
extern const char *FLAG_www_root;
extern double FLAG_token_rate;
extern float FLAG_decay_growth;
extern float FLAG_frequency_penalty;
extern float FLAG_presence_penalty;
extern float FLAG_reserve_tokens;
extern float FLAG_temperature;
extern float FLAG_top_p;
extern int FLAG_batch;
extern int FLAG_ctx_size;
extern int FLAG_decay_delay;
extern int FLAG_flash_attn;
extern int FLAG_gpu;
extern int FLAG_gpu;
Expand All @@ -49,7 +52,6 @@ extern int FLAG_http_obuf_size;
extern int FLAG_keepalive;
extern int FLAG_main_gpu;
extern int FLAG_n_gpu_layers;
extern float FLAG_reserve_tokens;
extern int FLAG_slots;
extern int FLAG_split_mode;
extern int FLAG_threads;
Expand Down
20 changes: 17 additions & 3 deletions llamafile/server/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,18 @@ Client::transport()
}
}

if (effective_ip_ != client_ip_) {
char name[17];
snprintf(name,
sizeof(name),
"%hhu.%hhu.%hhu.%hhu",
effective_ip_ >> 24,
effective_ip_ >> 16,
effective_ip_ >> 8,
effective_ip_);
set_thread_name(name);
}

if (get_header("X-Priority") == "batch") {
worker_->deprioritize();
} else if (!effective_ip_trusted_) {
Expand Down Expand Up @@ -661,9 +673,10 @@ Client::dispatcher()
}

// get request-uri path
char method[9] = { 0 };
std::string_view p1 = path();
if (FLAG_verbose >= 2)
SLOG("request path %.*s", (int)p1.size(), p1.data());
WRITE64LE(method, msg_.method);
SLOG("%s %.*s", method, (int)p1.size(), p1.data());
if (!p1.starts_with(FLAG_url_prefix)) {
SLOG("path prefix mismatch");
return send_error(404);
Expand Down Expand Up @@ -779,7 +792,8 @@ Client::dispatcher()
return false;
}
}
SLOG("served %s", resolved_.c_str());
if (FLAG_verbose >= 1)
SLOG("served %s", resolved_.c_str());
cleanup();
return true;
}
Expand Down
41 changes: 38 additions & 3 deletions llamafile/server/main.1
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,26 @@ Specifies path of sqlite3 database.
.Pp
The default is
.Pa ~/.llamafile/llamafile.sqlite3
.It Fl ngl Ar N , Fl Fl gpu-layers Ar N , Fl Fl n-gpu-layers Ar N
Specifies number of layers to offload to GPU.
.Pp
This flag must be passed in order to use GPU on systems with NVIDIA or
AMD GPUs. If you're confident that you have enough VRAM, then you can
pass
.Fl ngl Ar 999
to enable full offloading, since this number is automatically downtuned
to however many number of layers the model has. If VRAM is limited, then
the
.Fl Fl verbose
flag may be passed to learn how many layers the model has, e.g. 35,
which can then be down-tuned until the out of memory error goes away.
.Pp
On Apple Silicon systems with Metal, GPU offloading is enabled by
default. Since these GPUs use unified memory, they're treated as having
a single layer; therefore, using values higher than 1 will be treated as
1. You can pass
.Fl ngl Ar 0
to disable GPU offloading and run in CPU mode on Apple Metal systems.
.It Fl l Ar HOSTPORT , Fl Fl listen Ar HOSTPORT
Specifies the local [HOST:]PORT on which the HTTP server should listen.
By default this is 0.0.0.0:8080 which means llamafiler will bind to port
Expand All @@ -58,6 +78,16 @@ resources, and control how much completion parallelism can happen.
Please note that
.Fl Fl ctx-size
has a strong influence on how many slots can be created.
.It Fl Fl decay-delay Ar INT
Number of seconds a context window slot needs to be inactive before the
system starts to strongly consider giving it to other clients. The
default is 300 which is five minutes.
.It Fl Fl decay-growth Ar FLOAT
Sets slot decay growth factor. Context window slots are assigned in a
least recently used fashion, based on the formula
.EQ
age + e sup {growth * (age - delay)}
.EN
.It Fl p Ar TEXT , Fl Fl prompt Ar TEXT , Fl Fl system-prompt Ar TEXT
Specifies system prompt. This value is passed along to the web frontend.
.It Fl Fl no-display-prompt
Expand All @@ -69,6 +99,11 @@ Specifies a URL prefix (subdirectory) under which the HTTP server will
make the API accessible, e.g. /lamafiler. Useful when running llamafiler
behind a reverse proxy such as NGINX or Redbean. By default, this is set
to / (root).
.It Fl Fl verbose
Enable logging of diagnostic information. This flag is useful for
learning more about the model and hardware. It can also be helpful for
troubleshooting errors. We currently recommend that this flag be avoided
in production since the llama.cpp logger may disrupt thread cancelation.
.It Fl w Ar N , Fl Fl workers Ar N
Number of HTTP client handling threads.
.It Fl Fl trust Ar CIDR
Expand Down Expand Up @@ -161,22 +196,22 @@ models do. If it's a base model, then the web ui will automatically use
completion mode only, without needing to specify this flag. This flag is
useful in cases where a prompt template is defined by the gguf, but it
is desirable for the chat interface to be disabled.
.It Fl Fl db-startup-sql
.It Fl Fl db-startup-sql Ar CODE
Specifies SQL code that should be executed whenever connecting to the
SQLite database. The default is the following code, which enables the
write-ahead log.
.Bd -literal -offset indent
PRAGMA journal_mode=WAL;
PRAGMA synchronous=NORMAL;
.Ed
.It Fl Fl reserve-tokens
.It Fl Fl reserve-tokens Ar N
Percent of context window to reserve for predicted tokens. When the
server runs out of context window, old chat messages will be forgotten
until this percent of the context is empty. The default is 15%. If this
is specified as a floating point number, e.g. 0.15, then it'll be
multiplied by 100 to get the percent.
.El
.Sh EXAMPLE
.Sh EXAMPLES
Here's an example of how you might start this server:
.Pp
.Dl "llamafiler -m all-MiniLM-L6-v2.F32.gguf"
Expand Down
40 changes: 37 additions & 3 deletions llamafile/server/main.1.asc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,23 @@ OOPPTTIIOONNSS

The default is _~_/_._l_l_a_m_a_f_i_l_e_/_l_l_a_m_a_f_i_l_e_._s_q_l_i_t_e_3

--nnggll _N, ----ggppuu--llaayyeerrss _N, ----nn--ggppuu--llaayyeerrss _N
Specifies number of layers to offload to GPU.

This flag must be passed in order to use GPU on systems with NVIDIA
or AMD GPUs. If you're confident that you have enough VRAM, then
you can pass --nnggll _9_9_9 to enable full offloading, since this number
is automatically downtuned to however many number of layers the
model has. If VRAM is limited, then the ----vveerrbboossee flag may be
passed to learn how many layers the model has, e.g. 35, which can
then be down-tuned until the out of memory error goes away.

On Apple Silicon systems with Metal, GPU offloading is enabled by
default. Since these GPUs use unified memory, they're treated as
having a single layer; therefore, using values higher than 1 will
be treated as 1. You can pass --nnggll _0 to disable GPU offloading and
run in CPU mode on Apple Metal systems.

--ll _H_O_S_T_P_O_R_T, ----lliisstteenn _H_O_S_T_P_O_R_T
Specifies the local [HOST:]PORT on which the HTTP server should
listen. By default this is 0.0.0.0:8080 which means llamafiler
Expand All @@ -63,6 +80,16 @@ OOPPTTIIOONNSS
parallelism can happen. Please note that ----ccttxx--ssiizzee has a strong
influence on how many slots can be created.

----ddeeccaayy--ddeellaayy _I_N_T
Number of seconds a context window slot needs to be inactive before
the system starts to strongly consider giving it to other clients.
The default is 300 which is five minutes.

----ddeeccaayy--ggrroowwtthh _F_L_O_A_T
Sets slot decay growth factor. Context window slots are assigned in
a least recently used fashion, based on the formula _a_g_e + _e^(_g_r_o_w_t_h
* (_a_g_e - _d_e_l_a_y))

--pp _T_E_X_T, ----pprroommpptt _T_E_X_T, ----ssyysstteemm--pprroommpptt _T_E_X_T
Specifies system prompt. This value is passed along to the web
frontend.
Expand All @@ -79,6 +106,13 @@ OOPPTTIIOONNSS
llamafiler behind a reverse proxy such as NGINX or Redbean. By
default, this is set to / (root).

----vveerrbboossee
Enable logging of diagnostic information. This flag is useful for
learning more about the model and hardware. It can also be helpful
for troubleshooting errors. We currently recommend that this flag
be avoided in production since the llama.cpp logger may disrupt
thread cancelation.

--ww _N, ----wwoorrkkeerrss _N
Number of HTTP client handling threads.

Expand Down Expand Up @@ -193,22 +227,22 @@ OOPPTTIIOONNSS
defined by the gguf, but it is desirable for the chat interface to
be disabled.

----ddbb--ssttaarrttuupp--ssqqll
----ddbb--ssttaarrttuupp--ssqqll _C_O_D_E
Specifies SQL code that should be executed whenever connecting to
the SQLite database. The default is the following code, which
enables the write-ahead log.

PRAGMA journal_mode=WAL;
PRAGMA synchronous=NORMAL;

----rreesseerrvvee--ttookkeennss
----rreesseerrvvee--ttookkeennss _N
Percent of context window to reserve for predicted tokens. When the
server runs out of context window, old chat messages will be
forgotten until this percent of the context is empty. The default
is 15%. If this is specified as a floating point number, e.g. 0.15,
then it'll be multiplied by 100 to get the percent.
EEXXAAMMPPLLEE
EEXXAAMMPPLLEESS
Here's an example of how you might start this server:

llamafiler -m all-MiniLM-L6-v2.F32.gguf
Expand Down
3 changes: 2 additions & 1 deletion llamafile/server/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ main(int argc, char* argv[])

// we must disable the llama.cpp logger
// otherwise pthread_cancel() will cause deadlocks
FLAG_log_disable = true;
if (!llamafile_has(argv, "--verbose"))
FLAG_log_disable = true;

// load model
llama_model_params mparams = {
Expand Down
19 changes: 12 additions & 7 deletions llamafile/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,19 @@ Server::accept(unsigned* out_ip)

// set name
char name[17];
int port = ntohs(clientaddr.sin_port);
unsigned ip = ntohl(clientaddr.sin_addr.s_addr);
snprintf(name,
sizeof(name),
"%hhu.%hhu.%hhu.%hhu",
ip >> 24,
ip >> 16,
ip >> 8,
ip);
if (ip == 0x7f000001) {
snprintf(name, sizeof(name), "%hu", port);
} else {
snprintf(name,
sizeof(name),
"%hhu.%hhu.%hhu.%hhu",
ip >> 24,
ip >> 16,
ip >> 8,
ip);
}
set_thread_name(name);

// keep sockets open
Expand Down
3 changes: 2 additions & 1 deletion llamafile/server/slot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,10 @@ Slot::describe_error(int err)
}
}

Slot::Slot(llama_model* model) : model_(model)
Slot::Slot(int id, llama_model* model) : id_(id), model_(model)
{
dll_init(&elem_);
last_used_ = time(0);
}

Slot::~Slot()
Expand Down
5 changes: 4 additions & 1 deletion llamafile/server/slot.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#pragma once
#include <cosmo.h>
#include <ctime>
#include <functional>
#include <string>
#include <vector>
Expand Down Expand Up @@ -49,15 +50,17 @@ struct Slot

static const char* describe_error(int);

int id_;
Dll elem_;
time_t last_used_;
llama_model* model_;
clip_ctx* clip_ctx_ = nullptr;
llama_context* ctx_ = nullptr;
std::vector<Atom> history_;
std::string system_fingerprint_;

~Slot();
explicit Slot(llama_model*);
Slot(int, llama_model*);
int ctx_size() const;
int ctx_used() const;
bool start();
Expand Down
Loading

0 comments on commit a8fd4d2

Please sign in to comment.