Improve management of multiple slots

The server now does a better job picking the most appropriate slot, when servicing multiple independent completion sessions.
Mozilla-Ocho · Dec 14, 2024 · a8fd4d2 · a8fd4d2
1 parent 38677b5
commit a8fd4d2
Show file tree

Hide file tree

Showing 11 changed files with 201 additions and 36 deletions.
diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
@@ -65,13 +65,15 @@ const char *FLAG_prompt = nullptr;
 const char *FLAG_url_prefix = "";
 const char *FLAG_www_root = "/zip/www";
 double FLAG_token_rate = 1;
+float FLAG_decay_growth = .01;
 float FLAG_frequency_penalty = 0;
 float FLAG_presence_penalty = 0;
 float FLAG_reserve_tokens = .15;
 float FLAG_temperature = .8;
 float FLAG_top_p = .95;
 int FLAG_batch = 256;
 int FLAG_ctx_size = 8192;
+int FLAG_decay_delay = 60 * 5;
 int FLAG_flash_attn = false;
 int FLAG_gpu = 0;
 int FLAG_http_ibuf_size = 5 * 1024 * 1024;
@@ -396,13 +398,6 @@ void llamafile_get_flags(int argc, char **argv) {
             continue;
         }
 
-        if (!strcmp(flag, "-s") || !strcmp(flag, "--slots")) {
-            if (i == argc)
-                missing("--slots");
-            FLAG_slots = atoi(argv[i++]);
-            continue;
-        }
-
         if (!strcmp(flag, "-m") || !strcmp(flag, "--model")) {
             if (i == argc)
                 missing("--model");
@@ -482,6 +477,36 @@ void llamafile_get_flags(int argc, char **argv) {
             continue;
         }
 
+        //////////////////////////////////////////////////////////////////////
+        // resource management flags
+
+        if (!strcmp(flag, "-s") || !strcmp(flag, "--slots")) {
+            if (i == argc)
+                missing("--slots");
+            FLAG_slots = atoi(argv[i++]);
+            continue;
+        }
+
+        if (!strcmp(flag, "--decay-delay")) {
+            if (i == argc)
+                missing("--decay-delay");
+            int n = atoi(argv[i++]);
+            if (!(0 <= n && n <= 31536000))
+                error("--decay-delay INT must be between 1 and 31536000");
+            FLAG_decay_delay = n;
+            continue;
+        }
+
+        if (!strcmp(flag, "--decay-growth")) {
+            if (i == argc)
+                missing("--decay-growth");
+            float n = atof(argv[i++]);
+            if (!(isnormal(n) && n > 0))
+                error("--decay-growth FLOAT must be greater than 0");
+            FLAG_decay_growth = n;
+            continue;
+        }
+
         //////////////////////////////////////////////////////////////////////
         // cpu flags
 

diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h
@@ -35,12 +35,15 @@ extern const char *FLAG_prompt;
 extern const char *FLAG_url_prefix;
 extern const char *FLAG_www_root;
 extern double FLAG_token_rate;
+extern float FLAG_decay_growth;
 extern float FLAG_frequency_penalty;
 extern float FLAG_presence_penalty;
+extern float FLAG_reserve_tokens;
 extern float FLAG_temperature;
 extern float FLAG_top_p;
 extern int FLAG_batch;
 extern int FLAG_ctx_size;
+extern int FLAG_decay_delay;
 extern int FLAG_flash_attn;
 extern int FLAG_gpu;
 extern int FLAG_gpu;
@@ -49,7 +52,6 @@ extern int FLAG_http_obuf_size;
 extern int FLAG_keepalive;
 extern int FLAG_main_gpu;
 extern int FLAG_n_gpu_layers;
-extern float FLAG_reserve_tokens;
 extern int FLAG_slots;
 extern int FLAG_split_mode;
 extern int FLAG_threads;

diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
@@ -228,6 +228,18 @@ Client::transport()
         }
     }
 
+    if (effective_ip_ != client_ip_) {
+        char name[17];
+        snprintf(name,
+                 sizeof(name),
+                 "%hhu.%hhu.%hhu.%hhu",
+                 effective_ip_ >> 24,
+                 effective_ip_ >> 16,
+                 effective_ip_ >> 8,
+                 effective_ip_);
+        set_thread_name(name);
+    }
+
     if (get_header("X-Priority") == "batch") {
         worker_->deprioritize();
     } else if (!effective_ip_trusted_) {
@@ -661,9 +673,10 @@ Client::dispatcher()
     }
 
     // get request-uri path
+    char method[9] = { 0 };
     std::string_view p1 = path();
-    if (FLAG_verbose >= 2)
-        SLOG("request path %.*s", (int)p1.size(), p1.data());
+    WRITE64LE(method, msg_.method);
+    SLOG("%s %.*s", method, (int)p1.size(), p1.data());
     if (!p1.starts_with(FLAG_url_prefix)) {
         SLOG("path prefix mismatch");
         return send_error(404);
@@ -779,7 +792,8 @@ Client::dispatcher()
             return false;
         }
     }
-    SLOG("served %s", resolved_.c_str());
+    if (FLAG_verbose >= 1)
+        SLOG("served %s", resolved_.c_str());
     cleanup();
     return true;
 }

diff --git a/llamafile/server/main.1 b/llamafile/server/main.1
@@ -35,6 +35,26 @@ Specifies path of sqlite3 database.
 .Pp
 The default is
 .Pa ~/.llamafile/llamafile.sqlite3
+.It Fl ngl Ar N , Fl Fl gpu-layers Ar N , Fl Fl n-gpu-layers Ar N
+Specifies number of layers to offload to GPU.
+.Pp
+This flag must be passed in order to use GPU on systems with NVIDIA or
+AMD GPUs. If you're confident that you have enough VRAM, then you can
+pass
+.Fl ngl Ar 999
+to enable full offloading, since this number is automatically downtuned
+to however many number of layers the model has. If VRAM is limited, then
+the
+.Fl Fl verbose
+flag may be passed to learn how many layers the model has, e.g. 35,
+which can then be down-tuned until the out of memory error goes away.
+.Pp
+On Apple Silicon systems with Metal, GPU offloading is enabled by
+default. Since these GPUs use unified memory, they're treated as having
+a single layer; therefore, using values higher than 1 will be treated as
+1. You can pass
+.Fl ngl Ar 0
+to disable GPU offloading and run in CPU mode on Apple Metal systems.
 .It Fl l Ar HOSTPORT , Fl Fl listen Ar HOSTPORT
 Specifies the local [HOST:]PORT on which the HTTP server should listen.
 By default this is 0.0.0.0:8080 which means llamafiler will bind to port
@@ -58,6 +78,16 @@ resources, and control how much completion parallelism can happen.
 Please note that
 .Fl Fl ctx-size
 has a strong influence on how many slots can be created.
+.It Fl Fl decay-delay Ar INT
+Number of seconds a context window slot needs to be inactive before the
+system starts to strongly consider giving it to other clients. The
+default is 300 which is five minutes.
+.It Fl Fl decay-growth Ar FLOAT
+Sets slot decay growth factor. Context window slots are assigned in a
+least recently used fashion, based on the formula
+.EQ
+age + e sup {growth * (age - delay)}
+.EN
 .It Fl p Ar TEXT , Fl Fl prompt Ar TEXT , Fl Fl system-prompt Ar TEXT
 Specifies system prompt. This value is passed along to the web frontend.
 .It Fl Fl no-display-prompt
@@ -69,6 +99,11 @@ Specifies a URL prefix (subdirectory) under which the HTTP server will
 make the API accessible, e.g. /lamafiler. Useful when running llamafiler
 behind a reverse proxy such as NGINX or Redbean. By default, this is set
 to / (root).
+.It Fl Fl verbose
+Enable logging of diagnostic information. This flag is useful for
+learning more about the model and hardware. It can also be helpful for
+troubleshooting errors. We currently recommend that this flag be avoided
+in production since the llama.cpp logger may disrupt thread cancelation.
 .It Fl w Ar N , Fl Fl workers Ar N
 Number of HTTP client handling threads.
 .It Fl Fl trust Ar CIDR
@@ -161,22 +196,22 @@ models do. If it's a base model, then the web ui will automatically use
 completion mode only, without needing to specify this flag. This flag is
 useful in cases where a prompt template is defined by the gguf, but it
 is desirable for the chat interface to be disabled.
-.It Fl Fl db-startup-sql
+.It Fl Fl db-startup-sql Ar CODE
 Specifies SQL code that should be executed whenever connecting to the
 SQLite database. The default is the following code, which enables the
 write-ahead log.
 .Bd -literal -offset indent
 PRAGMA journal_mode=WAL;
 PRAGMA synchronous=NORMAL;
 .Ed
-.It Fl Fl reserve-tokens
+.It Fl Fl reserve-tokens Ar N
 Percent of context window to reserve for predicted tokens. When the
 server runs out of context window, old chat messages will be forgotten
 until this percent of the context is empty. The default is 15%. If this
 is specified as a floating point number, e.g. 0.15, then it'll be
 multiplied by 100 to get the percent.
 .El
-.Sh EXAMPLE
+.Sh EXAMPLES
 Here's an example of how you might start this server:
 .Pp
 .Dl "llamafiler -m all-MiniLM-L6-v2.F32.gguf"

diff --git a/llamafile/server/main.1.asc b/llamafile/server/main.1.asc
@@ -37,6 +37,23 @@ OOPPTTIIOONNSS
 
              The default is _~_/_._l_l_a_m_a_f_i_l_e_/_l_l_a_m_a_f_i_l_e_._s_q_l_i_t_e_3
 
+     --nnggll _N, ----ggppuu--llaayyeerrss _N, ----nn--ggppuu--llaayyeerrss _N
+             Specifies number of layers to offload to GPU.
+
+             This flag must be passed in order to use GPU on systems with NVIDIA
+             or AMD GPUs. If you're confident that you have enough VRAM, then
+             you can pass --nnggll _9_9_9 to enable full offloading, since this number
+             is automatically downtuned to however many number of layers the
+             model has. If VRAM is limited, then the ----vveerrbboossee flag may be
+             passed to learn how many layers the model has, e.g. 35, which can
+             then be down-tuned until the out of memory error goes away.
+
+             On Apple Silicon systems with Metal, GPU offloading is enabled by
+             default. Since these GPUs use unified memory, they're treated as
+             having a single layer; therefore, using values higher than 1 will
+             be treated as 1. You can pass --nnggll _0 to disable GPU offloading and
+             run in CPU mode on Apple Metal systems.
+
      --ll _H_O_S_T_P_O_R_T, ----lliisstteenn _H_O_S_T_P_O_R_T
              Specifies the local [HOST:]PORT on which the HTTP server should
              listen.  By default this is 0.0.0.0:8080 which means llamafiler
@@ -63,6 +80,16 @@ OOPPTTIIOONNSS
              parallelism can happen.  Please note that ----ccttxx--ssiizzee has a strong
              influence on how many slots can be created.
 
+     ----ddeeccaayy--ddeellaayy _I_N_T
+             Number of seconds a context window slot needs to be inactive before
+             the system starts to strongly consider giving it to other clients.
+             The default is 300 which is five minutes.
+
+     ----ddeeccaayy--ggrroowwtthh _F_L_O_A_T
+             Sets slot decay growth factor. Context window slots are assigned in
+             a least recently used fashion, based on the formula _a_g_e + _e^(_g_r_o_w_t_h
+             * (_a_g_e - _d_e_l_a_y))
+
      --pp _T_E_X_T, ----pprroommpptt _T_E_X_T, ----ssyysstteemm--pprroommpptt _T_E_X_T
              Specifies system prompt. This value is passed along to the web
              frontend.
@@ -79,6 +106,13 @@ OOPPTTIIOONNSS
              llamafiler behind a reverse proxy such as NGINX or Redbean. By
              default, this is set to / (root).
 
+     ----vveerrbboossee
+             Enable logging of diagnostic information. This flag is useful for
+             learning more about the model and hardware. It can also be helpful
+             for troubleshooting errors. We currently recommend that this flag
+             be avoided in production since the llama.cpp logger may disrupt
+             thread cancelation.
+
      --ww _N, ----wwoorrkkeerrss _N
              Number of HTTP client handling threads.
 
@@ -193,22 +227,22 @@ OOPPTTIIOONNSS
              defined by the gguf, but it is desirable for the chat interface to
              be disabled.
 
-     ----ddbb--ssttaarrttuupp--ssqqll
+     ----ddbb--ssttaarrttuupp--ssqqll _C_O_D_E
              Specifies SQL code that should be executed whenever connecting to
              the SQLite database. The default is the following code, which
              enables the write-ahead log.
 
                    PRAGMA journal_mode=WAL;
                    PRAGMA synchronous=NORMAL;
 
-     ----rreesseerrvvee--ttookkeennss
+     ----rreesseerrvvee--ttookkeennss _N
              Percent of context window to reserve for predicted tokens. When the
              server runs out of context window, old chat messages will be
              forgotten until this percent of the context is empty. The default
              is 15%. If this is specified as a floating point number, e.g. 0.15,
              then it'll be multiplied by 100 to get the percent.
 
-EEXXAAMMPPLLEE
+EEXXAAMMPPLLEESS
      Here's an example of how you might start this server:
 
            llamafiler -m all-MiniLM-L6-v2.F32.gguf

diff --git a/llamafile/server/main.cpp b/llamafile/server/main.cpp
@@ -65,7 +65,8 @@ main(int argc, char* argv[])
 
     // we must disable the llama.cpp logger
     // otherwise pthread_cancel() will cause deadlocks
-    FLAG_log_disable = true;
+    if (!llamafile_has(argv, "--verbose"))
+        FLAG_log_disable = true;
 
     // load model
     llama_model_params mparams = {

diff --git a/llamafile/server/server.cpp b/llamafile/server/server.cpp
@@ -128,14 +128,19 @@ Server::accept(unsigned* out_ip)
 
     // set name
     char name[17];
+    int port = ntohs(clientaddr.sin_port);
     unsigned ip = ntohl(clientaddr.sin_addr.s_addr);
-    snprintf(name,
-             sizeof(name),
-             "%hhu.%hhu.%hhu.%hhu",
-             ip >> 24,
-             ip >> 16,
-             ip >> 8,
-             ip);
+    if (ip == 0x7f000001) {
+        snprintf(name, sizeof(name), "%hu", port);
+    } else {
+        snprintf(name,
+                 sizeof(name),
+                 "%hhu.%hhu.%hhu.%hhu",
+                 ip >> 24,
+                 ip >> 16,
+                 ip >> 8,
+                 ip);
+    }
     set_thread_name(name);
 
     // keep sockets open

diff --git a/llamafile/server/slot.cpp b/llamafile/server/slot.cpp
@@ -79,9 +79,10 @@ Slot::describe_error(int err)
     }
 }
 
-Slot::Slot(llama_model* model) : model_(model)
+Slot::Slot(int id, llama_model* model) : id_(id), model_(model)
 {
     dll_init(&elem_);
+    last_used_ = time(0);
 }
 
 Slot::~Slot()

diff --git a/llamafile/server/slot.h b/llamafile/server/slot.h
@@ -17,6 +17,7 @@
 
 #pragma once
 #include <cosmo.h>
+#include <ctime>
 #include <functional>
 #include <string>
 #include <vector>
@@ -49,15 +50,17 @@ struct Slot
 
     static const char* describe_error(int);
 
+    int id_;
     Dll elem_;
+    time_t last_used_;
     llama_model* model_;
     clip_ctx* clip_ctx_ = nullptr;
     llama_context* ctx_ = nullptr;
     std::vector<Atom> history_;
     std::string system_fingerprint_;
 
     ~Slot();
-    explicit Slot(llama_model*);
+    Slot(int, llama_model*);
     int ctx_size() const;
     int ctx_used() const;
     bool start();