From 8233c18d77dc0e9aee5b916d6d3ddac4c1f2546c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 22 Dec 2024 16:20:20 +0200
Subject: [PATCH] llama : arch

---
 src/llama-arch.cpp  | 41 +++++++++++++++++++++
 src/llama-arch.h    | 89 +++------------------------------------------
 src/llama-impl.h    | 16 +-------
 src/llama-mmap.cpp  |  2 +
 src/llama-mmap.h    |  2 -
 src/llama-model.cpp | 43 ++++++++++++++++++++++
 src/llama-model.h   |  2 +
 src/llama-vocab.h   |  2 +-
 src/llama.cpp       | 21 +++++++++--
 9 files changed, 113 insertions(+), 105 deletions(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 7a2a193fd3872..1616039ad8bbe 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -1 +1,42 @@
 #include "llama-arch.h"
+
+#include "llama-impl.h"
+
+LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
+
+std::string LLM_KV::operator()(llm_kv kv) const {
+    return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+}
+
+std::string LLM_TN_IMPL::str() const {
+    if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
+        return "__missing__";
+    }
+
+    std::string name = ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid);
+
+    if (suffix != nullptr) {
+        name += ".";
+        name += suffix;
+    }
+
+    return name;
+}
+
+const char * llm_arch_name(llm_arch arch) {
+    auto it = LLM_ARCH_NAMES.find(arch);
+    if (it == LLM_ARCH_NAMES.end()) {
+        return "unknown";
+    }
+    return it->second;
+}
+
+llm_arch llm_arch_from_string(const std::string & name) {
+    for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
+        if (kv.second == name) {
+            return kv.first;
+        }
+    }
+
+    return LLM_ARCH_UNKNOWN;
+}
diff --git a/src/llama-arch.h b/src/llama-arch.h
index e2bdb295dfb0b..a68cbd262e427 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include "llama-impl.h"
-
 #include <map>
 
 //
@@ -375,13 +373,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 };
 
 struct LLM_KV {
-    LLM_KV(llm_arch arch) : arch(arch) {}
+    LLM_KV(llm_arch arch);
 
     llm_arch arch;
 
-    std::string operator()(llm_kv kv) const {
-        return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
-    }
+    std::string operator()(llm_kv kv) const;
 };
 
 enum llm_tensor {
@@ -1589,16 +1585,6 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
 };
 
-static llm_arch llm_arch_from_string(const std::string & name) {
-    for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
-        if (kv.second == name) {
-            return kv.first;
-        }
-    }
-
-    return LLM_ARCH_UNKNOWN;
-}
-
 // helper to handle gguf constants
 // usage:
 //
@@ -1615,20 +1601,7 @@ struct LLM_TN_IMPL {
     const int bid;
     const int xid;
 
-    std::string str() const {
-        if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
-            return "__missing__";
-        }
-
-        std::string name = ::format(LLM_TENSOR_NAMES.at(arch).at(tensor), bid, xid);
-
-        if (suffix != nullptr) {
-            name += ".";
-            name += suffix;
-        }
-
-        return name;
-    }
+    std::string str() const;
 
     operator std::string() const {
         return str();
@@ -1657,58 +1630,6 @@ struct LLM_TN {
     }
 };
 
-//
-// load LLaMA models
-//
-
-static const char * llama_model_arch_name(llm_arch arch) {
-    auto it = LLM_ARCH_NAMES.find(arch);
-    if (it == LLM_ARCH_NAMES.end()) {
-        return "unknown";
-    }
-    return it->second;
-}
-
-static std::string llama_model_ftype_name(llama_ftype ftype) {
-    if (ftype & LLAMA_FTYPE_GUESSED) {
-        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
-    }
-
-    switch (ftype) {
-        case LLAMA_FTYPE_ALL_F32:         return "all F32";
-        case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
-        case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
-        case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
-        case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
-        case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
-        case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
-        case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
-        case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
-        case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
-        case LLAMA_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
-        case LLAMA_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
-
-        default: return "unknown, may not work";
-    }
-}
+const char * llm_arch_name(llm_arch arch);
 
+llm_arch llm_arch_from_string(const std::string & name);
diff --git a/src/llama-impl.h b/src/llama-impl.h
index 7a622f213a790..273897c08fae0 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -24,22 +24,8 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
 void llama_log_internal        (ggml_log_level level, const char * format, ...);
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 
-// TODO: move to source
 LLAMA_ATTRIBUTE_FORMAT(1, 2)
-static std::string format(const char * fmt, ...) {
-    va_list ap;
-    va_list ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
+std::string format(const char * fmt, ...);
 
 #define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 1dcfdcd1896e4..2b9197bb8a84a 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -1 +1,3 @@
 #include "llama-mmap.h"
+
+
diff --git a/src/llama-mmap.h b/src/llama-mmap.h
index f091558e3b05b..a1b50b3ffa328 100644
--- a/src/llama-mmap.h
+++ b/src/llama-mmap.h
@@ -4,8 +4,6 @@
 
 #include "ggml.h"
 
-#include <cstdio>
-
 #ifdef __has_include
     #if __has_include(<unistd.h>)
         #include <unistd.h>
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 2364e7c9561bc..1c563b4c87e51 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1 +1,44 @@
 #include "llama-model.h"
+
+std::string llama_model_ftype_name(llama_ftype ftype) {
+    if (ftype & LLAMA_FTYPE_GUESSED) {
+        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+    }
+
+    switch (ftype) {
+        case LLAMA_FTYPE_ALL_F32:         return "all F32";
+        case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
+        case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
+        case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
+        case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
+        case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
+        case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
+
+        default: return "unknown, may not work";
+    }
+}
diff --git a/src/llama-model.h b/src/llama-model.h
index f3bd79aa9715f..bf030e90b64e4 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -648,3 +648,5 @@ static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & b
     throw std::runtime_error(format("no suitable buffer type found"));
 }
 
+
+std::string llama_model_ftype_name(llama_ftype ftype);
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index 2943c34804f89..834ad6ab8527a 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "llama-impl.h"
+#include "llama.h"
 
 #include <string>
 #include <vector>
diff --git a/src/llama.cpp b/src/llama.cpp
index 1ab22e6a4ab93..e7ab6f94bda5d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -59,6 +59,21 @@
 // helpers
 //
 
+std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
     size_t start = 0;
@@ -16432,9 +16447,9 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
 
 int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
     return snprintf(buf, buf_size, "%s %s %s",
-            llama_model_arch_name(model->arch),
-            llama_model_type_name(model->type),
-            llama_model_ftype_name(model->ftype).c_str());
+            llm_arch_name(model->arch),                     // TODO: llama_model_arch_name(model)
+            llama_model_type_name(model->type),             // TODO: llama_model_type_name(model)
+            llama_model_ftype_name(model->ftype).c_str());  // TODO: llama_model_ftype_name(model)
 }
 
 uint64_t llama_model_size(const struct llama_model * model) {