Merge branch 'main' of https://github.com/cgisky1980/ai00_rwkv_server

Ai00-X · Oct 14, 2023 · b5fa0e3 · b5fa0e3
2 parents f6539c3 + 5a14f41
commit b5fa0e3
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 64 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,4 @@ qdrant/
 .qdrant-initialized
 run.bat
 node_modules/
+.vscode/
diff --git a/.vscode/launch.json b/.vscode/launch.json
diff --git a/.vscode/settings.json b/.vscode/settings.json
diff --git a/assets/configs/Config.toml b/assets/configs/Config.toml
@@ -1,8 +1,8 @@
 [model]
-path = "assets/models/RWKV-4-World-CHNtuned-3B-v1-20230625-ctx4096.st" # Path to the model.
-quant = []                                                             # Layers to be quantized.
-token_chunk_size = 32                                                  # Size of token chunk that is inferred at once. For high end GPUs, this could be 64 or 128 (faster).
-head_chunk_size = 8192                                                 # DO NOT modify this if you don't know what you are doing.
-max_runtime_batch = 8                                                  # The maximum batches that can be scheduled for inference at the same time.
-max_batch = 2                                                          # The maximum batches that are cached on GPU.
-embed_layer = 2                                                        # The (reversed) layer number whose output is used as embedding.
+path = "assets/models/RWKV-4-World-0.4B-v1-20230529-ctx4096.st" # Path to the model.
+quant = []                                                      # Layers to be quantized.
+token_chunk_size = 32                                           # Size of token chunk that is inferred at once. For high end GPUs, this could be 64 or 128 (faster).
+head_chunk_size = 8192                                          # DO NOT modify this if you don't know what you are doing.
+max_runtime_batch = 8                                           # The maximum batches that can be scheduled for inference at the same time.
+max_batch = 2                                                   # The maximum batches that are cached on GPU.
+embed_layer = 2                                                 # The (reversed) layer number whose output is used as embedding.