Fix quant.

Ai00-X · Oct 17, 2023 · a3301ac · a3301ac
1 parent ca11745
commit a3301ac
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 11 deletions.
diff --git a/assets/configs/Config.toml b/assets/configs/Config.toml
@@ -1,6 +1,6 @@
 [model]
 path = "assets/models/RWKV-4-World-0.4B-v1-20230529-ctx4096.st" # Path to the model.
-quant = []                                                      # Layers to be quantized.
+quant = 0                                                       # Layers to be quantized.
 token_chunk_size = 32                                           # Size of token chunk that is inferred at once. For high end GPUs, this could be 64 or 128 (faster).
 head_chunk_size = 8192                                          # DO NOT modify this if you don't know what you are doing.
 max_runtime_batch = 8                                           # The maximum batches that can be scheduled for inference at the same time.

diff --git a/src/config.rs b/src/config.rs
@@ -46,7 +46,7 @@ pub struct ModelConfig {
     /// Path to the model.
     pub path: PathBuf,
     /// Specify layers that needs to be quantized.
-    pub quant: Vec<usize>,
+    pub quant: usize,
     /// Maximum tokens to be processed in parallel at once.
     pub token_chunk_size: usize,
     /// The chunk size for each split of the head matrix.

diff --git a/src/main.rs b/src/main.rs
@@ -135,7 +135,7 @@ pub struct ReloadRequest {
     /// Path to the model.
     pub model_path: PathBuf,
     /// Specify layers that needs to be quantized.
-    pub quant: Vec<usize>,
+    pub quant: usize,
     /// Maximum tokens to be processed in parallel at once.
     pub token_chunk_size: usize,
     /// The chunk size for each split of the head matrix.
@@ -209,14 +209,9 @@ where
         head_chunk_size,
         ..
     } = request;
-    let quant = if quant.is_empty() {
-        Quantization::None
-    } else {
-        let mut layers = LayerFlags::empty();
-        quant
-            .into_iter()
-            .for_each(|x| layers.insert(LayerFlags::from_layer(x as u64)));
-        Quantization::Int8(layers)
+    let quant = match quant {
+        0 => Quantization::None,
+        x => Quantization::Int8(LayerFlags::from_bits_retain((1 << x) - 1)),
     };
 
     let model: M = ModelBuilder::new(context, data)