rebased and added other profile

OlivierDehaene · OlivierDehaene · commit a9beab04b516 · 2024-05-31T14:37:52.000+02:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -21,7 +21,12 @@ hf-hub = { version = "0.3.1", features = ["tokio"] }
 [profile.release]
 debug = 1
 incremental = true
+panic = "abort"
+
+[profile.release-opt]
+inherits = "release"
+debug = 0
+incremental = false
 lto = "fat"
 opt-level = 3
 codegen-units = 1
-panic = "abort"
diff --git a/Dockerfile b/Dockerfile
@@ -33,7 +33,7 @@ COPY proto proto
 COPY benchmark benchmark
 COPY router router
 COPY launcher launcher
-RUN cargo build --release
+RUN cargo build --profile release-opt
 
 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
diff --git a/Dockerfile_amd b/Dockerfile_amd
@@ -33,7 +33,7 @@ COPY proto proto
 COPY benchmark benchmark
 COPY router router
 COPY launcher launcher
-RUN cargo build --release
+RUN cargo build --profile release-opt
 
 # Text Generation Inference base image for RoCm
 FROM rocm/dev-ubuntu-22.04:6.1.1_hip_update as base
diff --git a/Dockerfile_intel b/Dockerfile_intel
@@ -32,7 +32,7 @@ COPY proto proto
 COPY benchmark benchmark
 COPY router router
 COPY launcher launcher
-RUN cargo build --release
+RUN cargo build --profile release-opt
 
 
 # Text Generation Inference base image for Intel
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -766,8 +766,7 @@ def init_kv_cache(
         device: torch.device,
     ):
         self.kv_cache = []
-        if IS_CUDA_SYSTEM:
-            torch.cuda.empty_cache()
+        empty_cache()
 
         element_size = torch.tensor([], dtype=dtype).element_size()
         if SYSTEM == "xpu":
@@ -960,7 +959,6 @@ def tunableop_warmup(self, seqlen: int):
         input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
         position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
         slots = torch.arange(seqlen, dtype=torch.int64, device=self.device)
-        kv_cache = get_cache_manager().kv_cache
 
         # Dummy value, some models (starcoder2) don't accept `None`.
         input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device)
@@ -972,12 +970,13 @@ def tunableop_warmup(self, seqlen: int):
             cu_seqlen_prefill=torch.tensor(
                 [0, seqlen], device=self.device, dtype=torch.int32
             ),
-            kv_cache=get_cache_manager().kv_cache,
+            kv_cache=self.kv_cache,
             block_tables=None,
             input_lengths=input_lengths,
             slots=slots,
             max_s=seqlen,
             lm_head_indices=None,
+            prefill_cache_indices=None,
         )
 
     def forward(
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
@@ -98,31 +98,6 @@ def get_layer_config(self, model) -> Tuple[int, int, int]:
             model.model.head_size,
         )
 
-    def tunableop_warmup(self, seqlen: int):
-        input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
-        position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
-        slots = torch.arange(seqlen, dtype=torch.int64, device=self.device)
-        kv_cache = get_cache_manager().kv_cache
-
-        # Dummy value, some models (starcoder2) don't accept `None`.
-        input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device)
-
-        # We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
-        self.model.forward(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlen_prefill=torch.tensor(
-                [0, seqlen], device=self.device, dtype=torch.int32
-            ),
-            kv_cache=self.kv_cache,
-            block_tables=None,
-            input_lengths=input_lengths,
-            slots=slots,
-            max_s=seqlen,
-            lm_head_indices=None,
-            prefill_cache_indices=None,
-        )
-
 
 class FlashMistral(BaseFlashMistral):
     def __init__(