Fix HF_HUB_OFFLINE=1 for Gaudi backend (#3193)

regisss · web-flow · commit f208ba6afc14 · 2025-05-06T10:47:53.000+02:00
* Fix `HF_HUB_OFFLINE=1` for Gaudi backend

* Fix HF cache default value in server.rs

* Format
diff --git a/backends/gaudi/Makefile b/backends/gaudi/Makefile
@@ -8,7 +8,7 @@ PYTORCH_VERSION := 2.6.0
 .PHONY:	image run-local-dev-container install-dependencies install-server install-router install-launcher local-dev-install
 
 image:
-	docker build -t tgi-gaudi -f ${root_dir}/Dockerfile_gaudi ${root_dir} --build-arg HABANA_VERSION=$(HABANA_VERSION) --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION)
+	docker build --ulimit nofile=4096 -t tgi-gaudi -f ${root_dir}/Dockerfile_gaudi ${root_dir} --build-arg HABANA_VERSION=$(HABANA_VERSION) --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION)
 
 run-local-dev-container:
 		docker run -it \
diff --git a/backends/gaudi/server/text_generation_server/models/causal_lm.py b/backends/gaudi/server/text_generation_server/models/causal_lm.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass
 from functools import wraps
 import itertools
+import json
 import math
 import os
 import tempfile
@@ -17,15 +18,12 @@
 from opentelemetry import trace
 
 import text_generation_server.habana_quantization_env as hq_env
+from text_generation_server.utils import weight_files
 import habana_frameworks.torch as htorch
 from optimum.habana.utils import HabanaProfile
 from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
 from text_generation_server.utils.chunks import concat_text_chunks
-from optimum.habana.checkpoint_utils import (
-    get_repo_root,
-    model_on_meta,
-    write_checkpoints_json,
-)
+from optimum.habana.checkpoint_utils import model_on_meta
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
@@ -708,15 +706,16 @@ def __init__(
         if hq_env.is_quantization_enabled:
             htorch.core.hpu_set_env()
 
+        # Get weight files
+        weight_files(model_id, revision=revision, extension=".safetensors")
+
         if world_size > 1:
             os.environ.setdefault(
                 "DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1"
             )
             model = self.get_deepspeed_model(model_id, dtype, revision)
             model = hq_env.prepare_model_for_quantization(model)
         else:
-            get_repo_root(model_id)
-
             # Check support for rope scaling
             model_kwargs = {}
             config = AutoConfig.from_pretrained(model_id)
@@ -868,7 +867,6 @@ def get_deepspeed_model(
             with deepspeed.OnDevice(dtype=dtype, device="meta"):
                 model = AutoModelForCausalLM.from_config(config, torch_dtype=dtype)
         else:
-            get_repo_root(model_id, local_rank=os.getenv("LOCAL_RANK"))
             # TODO: revisit placement on CPU when auto-injection is possible
             with deepspeed.OnDevice(dtype=dtype, device="cpu"):
                 model = AutoModelForCausalLM.from_pretrained(
@@ -884,7 +882,16 @@ def get_deepspeed_model(
         if load_to_meta:
             # model loaded to meta is managed differently
             checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
-            write_checkpoints_json(model_id, local_rank, checkpoints_json)
+            checkpoint_files = [
+                str(f)
+                for f in weight_files(
+                    model_id, revision=revision, extension=".safetensors"
+                )
+            ]
+            data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
+            json.dump(data, checkpoints_json)
+            checkpoints_json.flush()
+
             ds_inference_kwargs["checkpoint"] = checkpoints_json.name
         model = deepspeed.init_inference(model, **ds_inference_kwargs)
 
diff --git a/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py b/backends/gaudi/server/text_generation_server/models/vlm_causal_lm.py
@@ -1,3 +1,4 @@
+import json
 import re
 import torch
 import os
@@ -12,6 +13,7 @@
 import copy
 from text_generation_server.models import Model
 from transformers import PreTrainedTokenizerBase
+from text_generation_server.utils import weight_files
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.causal_lm import (
@@ -43,11 +45,7 @@
     AutoTokenizer,
     AutoConfig,
 )
-from optimum.habana.checkpoint_utils import (
-    get_repo_root,
-    model_on_meta,
-    write_checkpoints_json,
-)
+from optimum.habana.checkpoint_utils import model_on_meta
 
 from text_generation_server.utils.speculate import get_speculate
 from text_generation_server.models.types import (
@@ -840,15 +838,16 @@ def __init__(
         if hq_env.is_quantization_enabled:
             htorch.core.hpu_set_env()
 
+        # Get weight files
+        weight_files(model_id, revision=revision, extension=".safetensors")
+
         if world_size > 1:
             os.environ.setdefault(
                 "DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1"
             )
             model = self.get_deepspeed_model(model_class, model_id, dtype, revision)
             model = hq_env.prepare_model_for_quantization(model)
         else:
-            get_repo_root(model_id)
-
             # Check support for rope scaling
             model_kwargs = {}
             config = AutoConfig.from_pretrained(model_id)
@@ -1000,7 +999,6 @@ def get_deepspeed_model(
             with deepspeed.OnDevice(dtype=dtype, device="meta"):
                 model = model_class.from_config(config, torch_dtype=dtype)
         else:
-            get_repo_root(model_id, local_rank=os.getenv("LOCAL_RANK"))
             # TODO: revisit placement on CPU when auto-injection is possible
             with deepspeed.OnDevice(dtype=dtype, device="cpu"):
                 model = model_class.from_pretrained(
@@ -1019,7 +1017,15 @@ def get_deepspeed_model(
         if load_to_meta:
             # model loaded to meta is managed differently
             checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
-            write_checkpoints_json(model_id, local_rank, checkpoints_json)
+            checkpoint_files = [
+                str(f)
+                for f in weight_files(
+                    model_id, revision=revision, extension=".safetensors"
+                )
+            ]
+            data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
+            json.dump(data, checkpoints_json)
+            checkpoints_json.flush()
             ds_inference_kwargs["checkpoint"] = checkpoints_json.name
         model = deepspeed.init_inference(model, **ds_inference_kwargs)
 
diff --git a/router/src/server.rs b/router/src/server.rs
@@ -1578,7 +1578,7 @@ pub async fn run(
             let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
                 .map_err(|_| ())
                 .map(|cache_dir| Cache::new(cache_dir.into()))
-                .unwrap_or_else(|_| Cache::default());
+                .unwrap_or_else(|_| Cache::from_env());
             tracing::warn!("Offline mode active using cache defaults");
             Type::Cache(cache)
         } else {