diff --git a/medarc_rl/slurm_templates/one_node_rl.j2 b/medarc_rl/slurm_templates/one_node_rl.j2 index da06df9..2c34f48 100644 --- a/medarc_rl/slurm_templates/one_node_rl.j2 +++ b/medarc_rl/slurm_templates/one_node_rl.j2 @@ -35,6 +35,18 @@ export HF_HUB_OFFLINE={{ 1 if hf_hub_offline else 0 }} export HF_HOME="$HF_CACHE_DIR" export MEDARC_SINGLE_GPU={{ 1 if single_gpu else 0 }} +_d="$HF_CACHE_DIR"; while [ ! -e "$_d" ]; do _d="$(dirname "$_d")"; done +HF_CACHE_GROUP="$(stat -c '%G' "$_d")"; unset _d + +if ! id -Gn | grep -qw "$HF_CACHE_GROUP"; then + echo "ERROR: user is not a member of group $HF_CACHE_GROUP (required by $HF_CACHE_DIR)" >&2 + exit 1 +fi + +if [ "$(id -gn)" != "$HF_CACHE_GROUP" ]; then + exec sg "$HF_CACHE_GROUP" "$0" "$@" +fi + mkdir -p "$OUTPUT_DIR/slurm" "$OUTPUT_DIR/torchrun" if [ -f "$PROJECT_DIR/.env" ]; then diff --git a/medarc_rl/slurm_templates/one_node_sft.j2 b/medarc_rl/slurm_templates/one_node_sft.j2 index 9a960ac..2d52855 100644 --- a/medarc_rl/slurm_templates/one_node_sft.j2 +++ b/medarc_rl/slurm_templates/one_node_sft.j2 @@ -34,6 +34,18 @@ export HF_CACHE_DIR="{{ hf_cache_dir }}" export HF_HUB_OFFLINE={{ 1 if hf_hub_offline else 0 }} export HF_HOME="$HF_CACHE_DIR" +_d="$HF_CACHE_DIR"; while [ ! -e "$_d" ]; do _d="$(dirname "$_d")"; done +HF_CACHE_GROUP="$(stat -c '%G' "$_d")"; unset _d + +if ! id -Gn | grep -qw "$HF_CACHE_GROUP"; then + echo "ERROR: user is not a member of group $HF_CACHE_GROUP (required by $HF_CACHE_DIR)" >&2 + exit 1 +fi + +if [ "$(id -gn)" != "$HF_CACHE_GROUP" ]; then + exec sg "$HF_CACHE_GROUP" "$0" "$@" +fi + mkdir -p "$OUTPUT_DIR/slurm" "$OUTPUT_DIR/torchrun" if [ -f "$PROJECT_DIR/.env" ]; then