fix: merging of model for multi-gpu (foundation-model-stack#158)

* only copy over if adapter found, problem when lora multi-gpu train Signed-off-by: Anh-Uong <[email protected]> * formatting and helpful comment Signed-off-by: Anh-Uong <[email protected]> --------- Signed-off-by: Anh-Uong <[email protected]>
red-hat-data-services · May 15, 2024 · eba20f3 · eba20f3
1 parent 38c4f22
commit eba20f3
Showing 1 changed file with 10 additions and 6 deletions.
diff --git a/build/launch_training.py b/build/launch_training.py
@@ -142,12 +142,16 @@ def main():
                     export_path,
                 )
 
-                create_merged_model(
-                    checkpoint_models=full_checkpoint_dir,
-                    export_path=export_path,
-                    base_model=model_args.model_name_or_path,
-                    save_tokenizer=True,
-                )
+                # ensure checkpoint dir has correct files, important with multi-gpu tuning
+                if os.path.exists(
+                    os.path.join(full_checkpoint_dir, "adapter_config.json")
+                ):
+                    create_merged_model(
+                        checkpoint_models=full_checkpoint_dir,
+                        export_path=export_path,
+                        base_model=model_args.model_name_or_path,
+                        save_tokenizer=True,
+                    )
             except Exception as e:  # pylint: disable=broad-except
                 logging.error(traceback.format_exc())
                 write_termination_log(