fix(training): use 'rmse' key for total loss instead of 'rmse_e' for energy loss

Copilot · njzjz · Copilot · commit 7a2b41edd3c2 · 2025-09-20T18:07:16.000Z
Co-authored-by: njzjz &lt;9496702+njzjz@users.noreply.github.com&gt;
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -863,8 +863,8 @@ def log_loss_valid(_task_key="Default"):
                 if not self.multi_task:
                     train_results = log_loss_train(loss, more_loss)
                     # Check for NaN in total loss using CPU values from lcurve computation
-                    if self.rank == 0 and "rmse_e" in train_results:
-                        check_total_loss_nan(display_step_id, train_results["rmse_e"])
+                    if self.rank == 0 and "rmse" in train_results:
+                        check_total_loss_nan(display_step_id, train_results["rmse"])
                     valid_results = log_loss_valid()
                     if self.rank == 0:
                         log.info(
@@ -907,9 +907,9 @@ def log_loss_valid(_task_key="Default"):
                             )
                         valid_results[_key] = log_loss_valid(_task_key=_key)
                         # Check for NaN in total loss using CPU values from lcurve computation
-                        if self.rank == 0 and "rmse_e" in train_results[_key]:
+                        if self.rank == 0 and "rmse" in train_results[_key]:
                             check_total_loss_nan(
-                                display_step_id, train_results[_key]["rmse_e"]
+                                display_step_id, train_results[_key]["rmse"]
                             )
                         if self.rank == 0:
                             log.info(
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -953,8 +953,8 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
                 if not self.multi_task:
                     train_results = log_loss_train(loss, more_loss)
                     # Check for NaN in total loss using CPU values from lcurve computation
-                    if self.rank == 0 and "rmse_e" in train_results:
-                        check_total_loss_nan(display_step_id, train_results["rmse_e"])
+                    if self.rank == 0 and "rmse" in train_results:
+                        check_total_loss_nan(display_step_id, train_results["rmse"])
                     valid_results = log_loss_valid()
                     if self.rank == 0:
                         log.info(
@@ -1004,9 +1004,9 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
                                 )
                         valid_results[_key] = log_loss_valid(_task_key=_key)
                         # Check for NaN in total loss using CPU values from lcurve computation
-                        if self.rank == 0 and "rmse_e" in train_results[_key]:
+                        if self.rank == 0 and "rmse" in train_results[_key]:
                             check_total_loss_nan(
-                                display_step_id, train_results[_key]["rmse_e"]
+                                display_step_id, train_results[_key]["rmse"]
                             )
                         if self.rank == 0:
                             log.info(
diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py
@@ -689,8 +689,8 @@ def valid_on_the_fly(
         current_lr = run_sess(self.sess, self.learning_rate)
 
         # Check for NaN in total loss before writing to file and saving checkpoint
-        # We check the main energy loss component that represents total training loss
-        check_total_loss_nan(cur_batch, train_results["rmse_e"])
+        # We check the main total loss component that represents training loss
+        check_total_loss_nan(cur_batch, train_results["rmse"])
 
         if print_header:
             self.print_header(fp, train_results, valid_results)