fix(training): optimize NaN detection based on feedback - use lcurve CPU values and fixed loss keys

Copilot · njzjz · Copilot · commit 0852b7c460cf · 2025-09-20T17:27:21.000Z
Co-authored-by: njzjz &lt;9496702+njzjz@users.noreply.github.com&gt;
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -781,8 +781,6 @@ def step(_step_id, task_key="Default") -> None:
                             label=label_dict,
                             task_key=task_key,
                         )
-                        # Check for NaN in total loss before backward pass to prevent corrupted training
-                        check_total_loss_nan(_step_id + 1, loss.item())
 
                     with nvprof_context(enable_profiling, "Backward pass"):
                         loss.backward()
@@ -864,6 +862,9 @@ def log_loss_valid(_task_key="Default"):
 
                 if not self.multi_task:
                     train_results = log_loss_train(loss, more_loss)
+                    # Check for NaN in total loss using CPU values from lcurve computation
+                    if self.rank == 0 and "rmse_e" in train_results:
+                        check_total_loss_nan(display_step_id, train_results["rmse_e"])
                     valid_results = log_loss_valid()
                     if self.rank == 0:
                         log.info(
@@ -905,6 +906,11 @@ def log_loss_valid(_task_key="Default"):
                                 loss, more_loss, _task_key=_key
                             )
                         valid_results[_key] = log_loss_valid(_task_key=_key)
+                        # Check for NaN in total loss using CPU values from lcurve computation
+                        if self.rank == 0 and "rmse_e" in train_results[_key]:
+                            check_total_loss_nan(
+                                display_step_id, train_results[_key]["rmse_e"]
+                            )
                         if self.rank == 0:
                             log.info(
                                 format_training_message_per_task(
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -764,8 +764,6 @@ def step(_step_id: int, task_key: str = "Default") -> None:
                 model_pred, loss, more_loss = self.wrapper(
                     **input_dict, cur_lr=pref_lr, label=label_dict, task_key=task_key
                 )
-                # Check for NaN in total loss before backward pass to prevent corrupted training
-                check_total_loss_nan(_step_id + 1, loss.item())
                 loss.backward()
                 if self.gradient_max_norm > 0.0:
                     torch.nn.utils.clip_grad_norm_(
@@ -817,8 +815,6 @@ def fake_model() -> dict:
                         int(input_dict["atype"].shape[-1]),
                         learning_rate=pref_lr,
                     )
-                    # Check for NaN in total loss before continuing training
-                    check_total_loss_nan(_step_id + 1, loss.item())
                 elif isinstance(self.loss, DenoiseLoss):
                     KFOptWrapper = KFOptimizerWrapper(
                         self.wrapper,
@@ -845,8 +841,6 @@ def fake_model() -> dict:
                         input_dict["natoms"],
                         learning_rate=pref_lr,
                     )
-                    # Check for NaN in total loss before continuing training
-                    check_total_loss_nan(_step_id + 1, loss.item())
             else:
                 raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
 
@@ -958,6 +952,9 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
 
                 if not self.multi_task:
                     train_results = log_loss_train(loss, more_loss)
+                    # Check for NaN in total loss using CPU values from lcurve computation
+                    if self.rank == 0 and "rmse_e" in train_results:
+                        check_total_loss_nan(display_step_id, train_results["rmse_e"])
                     valid_results = log_loss_valid()
                     if self.rank == 0:
                         log.info(
@@ -1006,6 +1003,11 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
                                     loss, more_loss, _task_key=_key
                                 )
                         valid_results[_key] = log_loss_valid(_task_key=_key)
+                        # Check for NaN in total loss using CPU values from lcurve computation
+                        if self.rank == 0 and "rmse_e" in train_results[_key]:
+                            check_total_loss_nan(
+                                display_step_id, train_results[_key]["rmse_e"]
+                            )
                         if self.rank == 0:
                             log.info(
                                 format_training_message_per_task(
diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py
@@ -689,18 +689,8 @@ def valid_on_the_fly(
         current_lr = run_sess(self.sess, self.learning_rate)
 
         # Check for NaN in total loss before writing to file and saving checkpoint
-        # We check the main loss component that represents total training loss
-        if train_results:
-            # Look for the main loss key (typically the first loss component)
-            main_loss_key = next(iter(train_results.keys())) if train_results else None
-            if main_loss_key and main_loss_key in train_results:
-                check_total_loss_nan(cur_batch, train_results[main_loss_key])
-
-        if valid_results:
-            # Check validation loss as well for consistency
-            main_loss_key = next(iter(valid_results.keys())) if valid_results else None
-            if main_loss_key and main_loss_key in valid_results:
-                check_total_loss_nan(cur_batch, valid_results[main_loss_key])
+        # We check the main energy loss component that represents total training loss
+        check_total_loss_nan(cur_batch, train_results["rmse_e"])
 
         if print_header:
             self.print_header(fp, train_results, valid_results)
diff --git a/source/3rdparty/implib/implib-gen.py b/source/3rdparty/implib/implib-gen.py