deepmodeling
diff --git a/‎deepmd/pt_expt/train/training.py‎
Lines changed: 131 additions & 102 deletions b/‎deepmd/pt_expt/train/training.py‎
Lines changed: 131 additions & 102 deletions
@@ -142,6 +142,28 @@ def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]:
 # ---------------------------------------------------------------------------
 
 
+def _remove_detach_nodes(gm: torch.fx.GraphModule) -> None:
+    """Remove ``aten.detach.default`` nodes from an FX graph in-place.
+
+    ``make_fx`` inserts these nodes when recording saved tensors from the
+    autograd backward pass (``autograd.grad`` with ``create_graph=True``).
+    The detach breaks the gradient connection between saved activations and
+    model parameters, causing incorrect second-order derivatives — e.g.
+    bias gradients become zero for force-loss training.
+
+    Removing these nodes restores the gradient path so that higher-order
+    derivatives flow correctly through the decomposed backward ops.
+    """
+    graph = gm.graph
+    for node in list(graph.nodes):
+        if node.op == "call_function" and node.target == torch.ops.aten.detach.default:
+            input_node = node.args[0]
+            node.replace_all_uses_with(input_node)
+            graph.erase_node(node)
+    graph.lint()
+    gm.recompile()
+
+
 def _trace_and_compile(
     model: torch.nn.Module,
     ext_coord: torch.Tensor,
@@ -157,7 +179,7 @@ def _trace_and_compile(
     Parameters
     ----------
     model : torch.nn.Module
-        The (uncompiled) model.  Temporarily set to eval mode for tracing.
+        The (uncompiled) model.
     ext_coord, ext_atype, nlist, mapping, fparam, aparam
         Sample tensors (already padded to the desired max_nall).
     compile_opts : dict
@@ -188,7 +210,7 @@ def fn(
         fparam: torch.Tensor | None,
         aparam: torch.Tensor | None,
     ) -> dict[str, torch.Tensor]:
-        extended_coord = extended_coord.detach().requires_grad_(True)
+        extended_coord = extended_coord.requires_grad_(True)
         return model.forward_lower(
             extended_coord,
             extended_atype,
@@ -203,13 +225,15 @@ def fn(
     # change at runtime, the caller catches the error and retraces.
     traced_lower = make_fx(fn)(ext_coord, ext_atype, nlist, mapping, fparam, aparam)
 
+    # make_fx inserts aten.detach.default for saved tensors used in the
+    # decomposed autograd.grad backward ops.  These detach nodes break
+    # second-order gradient flow (d(force)/d(params) for force training).
+    # Removing them restores correct higher-order derivatives.
+    _remove_detach_nodes(traced_lower)
+
     if not was_training:
         model.eval()
 
-    # The inductor backend does not propagate gradients through the
-    # make_fx-decomposed autograd.grad ops (second-order gradients for
-    # force training).  Use "aot_eager" which correctly preserves the
-    # gradient chain while still benefiting from make_fx decomposition.
     if "backend" not in compile_opts:
         compile_opts["backend"] = "aot_eager"
     compiled_lower = torch.compile(traced_lower, dynamic=False, **compile_opts)
@@ -839,10 +863,6 @@ def _make_sample(
         # torch.compile -------------------------------------------------------
         self.enable_compile = training_params.get("enable_compile", False)
         if self.enable_compile:
-            if self.multi_task:
-                raise ValueError(
-                    "torch.compile is not supported with multi-task training."
-                )
             compile_opts = training_params.get("compile_options", {})
             log.info("Compiling model with torch.compile (%s)", compile_opts)
             self._compile_model(compile_opts)
@@ -878,108 +898,117 @@ def _compile_model(self, compile_opts: dict[str, Any]) -> None:
             normalize_coord,
         )
 
-        model = self.model
-
-        # --- Estimate max_nall by sampling multiple batches ---
-        n_sample = 20
-        max_nall = 0
-        best_sample: (
-            tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, dict] | None
-        ) = None
-
-        for _ii in range(n_sample):
-            inp, _ = self.get_data(is_train=True)
-            coord = inp["coord"].detach()
-            atype = inp["atype"].detach()
-            box = inp.get("box")
-            if box is not None:
-                box = box.detach()
-
-            nframes, nloc = atype.shape[:2]
-            coord_np = coord.cpu().numpy().reshape(nframes, nloc, 3)
-            atype_np = atype.cpu().numpy()
-            box_np = box.cpu().numpy().reshape(nframes, 9) if box is not None else None
-
-            if box_np is not None:
-                coord_norm = normalize_coord(coord_np, box_np.reshape(nframes, 3, 3))
-            else:
-                coord_norm = coord_np
+        for task_key in self.model_keys:
+            model = self.wrapper.model[task_key]
+
+            # --- Estimate max_nall by sampling multiple batches ---
+            n_sample = 20
+            max_nall = 0
+            best_sample: (
+                tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, dict] | None
+            ) = None
+
+            for _ii in range(n_sample):
+                inp, _ = self.get_data(is_train=True, task_key=task_key)
+                coord = inp["coord"].detach()
+                atype = inp["atype"].detach()
+                box = inp.get("box")
+                if box is not None:
+                    box = box.detach()
+
+                nframes, nloc = atype.shape[:2]
+                coord_np = coord.cpu().numpy().reshape(nframes, nloc, 3)
+                atype_np = atype.cpu().numpy()
+                box_np = (
+                    box.cpu().numpy().reshape(nframes, 9) if box is not None else None
+                )
 
-            ext_coord_np, ext_atype_np, mapping_np = extend_coord_with_ghosts(
-                coord_norm, atype_np, box_np, model.get_rcut()
-            )
-            nlist_np = build_neighbor_list(
-                ext_coord_np,
-                ext_atype_np,
-                nloc,
-                model.get_rcut(),
-                model.get_sel(),
-                distinguish_types=False,
-            )
-            ext_coord_np = ext_coord_np.reshape(nframes, -1, 3)
-            nall = ext_coord_np.shape[1]
-            if nall > max_nall:
-                max_nall = nall
-                best_sample = (
+                if box_np is not None:
+                    coord_norm = normalize_coord(
+                        coord_np, box_np.reshape(nframes, 3, 3)
+                    )
+                else:
+                    coord_norm = coord_np
+
+                ext_coord_np, ext_atype_np, mapping_np = extend_coord_with_ghosts(
+                    coord_norm, atype_np, box_np, model.get_rcut()
+                )
+                nlist_np = build_neighbor_list(
                     ext_coord_np,
                     ext_atype_np,
-                    mapping_np,
-                    nlist_np,
                     nloc,
-                    inp,
+                    model.get_rcut(),
+                    model.get_sel(),
+                    distinguish_types=False,
                 )
+                ext_coord_np = ext_coord_np.reshape(nframes, -1, 3)
+                nall = ext_coord_np.shape[1]
+                if nall > max_nall:
+                    max_nall = nall
+                    best_sample = (
+                        ext_coord_np,
+                        ext_atype_np,
+                        mapping_np,
+                        nlist_np,
+                        nloc,
+                        inp,
+                    )
 
-        # Add 20 % margin and round up to a multiple of 8.
-        max_nall = ((int(max_nall * 1.2) + 7) // 8) * 8
-        log.info(
-            "Estimated max_nall=%d for compiled model (sampled %d batches).",
-            max_nall,
-            n_sample,
-        )
-
-        # --- Pad the largest sample to max_nall and trace ---
-        assert best_sample is not None
-        ext_coord_np, ext_atype_np, mapping_np, nlist_np, nloc, sample_input = (
-            best_sample
-        )
-        nframes = ext_coord_np.shape[0]
-        actual_nall = ext_coord_np.shape[1]
-        pad_n = max_nall - actual_nall
-
-        if pad_n > 0:
-            ext_coord_np = np.pad(ext_coord_np, ((0, 0), (0, pad_n), (0, 0)))
-            ext_atype_np = np.pad(ext_atype_np, ((0, 0), (0, pad_n)))
-            mapping_np = np.pad(mapping_np, ((0, 0), (0, pad_n)))
+            # Add 20 % margin and round up to a multiple of 8.
+            max_nall = ((int(max_nall * 1.2) + 7) // 8) * 8
+            log.info(
+                "Estimated max_nall=%d for compiled model "
+                "(task=%s, sampled %d batches).",
+                max_nall,
+                task_key,
+                n_sample,
+            )
 
-        ext_coord = torch.tensor(
-            ext_coord_np, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-        )
-        ext_atype = torch.tensor(ext_atype_np, dtype=torch.int64, device=DEVICE)
-        nlist_t = torch.tensor(nlist_np, dtype=torch.int64, device=DEVICE)
-        mapping_t = torch.tensor(mapping_np, dtype=torch.int64, device=DEVICE)
-        fparam = sample_input.get("fparam")
-        aparam = sample_input.get("aparam")
+            # --- Pad the largest sample to max_nall and trace ---
+            assert best_sample is not None
+            ext_coord_np, ext_atype_np, mapping_np, nlist_np, nloc, sample_input = (
+                best_sample
+            )
+            nframes = ext_coord_np.shape[0]
+            actual_nall = ext_coord_np.shape[1]
+            pad_n = max_nall - actual_nall
 
-        compile_opts.pop("dynamic", None)  # always False for padded approach
+            if pad_n > 0:
+                ext_coord_np = np.pad(ext_coord_np, ((0, 0), (0, pad_n), (0, 0)))
+                ext_atype_np = np.pad(ext_atype_np, ((0, 0), (0, pad_n)))
+                mapping_np = np.pad(mapping_np, ((0, 0), (0, pad_n)))
 
-        compiled_lower = _trace_and_compile(
-            model,
-            ext_coord,
-            ext_atype,
-            nlist_t,
-            mapping_t,
-            fparam,
-            aparam,
-            compile_opts,
-        )
+            ext_coord = torch.tensor(
+                ext_coord_np, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
+            )
+            ext_atype = torch.tensor(ext_atype_np, dtype=torch.int64, device=DEVICE)
+            nlist_t = torch.tensor(nlist_np, dtype=torch.int64, device=DEVICE)
+            mapping_t = torch.tensor(mapping_np, dtype=torch.int64, device=DEVICE)
+            fparam = sample_input.get("fparam")
+            aparam = sample_input.get("aparam")
+
+            task_compile_opts = dict(compile_opts)
+            task_compile_opts.pop("dynamic", None)  # always False for padded approach
+
+            compiled_lower = _trace_and_compile(
+                model,
+                ext_coord,
+                ext_atype,
+                nlist_t,
+                mapping_t,
+                fparam,
+                aparam,
+                task_compile_opts,
+            )
 
-        self.wrapper.model["Default"] = _CompiledModel(
-            model, compiled_lower, max_nall, compile_opts
-        )
-        log.info(
-            "Model compiled with padded nall=%d (tracing_mode=real, dynamic=False).",
-            max_nall,
-        )
+            self.wrapper.model[task_key] = _CompiledModel(
+                model, compiled_lower, max_nall, task_compile_opts
+            )
+            log.info(
+                "Model compiled with padded nall=%d (task=%s, dynamic=False).",
+                max_nall,
+                task_key,
+            )
 
     # ------------------------------------------------------------------
     # Data helpers