fix(pt2): move nlist padding inside traced fn and strip shape assertions

Han Wang · Han Wang · commit 8a9fe6338414 · 2026-04-21T06:50:07.000+08:00
Move nlist padding (+1 column of -1s) inside the `fn` closure in both
`make_model.forward_common_lower_exportable` and
`SpinModel.forward_common_lower_exportable`, making it part of the
traced graph.  This fixes proxy tensor shape mismatches from make_fx
and removes the need for external padding in deep_eval.py.

Also apply `_strip_shape_assertions` unconditionally (not just spin
models) to remove spurious torch.export guards like Ne(nnei, sum(sel)).

Export tests that verify atomic virial now pass `do_atomic_virial=True`
to `deserialize_to_file` so the exported model includes the correction.
diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py
@@ -759,19 +759,6 @@ def _eval_model(
             # returning a dict just like the .pte module.
             # It also filters non-tensor args automatically, matching the
             # export-time signature where None args were excluded.
-            # Pad nlist with extra -1 column so n_nnei > nnei, ensuring
-            # format_nlist's compiled sort branch executes.
-            nlist_t = torch.cat(
-                [
-                    nlist_t,
-                    -torch.ones(
-                        (*nlist_t.shape[:2], 1),
-                        dtype=nlist_t.dtype,
-                        device=nlist_t.device,
-                    ),
-                ],
-                dim=-1,
-            )
             model_ret = self._pt2_runner(
                 ext_coord_t, ext_atype_t, nlist_t, mapping_t, fparam_t, aparam_t
             )
@@ -911,19 +898,6 @@ def _eval_model_spin(
 
         # Call the model with spin (7 args)
         if self._is_pt2:
-            # Pad nlist with extra -1 column so n_nnei > nnei, ensuring
-            # format_nlist's compiled sort branch executes.
-            nlist_t = torch.cat(
-                [
-                    nlist_t,
-                    -torch.ones(
-                        (*nlist_t.shape[:2], 1),
-                        dtype=nlist_t.dtype,
-                        device=nlist_t.device,
-                    ),
-                ],
-                dim=-1,
-            )
             model_ret = self._pt2_runner(
                 ext_coord_t,
                 ext_atype_t,
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
@@ -346,6 +346,21 @@ def fn(
                 aparam: torch.Tensor | None,
             ) -> dict[str, torch.Tensor]:
                 extended_coord = extended_coord.detach().requires_grad_(True)
+                # Pad nlist with one extra -1 column inside the traced function.
+                # This ensures n_nnei > sum(sel), forcing the sort branch in
+                # _format_nlist.  The padding becomes part of the compiled graph,
+                # so callers never need to pad externally.
+                nlist = torch.cat(
+                    [
+                        nlist,
+                        -torch.ones(
+                            (*nlist.shape[:2], 1),
+                            dtype=nlist.dtype,
+                            device=nlist.device,
+                        ),
+                    ],
+                    dim=-1,
+                )
                 return model.forward_common_lower(
                     extended_coord,
                     extended_atype,
@@ -356,13 +371,19 @@ def fn(
                     do_atomic_virial=do_atomic_virial,
                 )
 
-            return make_fx(fn, **make_fx_kwargs)(
-                extended_coord,
-                extended_atype,
-                nlist,
-                mapping,
-                fparam,
-                aparam,
-            )
+            # Force format_nlist to always use the sort branch during tracing.
+            model.need_sorted_nlist_for_lower = lambda: True
+            try:
+                traced = make_fx(fn, **make_fx_kwargs)(
+                    extended_coord,
+                    extended_atype,
+                    nlist,
+                    mapping,
+                    fparam,
+                    aparam,
+                )
+            finally:
+                del model.need_sorted_nlist_for_lower
+            return traced
 
     return CM
diff --git a/deepmd/pt_expt/model/spin_model.py b/deepmd/pt_expt/model/spin_model.py
@@ -96,6 +96,18 @@ def fn(
             aparam: torch.Tensor | None,
         ) -> dict[str, torch.Tensor]:
             extended_coord = extended_coord.detach().requires_grad_(True)
+            # Pad nlist inside traced function (see make_model.py for rationale).
+            nlist = torch.cat(
+                [
+                    nlist,
+                    -torch.ones(
+                        (*nlist.shape[:2], 1),
+                        dtype=nlist.dtype,
+                        device=nlist.device,
+                    ),
+                ],
+                dim=-1,
+            )
             return model.forward_common_lower(
                 extended_coord,
                 extended_atype,
@@ -107,15 +119,22 @@ def fn(
                 do_atomic_virial=do_atomic_virial,
             )
 
-        return make_fx(fn, **make_fx_kwargs)(
-            extended_coord,
-            extended_atype,
-            extended_spin,
-            nlist,
-            mapping,
-            fparam,
-            aparam,
-        )
+        # Force format_nlist to always use the sort branch during tracing.
+        backbone = model.backbone_model
+        backbone.need_sorted_nlist_for_lower = lambda: True
+        try:
+            traced = make_fx(fn, **make_fx_kwargs)(
+                extended_coord,
+                extended_atype,
+                extended_spin,
+                nlist,
+                mapping,
+                fparam,
+                aparam,
+            )
+        finally:
+            del backbone.need_sorted_nlist_for_lower
+        return traced
 
     def forward_common_lower(
         self, *args: Any, **kwargs: Any
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
@@ -17,23 +17,23 @@
 
 
 def _strip_shape_assertions(graph_module: torch.nn.Module) -> None:
-    """Remove shape-guard assertion nodes from a spin model's exported graph.
+    """Remove shape-guard assertion nodes from an exported graph.
 
     ``torch.export`` inserts ``aten._assert_scalar`` nodes for symbolic shape
-    relationships discovered during tracing.  For the spin model, the atom-
-    doubling logic creates slice patterns that depend on ``(nall - nloc)``,
-    producing guards like ``Ne(nall, nloc)``.  These guards are spurious: the
-    model computes correct results even when ``nall == nloc`` (NoPBC, no ghost
-    atoms).
-
-    This function is **only called for spin models** (guarded by ``if is_spin``
-    in ``_trace_and_export``).  The assertion messages use opaque symbolic
-    variable names (e.g. ``Ne(s22, s96)``) rather than human-readable names,
-    so filtering by message content is not reliable.  Since
+    relationships discovered during tracing.  These guards can be spurious:
+
+    * **Spin models**: atom-doubling logic creates slice patterns that depend
+      on ``(nall - nloc)``, producing guards like ``Ne(nall, nloc)``.
+    * **All models**: the nlist padding inside ``forward_common_lower_exportable``
+      and the subsequent sort/truncate in ``_format_nlist`` can produce guards
+      like ``Ne(nnei, sum(sel))``.  These are spurious because the compiled
+      graph handles any ``nnei >= sum(sel)`` correctly.
+
+    The assertion messages use opaque symbolic variable names (e.g.
+    ``Ne(s22, s96)``) rather than human-readable names, so filtering by
+    message content is not reliable.  Since
     ``prefer_deferred_runtime_asserts_over_guards=True`` converts all shape
-    guards into these deferred assertions, and the only shape relationships in
-    the spin model involve nall/nloc, removing all of them is safe in this
-    context.
+    guards into these deferred assertions, removing all of them is safe.
     """
     graph = graph_module.graph
     for node in list(graph.nodes):
@@ -141,10 +141,8 @@ def _make_sample_inputs(
         sel,
         distinguish_types=not mixed_types,
     )
-    # Pad nlist with extra -1 columns so n_nnei > nnei at trace time.
-    # This ensures format_nlist's distance-sort branch is traced into the
-    # compiled graph, allowing the .pt2 model to handle variable-size
-    # neighbor lists at runtime (e.g. LAMMPS rcut + skin).
+    # Pad nlist so nnei > sum(sel) in the sample tensors.
+    # This prevents torch.export from specializing nnei to sum(sel).
     nnei = sum(sel)
     n_pad = max(1, nnei // 4)  # pad by ~25%, at least 1
     nlist = np.concatenate(
@@ -519,15 +517,10 @@ def _trace_and_export(
         prefer_deferred_runtime_asserts_over_guards=True,
     )
 
-    if is_spin:
-        # torch.export re-introduces shape-guard assertions even when
-        # the make_fx graph has none.  The spin model's atom-doubling
-        # logic creates slice patterns that depend on (nall - nloc),
-        # producing guards like Ne(nall, nloc).  These guards are
-        # spurious: the model is correct when nall == nloc (NoPBC).
-        # Strip them from the exported graph so the model can be
-        # used with any valid nall >= nloc.
-        _strip_shape_assertions(exported.graph_module)
+    # torch.export inserts _assert_scalar guards for symbolic shape
+    # relationships (e.g. Ne(nnei, sum(sel)), Ne(nall, nloc)).  These
+    # are spurious — the model handles any valid input shapes correctly.
+    _strip_shape_assertions(exported.graph_module)
 
     # 7. Move the exported program to the target device if needed.
     if target_device.type != "cpu":
diff --git a/source/tests/pt_expt/export_helpers.py b/source/tests/pt_expt/export_helpers.py
@@ -70,6 +70,12 @@ def export_save_load_and_compare(
         strict=False,
         prefer_deferred_runtime_asserts_over_guards=True,
     )
+    # Strip spurious shape-guard assertions (e.g. Ne(nnei, sum(sel)))
+    from deepmd.pt_expt.utils.serialization import (
+        _strip_shape_assertions,
+    )
+
+    _strip_shape_assertions(exported.graph_module)
 
     # 4. .pte save -> load round-trip
     with tempfile.NamedTemporaryFile(suffix=".pte") as f:
@@ -199,9 +205,22 @@ def model_forward_lower_export_round_trip(
         )
 
     # 5. Symbolic trace + dynamic shapes + .pte round-trip
+    # Pad nlist with extra -1 columns so nnei > sum(sel) in the sample.
+    # This prevents torch.export from specializing nnei to sum(sel).
+    nlist_padded = torch.cat(
+        [
+            nlist_t,
+            -torch.ones(
+                (*nlist_t.shape[:2], max(1, nlist_t.shape[2] // 4)),
+                dtype=nlist_t.dtype,
+                device=nlist_t.device,
+            ),
+        ],
+        dim=-1,
+    )
     inputs_2f = tuple(
         torch.cat([t, t], dim=0) if t is not None else None
-        for t in (ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam)
+        for t in (ext_coord, ext_atype, nlist_padded, mapping_t, fparam, aparam)
     )
     traced_sym = md_pt.forward_lower_exportable(
         inputs_2f[0],
@@ -221,6 +240,12 @@ def model_forward_lower_export_round_trip(
         strict=False,
         prefer_deferred_runtime_asserts_over_guards=True,
     )
+    # Strip spurious shape-guard assertions (e.g. Ne(nnei, sum(sel)))
+    from deepmd.pt_expt.utils.serialization import (
+        _strip_shape_assertions,
+    )
+
+    _strip_shape_assertions(exported_dyn.graph_module)
     with tempfile.NamedTemporaryFile(suffix=".pte") as f:
         torch.export.save(exported_dyn, f.name)
         loaded = torch.export.load(f.name).module()
diff --git a/source/tests/pt_expt/infer/test_deep_eval.py b/source/tests/pt_expt/infer/test_deep_eval.py
@@ -63,11 +63,11 @@ def setUpClass(cls) -> None:
         cls.model = cls.model.to(torch.float64)
         cls.model.eval()
 
-        # Serialize and save to .pte
+        # Serialize and save to .pte (with atomic virial for test_dynamic_shapes)
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
         cls.tmpfile.close()
-        deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        deserialize_to_file(cls.tmpfile.name, cls.model_data, do_atomic_virial=True)
 
         # Create DeepPot for testing
         cls.dp = DeepPot(cls.tmpfile.name)
@@ -547,14 +547,14 @@ def setUpClass(cls) -> None:
         # compilation (tests/pt/__init__.py sets it to "cuda:9999999").
         torch.set_default_device(None)
         try:
-            deserialize_to_file(cls.tmpfile.name, cls.model_data)
+            deserialize_to_file(cls.tmpfile.name, cls.model_data, do_atomic_virial=True)
         finally:
             torch.set_default_device("cuda:9999999")
 
         # Also save to .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
         cls.pte_tmpfile.close()
-        deserialize_to_file(cls.pte_tmpfile.name, cls.model_data)
+        deserialize_to_file(cls.pte_tmpfile.name, cls.model_data, do_atomic_virial=True)
 
         # Create DeepPot for .pt2
         cls.dp = DeepPot(cls.tmpfile.name)
@@ -1070,7 +1070,7 @@ def setUpClass(cls) -> None:
         cls.model_data = {"model": cls.model.serialize()}
         cls.tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
         cls.tmpfile.close()
-        deserialize_to_file(cls.tmpfile.name, cls.model_data)
+        deserialize_to_file(cls.tmpfile.name, cls.model_data, do_atomic_virial=True)
 
         cls.dp = DeepPot(cls.tmpfile.name)
 
@@ -1187,14 +1187,14 @@ def setUpClass(cls) -> None:
         cls.tmpfile.close()
         torch.set_default_device(None)
         try:
-            deserialize_to_file(cls.tmpfile.name, cls.model_data)
+            deserialize_to_file(cls.tmpfile.name, cls.model_data, do_atomic_virial=True)
         finally:
             torch.set_default_device("cuda:9999999")
 
         # Also save .pte for cross-format comparison
         cls.pte_tmpfile = tempfile.NamedTemporaryFile(suffix=".pte", delete=False)
         cls.pte_tmpfile.close()
-        deserialize_to_file(cls.pte_tmpfile.name, cls.model_data)
+        deserialize_to_file(cls.pte_tmpfile.name, cls.model_data, do_atomic_virial=True)
 
         cls.dp = DeepPot(cls.tmpfile.name)
         cls.dp_pte = DeepPot(cls.pte_tmpfile.name)