deepcompile: Fix backward graph recompilation due to unbalanced forward/backward visits

eternalNight · eternalNight · commit ebbe957609c9 · 2026-04-20T16:25:35.000+08:00
In recent PyTorch AOT Autograd, having tensors requiring grad in inputs doesn't guarantee backward graph compilation. If no output requires grad and no input requiring grad is mutated, aot_autograd skips backward compilation (see [1]). DeepCompile previously required backward compilation for every forward graph which required grad, but relied solely on the existence of require_grad tensors. This mismatch caused unbalanced forward/backward visits, leaving graphs unvisited in `frames_needing_bwd`. The patched FunctionMeta then remained effective during backward execution, causing graphs to recompile on each execution and triggering exceptions during `frames_needing_bwd.remove`. Fix by: - Remove `frames_needing_bwd` set and `needs_backward` tracking - Use context manager `collect_backward_inputs()` to scope the patching of compiled functions only during the forward pass This ensures FunctionMeta patching is only effective during forward and prevents unnecessary recompilation during backward passes. References [1] https://github.com/pytorch/pytorch/blob/aea31e0c306e2315bf6d84255e0dde7adf09762a/torch/_functorch/aot_autograd.py#L618 Signed-off-by: Junjie Mao <junjie.mao@linux.alibaba.com>
diff --git a/deepspeed/compile/backend.py b/deepspeed/compile/backend.py
@@ -12,7 +12,6 @@
 from torch.fx import Graph, GraphModule
 
 try:
-    import torch.utils._pytree as pytree
     import torch._dynamo
     from functorch.compile import make_boxed_func
     from torch._functorch.aot_autograd import aot_module_simplified
@@ -28,7 +27,7 @@
 from .graph_param import DSGraphParamManager
 from .profilers import ProfilingResult
 from .profilers.graph_profile import MemoryProfilingInterpreter
-from .patch_compiled_func import patch_compiled_func, unpatch_compiled_func, get_backward_inputs
+from .patch_compiled_func import get_backward_inputs
 from .util import get_input_nodes, get_activation_node_names, get_index_by_graph_id, get_deepcompile_handle, log_rank0, is_backend_inductor
 from .partitioner import get_wrapped_partitioner
 from .inductor import register_custom_ops, patch_create_aot_dispatcher_function
@@ -47,9 +46,9 @@ class GraphOrder:
     def __init__(self):
         self.frames = OrderedDict()
 
-    def add_graph(self, graph_id: int, frame_id: int, needs_backward: bool):
+    def add_graph(self, graph_id: int, frame_id: int):
         if frame_id not in self.frames:
-            self.frames[frame_id] = (graph_id, needs_backward)
+            self.frames[frame_id] = (graph_id, )
 
     def get_graph_order(self) -> List[Tuple[int, bool]]:
         return list(self.frames.values())
@@ -60,7 +59,6 @@ def clear(self):
 
 graph_order_with_frame_id = GraphOrder()
 
-frames_needing_bwd = set()
 profiling_results: Dict[int, ProfilingResult] = {}
 opt_pass_times = []
 opt_passes = {}
@@ -225,9 +223,8 @@ def make_backend(backend, compile_config, compile_kwargs={}):
     def backend_fn(gm: GraphModule, real_inputs):
         graph_id = id(gm.graph)
 
-        needs_backward = pytree.tree_any(lambda x: x.requires_grad if torch.is_tensor(x) else False, real_inputs)
         frame_id = gm.meta["dynamo_compile_id"].frame_id
-        graph_order_with_frame_id.add_graph(graph_id, frame_id, needs_backward)
+        graph_order_with_frame_id.add_graph(graph_id, frame_id)
 
         graph_order = graph_order_with_frame_id.get_graph_order()
 
@@ -258,17 +255,11 @@ def backend_fn(gm: GraphModule, real_inputs):
         if graph_id not in profiling_results:
             profiling_results[graph_id] = ProfilingResult()
             profiling_results[graph_id].param_indices = param_indices
-            profiling_results[graph_id].needs_backward = needs_backward
 
         def make_fw_graph(gm, sample_inputs):
             time_start = time.time()
             graph_index = len(graph_order) - 1
 
-            if needs_backward:
-                if len(frames_needing_bwd) == 0:
-                    patch_compiled_func()
-                frames_needing_bwd.add(frame_id)
-
             # Try to get real_inputs from the list first, then from storage
             if fwd_real_inputs:
                 real_inputs = fwd_real_inputs.pop(0)
@@ -347,10 +338,6 @@ def make_bw_graph(gm, sample_inputs):
                 add_free_activations(graph_id, gm.graph,
                                      get_activation_node_names(gm.graph, param_nodes_bw, non_param_input_names))
 
-            frames_needing_bwd.remove(frame_id)
-            if len(frames_needing_bwd) == 0:
-                unpatch_compiled_func()
-
             log_rank0(
                 f"Bwd end {graph_index} graph_id={graph_id} alloc_mem={get_accelerator().memory_allocated()} graph={gm.graph}",
                 enable=debug_log)
diff --git a/deepspeed/compile/patch_compiled_func.py b/deepspeed/compile/patch_compiled_func.py
@@ -3,6 +3,7 @@
 
 # DeepSpeed Team
 
+from contextlib import contextmanager
 import torch
 from deepspeed.utils.torch import required_torch_version
 
@@ -89,5 +90,12 @@ def unpatch_compiled_func():
     torch.autograd.Function = original_grad_fn
 
 
+@contextmanager
+def collect_backward_inputs():
+    patch_compiled_func()
+    yield
+    unpatch_compiled_func()
+
+
 def get_backward_inputs():
     return backward_inputs
diff --git a/deepspeed/compile/profilers/__init__.py b/deepspeed/compile/profilers/__init__.py
@@ -13,7 +13,6 @@
 class ProfilingResult:
     fwd_graph: Graph = None
     bwd_graph: Graph = None
-    needs_backward: bool = False
     fwd_mem: List[Tuple[str, int, int, int]] = field(default_factory=list)  # name, current_alloc, delta, peak
     bwd_mem: List[Tuple[str, int, int, int]] = field(default_factory=list)
     fwd_time: List[Tuple[str, int, int]] = field(default_factory=list)  # name, device_time, wall_time
diff --git a/deepspeed/compile/util.py b/deepspeed/compile/util.py
@@ -469,7 +469,7 @@ def is_release_node(n: Node) -> bool:
 
 
 def get_index_by_graph_id(graph_order, target_graph_id):
-    for index, (graph_id, _) in enumerate(graph_order):
+    for index, (graph_id, ) in enumerate(graph_order):
         if graph_id == target_graph_id:
             return index
     return -1
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -11,6 +11,7 @@
 from collections import defaultdict, OrderedDict, deque
 from shutil import copyfile
 import gc
+from contextlib import nullcontext
 
 from torch.nn.modules import Module
 from torch.nn.parameter import Parameter
@@ -125,6 +126,7 @@
 from deepspeed.compile.util import is_deepcompile_supported, get_deepcompile_handle, deepcompile_backward_prologue
 from deepspeed.compile.backend import register_compile_pass, opt_passes
 from deepspeed.compile.passes import zero3_compile, prefetch, selective_gather, offload_adam_states
+from deepspeed.compile.patch_compiled_func import collect_backward_inputs
 from deepspeed.compile.init_z1 import init_z1
 from deepspeed.compile.init_z3 import init_z3
 from deepspeed.compile.init_sp import init_autosp
@@ -2350,11 +2352,15 @@ def forward(self, *inputs, **kwargs):
                 "DeepCompile is enabled but engine.compile() has not been called; executing without DeepCompile until compile() runs.",
                 ranks=[0])
 
-        if self.is_deepcompile_active() and hasattr(self, "launch_compile_passes"):
-            # We can't have this in forward prologue as the compiler compiles hooks including the forward prologue.
-            self.launch_compile_passes(self.global_steps)
+        if self.is_deepcompile_active():
+            collect_backward_input_ctx = collect_backward_inputs
+            if hasattr(self, "launch_compile_passes"):
+                # We can't have this in forward prologue as the compiler compiles hooks including the forward prologue.
+                self.launch_compile_passes(self.global_steps)
+        else:
+            collect_backward_input_ctx = nullcontext
 
-        with autocast_if_enabled(self):
+        with autocast_if_enabled(self), collect_backward_input_ctx():
             loss = self.module(*inputs, **kwargs)
 
         # Register output backward hooks