facebookincubator
diff --git a/‎examples/07_how_to_run_pt_model/classic_b2b_bmm_example.py‎
Lines changed: 37 additions & 6 deletions b/‎examples/07_how_to_run_pt_model/classic_b2b_bmm_example.py‎
Lines changed: 37 additions & 6 deletions
diff --git a/‎examples/07_how_to_run_pt_model/classic_b2b_bmm_flow.md‎
Lines changed: 136 additions & 0 deletions b/‎examples/07_how_to_run_pt_model/classic_b2b_bmm_flow.md‎
Lines changed: 136 additions & 0 deletions
@@ -141,15 +141,21 @@ def build_decomposed_b2b_bmm_graph(batch, seq_len, head_dim, dtype="float16"):
 # =============================================================================
 
 
-def run_pattern_matching_example():
+def run_pattern_matching_example(use_cutedsl=False):
     """Test: Decomposed ops auto-fused into classic_b2b_bmm by compiler pass.
 
     Builds an AIT graph from primitive ops (bmm_rcr, elementwise MUL/ADD/SIGMOID,
     bmm_rrr) and verifies that the fuse_b2b_bmm pass fuses them into a single
     classic_b2b_bmm kernel, producing results matching PyTorch.
+
+    Parameters
+    ----------
+    use_cutedsl : bool
+        If True, use CuTeDSL backend instead of CUTLASS C++ templates.
     """
+    backend_name = "CuTeDSL" if use_cutedsl else "CUTLASS C++"
     print("\n" + "=" * 60)
-    print("Pattern Matching Test: decomposed ops -> classic_b2b_bmm")
+    print(f"Pattern Matching Test: decomposed ops -> classic_b2b_bmm ({backend_name})")
     print("=" * 60)
 
     batch, seq_len, head_dim = 4, 128, 64
@@ -166,15 +172,19 @@ def run_pattern_matching_example():
     y_pt = pt_model(q_pt, k_pt, v_pt, bias_pt)
 
     # Build AIT graph from decomposed ops (NOT ops.classic_b2b_bmm)
-    target = _get_target(use_fp16_acc=False)
+    target = _get_target(use_fp16_acc=False, use_cutedsl_b2b_bmm=use_cutedsl)
     logging.getLogger("aitemplate").setLevel(logging.DEBUG)
 
     with target:
         Y = build_decomposed_b2b_bmm_graph(batch, seq_len, head_dim, dtype)
 
     # Compile - the fuse_b2b_bmm pass will fuse the decomposed graph
-    print("\nCompiling... (fuse_b2b_bmm pass will pattern-match and fuse)")
-    with compile_model(Y, target, "./tmp", "pattern_matched_b2b_bmm") as module:
+    workdir_suffix = "cutedsl" if use_cutedsl else "cutlass"
+    print(f"\nCompiling with {backend_name} backend...")
+    print("(fuse_b2b_bmm pass will pattern-match and fuse)")
+    with compile_model(
+        Y, target, "./tmp", f"pattern_matched_b2b_bmm_{workdir_suffix}"
+    ) as module:
         y_ait = torch.empty_like(y_pt)
         module.run_with_tensors(
             {"Q": q_pt, "K": k_pt, "V": v_pt, "Bias": bias_pt},
@@ -189,13 +199,34 @@ def run_pattern_matching_example():
 
 
 def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(description="AITemplate classic_b2b_bmm example")
+    parser.add_argument(
+        "--use-cutedsl",
+        action="store_true",
+        default=False,
+        help="Use CuTeDSL backend instead of CUTLASS C++ templates",
+    )
+    parser.add_argument(
+        "--both",
+        action="store_true",
+        default=False,
+        help="Run with both CUTLASS C++ and CuTeDSL backends",
+    )
+    args = parser.parse_args()
+
     print("=" * 60)
     print("AITemplate classic_b2b_bmm Pattern Matching Example")
     print("=" * 60)
     print("\nDemonstrates automatic fusion of decomposed attention ops")
     print("into classic_b2b_bmm via the fuse_b2b_bmm compiler pass.")
 
-    run_pattern_matching_example()
+    if args.both:
+        run_pattern_matching_example(use_cutedsl=False)
+        run_pattern_matching_example(use_cutedsl=True)
+    else:
+        run_pattern_matching_example(use_cutedsl=args.use_cutedsl)
 
     print("\n" + "=" * 60)
     print("All tests passed!")
 
@@ -0,0 +1,136 @@
+# AITemplate classic_b2b_bmm: Graph Optimization & Code Generation Flow
+
+## Overview
+
+This document describes the end-to-end compilation flow when decomposed
+attention ops are automatically fused into a single `classic_b2b_bmm` kernel.
+
+## Flow Diagram
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│  User Code: build_decomposed_b2b_bmm_graph()                   │
+│                                                                 │
+│  Q ──► bmm_rcr(Q,K) ──► MUL(α₀) ──► ADD(bias) ──► SIGMOID     │
+│                                                        │        │
+│                                                  MUL(α₁)       │
+│                                                        │        │
+│                                           bmm_rrr(score,V) ──► Y│
+└──────────────────────────┬──────────────────────────────────────┘
+                           │
+                           ▼
+┌──────────────────────────────────────────────────────────────────┐
+│  compile_model(Y, target, workdir, test_name)                    │
+│  [compiler.py]                                                   │
+│                                                                  │
+│  1. toposort(output_tensors)                                     │
+│  2. name_graph(sorted_graph)                                     │
+│  3. optimize_graph(sorted_graph)  ◄──────────────────────────┐   │
+│     │                                                        │   │
+│     ├─ constant_folding                                      │   │
+│     ├─ fuse_ops (elementwise fusions, etc.)                  │   │
+│     ├─ ★ fuse_b2b_bmm(sorted_graph) ◄───── PATTERN MATCH    │   │
+│     │   │                                                    │   │
+│     │   │  Matches chain:                                    │   │
+│     │   │    bmm_rcr → MUL(const) → ADD(tensor)             │   │
+│     │   │    → activation → [MUL(const)] → bmm_rrr          │   │
+│     │   │                                                    │   │
+│     │   │  Replaces with:                                    │   │
+│     │   │    classic_b2b_bmm(Q, K, V, bias)                  │   │
+│     │   │    α₀, α₁, epilogue baked into op attrs            │   │
+│     │   │                                                    │   │
+│     │   └─ Removes 6 intermediate ops, 4+ intermediate       │   │
+│     │      tensors                                            │   │
+│     │                                                        │   │
+│     ├─ memory_planning(sorted_graph)                         │   │
+│     └─ other passes...                                       │   │
+│                                                                  │
+│  4. codegen(sorted_graph, workdir)                               │
+│     │                                                            │
+│     ├─ gen_function_src()                                        │
+│     │   For each op (including classic_b2b_bmm):                 │
+│     │   ┌────────────────────────────────────────────────────┐   │
+│     │   │ op.gen_function()                                  │   │
+│     │   │   → registry.get("cuda.classic_b2b_bmm.gen_function")│ │
+│     │   │   → Renders Jinja2 FUNC_TEMPLATE                   │  │
+│     │   │   → Writes <func_name>.cu                          │  │
+│     │   └────────────────────────────────────────────────────┘   │
+│     │                                                            │
+│     ├─ ModelContainerGenerator                                   │
+│     │   → func_decl(): function declarations                     │
+│     │   → func_call(): invocations in RunImpl()                  │
+│     │   → Writes model.cu, model_container.cu                    │
+│     │                                                            │
+│     └─ copy_headers_and_csrc_to_workdir()                        │
+│                                                                  │
+│  5. build(file_pairs, workdir, test_name)                        │
+│     │                                                            │
+│     ├─ gen_makefile()                                             │
+│     ├─ nvcc <func>.cu → <func>.obj                               │
+│     ├─ nvcc model.cu → model.obj                                 │
+│     └─ nvcc -shared *.obj → test.so                              │
+│                                                                  │
+│  6. Return Model(workdir)                                        │
+└──────────────────────────┬───────────────────────────────────────┘
+                           │
+                           ▼
+┌──────────────────────────────────────────────────────────────────┐
+│  Runtime: module.run_with_tensors(inputs, outputs)               │
+│  [model.py → Model class]                                        │
+│                                                                  │
+│  1. ctypes.CDLL loads test.so                                    │
+│  2. Sets input pointers + dynamic dims                           │
+│  3. Calls RunImpl(stream) in C++                                 │
+│     → Invokes classic_b2b_bmm_func(output, Q, K, V, bias,       │
+│         batch_size, num_heads, m0, k0, stream)                   │
+│     → Inside: instantiates B2bGemmBatched<...>, runs on GPU      │
+│  4. Returns output tensors                                       │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+## Generated CUDA Code Structure
+
+The backend codegen (`backend/cuda/b2b_bmm/classic_b2b_bmm.py`) produces:
+
+### `<func_name>.cu` — Kernel Source
+```cpp
+#include "cutlass/cutlass.h"
+#include "classic_b2b_bmm/device/b2b_batched_gemm.h"
+
+// Hardcoded tile sizes
+constexpr int ThreadblockM = 64, ThreadblockK = 32;
+constexpr int WarpM = 16, WarpK = 32;
+constexpr int N0 = <seq_len>, N1 = <head_dim>;
+
+void <func_name>(void* output, void* query, void* key, void* value,
+                 void* bias, int64_t batch_size, int64_t num_heads,
+                 int64_t m0, int64_t k0, cudaStream_t stream) {
+    // Type aliases, epilogue ops, B2bGemmBatched instantiation
+    // Argument construction with batched/multi-head strides
+    // Initialize and execute
+}
+```
+
+### `model.cu` — Container
+```cpp
+class Model : public ModelBase<Model> {
+    void RunImpl(StreamType stream) {
+        // ... sets up pointers ...
+        <func_name>(output, Q, K, V, bias, batch, heads, m0, k0, stream);
+    }
+};
+```
+
+## Key Files
+
+| Component | File |
+|-----------|------|
+| Pattern matching | `compiler/transform/fuse_b2b_bmm.py` |
+| Op definition | `compiler/ops/b2b_bmm/classic_b2b_bmm.py` |
+| Base class | `compiler/ops/b2b_bmm/b2b_bmm_base.py` |
+| CUDA backend | `backend/cuda/b2b_bmm/classic_b2b_bmm.py` |
+| CUTLASS headers | `static/include/kernels/classic_b2b_bmm/` |
+| Compiler entry | `compiler/compiler.py` |
+| Code generation | `backend/codegen.py` |
+| Builder | `backend/builder.py` |
+| Runtime | `compiler/model.py` |