Runtime and tensor improvements (#3008)

hunhoffe · claude · web-flow · commit 190bd98bb8af · 2026-03-30T23:14:41.000Z
Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -206,6 +206,20 @@ if (AIE_ENABLE_XRT_PYTHON_BINDINGS)
   endif()
 endif()
 
+# Detect whether PyTorch is importable by the configured Python interpreter.
+# Used to enable/skip lit tests that require torch (# REQUIRES: pytorch).
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} -c "import torch"
+  RESULT_VARIABLE _torch_result
+  OUTPUT_QUIET ERROR_QUIET
+)
+if(_torch_result EQUAL 0)
+  set(AIE_ENABLE_PYTORCH ON)
+else()
+  set(AIE_ENABLE_PYTORCH OFF)
+endif()
+message(STATUS "PyTorch available for testing: ${AIE_ENABLE_PYTORCH}")
+
 cmake_dependent_option(AIECC_COMPILE
   "Set aiecc to compile." ON "NOT AIE_COMPILER STREQUAL NONE" OFF)
 
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
@@ -70,13 +70,9 @@
     config.opencv_include_dir, config.opencv_lib_dir, config.opencv_libs
 )
 
-try:
-    import torch
-
+if config.pytorch:
     config.available_features.add("torch")
-except ImportError:
-    print("torch not found", file=sys.stderr)
-    pass
+    config.available_features.add("pytorch")
 
 # Setup host target triplet and sysroot
 triplet, sysroot_flag = LitConfigHelper.setup_host_target_triplet(
diff --git a/programming_examples/lit.site.cfg.py.in b/programming_examples/lit.site.cfg.py.in
@@ -53,6 +53,7 @@ config.test_exec_root = r"""@CMAKE_CURRENT_BINARY_DIR@"""
 
 config.hsa_dir = r"""@hsa-runtime64_DIR@"""
 config.hsa_found = lit.util.pythonize_bool(r"""@hsa-runtime64_FOUND@""")
+config.pytorch = lit.util.pythonize_bool(r"""@AIE_ENABLE_PYTORCH@""")
 
 # pass on vitis settings
 config.enable_chess_tests = @CONFIG_ENABLE_CHESS_TESTS@
diff --git a/python/utils/hostruntime/tensor_class.py b/python/utils/hostruntime/tensor_class.py
@@ -9,12 +9,80 @@
 from functools import cached_property
 import numpy as np
 
+# Mapping from ml_dtypes (non-native numpy) types to their torch equivalents.
+# Native numpy dtypes (float32, int32, …) are handled directly by torch.from_numpy
+# and do not need an entry here.
+# Populated lazily at first use to avoid importing torch/ml_dtypes at module load.
+_ML_DTYPE_TO_TORCH: dict | None = None
+
+
+def _ml_dtype_to_torch_map():
+    global _ML_DTYPE_TO_TORCH
+    if _ML_DTYPE_TO_TORCH is None:
+        import torch
+        import ml_dtypes
+
+        _candidates = {
+            ml_dtypes.bfloat16: torch.bfloat16,
+        }
+        for attr in (
+            "float8_e4m3fn",
+            "float8_e5m2",
+            "float8_e4m3fnuz",
+            "float8_e5m2fnuz",
+        ):
+            ml_dt = getattr(ml_dtypes, attr, None)
+            torch_dt = getattr(torch, attr, None)
+            if ml_dt is not None and torch_dt is not None:
+                _candidates[ml_dt] = torch_dt
+        _ML_DTYPE_TO_TORCH = {
+            np.dtype(ml_dt): torch_dt for ml_dt, torch_dt in _candidates.items()
+        }
+    return _ML_DTYPE_TO_TORCH
+
+
+# Same-width unsigned integer dtype for the ND reinterpret-view trick.
+_UINT_VIEW_DTYPE = {
+    1: np.uint8,
+    2: np.uint16,
+    4: np.uint32,
+    8: np.uint64,
+}
+
+
+def _array_to_torch(array: np.ndarray):
+    """
+    Convert a numpy array to a torch tensor, zero-copy.
+
+    For native numpy dtypes (float32, float16, int32, …) torch.from_numpy is used directly
+    (fastest path for these types).
+
+    For ml_dtypes types (bfloat16, float8_*) that torch cannot consume via from_numpy:
+    reinterpret as a same-width unsigned integer numpy view, wrap with from_numpy,
+    then view as the target torch dtype.  This is guaranteed zero-copy for all ranks.
+
+    Raises:
+        ImportError: If torch is not installed.
+    """
+    # _ml_dtype_to_torch_map() imports torch (raising ImportError with a helpful message
+    # if absent) and returns the ml_dtype -> torch dtype mapping.
+    torch_dtype = _ml_dtype_to_torch_map().get(array.dtype)
+    import torch  # already imported by _ml_dtype_to_torch_map(); cached by Python
+
+    if torch_dtype is None:
+        # Native numpy dtype: torch.from_numpy handles it directly and fastest.
+        return torch.from_numpy(array)
+
+    # ml_dtype: reinterpret memory as a same-width uint, then view as the torch dtype.
+    uint_dtype = _UINT_VIEW_DTYPE[array.dtype.itemsize]
+    return torch.from_numpy(array.view(uint_dtype)).view(torch_dtype)
+
 
 class Tensor(ABC):
     """
     Tensor object backed by NPU or CPU memory.
 
-    The class provides commom tensor operations such as creation,
+    The class provides common tensor operations such as creation,
     filling with values, and accessing data.
 
     """
@@ -258,28 +326,33 @@ def to_torch(self):
         """
         Returns a torch tensor sharing the data in this tensor if possible.
 
+        Syncs from device first if the tensor is on the NPU.
+
         Returns:
             torch.Tensor: A torch tensor containing the data.
 
         Raises:
             ImportError: If torch is not installed.
         """
-        try:
-            import torch
-            from ml_dtypes import bfloat16
-        except ImportError:
-            raise ImportError(
-                "torch is not installed. Please install it with 'pip install torch'"
-            )
+        return _array_to_torch(self.numpy())
 
-        array = self.numpy()
+    def torch_view(self):
+        """
+        Returns a torch tensor sharing this buffer's host memory without syncing from device.
 
-        if array.dtype == bfloat16:
-            # reinterpret the same memory as int16, then view as torch.bfloat16
-            t_u16 = torch.from_numpy(array.view(np.uint16))
-            return t_u16.view(torch.bfloat16)
+        Unlike to_torch(), this does NOT sync from the NPU first. Marks the buffer as
+        CPU-resident so that a subsequent .to("npu") call (or the NPU operator's implicit
+        sync) will push the written data to device. Use this on write paths where the
+        caller is about to overwrite the buffer contents.
 
-        return torch.from_numpy(array)
+        Returns:
+            torch.Tensor: A zero-copy torch tensor view of the host-side buffer.
+
+        Raises:
+            ImportError: If torch is not installed.
+        """
+        self.device = "cpu"  # mark dirty so next to("npu") will actually sync
+        return _array_to_torch(self.data)
 
     @classmethod
     def from_torch(cls, torch_tensor, device=None, **kwargs):
@@ -297,13 +370,8 @@ def from_torch(cls, torch_tensor, device=None, **kwargs):
         Raises:
             ImportError: If torch is not installed.
         """
-        try:
-            import torch
-            from ml_dtypes import bfloat16
-        except ImportError:
-            raise ImportError(
-                "torch is not installed. Please install it with 'pip install torch'"
-            )
+        import torch
+        from ml_dtypes import bfloat16
 
         # Detach (to drop grad) and ensure on CPU
         t = torch_tensor.detach()
diff --git a/python/utils/hostruntime/xrtruntime/hostruntime.py b/python/utils/hostruntime/xrtruntime/hostruntime.py
@@ -204,9 +204,10 @@ def load(
                 raise RuntimeError("No kernels found in xclbin")
             kernel_name = kernels[0].get_name()
         else:
-            if not kernel_name in [k.get_name() for k in xclbin.get_kernels()]:
+            available_kernels = [k.get_name() for k in xclbin.get_kernels()]
+            if kernel_name not in available_kernels:
                 raise HostRuntimeError(
-                    f"Kernel {kernel_name} not found in xclbin (kernels found: {[k.get_name() for k in xclbin.get_kernels()]})"
+                    f"Kernel {kernel_name} not found in xclbin (kernels found: {available_kernels})"
                 )
 
         insts = self.read_insts(insts_path)
@@ -399,7 +400,6 @@ def cleanup(self):
         gc.collect()  # Make sure contexts are garbage collected.
 
     def _cleanup_entry(self, entry):
-        context = entry["context"]
         handles = entry["handles"]
 
         # Invalidate all handles
@@ -408,17 +408,24 @@ def _cleanup_entry(self, entry):
             if handle:
                 handle.invalidate()
 
-        # Explicitly delete context
-        del context
+        # Clear kernel cache so pyxrt.kernel objects are released with the context
+        entry["kernels"].clear()
+
+        # Release the hw_context by removing its strong reference from the entry dict.
+        # Simply assigning a local `context = entry["context"]` and then `del context`
+        # only removes the local name — entry["context"] would keep the object alive for
+        # as long as any caller holds a reference to the entry dict (e.g. tests, or the
+        # exception handler).  Deleting the key guarantees the refcount drops here.
+        del entry["context"]
 
     def _evict(self):
         # Pop the oldest item
         key, entry = self._context_cache.popitem(last=False)
         self._cleanup_entry(entry)
 
     def _cleanup_insts_entry(self, entry):
-        insts_bo = entry["insts_bo"]
-        del insts_bo
+        # Delete the key (not a local copy) so the refcount drops here.
+        del entry["insts_bo"]
 
     def _evict_insts(self):
         key, entry = self._insts_cache.popitem(last=False)
@@ -542,6 +549,7 @@ def load(
                 entry = {
                     "context": context,
                     "xclbin": xclbin,
+                    "kernels": {},  # kernel_name -> pyxrt.kernel (strong ref, tied to context)
                     "handles": [],
                     "uuid": xclbin_uuid,
                 }
@@ -554,17 +562,25 @@ def load(
                     raise RuntimeError("No kernels found in xclbin")
                 kernel_name = kernels[0].get_name()
             else:
-                if not kernel_name in [k.get_name() for k in xclbin.get_kernels()]:
+                available_kernels = [k.get_name() for k in xclbin.get_kernels()]
+                if kernel_name not in available_kernels:
                     raise HostRuntimeError(
-                        f"Kernel {kernel_name} not found in xclbin (kernels found: {[k.get_name() for k in xclbin.get_kernels()]})"
+                        f"Kernel {kernel_name} not found in xclbin (kernels found: {available_kernels})"
                     )
 
             insts = self.read_insts(insts_path)
             insts_bo = None
             if hasattr(pyxrt, "module") and isinstance(insts, pyxrt.module):
-                kernel = pyxrt.ext.kernel(context, insts, kernel_name)
+                ext_kernel_key = (kernel_name, str(insts_path), insts_mtime)
+                if ext_kernel_key not in entry["kernels"]:
+                    entry["kernels"][ext_kernel_key] = pyxrt.ext.kernel(
+                        context, insts, kernel_name
+                    )
+                kernel = entry["kernels"][ext_kernel_key]
             else:
-                kernel = pyxrt.kernel(context, kernel_name)
+                if kernel_name not in entry["kernels"]:
+                    entry["kernels"][kernel_name] = pyxrt.kernel(context, kernel_name)
+                kernel = entry["kernels"][kernel_name]
 
                 # Magic number for RyzenAI group id that will be fixed in the future. See same code at XRT:
                 # https://github.com/Xilinx/XRT/blob/56222ed5cfd119dff0d5bd920735b87024e8c829/src/runtime_src/core/common/api/xrt_module.cpp#L1621
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
@@ -188,6 +188,8 @@
 if config.has_mlir_runtime_libraries:
     config.available_features.add("has_mlir_runtime_libraries")
 
+if config.pytorch:
+    config.available_features.add("pytorch")
 
 if "LIT_AVAILABLE_FEATURES" in os.environ:
     for feature in os.environ["LIT_AVAILABLE_FEATURES"].split():
diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in
@@ -59,6 +59,7 @@ config.enable_python_tests = lit.util.pythonize_bool(r"""@ENABLE_PYTHON_TESTS@""
 config.python_passes = lit.util.pythonize_bool(r"""@AIE_ENABLE_PYTHON_PASSES@""")
 config.xrt_python_bindings = lit.util.pythonize_bool(r"""@AIE_ENABLE_XRT_PYTHON_BINDINGS@""")
 config.has_mlir_runtime_libraries = lit.util.pythonize_bool(r"""@HAS_MLIR_RUNTIME_LIBRARIES@""")
+config.pytorch = lit.util.pythonize_bool(r"""@AIE_ENABLE_PYTORCH@""")
 
 # pass on vitis settings
 config.vitis_root = r"""@VITIS_ROOT@"""
diff --git a/test/python/npu-xrt/test_cached_xrt_runtime.py b/test/python/npu-xrt/test_cached_xrt_runtime.py
diff --git a/test/python/npu-xrt/test_torch_comparison.py b/test/python/npu-xrt/test_torch_comparison.py