vllm-project · kylesayrs · Mar 7, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/src/compressed_tensors/offload/__init__.py b/src/compressed_tensors/offload/__init__.py
@@ -3,7 +3,7 @@
 
 import contextlib
 from collections.abc import Iterable
-from typing import Literal
+from typing import Literal, Optional
 
 import torch
 from compressed_tensors.distributed.utils import set_source_process
@@ -32,6 +32,7 @@
     to_meta,
 )
 from compressed_tensors.utils.helpers import patch_attr
+from torch._prims_common import DeviceLikeType
 
 
 __all__ = [
@@ -145,8 +146,8 @@ def update_offload_parameter(module: torch.nn.Module, name: str, data: torch.Ten
 
 
 def get_execution_device(
-    module: torch.nn.Module, default: torch.device | None = None
-) -> torch.device | Literal["disk"]:
+    module: torch.nn.Module, default: Optional[DeviceLikeType] = None
+) -> torch.device:
     """
     Get the device which inputs should be moved to before module execution.
 
@@ -161,7 +162,7 @@ def get_execution_device(
 
 
 def get_offloaded_device(
-    module: torch.nn.Module, default: torch.device | None = None
+    module: torch.nn.Module, default: Optional[DeviceLikeType] = None
 ) -> torch.device | Literal["disk"]:
     """
     :param module: module to check
@@ -236,7 +237,7 @@ def register_offload_module(base: torch.nn.Module, name: str, module: torch.nn.M
 @contextlib.contextmanager
 def align_modules(
     modules: torch.nn.Module | Iterable[torch.nn.Module],
-    execution_device: torch.device | None = None,
+    execution_device: Optional[DeviceLikeType] = None,
 ):
     """
     Context manager for onloading modules to a device, and disabling onload and offload
@@ -253,7 +254,7 @@ def align_modules(
 
 @contextlib.contextmanager
 def align_module_device(
-    module: torch.nn.Module, execution_device: torch.device | None = None
+    module: torch.nn.Module, execution_device: Optional[DeviceLikeType] = None
 ):
     """
     Context manager that moves a module's parameters to the specified execution device.
@@ -286,4 +287,4 @@ def align_module_device(
         finally:
             for name, param in module.named_parameters(recurse=False):
                 device = original_device[name]
-                move_module_tensor(module, name, device)
+                move_module_tensor(module, name, device)
diff --git a/src/compressed_tensors/offload/cache/base.py b/src/compressed_tensors/offload/cache/base.py
@@ -9,6 +9,7 @@
 import torch
 import torch.distributed as dist
 from compressed_tensors.utils import is_accelerator_type
+from torch._prims_common import DeviceLikeType
 
 
 class OffloadCache(MutableMapping, ABC):
@@ -32,8 +33,8 @@ class OffloadCache(MutableMapping, ABC):
     info, see `compressed_tensors.offload::(disable_offloading|disable_onloading)`
     """
 
-    onload_device: torch.device | str
-    offload_device: torch.device | Literal["disk"]
+    onload_device: DeviceLikeType
+    offload_device: DeviceLikeType | Literal["disk"]
 
     # global flags for disabling
     offloading_disabled: ClassVar[bool] = False
@@ -47,8 +48,7 @@ class OffloadCache(MutableMapping, ABC):
 
     @classmethod
     def cls_from_device(
-        cls,
-        device: torch.device | str | Literal["disk"] | None = None,
+        cls, device: DeviceLikeType | Literal["disk"]
     ) -> type["OffloadCache"]:
         """
         Get the subclass which implements offloading for the given `offload_device`.
@@ -274,4 +274,4 @@ def disable_onloading(cls):
             yield
             OffloadCache.onloading_disabled = restore_value
         else:
-            yield
+            yield
diff --git a/src/compressed_tensors/offload/cache/cpu.py b/src/compressed_tensors/offload/cache/cpu.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Literal, Optional
+
 import torch
 from compressed_tensors.offload.cache.base import OffloadCache
 from compressed_tensors.offload.cache.utils import catch_cpu_mem_error
 from compressed_tensors.offload.utils import send_tensors
+from torch._prims_common import DeviceLikeType
 
 
 class CPUCache(OffloadCache):
@@ -45,4 +48,4 @@ def update_offload(self, offloaded: torch.Tensor, data: torch.Tensor | None):
         :param data: new data to copy from
         """
         if data is not None:
-            offloaded.copy_(data)
+            offloaded.copy_(data)
diff --git a/src/compressed_tensors/offload/cache/device.py b/src/compressed_tensors/offload/cache/device.py
@@ -6,16 +6,12 @@
 import torch
 from compressed_tensors.offload.cache.base import OffloadCache
 from compressed_tensors.offload.utils import send_tensors
-
-
-if TYPE_CHECKING:
-    from torch._prims_common import DeviceLikeType
+from torch._prims_common import DeviceLikeType
 
 
 class DeviceCache(OffloadCache):
     """
-    Handles offloading and onloading tensors from/to device memory. Onloading
-    tensors is typically a no-op (except onload device has been modified).
+    Handles offloading and onloading tensors from/to device memory.
     """
 
     def __init__(
@@ -57,4 +53,4 @@ def update_offload(self, offloaded: torch.Tensor, data: torch.Tensor | None):
         :param data: new data to copy from
         """
         if data is not None:
-            offloaded.copy_(data)
+            offloaded.copy_(data)
diff --git a/src/compressed_tensors/offload/cache/disk.py b/src/compressed_tensors/offload/cache/disk.py
@@ -13,7 +13,6 @@
 from safetensors import safe_open
 from safetensors.torch import save_file
 
-
 if TYPE_CHECKING:
     from torch._prims_common import DeviceLikeType
 
@@ -128,7 +127,7 @@ def update_offload(self, offloaded: torch.Tensor, data: torch.Tensor | None):
         """
         Write new param data to file that already exists.
 
-        :param offloaded: meta tensors representating parameter to update
+        :param offloaded: meta tensor representing the parameter to update
         :param data: new data
         """
         # get weight info from index
@@ -153,6 +152,20 @@ def create_checkpoint_symlink(
         weight_info: dict,
         offload_dir: str | os.PathLike | None,
     ) -> None:
+        """
+        Create a symlink to a checkpoint safetensors file. This symlink allows
+        individual tensor data to be individually modified and deleted without affecting
+        the original model checkpoint files.
+
+        When reading, the symlink redirects the read to the checkpoint file
+        When updating, the symlink is destroyed and a new file written to the same path
+        When deleting, the symlink (or new file) is destroyed
+
+        :param offloaded: meta tensor representing the parameter in the checkpoint
+        :param weight_info: info (typically from accelerate) pointing to checkpoint
+        :param offload_dir: offload directly to create symlink in
+        """
+        assert offloaded.device.type == "meta"
         assert (
             is_source_process()
         ), "Must call on rank 0 to avoid id collisions between ranks"
@@ -191,4 +204,4 @@ def _get_safe_open_device(device: "DeviceLikeType") -> str:
             index = device.index
         return f"{device.type}:{index}"
     else:
-        return device.type
+        return device.type
diff --git a/src/compressed_tensors/offload/convert/from_accelerate.py b/src/compressed_tensors/offload/convert/from_accelerate.py
@@ -20,6 +20,7 @@
 from compressed_tensors.offload.dispatch import dispatch_with_map
 from compressed_tensors.offload.utils import to_tensor
 from loguru import logger
+from torch._prims_common import DeviceLikeType
 
 
 if TYPE_CHECKING:
@@ -88,7 +89,7 @@ def remove_accelerate(model: torch.nn.Module) -> tuple["DeviceMap", str | None]:
 
 def remove_accelerate_from_module(
     module: torch.nn.Module,
-) -> tuple[torch.device | None, torch.device | Literal["disk"] | None, str | None]:
+) -> tuple[DeviceLikeType | None, DeviceLikeType | Literal["disk"] | None, str | None]:
     """
     Remove accelerate offloading from a module, if present.
     Absolutely no device movement occurs, and parameters/buffers pointers from state
@@ -236,4 +237,4 @@ def _unwrap_prefixed_dataset(weights_map, PrefixedDatasetType):
 def _set_or_validate_offload(current: str | None, new: str) -> str:
     if current not in (None, new):
         raise ValueError("Expected all accelerate tensors to share offload")
-    return new
+    return new
diff --git a/src/compressed_tensors/offload/dispatch.py b/src/compressed_tensors/offload/dispatch.py
@@ -19,6 +19,7 @@
 from compressed_tensors.utils.binary_search import SearchFailureError, max_binary_search
 from compressed_tensors.utils.helpers import deprecated
 from loguru import logger
+from torch._prims_common import DeviceLikeType
 from tqdm import tqdm
 from transformers import PreTrainedModel
 
@@ -104,7 +105,7 @@ def dispatch_with_map(
 
 
 def get_device_map(
-    model: torch.nn.Module, default_device: torch.device = torch.device("cpu")
+    model: torch.nn.Module, default_device: DeviceLikeType = torch.device("cpu")
 ) -> DeviceMap:
     """
     Get the device map of a CT-offloaded model
@@ -309,4 +310,4 @@ def _get_greedy_dispatch(
             memory_remaining[device] -= size
             break
 
-    return dispatch, memory_remaining
+    return dispatch, memory_remaining
diff --git a/src/compressed_tensors/offload/utils.py b/src/compressed_tensors/offload/utils.py
@@ -5,11 +5,12 @@
 from collections.abc import Container
 from dataclasses import fields, is_dataclass
 from itertools import chain
-from typing import TypeVar
+from typing import Optional, TypeVar
 
 import torch
 from compressed_tensors.utils.helpers import patch_attr
 from loguru import logger
+from torch._prims_common import DeviceLikeType
 
 
 __all__ = [
@@ -66,7 +67,7 @@ def send_tensors(value: T, *args, **kwargs) -> T:
 
 
 def get_module_device(
-    module: torch.nn.Module, default: torch.device | None = None
+    module: torch.nn.Module, default: Optional[DeviceLikeType] = None
 ) -> torch.device:
     """
     Infer the device of a module using the first
@@ -80,7 +81,7 @@ def get_module_device(
     if tensor is not None:
         return tensor.device
     elif default is not None:
-        return default
+        return torch.device(default)
     else:
         logger.warning(
             f"Unable to get execution device of {module}, falling back to CPU",
@@ -92,7 +93,7 @@ def get_module_device(
 def move_module_tensor(
     module: torch.nn.Module,
     name: str,
-    device: int | str | torch.device,
+    device: DeviceLikeType,
 ):
     """
     Move a module's tensor to a new device
@@ -237,4 +238,4 @@ def as_single_threaded():
         patch_attr(DistributedCPUCache, "offload", CPUCache.offload),
         patch_attr(DistributedDiskCache, "offload", DiskCache.offload),
     ):
-        yield
+        yield