-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCargo.toml
More file actions
92 lines (80 loc) · 2.3 KB
/
Cargo.toml
File metadata and controls
92 lines (80 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
[package]
name = "air-rs"
version = "0.2.0"
edition = "2021"
authors = ["Sunay Hegde"]
description = "A high-performance, memory-fluid LLM inference engine."
[lib]
# cdylib → Python .so wheel (.pyd on Windows)
# rlib → keeps `cargo test` working
name = "air_rs"
crate-type = ["cdylib", "rlib"]
[dependencies]
# Standard Async & Memory
tokio = { version = "1.37", features = ["full", "tracing"] }
memmap2 = "0.9"
anyhow = "1.0"
libc = "0.2"
# ML & CUDA
# candle-core is always included for tensor operations (CPU fallback).
# The "cuda" feature enables GPU acceleration via candle-core's cuda feature.
candle-core = "0.8.0"
candle-flash-attn = { version = "0.8.0", optional = true }
half = { version = "2.4", features = ["num-traits"] }
safetensors = "0.5.2"
# API & Utilities
axum = { version = "0.7", features = ["default"] }
tokio-stream = "0.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
toml = "0.8"
thiserror = "2.0"
byteorder = "1.5"
dashmap = "6.1"
rayon = "1.10"
rand = "0.8"
indicatif = "0.17"
# §13 optional ARB performance deps (feature-gated, default = std-only)
ordered-float = { version = "4.2", optional = true }
crossbeam-channel = { version = "0.5", optional = true }
# Model Hub & Utilities
dirs = "5.0"
ureq = { version = "2.10", default-features = false, features = ["tls", "gzip"] }
sha2 = "0.10"
uuid = { version = "1.10", features = ["v4"] }
log = "0.4"
env_logger = "0.11"
clap = { version = "4.5", features = ["derive"] }
tracing = "0.1"
tracing-subscriber = "0.3"
[features]
default = []
cuda = ["candle-core/cuda"]
rocm = []
vulkan = []
metal = ["candle-core/metal"]
flash-attn = ["candle-flash-attn"]
python = ["pyo3"]
# §13 optional performance deps
# Enable BinaryHeap-based O(log n) priority queue for waiting queue (W > 512)
arb-heap = ["dep:ordered-float"]
# Enable lock-free enqueue path via crossbeam channel (high-frequency HTTP)
arb-lockfree = ["dep:crossbeam-channel"]
[dependencies.pyo3]
version = "0.21"
optional = true
features = ["extension-module", "abi3-py311"]
[profile.release]
lto = "fat"
codegen-units = 1
panic = "abort"
opt-level = 3
strip = "symbols" # smaller .so → faster dlopen
[profile.bench]
opt-level = 3
lto = "thin"
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
[[bench]]
name = "throughput"
harness = false