// weights AWQ 4-bit for Mixtral 8x22B (141B params)
weights = params × bits ÷ 8
= 141 × 4.25 ÷ 8
= 74.91 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
74.91 ≤ 79.05 → FITS
headroom = 4.14 GB of weights budget left// weights Q4_K_M for gpt-oss 120B (117B params)
weights = params × bits ÷ 8
= 117 × 4.5 ÷ 8
= 65.81 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
65.81 ≤ 79.05 → FITS
headroom = 13.24 GB of weights budget left// weights Q5_K_M for Command R+ (104B params)
weights = params × bits ÷ 8
= 104 × 5.5 ÷ 8
= 71.50 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
71.50 ≤ 79.05 → FITS
headroom = 7.55 GB of weights budget left// weights FP8/INT8 for Qwen 2.5 72B (72.7B params)
weights = params × bits ÷ 8
= 72.7 × 8 ÷ 8
= 72.70 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
72.70 ≤ 79.05 → FITS
headroom = 6.35 GB of weights budget left// weights Q8_0 for Llama 3.3 70B (70.6B params)
weights = params × bits ÷ 8
= 70.6 × 8.5 ÷ 8
= 75.01 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
75.01 ≤ 79.05 → FITS
headroom = 4.04 GB of weights budget left// weights Q8_0 for Mixtral 8x7B (46.7B params)
weights = params × bits ÷ 8
= 46.7 × 8.5 ÷ 8
= 49.62 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
49.62 ≤ 79.05 → FITS
headroom = 29.43 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 16 ÷ 8
= 70.00 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
70.00 ≤ 79.05 → FITS
headroom = 9.05 GB of weights budget left// weights FP16/BF16 for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 16 ÷ 8
= 68.00 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
68.00 ≤ 79.05 → FITS
headroom = 11.05 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
65.00 ≤ 79.05 → FITS
headroom = 14.05 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 Coder 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
65.00 ≤ 79.05 → FITS
headroom = 14.05 GB of weights budget left// weights FP16/BF16 for Qwen3 30B A3B (30.5B params)
weights = params × bits ÷ 8
= 30.5 × 16 ÷ 8
= 61.00 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
61.00 ≤ 79.05 → FITS
headroom = 18.05 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 16 ÷ 8
= 54.00 GB
// budget on M3 Max 96 (96GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 14.40 GB (15% of 96GB)
budget = vram − safety − kv − overhead
= 96 − 14.40 − 0.05 − 2.50
= 79.05 GB
// fit decision
54.00 ≤ 79.05 → FITS
headroom = 25.05 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.