// weights Q3_K_M for Command R+ (104B params)
weights = params × bits ÷ 8
= 104 × 3.44 ÷ 8
= 44.72 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
44.72 ≤ 51.85 → FITS
headroom = 7.13 GB of weights budget left// weights Q5_K_M for Qwen 2.5 72B (72.7B params)
weights = params × bits ÷ 8
= 72.7 × 5.5 ÷ 8
= 49.98 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
49.98 ≤ 51.85 → FITS
headroom = 1.87 GB of weights budget left// weights Q5_K_M for Llama 3.3 70B (70.6B params)
weights = params × bits ÷ 8
= 70.6 × 5.5 ÷ 8
= 48.54 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
48.54 ≤ 51.85 → FITS
headroom = 3.31 GB of weights budget left// weights Q8_0 for Mixtral 8x7B (46.7B params)
weights = params × bits ÷ 8
= 46.7 × 8.5 ÷ 8
= 49.62 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
49.62 ≤ 51.85 → FITS
headroom = 2.23 GB of weights budget left// weights Q8_0 for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 8.5 ÷ 8
= 37.19 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
37.19 ≤ 51.85 → FITS
headroom = 14.66 GB of weights budget left// weights Q8_0 for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 8.5 ÷ 8
= 36.13 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
36.13 ≤ 51.85 → FITS
headroom = 15.72 GB of weights budget left// weights Q8_0 for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 8.5 ÷ 8
= 34.53 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
34.53 ≤ 51.85 → FITS
headroom = 17.32 GB of weights budget left// weights Q8_0 for Qwen 2.5 Coder 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 8.5 ÷ 8
= 34.53 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
34.53 ≤ 51.85 → FITS
headroom = 17.32 GB of weights budget left// weights Q8_0 for Qwen3 30B A3B (30.5B params)
weights = params × bits ÷ 8
= 30.5 × 8.5 ÷ 8
= 32.41 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
32.41 ≤ 51.85 → FITS
headroom = 19.44 GB of weights budget left// weights Q8_0 for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 8.5 ÷ 8
= 28.69 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
28.69 ≤ 51.85 → FITS
headroom = 23.16 GB of weights budget left// weights Q8_0 for Gemma 4 26B A4B (26B params)
weights = params × bits ÷ 8
= 26 × 8.5 ÷ 8
= 27.63 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
27.63 ≤ 51.85 → FITS
headroom = 24.22 GB of weights budget left// weights FP16/BF16 for Mistral Small 3 (24B params)
weights = params × bits ÷ 8
= 24 × 16 ÷ 8
= 48.00 GB
// budget on M2 Max 64 (64GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 9.60 GB (15% of 64GB)
budget = vram − safety − kv − overhead
= 64 − 9.60 − 0.05 − 2.50
= 51.85 GB
// fit decision
48.00 ≤ 51.85 → FITS
headroom = 3.85 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.