// weights Q4_K_M for DeepSeek V3 (671B params)
weights = params × bits ÷ 8
= 671 × 4.5 ÷ 8
= 377.44 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
377.44 ≤ 432.65 → FITS
headroom = 55.21 GB of weights budget left// weights Q4_K_M for DeepSeek R1 (671B params)
weights = params × bits ÷ 8
= 671 × 4.5 ÷ 8
= 377.44 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
377.44 ≤ 432.65 → FITS
headroom = 55.21 GB of weights budget left// weights FP8/INT8 for Llama 3.1 405B (405B params)
weights = params × bits ÷ 8
= 405 × 8 ÷ 8
= 405.00 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
405.00 ≤ 432.65 → FITS
headroom = 27.65 GB of weights budget left// weights FP16/BF16 for Mixtral 8x22B (141B params)
weights = params × bits ÷ 8
= 141 × 16 ÷ 8
= 282.00 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
282.00 ≤ 432.65 → FITS
headroom = 150.65 GB of weights budget left// weights FP16/BF16 for gpt-oss 120B (117B params)
weights = params × bits ÷ 8
= 117 × 16 ÷ 8
= 234.00 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
234.00 ≤ 432.65 → FITS
headroom = 198.65 GB of weights budget left// weights FP16/BF16 for Command R+ (104B params)
weights = params × bits ÷ 8
= 104 × 16 ÷ 8
= 208.00 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
208.00 ≤ 432.65 → FITS
headroom = 224.65 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 72B (72.7B params)
weights = params × bits ÷ 8
= 72.7 × 16 ÷ 8
= 145.40 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
145.40 ≤ 432.65 → FITS
headroom = 287.25 GB of weights budget left// weights FP16/BF16 for Llama 3.3 70B (70.6B params)
weights = params × bits ÷ 8
= 70.6 × 16 ÷ 8
= 141.20 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
141.20 ≤ 432.65 → FITS
headroom = 291.45 GB of weights budget left// weights FP16/BF16 for Mixtral 8x7B (46.7B params)
weights = params × bits ÷ 8
= 46.7 × 16 ÷ 8
= 93.40 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
93.40 ≤ 432.65 → FITS
headroom = 339.25 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 16 ÷ 8
= 70.00 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
70.00 ≤ 432.65 → FITS
headroom = 362.65 GB of weights budget left// weights FP16/BF16 for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 16 ÷ 8
= 68.00 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
68.00 ≤ 432.65 → FITS
headroom = 364.65 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on M3 Ultra 512 (512GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 76.80 GB (15% of 512GB)
budget = vram − safety − kv − overhead
= 512 − 76.80 − 0.05 − 2.50
= 432.65 GB
// fit decision
65.00 ≤ 432.65 → FITS
headroom = 367.65 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.