// weights Q5_K_M for Mixtral 8x22B (141B params)
weights = params × bits ÷ 8
= 141 × 5.5 ÷ 8
= 96.94 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
96.94 ≤ 117.30 → FITS
headroom = 20.36 GB of weights budget left// weights Q6_K for gpt-oss 120B (117B params)
weights = params × bits ÷ 8
= 117 × 6.56 ÷ 8
= 95.94 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
95.94 ≤ 117.30 → FITS
headroom = 21.36 GB of weights budget left// weights Q8_0 for Command R+ (104B params)
weights = params × bits ÷ 8
= 104 × 8.5 ÷ 8
= 110.50 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
110.50 ≤ 117.30 → FITS
headroom = 6.80 GB of weights budget left// weights Q8_0 for Qwen 2.5 72B (72.7B params)
weights = params × bits ÷ 8
= 72.7 × 8.5 ÷ 8
= 77.24 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
77.24 ≤ 117.30 → FITS
headroom = 40.05 GB of weights budget left// weights Q8_0 for Llama 3.3 70B (70.6B params)
weights = params × bits ÷ 8
= 70.6 × 8.5 ÷ 8
= 75.01 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
75.01 ≤ 117.30 → FITS
headroom = 42.29 GB of weights budget left// weights FP16/BF16 for Mixtral 8x7B (46.7B params)
weights = params × bits ÷ 8
= 46.7 × 16 ÷ 8
= 93.40 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
93.40 ≤ 117.30 → FITS
headroom = 23.90 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 16 ÷ 8
= 70.00 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
70.00 ≤ 117.30 → FITS
headroom = 47.30 GB of weights budget left// weights FP16/BF16 for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 16 ÷ 8
= 68.00 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
68.00 ≤ 117.30 → FITS
headroom = 49.30 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
65.00 ≤ 117.30 → FITS
headroom = 52.30 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 Coder 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
65.00 ≤ 117.30 → FITS
headroom = 52.30 GB of weights budget left// weights FP16/BF16 for Qwen3 30B A3B (30.5B params)
weights = params × bits ÷ 8
= 30.5 × 16 ÷ 8
= 61.00 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
61.00 ≤ 117.30 → FITS
headroom = 56.30 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 16 ÷ 8
= 54.00 GB
// budget on H200 (141GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 21.15 GB (15% of 141GB)
budget = vram − safety − kv − overhead
= 141 − 21.15 − 0.05 − 2.50
= 117.30 GB
// fit decision
54.00 ≤ 117.30 → FITS
headroom = 63.30 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.