// weights Q8_0 for Mixtral 8x22B (141B params)
weights = params × bits ÷ 8
= 141 × 8.5 ÷ 8
= 149.81 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
149.81 ≤ 157.25 → FITS
headroom = 7.44 GB of weights budget left// weights Q8_0 for gpt-oss 120B (117B params)
weights = params × bits ÷ 8
= 117 × 8.5 ÷ 8
= 124.31 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
124.31 ≤ 157.25 → FITS
headroom = 32.94 GB of weights budget left// weights Q8_0 for Command R+ (104B params)
weights = params × bits ÷ 8
= 104 × 8.5 ÷ 8
= 110.50 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
110.50 ≤ 157.25 → FITS
headroom = 46.75 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 72B (72.7B params)
weights = params × bits ÷ 8
= 72.7 × 16 ÷ 8
= 145.40 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
145.40 ≤ 157.25 → FITS
headroom = 11.85 GB of weights budget left// weights FP16/BF16 for Llama 3.3 70B (70.6B params)
weights = params × bits ÷ 8
= 70.6 × 16 ÷ 8
= 141.20 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
141.20 ≤ 157.25 → FITS
headroom = 16.05 GB of weights budget left// weights FP16/BF16 for Mixtral 8x7B (46.7B params)
weights = params × bits ÷ 8
= 46.7 × 16 ÷ 8
= 93.40 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
93.40 ≤ 157.25 → FITS
headroom = 63.85 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 16 ÷ 8
= 70.00 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
70.00 ≤ 157.25 → FITS
headroom = 87.25 GB of weights budget left// weights FP16/BF16 for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 16 ÷ 8
= 68.00 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
68.00 ≤ 157.25 → FITS
headroom = 89.25 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
65.00 ≤ 157.25 → FITS
headroom = 92.25 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 Coder 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
65.00 ≤ 157.25 → FITS
headroom = 92.25 GB of weights budget left// weights FP16/BF16 for Qwen3 30B A3B (30.5B params)
weights = params × bits ÷ 8
= 30.5 × 16 ÷ 8
= 61.00 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
61.00 ≤ 157.25 → FITS
headroom = 96.25 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 16 ÷ 8
= 54.00 GB
// budget on 2× H100 NVL (188GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.20 GB (15% of 188GB)
budget = vram − safety − kv − overhead
= 188 − 28.20 − 0.05 − 2.50
= 157.25 GB
// fit decision
54.00 ≤ 157.25 → FITS
headroom = 103.25 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.