// weights Q8_0 for Mixtral 8x22B (141B params)
weights = params × bits ÷ 8
= 141 × 8.5 ÷ 8
= 149.81 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
149.81 ≤ 160.65 → FITS
headroom = 10.84 GB of weights budget left// weights Q8_0 for gpt-oss 120B (117B params)
weights = params × bits ÷ 8
= 117 × 8.5 ÷ 8
= 124.31 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
124.31 ≤ 160.65 → FITS
headroom = 36.34 GB of weights budget left// weights Q8_0 for Command R+ (104B params)
weights = params × bits ÷ 8
= 104 × 8.5 ÷ 8
= 110.50 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
110.50 ≤ 160.65 → FITS
headroom = 50.15 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 72B (72.7B params)
weights = params × bits ÷ 8
= 72.7 × 16 ÷ 8
= 145.40 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
145.40 ≤ 160.65 → FITS
headroom = 15.25 GB of weights budget left// weights FP16/BF16 for Llama 3.3 70B (70.6B params)
weights = params × bits ÷ 8
= 70.6 × 16 ÷ 8
= 141.20 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
141.20 ≤ 160.65 → FITS
headroom = 19.45 GB of weights budget left// weights FP16/BF16 for Mixtral 8x7B (46.7B params)
weights = params × bits ÷ 8
= 46.7 × 16 ÷ 8
= 93.40 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
93.40 ≤ 160.65 → FITS
headroom = 67.25 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 16 ÷ 8
= 70.00 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
70.00 ≤ 160.65 → FITS
headroom = 90.65 GB of weights budget left// weights FP16/BF16 for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 16 ÷ 8
= 68.00 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
68.00 ≤ 160.65 → FITS
headroom = 92.65 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
65.00 ≤ 160.65 → FITS
headroom = 95.65 GB of weights budget left// weights FP16/BF16 for Qwen 2.5 Coder 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 16 ÷ 8
= 65.00 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
65.00 ≤ 160.65 → FITS
headroom = 95.65 GB of weights budget left// weights FP16/BF16 for Qwen3 30B A3B (30.5B params)
weights = params × bits ÷ 8
= 30.5 × 16 ÷ 8
= 61.00 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
61.00 ≤ 160.65 → FITS
headroom = 99.65 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 16 ÷ 8
= 54.00 GB
// budget on B200 (192GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 28.80 GB (15% of 192GB)
budget = vram − safety − kv − overhead
= 192 − 28.80 − 0.05 − 2.50
= 160.65 GB
// fit decision
54.00 ≤ 160.65 → FITS
headroom = 106.65 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.