// weights Q3_K_M for Mixtral 8x22B (141B params)
weights = params × bits ÷ 8
= 141 × 3.44 ÷ 8
= 60.63 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
60.63 ≤ 65.45 → FITS
headroom = 4.82 GB of weights budget left// weights AWQ 4-bit for gpt-oss 120B (117B params)
weights = params × bits ÷ 8
= 117 × 4.25 ÷ 8
= 62.16 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
62.16 ≤ 65.45 → FITS
headroom = 3.29 GB of weights budget left// weights Q4_K_M for Command R+ (104B params)
weights = params × bits ÷ 8
= 104 × 4.5 ÷ 8
= 58.50 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
58.50 ≤ 65.45 → FITS
headroom = 6.95 GB of weights budget left// weights Q6_K for Qwen 2.5 72B (72.7B params)
weights = params × bits ÷ 8
= 72.7 × 6.56 ÷ 8
= 59.61 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
59.61 ≤ 65.45 → FITS
headroom = 5.83 GB of weights budget left// weights Q6_K for Llama 3.3 70B (70.6B params)
weights = params × bits ÷ 8
= 70.6 × 6.56 ÷ 8
= 57.89 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
57.89 ≤ 65.45 → FITS
headroom = 7.56 GB of weights budget left// weights Q8_0 for Mixtral 8x7B (46.7B params)
weights = params × bits ÷ 8
= 46.7 × 8.5 ÷ 8
= 49.62 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
49.62 ≤ 65.45 → FITS
headroom = 15.83 GB of weights budget left// weights Q8_0 for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 8.5 ÷ 8
= 37.19 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
37.19 ≤ 65.45 → FITS
headroom = 28.26 GB of weights budget left// weights Q8_0 for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 8.5 ÷ 8
= 36.13 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
36.13 ≤ 65.45 → FITS
headroom = 29.32 GB of weights budget left// weights Q8_0 for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 8.5 ÷ 8
= 34.53 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
34.53 ≤ 65.45 → FITS
headroom = 30.92 GB of weights budget left// weights Q8_0 for Qwen 2.5 Coder 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 8.5 ÷ 8
= 34.53 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
34.53 ≤ 65.45 → FITS
headroom = 30.92 GB of weights budget left// weights FP16/BF16 for Qwen3 30B A3B (30.5B params)
weights = params × bits ÷ 8
= 30.5 × 16 ÷ 8
= 61.00 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
61.00 ≤ 65.45 → FITS
headroom = 4.45 GB of weights budget left// weights FP16/BF16 for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 16 ÷ 8
= 54.00 GB
// budget on H100 80GB (80GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.50 GB (runtime, cuda, allocator)
safety = 12.00 GB (15% of 80GB)
budget = vram − safety − kv − overhead
= 80 − 12.00 − 0.05 − 2.50
= 65.45 GB
// fit decision
54.00 ≤ 65.45 → FITS
headroom = 11.45 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.