// weights Q3_K_M for Qwen 2.5 72B (72.7B params)
weights = params × bits ÷ 8
= 72.7 × 3.44 ÷ 8
= 31.26 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
31.26 ≤ 38.55 → FITS
headroom = 7.29 GB of weights budget left// weights Q3_K_M for Llama 3.3 70B (70.6B params)
weights = params × bits ÷ 8
= 70.6 × 3.44 ÷ 8
= 30.36 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
30.36 ≤ 38.55 → FITS
headroom = 8.19 GB of weights budget left// weights Q5_K_M for Mixtral 8x7B (46.7B params)
weights = params × bits ÷ 8
= 46.7 × 5.5 ÷ 8
= 32.11 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
32.11 ≤ 38.55 → FITS
headroom = 6.44 GB of weights budget left// weights FP8/INT8 for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 8 ÷ 8
= 35.00 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
35.00 ≤ 38.55 → FITS
headroom = 3.55 GB of weights budget left// weights Q8_0 for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 8.5 ÷ 8
= 36.13 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
36.13 ≤ 38.55 → FITS
headroom = 2.42 GB of weights budget left// weights Q8_0 for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 8.5 ÷ 8
= 34.53 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
34.53 ≤ 38.55 → FITS
headroom = 4.02 GB of weights budget left// weights Q8_0 for Qwen 2.5 Coder 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 8.5 ÷ 8
= 34.53 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
34.53 ≤ 38.55 → FITS
headroom = 4.02 GB of weights budget left// weights Q8_0 for Qwen3 30B A3B (30.5B params)
weights = params × bits ÷ 8
= 30.5 × 8.5 ÷ 8
= 32.41 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
32.41 ≤ 38.55 → FITS
headroom = 6.14 GB of weights budget left// weights Q8_0 for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 8.5 ÷ 8
= 28.69 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
28.69 ≤ 38.55 → FITS
headroom = 9.86 GB of weights budget left// weights Q8_0 for Gemma 4 26B A4B (26B params)
weights = params × bits ÷ 8
= 26 × 8.5 ÷ 8
= 27.63 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
27.63 ≤ 38.55 → FITS
headroom = 10.92 GB of weights budget left// weights Q8_0 for Mistral Small 3 (24B params)
weights = params × bits ÷ 8
= 24 × 8.5 ÷ 8
= 25.50 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
25.50 ≤ 38.55 → FITS
headroom = 13.05 GB of weights budget left// weights Q8_0 for gpt-oss 20B (20.9B params)
weights = params × bits ÷ 8
= 20.9 × 8.5 ÷ 8
= 22.21 GB
// budget on RTX 6000 Ada (48GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 2.20 GB (runtime, cuda, allocator)
safety = 7.20 GB (15% of 48GB)
budget = vram − safety − kv − overhead
= 48 − 7.20 − 0.05 − 2.20
= 38.55 GB
// fit decision
22.21 ≤ 38.55 → FITS
headroom = 16.34 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.