// weights Q3_K_M for Qwen 3.6 35B A3B (35B params)
weights = params × bits ÷ 8
= 35 × 3.44 ÷ 8
= 15.05 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
15.05 ≤ 18.75 → FITS
headroom = 3.70 GB of weights budget left// weights Q3_K_M for Yi 34B (34B params)
weights = params × bits ÷ 8
= 34 × 3.44 ÷ 8
= 14.62 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
14.62 ≤ 18.75 → FITS
headroom = 4.13 GB of weights budget left// weights AWQ 4-bit for Qwen 2.5 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 4.25 ÷ 8
= 17.27 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
17.27 ≤ 18.75 → FITS
headroom = 1.48 GB of weights budget left// weights AWQ 4-bit for Qwen 2.5 Coder 32B (32.5B params)
weights = params × bits ÷ 8
= 32.5 × 4.25 ÷ 8
= 17.27 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
17.27 ≤ 18.75 → FITS
headroom = 1.48 GB of weights budget left// weights Q4_K_M for Qwen3 30B A3B (30.5B params)
weights = params × bits ÷ 8
= 30.5 × 4.5 ÷ 8
= 17.16 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
17.16 ≤ 18.75 → FITS
headroom = 1.59 GB of weights budget left// weights Q4_K_M for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 4.5 ÷ 8
= 15.19 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
15.19 ≤ 18.75 → FITS
headroom = 3.56 GB of weights budget left// weights Q5_K_M for Gemma 4 26B A4B (26B params)
weights = params × bits ÷ 8
= 26 × 5.5 ÷ 8
= 17.88 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
17.88 ≤ 18.75 → FITS
headroom = 0.87 GB of weights budget left// weights Q5_K_M for Mistral Small 3 (24B params)
weights = params × bits ÷ 8
= 24 × 5.5 ÷ 8
= 16.50 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
16.50 ≤ 18.75 → FITS
headroom = 2.25 GB of weights budget left// weights Q6_K for gpt-oss 20B (20.9B params)
weights = params × bits ÷ 8
= 20.9 × 6.56 ÷ 8
= 17.14 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
17.14 ≤ 18.75 → FITS
headroom = 1.61 GB of weights budget left// weights Q8_0 for StarCoder2 15B (15B params)
weights = params × bits ÷ 8
= 15 × 8.5 ÷ 8
= 15.94 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
15.94 ≤ 18.75 → FITS
headroom = 2.81 GB of weights budget left// weights Q8_0 for Phi-4 (14.7B params)
weights = params × bits ÷ 8
= 14.7 × 8.5 ÷ 8
= 15.62 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
15.62 ≤ 18.75 → FITS
headroom = 3.13 GB of weights budget left// weights FP16/BF16 for Qwen 3.5 9B (9B params)
weights = params × bits ÷ 8
= 9 × 16 ÷ 8
= 18.00 GB
// budget on RTX 3090 Ti (24GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.60 GB (runtime, cuda, allocator)
safety = 3.60 GB (15% of 24GB)
budget = vram − safety − kv − overhead
= 24 − 3.60 − 0.05 − 1.60
= 18.75 GB
// fit decision
18.00 ≤ 18.75 → FITS
headroom = 0.75 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.