// weights Q3_K_M for Qwen 3.6 27B (27B params)
weights = params × bits ÷ 8
= 27 × 3.44 ÷ 8
= 11.61 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
11.61 ≤ 12.15 → FITS
headroom = 0.54 GB of weights budget left// weights Q3_K_M for Gemma 4 26B A4B (26B params)
weights = params × bits ÷ 8
= 26 × 3.44 ÷ 8
= 11.18 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
11.18 ≤ 12.15 → FITS
headroom = 0.97 GB of weights budget left// weights Q3_K_M for Mistral Small 3 (24B params)
weights = params × bits ÷ 8
= 24 × 3.44 ÷ 8
= 10.32 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
10.32 ≤ 12.15 → FITS
headroom = 1.83 GB of weights budget left// weights Q4_K_M for gpt-oss 20B (20.9B params)
weights = params × bits ÷ 8
= 20.9 × 4.5 ÷ 8
= 11.76 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
11.76 ≤ 12.15 → FITS
headroom = 0.39 GB of weights budget left// weights Q5_K_M for StarCoder2 15B (15B params)
weights = params × bits ÷ 8
= 15 × 5.5 ÷ 8
= 10.31 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
10.31 ≤ 12.15 → FITS
headroom = 1.84 GB of weights budget left// weights Q5_K_M for Phi-4 (14.7B params)
weights = params × bits ÷ 8
= 14.7 × 5.5 ÷ 8
= 10.11 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
10.11 ≤ 12.15 → FITS
headroom = 2.04 GB of weights budget left// weights Q8_0 for Qwen 3.5 9B (9B params)
weights = params × bits ÷ 8
= 9 × 8.5 ÷ 8
= 9.56 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
9.56 ≤ 12.15 → FITS
headroom = 2.59 GB of weights budget left// weights Q8_0 for Gemma 2 9B (9B params)
weights = params × bits ÷ 8
= 9 × 8.5 ÷ 8
= 9.56 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
9.56 ≤ 12.15 → FITS
headroom = 2.59 GB of weights budget left// weights Q8_0 for Llama 3.1 8B (8B params)
weights = params × bits ÷ 8
= 8 × 8.5 ÷ 8
= 8.50 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
8.50 ≤ 12.15 → FITS
headroom = 3.65 GB of weights budget left// weights Q8_0 for Granite 8B Code (8B params)
weights = params × bits ÷ 8
= 8 × 8.5 ÷ 8
= 8.50 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
8.50 ≤ 12.15 → FITS
headroom = 3.65 GB of weights budget left// weights Q8_0 for Mistral 7B v0.3 (7.2B params)
weights = params × bits ÷ 8
= 7.2 × 8.5 ÷ 8
= 7.65 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
7.65 ≤ 12.15 → FITS
headroom = 4.50 GB of weights budget left// weights Q8_0 for Qwen 2.5 7B (7B params)
weights = params × bits ÷ 8
= 7 × 8.5 ÷ 8
= 7.44 GB
// budget on RTX 4070 Ti Super (16GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.40 GB (runtime, cuda, allocator)
safety = 2.40 GB (15% of 16GB)
budget = vram − safety − kv − overhead
= 16 − 2.40 − 0.05 − 1.40
= 12.15 GB
// fit decision
7.44 ≤ 12.15 → FITS
headroom = 4.71 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.