// weights Q4_K_M for Qwen 3.5 9B (9B params)
weights = params × bits ÷ 8
= 9 × 4.5 ÷ 8
= 5.06 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
5.06 ≤ 5.55 → FITS
headroom = 0.49 GB of weights budget left// weights Q4_K_M for Gemma 2 9B (9B params)
weights = params × bits ÷ 8
= 9 × 4.5 ÷ 8
= 5.06 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
5.06 ≤ 5.55 → FITS
headroom = 0.49 GB of weights budget left// weights Q4_K_M for Llama 3.1 8B (8B params)
weights = params × bits ÷ 8
= 8 × 4.5 ÷ 8
= 4.50 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
4.50 ≤ 5.55 → FITS
headroom = 1.05 GB of weights budget left// weights Q4_K_M for Granite 8B Code (8B params)
weights = params × bits ÷ 8
= 8 × 4.5 ÷ 8
= 4.50 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
4.50 ≤ 5.55 → FITS
headroom = 1.05 GB of weights budget left// weights Q5_K_M for Mistral 7B v0.3 (7.2B params)
weights = params × bits ÷ 8
= 7.2 × 5.5 ÷ 8
= 4.95 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
4.95 ≤ 5.55 → FITS
headroom = 0.60 GB of weights budget left// weights Q5_K_M for Qwen 2.5 7B (7B params)
weights = params × bits ÷ 8
= 7 × 5.5 ÷ 8
= 4.81 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
4.81 ≤ 5.55 → FITS
headroom = 0.74 GB of weights budget left// weights Q8_0 for Gemma 4 E4B (4B params)
weights = params × bits ÷ 8
= 4 × 8.5 ÷ 8
= 4.25 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
4.25 ≤ 5.55 → FITS
headroom = 1.30 GB of weights budget left// weights Q8_0 for Phi-4 Mini (3.8B params)
weights = params × bits ÷ 8
= 3.8 × 8.5 ÷ 8
= 4.04 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
4.04 ≤ 5.55 → FITS
headroom = 1.51 GB of weights budget left// weights Q8_0 for Llama 3.2 3B (3.21B params)
weights = params × bits ÷ 8
= 3.21 × 8.5 ÷ 8
= 3.41 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
3.41 ≤ 5.55 → FITS
headroom = 2.14 GB of weights budget left// weights FP16/BF16 for Llama 3.2 1B (1.23B params)
weights = params × bits ÷ 8
= 1.23 × 16 ÷ 8
= 2.46 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
2.46 ≤ 5.55 → FITS
headroom = 3.09 GB of weights budget left// weights Q4_K_M for Phi-4 (14.7B params)
weights = params × bits ÷ 8
= 14.7 × 4.5 ÷ 8
= 8.27 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
8.27 > 5.55 → OVER
overflow = 2.72 GB over budget// weights Q4_K_M for StarCoder2 15B (15B params)
weights = params × bits ÷ 8
= 15 × 4.5 ÷ 8
= 8.44 GB
// budget on RTX 3070 (8GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.20 GB (runtime, cuda, allocator)
safety = 1.20 GB (15% of 8GB)
budget = vram − safety − kv − overhead
= 8 − 1.20 − 0.05 − 1.20
= 5.55 GB
// fit decision
8.44 > 5.55 → OVER
overflow = 2.89 GB over budget// sign in with github to leave a comment. threads live in the repo's discussions tab.