// weights AWQ 4-bit for StarCoder2 15B (15B params)
weights = params × bits ÷ 8
= 15 × 4.25 ÷ 8
= 7.97 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
7.97 ≤ 8.85 → FITS
headroom = 0.88 GB of weights budget left// weights Q4_K_M for Phi-4 (14.7B params)
weights = params × bits ÷ 8
= 14.7 × 4.5 ÷ 8
= 8.27 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
8.27 ≤ 8.85 → FITS
headroom = 0.58 GB of weights budget left// weights Q6_K for Qwen 3.5 9B (9B params)
weights = params × bits ÷ 8
= 9 × 6.56 ÷ 8
= 7.38 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
7.38 ≤ 8.85 → FITS
headroom = 1.47 GB of weights budget left// weights Q6_K for Gemma 2 9B (9B params)
weights = params × bits ÷ 8
= 9 × 6.56 ÷ 8
= 7.38 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
7.38 ≤ 8.85 → FITS
headroom = 1.47 GB of weights budget left// weights FP8/INT8 for Llama 3.1 8B (8B params)
weights = params × bits ÷ 8
= 8 × 8 ÷ 8
= 8.00 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
8.00 ≤ 8.85 → FITS
headroom = 0.85 GB of weights budget left// weights FP8/INT8 for Granite 8B Code (8B params)
weights = params × bits ÷ 8
= 8 × 8 ÷ 8
= 8.00 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
8.00 ≤ 8.85 → FITS
headroom = 0.85 GB of weights budget left// weights Q8_0 for Mistral 7B v0.3 (7.2B params)
weights = params × bits ÷ 8
= 7.2 × 8.5 ÷ 8
= 7.65 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
7.65 ≤ 8.85 → FITS
headroom = 1.20 GB of weights budget left// weights Q8_0 for Qwen 2.5 7B (7B params)
weights = params × bits ÷ 8
= 7 × 8.5 ÷ 8
= 7.44 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
7.44 ≤ 8.85 → FITS
headroom = 1.41 GB of weights budget left// weights FP16/BF16 for Gemma 4 E4B (4B params)
weights = params × bits ÷ 8
= 4 × 16 ÷ 8
= 8.00 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
8.00 ≤ 8.85 → FITS
headroom = 0.85 GB of weights budget left// weights FP16/BF16 for Phi-4 Mini (3.8B params)
weights = params × bits ÷ 8
= 3.8 × 16 ÷ 8
= 7.60 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
7.60 ≤ 8.85 → FITS
headroom = 1.25 GB of weights budget left// weights FP16/BF16 for Llama 3.2 3B (3.21B params)
weights = params × bits ÷ 8
= 3.21 × 16 ÷ 8
= 6.42 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
6.42 ≤ 8.85 → FITS
headroom = 2.43 GB of weights budget left// weights FP16/BF16 for Llama 3.2 1B (1.23B params)
weights = params × bits ÷ 8
= 1.23 × 16 ÷ 8
= 2.46 GB
// budget on RTX 5070 (12GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.30 GB (runtime, cuda, allocator)
safety = 1.80 GB (15% of 12GB)
budget = vram − safety − kv − overhead
= 12 − 1.80 − 0.05 − 1.30
= 8.85 GB
// fit decision
2.46 ≤ 8.85 → FITS
headroom = 6.39 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.