// weights Q3_K_M for StarCoder2 15B (15B params)
weights = params × bits ÷ 8
= 15 × 3.44 ÷ 8
= 6.45 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
6.45 ≤ 7.20 → FITS
headroom = 0.75 GB of weights budget left// weights Q3_K_M for Phi-4 (14.7B params)
weights = params × bits ÷ 8
= 14.7 × 3.44 ÷ 8
= 6.32 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
6.32 ≤ 7.20 → FITS
headroom = 0.88 GB of weights budget left// weights Q5_K_M for Qwen 3.5 9B (9B params)
weights = params × bits ÷ 8
= 9 × 5.5 ÷ 8
= 6.19 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
6.19 ≤ 7.20 → FITS
headroom = 1.01 GB of weights budget left// weights Q5_K_M for Gemma 2 9B (9B params)
weights = params × bits ÷ 8
= 9 × 5.5 ÷ 8
= 6.19 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
6.19 ≤ 7.20 → FITS
headroom = 1.01 GB of weights budget left// weights Q6_K for Llama 3.1 8B (8B params)
weights = params × bits ÷ 8
= 8 × 6.56 ÷ 8
= 6.56 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
6.56 ≤ 7.20 → FITS
headroom = 0.64 GB of weights budget left// weights Q6_K for Granite 8B Code (8B params)
weights = params × bits ÷ 8
= 8 × 6.56 ÷ 8
= 6.56 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
6.56 ≤ 7.20 → FITS
headroom = 0.64 GB of weights budget left// weights Q6_K for Mistral 7B v0.3 (7.2B params)
weights = params × bits ÷ 8
= 7.2 × 6.56 ÷ 8
= 5.90 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
5.90 ≤ 7.20 → FITS
headroom = 1.29 GB of weights budget left// weights Q6_K for Qwen 2.5 7B (7B params)
weights = params × bits ÷ 8
= 7 × 6.56 ÷ 8
= 5.74 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
5.74 ≤ 7.20 → FITS
headroom = 1.46 GB of weights budget left// weights Q8_0 for Gemma 4 E4B (4B params)
weights = params × bits ÷ 8
= 4 × 8.5 ÷ 8
= 4.25 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
4.25 ≤ 7.20 → FITS
headroom = 2.95 GB of weights budget left// weights Q8_0 for Phi-4 Mini (3.8B params)
weights = params × bits ÷ 8
= 3.8 × 8.5 ÷ 8
= 4.04 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
4.04 ≤ 7.20 → FITS
headroom = 3.16 GB of weights budget left// weights FP16/BF16 for Llama 3.2 3B (3.21B params)
weights = params × bits ÷ 8
= 3.21 × 16 ÷ 8
= 6.42 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
6.42 ≤ 7.20 → FITS
headroom = 0.78 GB of weights budget left// weights FP16/BF16 for Llama 3.2 1B (1.23B params)
weights = params × bits ÷ 8
= 1.23 × 16 ÷ 8
= 2.46 GB
// budget on RTX 3080 (10GB) at ctx 8K, conc 1, 15% safety
kv_cache = 0.05 GB (1× at ctx 8K)
overhead = 1.25 GB (runtime, cuda, allocator)
safety = 1.50 GB (15% of 10GB)
budget = vram − safety − kv − overhead
= 10 − 1.50 − 0.05 − 1.25
= 7.20 GB
// fit decision
2.46 ≤ 7.20 → FITS
headroom = 4.74 GB of weights budget left// sign in with github to leave a comment. threads live in the repo's discussions tab.