{ "models": [ { "name": "Qwen/Qwen3", "profiles": [ { "name": "thinking", "command": "./llama-cli --model bartowski/Qwen_Qwen3-8B-GGUF/Qwen_Qwen3-8B-Q8_0.gguf --jinja --temp 0.6 --top-k 20 --top-p 0.95 --min-p 0 --ctx-size 40960 --predict 32768", "references": [ "https://qwen.readthedocs.io/en/latest/run_locally/llama.cpp.html#llama-cli", "https://huggingface.co/Qwen/Qwen3-235B-A22B#switching-between-thinking-and-non-thinking-mode" ], "notes": [ "qwen team suggests to set the --presence-penalty parameter between 0 and 2 to reduce endless repetitions and adds that a higher value may occasionally result in language mixing and a slight decrease in model performance." ] }, { "name": "non thinking", "command": "./llama-cli --model bartowski/Qwen_Qwen3-8B-GGUF/Qwen_Qwen3-8B-Q8_0.gguf --jinja --temp 0.7 --top-k 20 --top-p 0.8 --min-p 0 --ctx-size 40960 --predict 32768", "references": [ "https://qwen.readthedocs.io/en/latest/run_locally/llama.cpp.html#llama-cli", "https://huggingface.co/Qwen/Qwen3-235B-A22B#switching-between-thinking-and-non-thinking-mode" ], "notes": [ "qwen team suggests to set the --presence-penalty parameter between 0 and 2 to reduce endless repetitions and adds that a higher value may occasionally result in language mixing and a slight decrease in model performance." ] } ] }, { "name": "Qwen/QwQ", "profiles": [ { "name": "default", "command": "./llama-cli --model unsloth-QwQ-32B-GGUF/QwQ-32B-Q8_0.gguf --ctx-size 32768 --temp 0.6 --min-p 0.01 --top-k 40 --top-p 0.95", "references": [ "https://modelscope.cn/models/Qwen/QwQ-32B", "https://huggingface.co/Qwen/QwQ-32B", "https://huggingface.co/unsloth/QwQ-32B-GGUF", "https://docs.unsloth.ai/basics/tutorials-how-to-fine-tune-and-run-llms/qwq-32b-how-to-run-effectively" ], "notes": [ "unsloth team is using following parameters in their examples --repeat-penalty 1.1 --dry-multiplier 0.5 and recommends adding --samplers \"top_k;top_p;min_p;temperature;dry;typ_p;xtc\" to stop infinite repetitions" ] } ] }, { "name": "meta-llama/Llama-4", "profiles": [ { "name": "default", "command": "./llama-cli --model unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF/Llama-4-Scout-17B-16E-Instruct-Q8_0.gguf --ctx-size 10485760 --temp 0.6 --min-p 0.01 --top-p 0.9", "references": [ "https://www.llama.com/docs/llama-everywhere/running-meta-llama-on-linux/", "https://docs.unsloth.ai/basics/llama-4-how-to-run-and-fine-tune" ], "notes": [ "Use -ot \".ffn_.*_exps.=CPU\" to offload all MoE layers to the CPU! This effectively allows you to fit all non MoE layers on 1 GPU, improving generation speeds. You can customize the regex expression to fit more layers if you have more GPU capacity." ] } ] }, { "name": "google/gemma-3", "profiles": [ { "name": "default", "command": "./llama-cli --model bartowski/google_gemma-3-27b-it-qat-GGUF/gemma-3-27b-it-Q8_0.gguf --ctx-size 131072 --temp 1.0 --repeat-penalty 1.0 --min-p 0.01 --top-k 64 --top-p 0.95", "references": [ "https://ollama.com/library/gemma3/blobs/3116c5225075", "https://docs.unsloth.ai/basics/gemma-3-how-to-run-and-fine-tune" ], "notes": [ "recommended is a Min_P of 0.00 (optional, but 0.01 works well, llama.cpp acording to unsloth team." ] } ] }, { "name": "microsoft/Phi-4-reasoning", "profiles": [ { "name": "default", "command": "./llama-cli --model bartowski/microsoft_Phi-4-reasoning-plus-GGUF/microsoft_Phi-4-reasoning-plus-Q8_0.gguf --ctx-size 32768 --temp 0.8 --top-k 50 --top-p 0.95 --reasoning-format deepseek", "references": [ "https://huggingface.co/microsoft/Phi-4-reasoning" ], "notes": [ "For more complex queries, set --predict to 32768 to allow for longer chain-of-thought (CoT)." ] } ] }, { "name": "mistralai/Mistral-Small", "profiles": [ { "name": "default", "command": "./llama-cli --model bartowski/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF/mistralai_Mistral-Small-3.1-24B-Instruct-2503-GGUF --ctx-size 32768 --temp 0.15", "references": [ "https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503" ] } ] }, { "name": "mistralai/Devstral-Small-2505", "profiles": [ { "name": "default", "command": "./llama-cli --model bartowski/Qwen_WorldPM-72B-GGUF/Qwen_WorldPM-72B-Q8_0.gguf --ctx-size 131072 --temp 0.15", "references": [ "https://huggingface.co/mistralai/Devstral-Small-2505" ] } ] } ] }