-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun_server.ps1
More file actions
124 lines (105 loc) · 4.88 KB
/
run_server.ps1
File metadata and controls
124 lines (105 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Launch llama.cpp server with OpenAI-compatible API
# Edit LLAMA_EXE and MODEL_PATH before running.
# Standard llama.cpp b9118 -- has SWA hybrid attention cache fix (no turbo KV types)
$LLAMA_EXE = "C:\llama_cpp\llama-server.exe"
# Turboquant v0.1.1 -- has turbo4/turbo3 KV types but NOT the SWA fix
# $LLAMA_EXE = "C:\llama_cpp_turbo\llama-server.exe"
# Qwen3.6-35B-A3B MoE IQ3_XXS (12.8 GB) -- fastest gen (46 tok/s), best EvalPlus (92.7%) [RECOMMENDED]
$MODEL_PATH = "C:\llama_cpp\models\Qwen3.6-35B-A3B-UD-IQ3_XXS.gguf"
$NGL = 999
# Qwen3.6-35B-A3B MoE IQ4_XS (17.7 GB) -- slightly lower quality, slower (32 tok/s)
# $MODEL_PATH = "C:\llama_cpp\models\Qwen3.6-35B-A3B-UD-IQ4_XS.gguf"
# $NGL = 999
# Qwen3.5-122B-A10B MoE IQ2_XXS (36.6 GB) -- 10B active params; experts in RAM, backbone on GPU
# $MODEL_PATH = "C:\llama_cpp\models\Qwen3.5-122B-A10B-UD-IQ2_XXS.gguf"
# $NGL = 999
# Llama 4 Scout MoE IQ2_XXS (37.4 GB) -- 17B active params; experts in RAM, backbone on GPU
# Requires turboquant binary to support Llama 4 architecture.
# $MODEL_PATH = "C:\llama_cpp\models\Llama-4-Scout-17B-16E-Instruct-UD-IQ2_XXS.gguf"
# $NGL = 999
# Gemma 4 26B-A4B MoE IQ4_XS (13.6 GB) -- needs --jinja for tool calling
# $MODEL_PATH = "C:\llama_cpp\models\gemma-4-26B-A4B-it-UD-IQ4_XS.gguf"
# $NGL = 999
# Context size.
# 131072 (128K): n-cpu-moe=24, VRAM=9,190 MB peak [default, verified under 86K load]
# 262144 (256K): n-cpu-moe=31, VRAM=9,184 MB peak [swap CTX and update n-cpu-moe below]
$CTX = 131072
# Number of parallel request slots (1 is fine for single-user dev).
$PARALLEL = 1
# Add --jinja when running Gemma 4 (required for its tool-calling chat template).
# $EXTRA_ARGS = "--jinja"
$EXTRA_ARGS = ""
# --- Log file config ---
# Output (stderr, which is where llama-server writes request/token/timing info)
# is tee'd to both the terminal and this log file so you can review it later.
# Log rotates when it exceeds MAX_LOG_MB: the old file is kept as .old (one backup).
$LOG_PATH = Join-Path $PSScriptRoot "logs\llama-server-live.log"
$LOG_OLD = Join-Path $PSScriptRoot "logs\llama-server-live.log.old"
$MAX_LOG_MB = 50
if (-not (Test-Path $LLAMA_EXE)) {
Write-Error "llama-server.exe not found at: $LLAMA_EXE"
Write-Host "Download from: https://github.com/ggerganov/llama.cpp/releases"
exit 1
}
if (-not (Test-Path $MODEL_PATH)) {
Write-Error "Model not found at: $MODEL_PATH"
exit 1
}
# Rotate log if it has grown too large
New-Item -ItemType Directory -Force -Path (Split-Path $LOG_PATH) | Out-Null
if (Test-Path $LOG_PATH) {
$sizeMB = (Get-Item $LOG_PATH).Length / 1MB
if ($sizeMB -gt $MAX_LOG_MB) {
Write-Host "Log exceeded ${MAX_LOG_MB}MB -- rotating to .old"
Move-Item -Force $LOG_PATH $LOG_OLD
}
}
# Write session header to log
$header = @"
================================================================================
llama-server session started: $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')
Model: $MODEL_PATH
Context: $CTX tokens
================================================================================
"@
Add-Content -Path $LOG_PATH -Value $header -Encoding UTF8
Write-Host "Starting llama-server on http://localhost:8081"
Write-Host " Model: $MODEL_PATH"
Write-Host " GPU layers: $NGL"
Write-Host " Context: $CTX"
Write-Host " Log: $LOG_PATH (rotates at ${MAX_LOG_MB}MB)"
Write-Host ""
$cmd = @(
"--model", $MODEL_PATH,
"--n-gpu-layers", $NGL,
"--ctx-size", $CTX,
"--parallel", $PARALLEL,
"--port", "8081",
"--host", "0.0.0.0",
"-ctk", "q8_0", # standard llama.cpp: use q8_0 (turbo4/3 only in turboquant fork)
"-ctv", "q8_0",
"--no-mmap",
"--mlock",
"--n-cpu-moe", "24", # IQ3_XXS 128K: verified 2026-05-14, 9,190 MB peak under 86K load
# Use 31 for 256K context (verified 2026-05-14, 9,184 MB peak)
"--reasoning", "auto", # auto-detect from template; proxy injects /no_think on normal
# turns and lets model think on error/planning turns
"-b", "8192" # larger batch = faster prefill on long prompts
# Test b=16384 with bench/test_batch_compact.py (use n-cpu-moe=26)
)
if ($EXTRA_ARGS) { $cmd += $EXTRA_ARGS.Split(" ") }
# Launch llama-server with stderr redirected to the log file.
# PS 5.1 can't cleanly tee native stderr to both terminal and file, so we run the
# process with stderr going to the log, and tail the log in a background job so
# you still see output in the terminal.
$proc = Start-Process -FilePath $LLAMA_EXE `
-ArgumentList $cmd `
-RedirectStandardError $LOG_PATH `
-NoNewWindow -PassThru
Write-Host "PID $($proc.Id) -- tailing log (Ctrl+C to stop)..."
Write-Host ""
# Tail the log file to the terminal so you can see what the server is doing
Get-Content $LOG_PATH -Wait -ErrorAction SilentlyContinue | ForEach-Object {
Write-Host $_
if ($proc.HasExited) { break }
}