diff --git a/README.md b/README.md index 45d7897..750dee1 100644 --- a/README.md +++ b/README.md @@ -342,6 +342,23 @@ subagents: mode: parallel # off | sequential | parallel ``` +### Large (MoE) models on limited VRAM + local "turbo cache" + +Big Mixture-of-Experts coders (Qwen3-Coder-MoE, DeepSeek-MoE) don't have to fit entirely in VRAM — keep some layers on the GPU and the rest on the CPU via Ollama's `num_gpu`, which QodeX forwards verbatim: + +```yaml +providers: + ollama: + keepAlive: 30m # keep the model + its KV cache warm between turns — the local "turbo cache" + options: + num_gpu: 14 # layers on GPU; the rest run on CPU (lower = fits a bigger model) + num_ctx: 32768 # KV-cache size; QodeX already defaults this to the model's window +``` + +Not sure what `num_gpu` to use? `suggestGpuLayers({ modelSizeGB, vramBudgetGB, totalLayers })` (`src/llm/offload.ts`) turns a VRAM budget into a sensible value (e.g. a 48 GB MoE on a 12 GB GPU → keep ~14/64 layers on the GPU). + +Two things make local fast here: **`keep_alive`** keeps the model resident so there's no cold reload, and QodeX's **byte-stable prompt prefix** (hierarchical cache work above) means the engine's **KV prefix cache hits** instead of re-prefilling the whole context every turn — the local counterpart to Anthropic prompt caching. + Cloud providers are opt-in. Web-search keys are read from the environment, never the config file: ```bash diff --git a/src/config/defaults.ts b/src/config/defaults.ts index d73751b..ffe31a6 100644 --- a/src/config/defaults.ts +++ b/src/config/defaults.ts @@ -89,10 +89,16 @@ export interface QodexConfig { /** Ollama `keep_alive` — how long the model stays resident after a request. * Longer avoids a cold reload (and full prefill) between turns. Default '30m'. */ keepAlive?: string; - /** Extra Ollama runtime `options` merged into every request (num_ctx, num_batch, - * num_gpu, …). `num_ctx` defaults to the routed model's context window so long - * sessions aren't silently truncated by the server's default 2k/4k window. */ - options?: Record; + /** Extra Ollama runtime `options` merged verbatim into every request. Numbers, strings, + * and bools all pass through, so any llama.cpp/Ollama runtime flag works — including the + * ones that matter for large MoE coders on limited VRAM: + * - `num_gpu`: layers to keep on the GPU (the rest run on CPU). Lower it to fit a big + * MoE model in VRAM. See src/llm/offload.ts `suggestGpuLayers` for a sensible value. + * - `num_ctx`: defaults to the routed model's context window so long sessions aren't + * silently truncated by the server's 2k/4k default. Bigger num_ctx ⇒ bigger KV cache. + * `keep_alive` (above) keeps the model + its KV cache warm between turns — the local + * "turbo cache" that, with QodeX's byte-stable prompt prefix, avoids a full re-prefill. */ + options?: Record; /** Draft model for speculative decoding, if the local server supports it. Passed * through verbatim; servers that don't read it ignore it. */ draftModel?: string; diff --git a/src/llm/offload.ts b/src/llm/offload.ts new file mode 100644 index 0000000..d482163 --- /dev/null +++ b/src/llm/offload.ts @@ -0,0 +1,53 @@ +/** + * Local-engine offloading helpers — run a model that's bigger than your VRAM by keeping some + * layers on the CPU. Especially relevant for large Mixture-of-Experts (MoE) coders (Qwen3-Coder + * MoE, DeepSeek-MoE …): the bulk of the weights are expert FFN layers, so offloading a slice of + * layers to system RAM lets a 30–100B MoE run on a 12–24 GB GPU at usable speed. + * + * QodeX already forwards `providers.ollama.options` verbatim, so `num_gpu` (the number of layers + * to keep on the GPU; the rest run on CPU) Just Works. These PURE helpers turn a VRAM budget + + * model facts into a sensible `num_gpu`, so a setup wizard or the docs can suggest one instead of + * the user guessing. No I/O, no hardware probing — caller supplies the numbers. + */ + +export interface OffloadInputs { + /** On-disk size of the (quantized) weights, GB. */ + modelSizeGB: number; + /** VRAM you're willing to give the model, GB (leave headroom for the desktop / other apps). */ + vramBudgetGB: number; + /** Total transformer layers (blocks) in the model. */ + totalLayers: number; + /** VRAM reserved for the KV cache + activations + overhead, GB. Default 1.5. */ + reserveGB?: number; +} + +export interface OffloadPlan { + /** Layers to keep on the GPU — feed as `options.num_gpu`. 0 = pure CPU; totalLayers = all-GPU. */ + numGpu: number; + /** Fraction of layers on the GPU (0–1) — a quick "how offloaded am I" read. */ + gpuFraction: number; + /** True when the whole model fits and no offloading is needed. */ + fitsFully: boolean; +} + +/** Suggest how many layers to keep on the GPU given a VRAM budget. PURE. Clamps to [0, total]. */ +export function suggestGpuLayers(inp: OffloadInputs): OffloadPlan { + const total = Math.max(1, Math.floor(inp.totalLayers)); + const reserve = inp.reserveGB ?? 1.5; + const perLayerGB = inp.modelSizeGB / total; + const usable = inp.vramBudgetGB - reserve; + if (perLayerGB <= 0 || !Number.isFinite(perLayerGB)) { + return { numGpu: total, gpuFraction: 1, fitsFully: true }; + } + const raw = Math.floor(usable / perLayerGB); + const numGpu = Math.max(0, Math.min(total, raw)); + return { numGpu, gpuFraction: numGpu / total, fitsFully: numGpu >= total }; +} + +/** One-line, human-readable summary of an offload plan (for `qodex setup` / docs). PURE. */ +export function describeOffload(plan: OffloadPlan, totalLayers: number): string { + if (plan.fitsFully) return `Fits in VRAM — all ${totalLayers} layers on GPU (num_gpu: ${plan.numGpu}).`; + if (plan.numGpu === 0) return `Too tight for GPU layers — running on CPU (num_gpu: 0). Expect slow generation.`; + const pct = Math.round(plan.gpuFraction * 100); + return `Offload: keep ${plan.numGpu}/${totalLayers} layers (${pct}%) on GPU, the rest on CPU — set options.num_gpu: ${plan.numGpu}.`; +} diff --git a/src/llm/providers/ollama.ts b/src/llm/providers/ollama.ts index 7d48f93..30f7a99 100644 --- a/src/llm/providers/ollama.ts +++ b/src/llm/providers/ollama.ts @@ -6,8 +6,9 @@ import { computeThroughput } from '../cache-layout.js'; export interface OllamaOptions { /** `keep_alive` — how long to keep the model resident. Default '30m'. */ keepAlive?: string; - /** Extra runtime options merged into every request's `options` (num_ctx, num_batch, …). */ - options?: Record; + /** Extra runtime options merged into every request's `options` (num_ctx, num_batch, num_gpu, + * … — numbers, strings, or bools so any llama.cpp/Ollama runtime flag passes through). */ + options?: Record; /** Draft model for speculative decoding, passed through if the server supports it. */ draftModel?: string; /** diff --git a/test/offload.test.ts b/test/offload.test.ts new file mode 100644 index 0000000..e4fd15e --- /dev/null +++ b/test/offload.test.ts @@ -0,0 +1,44 @@ +import { describe, it, expect } from 'vitest'; +import { suggestGpuLayers, describeOffload } from '../src/llm/offload.ts'; + +describe('suggestGpuLayers — fit a big (MoE) model on limited VRAM', () => { + it('keeps ALL layers on GPU when the model fits', () => { + // 8 GB model, 24 GB VRAM, 32 layers → fits fully. + const p = suggestGpuLayers({ modelSizeGB: 8, vramBudgetGB: 24, totalLayers: 32 }); + expect(p.fitsFully).toBe(true); + expect(p.numGpu).toBe(32); + expect(p.gpuFraction).toBe(1); + }); + + it('offloads a slice to CPU when the model is bigger than VRAM', () => { + // 48 GB MoE, 12 GB VRAM (1.5 reserve → 10.5 usable), 64 layers (0.75 GB/layer) → 14 layers. + const p = suggestGpuLayers({ modelSizeGB: 48, vramBudgetGB: 12, totalLayers: 64 }); + expect(p.fitsFully).toBe(false); + expect(p.numGpu).toBe(14); + expect(p.gpuFraction).toBeCloseTo(14 / 64, 5); + }); + + it('falls back to CPU (num_gpu 0) when even one layer will not fit', () => { + const p = suggestGpuLayers({ modelSizeGB: 80, vramBudgetGB: 2, totalLayers: 80 }); // 0.5 usable, 1 GB/layer + expect(p.numGpu).toBe(0); + expect(p.fitsFully).toBe(false); + }); + + it('clamps and never returns a negative or >total layer count', () => { + expect(suggestGpuLayers({ modelSizeGB: 1000, vramBudgetGB: 0, totalLayers: 40 }).numGpu).toBe(0); + expect(suggestGpuLayers({ modelSizeGB: 0, vramBudgetGB: 24, totalLayers: 40 }).numGpu).toBe(40); // degenerate → all GPU + }); + + it('respects a custom reserve', () => { + const tight = suggestGpuLayers({ modelSizeGB: 32, vramBudgetGB: 16, totalLayers: 32, reserveGB: 8 }); // 8 usable, 1/layer + expect(tight.numGpu).toBe(8); + }); +}); + +describe('describeOffload — human summary', () => { + it('describes full-fit, partial offload, and cpu-only', () => { + expect(describeOffload({ numGpu: 32, gpuFraction: 1, fitsFully: true }, 32)).toMatch(/Fits in VRAM/); + expect(describeOffload({ numGpu: 14, gpuFraction: 14 / 64, fitsFully: false }, 64)).toMatch(/keep 14\/64 layers.*num_gpu: 14/); + expect(describeOffload({ numGpu: 0, gpuFraction: 0, fitsFully: false }, 80)).toMatch(/CPU.*slow/); + }); +});