QodeXcli · QodeXcli · Jun 30, 2026 · Jun 30, 2026
diff --git a/README.md b/README.md
@@ -342,6 +342,23 @@ subagents:
   mode: parallel            # off | sequential | parallel
 ```
 
+### Large (MoE) models on limited VRAM + local "turbo cache"
+
+Big Mixture-of-Experts coders (Qwen3-Coder-MoE, DeepSeek-MoE) don't have to fit entirely in VRAM — keep some layers on the GPU and the rest on the CPU via Ollama's `num_gpu`, which QodeX forwards verbatim:
+
+```yaml
+providers:
+  ollama:
+    keepAlive: 30m            # keep the model + its KV cache warm between turns — the local "turbo cache"
+    options:
+      num_gpu: 14             # layers on GPU; the rest run on CPU (lower = fits a bigger model)
+      num_ctx: 32768          # KV-cache size; QodeX already defaults this to the model's window
+```
+
+Not sure what `num_gpu` to use? `suggestGpuLayers({ modelSizeGB, vramBudgetGB, totalLayers })` (`src/llm/offload.ts`) turns a VRAM budget into a sensible value (e.g. a 48 GB MoE on a 12 GB GPU → keep ~14/64 layers on the GPU).
+
+Two things make local fast here: **`keep_alive`** keeps the model resident so there's no cold reload, and QodeX's **byte-stable prompt prefix** (hierarchical cache work above) means the engine's **KV prefix cache hits** instead of re-prefilling the whole context every turn — the local counterpart to Anthropic prompt caching.
+
 Cloud providers are opt-in. Web-search keys are read from the environment, never the config file:
 
 ```bash

diff --git a/src/config/defaults.ts b/src/config/defaults.ts
@@ -89,10 +89,16 @@ export interface QodexConfig {
       /** Ollama `keep_alive` — how long the model stays resident after a request.
        *  Longer avoids a cold reload (and full prefill) between turns. Default '30m'. */
       keepAlive?: string;
-      /** Extra Ollama runtime `options` merged into every request (num_ctx, num_batch,
-       *  num_gpu, …). `num_ctx` defaults to the routed model's context window so long
-       *  sessions aren't silently truncated by the server's default 2k/4k window. */
-      options?: Record<string, number>;
+      /** Extra Ollama runtime `options` merged verbatim into every request. Numbers, strings,
+       *  and bools all pass through, so any llama.cpp/Ollama runtime flag works — including the
+       *  ones that matter for large MoE coders on limited VRAM:
+       *    - `num_gpu`: layers to keep on the GPU (the rest run on CPU). Lower it to fit a big
+       *      MoE model in VRAM. See src/llm/offload.ts `suggestGpuLayers` for a sensible value.
+       *    - `num_ctx`: defaults to the routed model's context window so long sessions aren't
+       *      silently truncated by the server's 2k/4k default. Bigger num_ctx ⇒ bigger KV cache.
+       *  `keep_alive` (above) keeps the model + its KV cache warm between turns — the local
+       *  "turbo cache" that, with QodeX's byte-stable prompt prefix, avoids a full re-prefill. */
+      options?: Record<string, number | string | boolean>;
       /** Draft model for speculative decoding, if the local server supports it. Passed
        *  through verbatim; servers that don't read it ignore it. */
       draftModel?: string;

diff --git a/src/llm/offload.ts b/src/llm/offload.ts
@@ -0,0 +1,53 @@
+/**
+ * Local-engine offloading helpers — run a model that's bigger than your VRAM by keeping some
+ * layers on the CPU. Especially relevant for large Mixture-of-Experts (MoE) coders (Qwen3-Coder
+ * MoE, DeepSeek-MoE …): the bulk of the weights are expert FFN layers, so offloading a slice of
+ * layers to system RAM lets a 30–100B MoE run on a 12–24 GB GPU at usable speed.
+ *
+ * QodeX already forwards `providers.ollama.options` verbatim, so `num_gpu` (the number of layers
+ * to keep on the GPU; the rest run on CPU) Just Works. These PURE helpers turn a VRAM budget +
+ * model facts into a sensible `num_gpu`, so a setup wizard or the docs can suggest one instead of
+ * the user guessing. No I/O, no hardware probing — caller supplies the numbers.
+ */
+
+export interface OffloadInputs {
+  /** On-disk size of the (quantized) weights, GB. */
+  modelSizeGB: number;
+  /** VRAM you're willing to give the model, GB (leave headroom for the desktop / other apps). */
+  vramBudgetGB: number;
+  /** Total transformer layers (blocks) in the model. */
+  totalLayers: number;
+  /** VRAM reserved for the KV cache + activations + overhead, GB. Default 1.5. */
+  reserveGB?: number;
+}
+
+export interface OffloadPlan {
+  /** Layers to keep on the GPU — feed as `options.num_gpu`. 0 = pure CPU; totalLayers = all-GPU. */
+  numGpu: number;
+  /** Fraction of layers on the GPU (0–1) — a quick "how offloaded am I" read. */
+  gpuFraction: number;
+  /** True when the whole model fits and no offloading is needed. */
+  fitsFully: boolean;
+}
+
+/** Suggest how many layers to keep on the GPU given a VRAM budget. PURE. Clamps to [0, total]. */
+export function suggestGpuLayers(inp: OffloadInputs): OffloadPlan {
+  const total = Math.max(1, Math.floor(inp.totalLayers));
+  const reserve = inp.reserveGB ?? 1.5;
+  const perLayerGB = inp.modelSizeGB / total;
+  const usable = inp.vramBudgetGB - reserve;
+  if (perLayerGB <= 0 || !Number.isFinite(perLayerGB)) {
+    return { numGpu: total, gpuFraction: 1, fitsFully: true };
+  }
+  const raw = Math.floor(usable / perLayerGB);
+  const numGpu = Math.max(0, Math.min(total, raw));
+  return { numGpu, gpuFraction: numGpu / total, fitsFully: numGpu >= total };
+}
+
+/** One-line, human-readable summary of an offload plan (for `qodex setup` / docs). PURE. */
+export function describeOffload(plan: OffloadPlan, totalLayers: number): string {
+  if (plan.fitsFully) return `Fits in VRAM — all ${totalLayers} layers on GPU (num_gpu: ${plan.numGpu}).`;
+  if (plan.numGpu === 0) return `Too tight for GPU layers — running on CPU (num_gpu: 0). Expect slow generation.`;
+  const pct = Math.round(plan.gpuFraction * 100);
+  return `Offload: keep ${plan.numGpu}/${totalLayers} layers (${pct}%) on GPU, the rest on CPU — set options.num_gpu: ${plan.numGpu}.`;
+}
diff --git a/src/llm/providers/ollama.ts b/src/llm/providers/ollama.ts
@@ -6,8 +6,9 @@ import { computeThroughput } from '../cache-layout.js';
 export interface OllamaOptions {
   /** `keep_alive` — how long to keep the model resident. Default '30m'. */
   keepAlive?: string;
-  /** Extra runtime options merged into every request's `options` (num_ctx, num_batch, …). */
-  options?: Record<string, number>;
+  /** Extra runtime options merged into every request's `options` (num_ctx, num_batch, num_gpu,
+   *  … — numbers, strings, or bools so any llama.cpp/Ollama runtime flag passes through). */
+  options?: Record<string, number | string | boolean>;
   /** Draft model for speculative decoding, passed through if the server supports it. */
   draftModel?: string;
   /**

diff --git a/test/offload.test.ts b/test/offload.test.ts
@@ -0,0 +1,44 @@
+import { describe, it, expect } from 'vitest';
+import { suggestGpuLayers, describeOffload } from '../src/llm/offload.ts';
+
+describe('suggestGpuLayers — fit a big (MoE) model on limited VRAM', () => {
+  it('keeps ALL layers on GPU when the model fits', () => {
+    // 8 GB model, 24 GB VRAM, 32 layers → fits fully.
+    const p = suggestGpuLayers({ modelSizeGB: 8, vramBudgetGB: 24, totalLayers: 32 });
+    expect(p.fitsFully).toBe(true);
+    expect(p.numGpu).toBe(32);
+    expect(p.gpuFraction).toBe(1);
+  });
+
+  it('offloads a slice to CPU when the model is bigger than VRAM', () => {
+    // 48 GB MoE, 12 GB VRAM (1.5 reserve → 10.5 usable), 64 layers (0.75 GB/layer) → 14 layers.
+    const p = suggestGpuLayers({ modelSizeGB: 48, vramBudgetGB: 12, totalLayers: 64 });
+    expect(p.fitsFully).toBe(false);
+    expect(p.numGpu).toBe(14);
+    expect(p.gpuFraction).toBeCloseTo(14 / 64, 5);
+  });
+
+  it('falls back to CPU (num_gpu 0) when even one layer will not fit', () => {
+    const p = suggestGpuLayers({ modelSizeGB: 80, vramBudgetGB: 2, totalLayers: 80 }); // 0.5 usable, 1 GB/layer
+    expect(p.numGpu).toBe(0);
+    expect(p.fitsFully).toBe(false);
+  });
+
+  it('clamps and never returns a negative or >total layer count', () => {
+    expect(suggestGpuLayers({ modelSizeGB: 1000, vramBudgetGB: 0, totalLayers: 40 }).numGpu).toBe(0);
+    expect(suggestGpuLayers({ modelSizeGB: 0, vramBudgetGB: 24, totalLayers: 40 }).numGpu).toBe(40); // degenerate → all GPU
+  });
+
+  it('respects a custom reserve', () => {
+    const tight = suggestGpuLayers({ modelSizeGB: 32, vramBudgetGB: 16, totalLayers: 32, reserveGB: 8 }); // 8 usable, 1/layer
+    expect(tight.numGpu).toBe(8);
+  });
+});
+
+describe('describeOffload — human summary', () => {
+  it('describes full-fit, partial offload, and cpu-only', () => {
+    expect(describeOffload({ numGpu: 32, gpuFraction: 1, fitsFully: true }, 32)).toMatch(/Fits in VRAM/);
+    expect(describeOffload({ numGpu: 14, gpuFraction: 14 / 64, fitsFully: false }, 64)).toMatch(/keep 14\/64 layers.*num_gpu: 14/);
+    expect(describeOffload({ numGpu: 0, gpuFraction: 0, fitsFully: false }, 80)).toMatch(/CPU.*slow/);
+  });
+});