Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,23 @@ subagents:
mode: parallel # off | sequential | parallel
```

### Large (MoE) models on limited VRAM + local "turbo cache"

Big Mixture-of-Experts coders (Qwen3-Coder-MoE, DeepSeek-MoE) don't have to fit entirely in VRAM — keep some layers on the GPU and the rest on the CPU via Ollama's `num_gpu`, which QodeX forwards verbatim:

```yaml
providers:
ollama:
keepAlive: 30m # keep the model + its KV cache warm between turns — the local "turbo cache"
options:
num_gpu: 14 # layers on GPU; the rest run on CPU (lower = fits a bigger model)
num_ctx: 32768 # KV-cache size; QodeX already defaults this to the model's window
```

Not sure what `num_gpu` to use? `suggestGpuLayers({ modelSizeGB, vramBudgetGB, totalLayers })` (`src/llm/offload.ts`) turns a VRAM budget into a sensible value (e.g. a 48 GB MoE on a 12 GB GPU → keep ~14/64 layers on the GPU).

Two things make local fast here: **`keep_alive`** keeps the model resident so there's no cold reload, and QodeX's **byte-stable prompt prefix** (hierarchical cache work above) means the engine's **KV prefix cache hits** instead of re-prefilling the whole context every turn — the local counterpart to Anthropic prompt caching.

Cloud providers are opt-in. Web-search keys are read from the environment, never the config file:

```bash
Expand Down
14 changes: 10 additions & 4 deletions src/config/defaults.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,16 @@ export interface QodexConfig {
/** Ollama `keep_alive` — how long the model stays resident after a request.
* Longer avoids a cold reload (and full prefill) between turns. Default '30m'. */
keepAlive?: string;
/** Extra Ollama runtime `options` merged into every request (num_ctx, num_batch,
* num_gpu, …). `num_ctx` defaults to the routed model's context window so long
* sessions aren't silently truncated by the server's default 2k/4k window. */
options?: Record<string, number>;
/** Extra Ollama runtime `options` merged verbatim into every request. Numbers, strings,
* and bools all pass through, so any llama.cpp/Ollama runtime flag works — including the
* ones that matter for large MoE coders on limited VRAM:
* - `num_gpu`: layers to keep on the GPU (the rest run on CPU). Lower it to fit a big
* MoE model in VRAM. See src/llm/offload.ts `suggestGpuLayers` for a sensible value.
* - `num_ctx`: defaults to the routed model's context window so long sessions aren't
* silently truncated by the server's 2k/4k default. Bigger num_ctx ⇒ bigger KV cache.
* `keep_alive` (above) keeps the model + its KV cache warm between turns — the local
* "turbo cache" that, with QodeX's byte-stable prompt prefix, avoids a full re-prefill. */
options?: Record<string, number | string | boolean>;
/** Draft model for speculative decoding, if the local server supports it. Passed
* through verbatim; servers that don't read it ignore it. */
draftModel?: string;
Expand Down
53 changes: 53 additions & 0 deletions src/llm/offload.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/**
* Local-engine offloading helpers — run a model that's bigger than your VRAM by keeping some
* layers on the CPU. Especially relevant for large Mixture-of-Experts (MoE) coders (Qwen3-Coder
* MoE, DeepSeek-MoE …): the bulk of the weights are expert FFN layers, so offloading a slice of
* layers to system RAM lets a 30–100B MoE run on a 12–24 GB GPU at usable speed.
*
* QodeX already forwards `providers.ollama.options` verbatim, so `num_gpu` (the number of layers
* to keep on the GPU; the rest run on CPU) Just Works. These PURE helpers turn a VRAM budget +
* model facts into a sensible `num_gpu`, so a setup wizard or the docs can suggest one instead of
* the user guessing. No I/O, no hardware probing — caller supplies the numbers.
*/

export interface OffloadInputs {
/** On-disk size of the (quantized) weights, GB. */
modelSizeGB: number;
/** VRAM you're willing to give the model, GB (leave headroom for the desktop / other apps). */
vramBudgetGB: number;
/** Total transformer layers (blocks) in the model. */
totalLayers: number;
/** VRAM reserved for the KV cache + activations + overhead, GB. Default 1.5. */
reserveGB?: number;
}

export interface OffloadPlan {
/** Layers to keep on the GPU — feed as `options.num_gpu`. 0 = pure CPU; totalLayers = all-GPU. */
numGpu: number;
/** Fraction of layers on the GPU (0–1) — a quick "how offloaded am I" read. */
gpuFraction: number;
/** True when the whole model fits and no offloading is needed. */
fitsFully: boolean;
}

/** Suggest how many layers to keep on the GPU given a VRAM budget. PURE. Clamps to [0, total]. */
export function suggestGpuLayers(inp: OffloadInputs): OffloadPlan {
const total = Math.max(1, Math.floor(inp.totalLayers));
const reserve = inp.reserveGB ?? 1.5;
const perLayerGB = inp.modelSizeGB / total;
const usable = inp.vramBudgetGB - reserve;
if (perLayerGB <= 0 || !Number.isFinite(perLayerGB)) {
return { numGpu: total, gpuFraction: 1, fitsFully: true };
}
const raw = Math.floor(usable / perLayerGB);
const numGpu = Math.max(0, Math.min(total, raw));
return { numGpu, gpuFraction: numGpu / total, fitsFully: numGpu >= total };
}

/** One-line, human-readable summary of an offload plan (for `qodex setup` / docs). PURE. */
export function describeOffload(plan: OffloadPlan, totalLayers: number): string {
if (plan.fitsFully) return `Fits in VRAM — all ${totalLayers} layers on GPU (num_gpu: ${plan.numGpu}).`;
if (plan.numGpu === 0) return `Too tight for GPU layers — running on CPU (num_gpu: 0). Expect slow generation.`;
const pct = Math.round(plan.gpuFraction * 100);
return `Offload: keep ${plan.numGpu}/${totalLayers} layers (${pct}%) on GPU, the rest on CPU — set options.num_gpu: ${plan.numGpu}.`;
}
5 changes: 3 additions & 2 deletions src/llm/providers/ollama.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ import { computeThroughput } from '../cache-layout.js';
export interface OllamaOptions {
/** `keep_alive` — how long to keep the model resident. Default '30m'. */
keepAlive?: string;
/** Extra runtime options merged into every request's `options` (num_ctx, num_batch, …). */
options?: Record<string, number>;
/** Extra runtime options merged into every request's `options` (num_ctx, num_batch, num_gpu,
* … — numbers, strings, or bools so any llama.cpp/Ollama runtime flag passes through). */
options?: Record<string, number | string | boolean>;
/** Draft model for speculative decoding, passed through if the server supports it. */
draftModel?: string;
/**
Expand Down
44 changes: 44 additions & 0 deletions test/offload.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import { describe, it, expect } from 'vitest';
import { suggestGpuLayers, describeOffload } from '../src/llm/offload.ts';

describe('suggestGpuLayers — fit a big (MoE) model on limited VRAM', () => {
it('keeps ALL layers on GPU when the model fits', () => {
// 8 GB model, 24 GB VRAM, 32 layers → fits fully.
const p = suggestGpuLayers({ modelSizeGB: 8, vramBudgetGB: 24, totalLayers: 32 });
expect(p.fitsFully).toBe(true);
expect(p.numGpu).toBe(32);
expect(p.gpuFraction).toBe(1);
});

it('offloads a slice to CPU when the model is bigger than VRAM', () => {
// 48 GB MoE, 12 GB VRAM (1.5 reserve → 10.5 usable), 64 layers (0.75 GB/layer) → 14 layers.
const p = suggestGpuLayers({ modelSizeGB: 48, vramBudgetGB: 12, totalLayers: 64 });
expect(p.fitsFully).toBe(false);
expect(p.numGpu).toBe(14);
expect(p.gpuFraction).toBeCloseTo(14 / 64, 5);
});

it('falls back to CPU (num_gpu 0) when even one layer will not fit', () => {
const p = suggestGpuLayers({ modelSizeGB: 80, vramBudgetGB: 2, totalLayers: 80 }); // 0.5 usable, 1 GB/layer
expect(p.numGpu).toBe(0);
expect(p.fitsFully).toBe(false);
});

it('clamps and never returns a negative or >total layer count', () => {
expect(suggestGpuLayers({ modelSizeGB: 1000, vramBudgetGB: 0, totalLayers: 40 }).numGpu).toBe(0);
expect(suggestGpuLayers({ modelSizeGB: 0, vramBudgetGB: 24, totalLayers: 40 }).numGpu).toBe(40); // degenerate → all GPU
});

it('respects a custom reserve', () => {
const tight = suggestGpuLayers({ modelSizeGB: 32, vramBudgetGB: 16, totalLayers: 32, reserveGB: 8 }); // 8 usable, 1/layer
expect(tight.numGpu).toBe(8);
});
});

describe('describeOffload — human summary', () => {
it('describes full-fit, partial offload, and cpu-only', () => {
expect(describeOffload({ numGpu: 32, gpuFraction: 1, fitsFully: true }, 32)).toMatch(/Fits in VRAM/);
expect(describeOffload({ numGpu: 14, gpuFraction: 14 / 64, fitsFully: false }, 64)).toMatch(/keep 14\/64 layers.*num_gpu: 14/);
expect(describeOffload({ numGpu: 0, gpuFraction: 0, fitsFully: false }, 80)).toMatch(/CPU.*slow/);
});
});