Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
85 commits
Select commit Hold shift + click to select a range
b41e58e
refactor(auto-routing): move classifier core into contracts package
iscekic Jun 11, 2026
1fb85f5
feat(auto-routing): add tier, routing-table, decision and benchmark c…
iscekic Jun 11, 2026
39acfdb
feat(auto-routing): add benchmark-driven decision engine and KV routi…
iscekic Jun 11, 2026
bd83fdc
feat(auto-routing): return routing decisions from /decide
iscekic Jun 11, 2026
9621d62
fix(auto-routing): log unparseable routing table JSON before falling …
iscekic Jun 11, 2026
7af1b6d
feat(auto-routing-benchmark): scaffold benchmark worker with D1 schema
iscekic Jun 11, 2026
22de713
feat(auto-routing-benchmark): classifier golden dataset and grading
iscekic Jun 11, 2026
878e49b
style(auto-routing-benchmark): apply oxfmt formatting
iscekic Jun 11, 2026
662717c
feat(auto-routing-benchmark): decider golden dataset with determinist…
iscekic Jun 11, 2026
110cbd9
fix(auto-routing-benchmark): unambiguous whitespace instruction in of…
iscekic Jun 11, 2026
5ce8621
feat(auto-routing-benchmark): queue-driven benchmark runs with aggreg…
iscekic Jun 11, 2026
0c763ce
feat(auto-routing-benchmark): admin config, runs and routing-table en…
iscekic Jun 11, 2026
c749be2
feat(admin): proxy routes for auto-routing benchmark service
iscekic Jun 11, 2026
0e34c02
feat(admin): benchmark config, runs and routing table panel
iscekic Jun 11, 2026
fb084c3
fix(admin): stabilize benchmark runs polling interval dependencies
iscekic Jun 11, 2026
9f2d876
feat(web): internal token mint endpoint for auto-routing benchmark
iscekic Jun 11, 2026
7a31d4a
feat(auto-routing-benchmark): run decider cases through kilo CLI in a…
iscekic Jun 11, 2026
d0f13b0
feat(admin): benchmark user id config field
iscekic Jun 11, 2026
fdc6520
feat(gateway): add kilo-auto/efficient with blocking auto-routing dec…
iscekic Jun 11, 2026
813ea0e
chore(auto-routing): drop unused import in routing-table contracts
iscekic Jun 11, 2026
9b69edf
fix(auto-routing-benchmark): harden decider CLI parsing, grading and …
iscekic Jun 11, 2026
5ff4b08
fix(auto-routing-benchmark): warm up CLI container before concurrent …
iscekic Jun 11, 2026
06836cc
fix(auto-routing-benchmark): faster container turnover to avoid insta…
iscekic Jun 11, 2026
2faee13
fix(auto-routing-benchmark): address review findings
iscekic Jun 11, 2026
cac57b7
style(auto-routing-benchmark): format wrangler.jsonc
iscekic Jun 11, 2026
ccc9c9d
fix(auto-routing-benchmark): guard against double finish on spawn fai…
iscekic Jun 11, 2026
ba3b3be
fix(auto-routing): break contracts module cycle and keep response sch…
iscekic Jun 11, 2026
6776db0
chore(admin): drop unused import after schema move
iscekic Jun 11, 2026
c0320c7
feat(auto-routing): classifier model becomes an admin override over t…
iscekic Jun 11, 2026
7bb5048
feat(auto-routing): manual benchmark runs, classifier override, decid…
iscekic Jun 12, 2026
f3c0128
refactor(auto-routing): simplification pass
iscekic Jun 12, 2026
641f6ef
refactor(auto-routing-benchmark): use drizzle for all D1 access
iscekic Jun 12, 2026
2d2691f
refactor(auto-routing-benchmark): normalize D1 schema and adopt drizz…
iscekic Jun 12, 2026
86e2fdc
fix(auto-routing-benchmark): preserve null candidate cost and type dr…
iscekic Jun 12, 2026
0241d47
refactor(auto-routing-benchmark): make candidate cost non-null to mat…
iscekic Jun 12, 2026
8244676
feat(auto-routing): read-through KV cache backed by the benchmark ser…
iscekic Jun 12, 2026
36f32a7
fix(auto-routing): await read-through cache writes and surface origin…
iscekic Jun 12, 2026
aa14657
ci(workers): run worker predeploy scripts (D1 migrations) before deploy
iscekic Jun 12, 2026
82aef0b
fix(auto-routing-benchmark): reuse loaded run state in finalize and b…
iscekic Jun 12, 2026
4a7478b
refactor(auto-routing): share ttl cache, single-source schemas and dr…
iscekic Jun 12, 2026
a449c26
docs(gateway): drop stale keep-in-sync comment on DecideBaseParams
iscekic Jun 12, 2026
4caa4f8
feat(gateway): bill classifier cost to the user for kilo-auto/efficient
iscekic Jun 12, 2026
ec5dc3f
fix(gateway): fix type error and remove dead guard in classifier billing
iscekic Jun 12, 2026
0141b71
fix(auto-routing): apply decision reasoningEffort to efficient routing
iscekic Jun 12, 2026
6960e1a
feat(auto-routing): align kilo-auto/efficient catalog with balanced, …
iscekic Jun 12, 2026
debdd03
fix(admin): correct run-summaries colspan in benchmarks section
iscekic Jun 12, 2026
a016310
feat(admin): derive decider model API kinds from gateway provider def…
iscekic Jun 12, 2026
fc427e5
feat(auto-routing): drop default routing table; no table means no dec…
iscekic Jun 12, 2026
01e4bd9
fix(auto-routing): keep classifier override when benchmark origin is …
iscekic Jun 12, 2026
0828e47
docs(contracts): fix stale classifier-winner comment
iscekic Jun 12, 2026
71222ca
fix(benchmark): exclude no-cost-signal summaries from routing table r…
iscekic Jun 12, 2026
6f5fd38
test(benchmark): fix expected ranking order in no-cost-signal test
iscekic Jun 12, 2026
2cd53f9
feat(benchmark): remove fabricated default config; runs require a sav…
iscekic Jun 12, 2026
354054d
chore(benchmark): drop redundant case_results index, regenerate basel…
iscekic Jun 12, 2026
6aba145
docs(benchmark): fix stale KV comment in wrangler config
iscekic Jun 12, 2026
8955269
feat(auto-routing-benchmark): grade subtaskType and riskLevel, expand…
iscekic Jun 12, 2026
ae707f3
feat(auto-routing-benchmark): expand decider dataset to per-pair taxo…
iscekic Jun 12, 2026
adb49f5
feat(auto-routing): session-sticky decisions with switch-cost factor
iscekic Jun 12, 2026
a24dc4d
feat(auto-routing-benchmark): plumb switchCostFactor through config, …
iscekic Jun 12, 2026
1d424c5
Merge remote-tracking branch 'origin/main' into feat/auto-routing-eff…
iscekic Jun 12, 2026
3d50441
fix(ai-gateway): align efficient fallback with Qwen-for-all-APIs afte…
iscekic Jun 12, 2026
d922d92
refactor(auto-routing): drop per-candidate API-kind plumbing, validat…
iscekic Jun 12, 2026
427dcc2
fix(auto-routing): review-pass fixes
iscekic Jun 12, 2026
053373b
test(ai-gateway): add sticky field to decision fixture
iscekic Jun 12, 2026
b8a5892
feat(dev): move auto-routing workers into their own opt-in dev group
iscekic Jun 12, 2026
2f39419
fix(auto-routing): make the decider benchmark runnable in local dev
iscekic Jun 12, 2026
ae0cec5
fix(auto-routing): kill the whole CLI process tree on decider case ti…
iscekic Jun 12, 2026
4f04e0a
feat(auto-routing): benchmark repetitions, p95 latency, and classifie…
iscekic Jun 12, 2026
1eae06f
fix(auto-routing): correct case_results migration backfill and close …
iscekic Jun 13, 2026
7151256
feat(admin): benchmark repetitions, latency budget, and p95/timeout c…
iscekic Jun 13, 2026
17a8c01
fix(admin): correct runs-table colSpan and cover config form round-trip
iscekic Jun 13, 2026
1a5d858
chore(auto-routing): squash benchmark D1 migrations into one baseline
iscekic Jun 13, 2026
c9db589
Merge remote-tracking branch 'origin/main' into feat/auto-routing-eff…
iscekic Jun 13, 2026
9eaae60
test(ai-gateway): stop depending on removed morph model in API-kind t…
iscekic Jun 13, 2026
165240b
fix(auto-routing-benchmark): return 400 when starting a run without c…
iscekic Jun 15, 2026
0844c48
fix(auto-routing-benchmark): slice queue fan-out under sendBatch limit
iscekic Jun 15, 2026
0d9d6d2
fix(ai-gateway): suppress first-usage events for classifier overhead row
iscekic Jun 15, 2026
f00b619
fix(ai-gateway): bill classifier cost regardless of final-provider BYOK
iscekic Jun 15, 2026
a8d0cd7
fix(ai-gateway): make efficient classifier spend authenticated + exit…
iscekic Jun 15, 2026
188dfe7
fix(auto-routing): reject duplicate benchmark model ids at validation
iscekic Jun 15, 2026
4c0a18d
fix(auto-routing): reject model-experiment ids as decider candidates
iscekic Jun 15, 2026
75c762a
fix(auto-routing-benchmark): invalidate carried summaries on identity…
iscekic Jun 15, 2026
3166eff
fix(auto-routing-benchmark): one active run per kind + stale recovery
iscekic Jun 15, 2026
b19c57a
fix(auto-routing): harden benchmarks admin panel (a11y, overflow, dir…
iscekic Jun 15, 2026
637d695
docs(auto-routing): add ADR and benchmark service README
iscekic Jun 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/deploy-workers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ jobs:
with:
apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
workingDirectory: ${{ inputs.worker }}
# Workers that define a `predeploy` script (e.g. D1 migrations) run it
# right before deploy; all other workers are unaffected.
preCommands: |
if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
command: deploy

detect-changes:
Expand Down Expand Up @@ -150,4 +154,8 @@ jobs:
with:
apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
workingDirectory: ${{ matrix.worker }}
# Workers that define a `predeploy` script (e.g. D1 migrations) run it
# right before deploy; all other workers are unaffected.
preCommands: |
if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
command: deploy
3 changes: 3 additions & 0 deletions apps/web/.env.development.local.example
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ AUTO_TRIAGE_URL=http://localhost:8791
# @url auto-routing
AUTO_ROUTING_WORKER_URL=http://localhost:8810

# @url auto-routing-benchmark
AUTO_ROUTING_BENCHMARK_WORKER_URL=http://localhost:8814

# @url cloudflare-security-sync
SECURITY_SYNC_WORKER_URL=http://localhost:8812

Expand Down
158 changes: 158 additions & 0 deletions apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import { NextRequest } from 'next/server';
import type { User } from '@kilocode/db';
import {
getBenchmarkConfig,
updateBenchmarkConfig,
} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
import { getUserFromAuth } from '@/lib/user/server';
import { findExperimentReservedModelIds } from '@/lib/ai-gateway/experiments/reserved-ids';
import type { KiloExclusiveModel } from '@/lib/ai-gateway/providers/kilo-exclusive-model';
import type * as ModelsModule from '@/lib/ai-gateway/models';

jest.mock('@/lib/user/server', () => ({
getUserFromAuth: jest.fn(),
}));

jest.mock('@/lib/ai-gateway/auto-routing-benchmark-admin-client', () => ({
getBenchmarkConfig: jest.fn(),
updateBenchmarkConfig: jest.fn(),
}));

jest.mock('@/lib/ai-gateway/experiments/reserved-ids', () => ({
findExperimentReservedModelIds: jest.fn(),
}));

// Stub the catalog so tests don't depend on any specific provider file.
// 'test-exclusive/alibaba-only' maps to the alibaba gateway (chat_completions only).
jest.mock('@/lib/ai-gateway/models', () => {
const actual = jest.requireActual<typeof ModelsModule>('@/lib/ai-gateway/models');
const stubModel: KiloExclusiveModel = {
public_id: 'test-exclusive/alibaba-only',
display_name: 'Test Alibaba-only',
description: 'stub for unit tests',
context_length: 8192,
max_completion_tokens: 4096,
status: 'public',
flags: [],
gateway: 'alibaba',
internal_id: 'stub-internal',
pricing: null,
exclusive_to: [],
inference_provider_restriction: [],
};
return {
...actual,
findKiloExclusiveModel: (id: string) =>
id === 'test-exclusive/alibaba-only' ? stubModel : actual.findKiloExclusiveModel(id),
};
});

import { PUT } from './route';

const mockGetUserFromAuth = jest.mocked(getUserFromAuth);
const mockGetBenchmarkConfig = jest.mocked(getBenchmarkConfig);
const mockUpdateBenchmarkConfig = jest.mocked(updateBenchmarkConfig);
const mockFindExperimentReservedModelIds = jest.mocked(findExperimentReservedModelIds);

// Test-fixture boundary: only the fields the route actually reads.
function adminUserFixture(): User {
return { id: 'admin_123', google_user_email: 'admin@kilocode.ai' } as Partial<User> as User;
}

function putRequest(body: unknown) {
return new NextRequest('http://localhost:3000/admin/api/auto-routing/benchmark-config', {
method: 'PUT',
body: JSON.stringify(body),
headers: { 'content-type': 'application/json' },
});
}

const validConfig = {
classifierModels: ['google/gemini-2.5-flash-lite'],
deciderModels: [{ id: 'openai/gpt-5-mini', reasoningEffort: null }],
minAccuracy: 0.7,
switchCostFactor: 3,
maxConcurrency: 4,
benchmarkUserId: null,
classifierRepetitions: 1,
deciderRepetitions: 1,
classifierMaxP95LatencyMs: 1000,
updatedAt: null,
updatedBy: null,
};

describe('PUT /admin/api/auto-routing/benchmark-config', () => {
beforeEach(() => {
jest.clearAllMocks();
mockGetUserFromAuth.mockResolvedValue({
user: adminUserFixture(),
authFailedResponse: null,
});
mockUpdateBenchmarkConfig.mockResolvedValue({
status: 200,
body: { config: validConfig },
});
mockGetBenchmarkConfig.mockResolvedValue({ status: 200, body: { config: null } });
mockFindExperimentReservedModelIds.mockResolvedValue([]);
});

it('forwards a config whose decider models all serve every gateway chat API', async () => {
const response = await PUT(putRequest(validConfig));
expect(response.status).toBe(200);
expect(mockUpdateBenchmarkConfig).toHaveBeenCalledWith(validConfig, 'admin@kilocode.ai');
});

it('rejects with 400 listing decider models not servable on all gateway chat APIs', async () => {
const response = await PUT(
putRequest({
...validConfig,
deciderModels: [
{ id: 'openai/gpt-5-mini', reasoningEffort: null },
{ id: 'test-exclusive/alibaba-only', reasoningEffort: null },
],
})
);

expect(response.status).toBe(400);
const body = (await response.json()) as { error: string };
expect(body.error).toContain('test-exclusive/alibaba-only');
expect(body.error).toContain('chat_completions');
expect(body.error).not.toContain('openai/gpt-5-mini (');
expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
});

it('rejects decider models reserved by a model experiment (any status)', async () => {
// Ownership is status-independent per .specs/model-experiments.md: a public
// id with a draft/active/paused/completed experiment is reserved for
// explicit user selection and must not enter kilo-auto candidate sets.
mockFindExperimentReservedModelIds.mockResolvedValue(['preview/experimental-model']);

const response = await PUT(
putRequest({
...validConfig,
deciderModels: [
{ id: 'openai/gpt-5-mini', reasoningEffort: null },
{ id: 'preview/experimental-model', reasoningEffort: null },
],
})
);

expect(response.status).toBe(400);
const body = (await response.json()) as { error: string };
expect(body.error).toContain('preview/experimental-model');
expect(body.error).toContain('model-experiment');
expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
// The check runs against the decider model ids.
expect(mockFindExperimentReservedModelIds).toHaveBeenCalledWith([
'openai/gpt-5-mini',
'preview/experimental-model',
]);
});

it('rejects a schema-invalid config with 400', async () => {
const response = await PUT(putRequest({ classifierModels: 'oops' }));
expect(response.status).toBe(400);
await expect(response.json()).resolves.toEqual({ error: 'Invalid benchmark config' });
expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
});
});
74 changes: 74 additions & 0 deletions apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts';
import type { NextRequest } from 'next/server';
import { NextResponse } from 'next/server';
import {
getBenchmarkConfig,
updateBenchmarkConfig,
} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
import {
gatewayChatApisForModel,
modelServesAllGatewayChatApis,
} from '@/lib/ai-gateway/model-api-kinds';
import { findExperimentReservedModelIds } from '@/lib/ai-gateway/experiments/reserved-ids';
import { getUserFromAuth } from '@/lib/user/server';

export async function GET() {
const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

const result = await getBenchmarkConfig();
return NextResponse.json(result.body, { status: result.status });
}

export async function PUT(request: NextRequest) {
const { authFailedResponse, user } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

let rawBody: unknown;
try {
rawBody = await request.json();
} catch {
return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
}

const parsed = BenchmarkConfigSchema.safeParse(rawBody);
if (!parsed.success) {
return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 });
}

// Model-experiment public ids are dedicated preview ids that users must
// explicitly select; per .specs/model-experiments.md they must never enter
// kilo-auto candidate sets, so they can't be saved as decider candidates
// (the routing table feeds kilo-auto/efficient automatic selection). Checked
// across all experiment statuses — ownership, not just routing membership.
const deciderModelIds = parsed.data.deciderModels.map(m => m.id);
const reservedExperimentIds = await findExperimentReservedModelIds(deciderModelIds);
if (reservedExperimentIds.length > 0) {
return NextResponse.json(
{
error: `Decider models must not be model-experiment public ids (reserved for explicit user selection): ${reservedExperimentIds.join(', ')}`,
},
{ status: 400 }
);
}

// Routing-table candidates carry no per-protocol metadata, so every decider
// model must be servable on ALL gateway chat API kinds by the provider the
// gateway would route it to.
const unsupported = parsed.data.deciderModels
Comment thread
iscekic marked this conversation as resolved.
.map(m => m.id)
.filter(id => !modelServesAllGatewayChatApis(id))
.map(id => `${id} (supports: ${gatewayChatApisForModel(id).join(', ') || 'none'})`);
if (unsupported.length > 0) {
return NextResponse.json(
{
error: `Decider models must support all gateway chat APIs (chat_completions, responses, messages): ${unsupported.join('; ')}`,
},
{ status: 400 }
);
}

const email = user?.google_user_email ?? '';
const result = await updateBenchmarkConfig(parsed.data, email);
return NextResponse.json(result.body, { status: result.status });
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { NextResponse } from 'next/server';
import { getBenchmarkRoutingTable } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
import { getUserFromAuth } from '@/lib/user/server';

export async function GET() {
const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

const result = await getBenchmarkRoutingTable();
return NextResponse.json(result.body, { status: result.status });
}
36 changes: 36 additions & 0 deletions apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import { StartBenchmarkRunRequestSchema } from '@kilocode/auto-routing-contracts';
import type { NextRequest } from 'next/server';
import { NextResponse } from 'next/server';
import {
listBenchmarkRuns,
startBenchmarkRun,
} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
import { getUserFromAuth } from '@/lib/user/server';

export async function GET() {
const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

const result = await listBenchmarkRuns();
return NextResponse.json(result.body, { status: result.status });
}

export async function POST(request: NextRequest) {
const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
if (authFailedResponse) return authFailedResponse;

let rawBody: unknown;
try {
rawBody = await request.json();
} catch {
return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
}

const parsed = StartBenchmarkRunRequestSchema.safeParse(rawBody);
if (!parsed.success) {
return NextResponse.json({ error: 'Invalid start benchmark run request' }, { status: 400 });
}

const result = await startBenchmarkRun(parsed.data.kind, parsed.data.force);
return NextResponse.json(result.body, { status: result.status });
}
Loading
Loading