Kilo-Org · iscekic · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.github/workflows/deploy-workers.yml b/.github/workflows/deploy-workers.yml
@@ -49,6 +49,10 @@ jobs:
         with:
           apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
           workingDirectory: ${{ inputs.worker }}
+          # Workers that define a `predeploy` script (e.g. D1 migrations) run it
+          # right before deploy; all other workers are unaffected.
+          preCommands: |
+            if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
           command: deploy
 
   detect-changes:
@@ -150,4 +154,8 @@ jobs:
         with:
           apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
           workingDirectory: ${{ matrix.worker }}
+          # Workers that define a `predeploy` script (e.g. D1 migrations) run it
+          # right before deploy; all other workers are unaffected.
+          preCommands: |
+            if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
           command: deploy
diff --git a/apps/web/.env.development.local.example b/apps/web/.env.development.local.example
@@ -19,6 +19,9 @@ AUTO_TRIAGE_URL=http://localhost:8791
 # @url auto-routing
 AUTO_ROUTING_WORKER_URL=http://localhost:8810
 
+# @url auto-routing-benchmark
+AUTO_ROUTING_BENCHMARK_WORKER_URL=http://localhost:8814
+
 # @url cloudflare-security-sync
 SECURITY_SYNC_WORKER_URL=http://localhost:8812
 

diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
@@ -0,0 +1,158 @@
+import { NextRequest } from 'next/server';
+import type { User } from '@kilocode/db';
+import {
+  getBenchmarkConfig,
+  updateBenchmarkConfig,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+import { findExperimentReservedModelIds } from '@/lib/ai-gateway/experiments/reserved-ids';
+import type { KiloExclusiveModel } from '@/lib/ai-gateway/providers/kilo-exclusive-model';
+import type * as ModelsModule from '@/lib/ai-gateway/models';
+
+jest.mock('@/lib/user/server', () => ({
+  getUserFromAuth: jest.fn(),
+}));
+
+jest.mock('@/lib/ai-gateway/auto-routing-benchmark-admin-client', () => ({
+  getBenchmarkConfig: jest.fn(),
+  updateBenchmarkConfig: jest.fn(),
+}));
+
+jest.mock('@/lib/ai-gateway/experiments/reserved-ids', () => ({
+  findExperimentReservedModelIds: jest.fn(),
+}));
+
+// Stub the catalog so tests don't depend on any specific provider file.
+// 'test-exclusive/alibaba-only' maps to the alibaba gateway (chat_completions only).
+jest.mock('@/lib/ai-gateway/models', () => {
+  const actual = jest.requireActual<typeof ModelsModule>('@/lib/ai-gateway/models');
+  const stubModel: KiloExclusiveModel = {
+    public_id: 'test-exclusive/alibaba-only',
+    display_name: 'Test Alibaba-only',
+    description: 'stub for unit tests',
+    context_length: 8192,
+    max_completion_tokens: 4096,
+    status: 'public',
+    flags: [],
+    gateway: 'alibaba',
+    internal_id: 'stub-internal',
+    pricing: null,
+    exclusive_to: [],
+    inference_provider_restriction: [],
+  };
+  return {
+    ...actual,
+    findKiloExclusiveModel: (id: string) =>
+      id === 'test-exclusive/alibaba-only' ? stubModel : actual.findKiloExclusiveModel(id),
+  };
+});
+
+import { PUT } from './route';
+
+const mockGetUserFromAuth = jest.mocked(getUserFromAuth);
+const mockGetBenchmarkConfig = jest.mocked(getBenchmarkConfig);
+const mockUpdateBenchmarkConfig = jest.mocked(updateBenchmarkConfig);
+const mockFindExperimentReservedModelIds = jest.mocked(findExperimentReservedModelIds);
+
+// Test-fixture boundary: only the fields the route actually reads.
+function adminUserFixture(): User {
+  return { id: 'admin_123', google_user_email: 'admin@kilocode.ai' } as Partial<User> as User;
+}
+
+function putRequest(body: unknown) {
+  return new NextRequest('http://localhost:3000/admin/api/auto-routing/benchmark-config', {
+    method: 'PUT',
+    body: JSON.stringify(body),
+    headers: { 'content-type': 'application/json' },
+  });
+}
+
+const validConfig = {
+  classifierModels: ['google/gemini-2.5-flash-lite'],
+  deciderModels: [{ id: 'openai/gpt-5-mini', reasoningEffort: null }],
+  minAccuracy: 0.7,
+  switchCostFactor: 3,
+  maxConcurrency: 4,
+  benchmarkUserId: null,
+  classifierRepetitions: 1,
+  deciderRepetitions: 1,
+  classifierMaxP95LatencyMs: 1000,
+  updatedAt: null,
+  updatedBy: null,
+};
+
+describe('PUT /admin/api/auto-routing/benchmark-config', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    mockGetUserFromAuth.mockResolvedValue({
+      user: adminUserFixture(),
+      authFailedResponse: null,
+    });
+    mockUpdateBenchmarkConfig.mockResolvedValue({
+      status: 200,
+      body: { config: validConfig },
+    });
+    mockGetBenchmarkConfig.mockResolvedValue({ status: 200, body: { config: null } });
+    mockFindExperimentReservedModelIds.mockResolvedValue([]);
+  });
+
+  it('forwards a config whose decider models all serve every gateway chat API', async () => {
+    const response = await PUT(putRequest(validConfig));
+    expect(response.status).toBe(200);
+    expect(mockUpdateBenchmarkConfig).toHaveBeenCalledWith(validConfig, 'admin@kilocode.ai');
+  });
+
+  it('rejects with 400 listing decider models not servable on all gateway chat APIs', async () => {
+    const response = await PUT(
+      putRequest({
+        ...validConfig,
+        deciderModels: [
+          { id: 'openai/gpt-5-mini', reasoningEffort: null },
+          { id: 'test-exclusive/alibaba-only', reasoningEffort: null },
+        ],
+      })
+    );
+
+    expect(response.status).toBe(400);
+    const body = (await response.json()) as { error: string };
+    expect(body.error).toContain('test-exclusive/alibaba-only');
+    expect(body.error).toContain('chat_completions');
+    expect(body.error).not.toContain('openai/gpt-5-mini (');
+    expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
+  });
+
+  it('rejects decider models reserved by a model experiment (any status)', async () => {
+    // Ownership is status-independent per .specs/model-experiments.md: a public
+    // id with a draft/active/paused/completed experiment is reserved for
+    // explicit user selection and must not enter kilo-auto candidate sets.
+    mockFindExperimentReservedModelIds.mockResolvedValue(['preview/experimental-model']);
+
+    const response = await PUT(
+      putRequest({
+        ...validConfig,
+        deciderModels: [
+          { id: 'openai/gpt-5-mini', reasoningEffort: null },
+          { id: 'preview/experimental-model', reasoningEffort: null },
+        ],
+      })
+    );
+
+    expect(response.status).toBe(400);
+    const body = (await response.json()) as { error: string };
+    expect(body.error).toContain('preview/experimental-model');
+    expect(body.error).toContain('model-experiment');
+    expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
+    // The check runs against the decider model ids.
+    expect(mockFindExperimentReservedModelIds).toHaveBeenCalledWith([
+      'openai/gpt-5-mini',
+      'preview/experimental-model',
+    ]);
+  });
+
+  it('rejects a schema-invalid config with 400', async () => {
+    const response = await PUT(putRequest({ classifierModels: 'oops' }));
+    expect(response.status).toBe(400);
+    await expect(response.json()).resolves.toEqual({ error: 'Invalid benchmark config' });
+    expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
+  });
+});
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
@@ -0,0 +1,74 @@
+import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts';
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import {
+  getBenchmarkConfig,
+  updateBenchmarkConfig,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import {
+  gatewayChatApisForModel,
+  modelServesAllGatewayChatApis,
+} from '@/lib/ai-gateway/model-api-kinds';
+import { findExperimentReservedModelIds } from '@/lib/ai-gateway/experiments/reserved-ids';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await getBenchmarkConfig();
+  return NextResponse.json(result.body, { status: result.status });
+}
+
+export async function PUT(request: NextRequest) {
+  const { authFailedResponse, user } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  let rawBody: unknown;
+  try {
+    rawBody = await request.json();
+  } catch {
+    return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
+  }
+
+  const parsed = BenchmarkConfigSchema.safeParse(rawBody);
+  if (!parsed.success) {
+    return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 });
+  }
+
+  // Model-experiment public ids are dedicated preview ids that users must
+  // explicitly select; per .specs/model-experiments.md they must never enter
+  // kilo-auto candidate sets, so they can't be saved as decider candidates
+  // (the routing table feeds kilo-auto/efficient automatic selection). Checked
+  // across all experiment statuses — ownership, not just routing membership.
+  const deciderModelIds = parsed.data.deciderModels.map(m => m.id);
+  const reservedExperimentIds = await findExperimentReservedModelIds(deciderModelIds);
+  if (reservedExperimentIds.length > 0) {
+    return NextResponse.json(
+      {
+        error: `Decider models must not be model-experiment public ids (reserved for explicit user selection): ${reservedExperimentIds.join(', ')}`,
+      },
+      { status: 400 }
+    );
+  }
+
+  // Routing-table candidates carry no per-protocol metadata, so every decider
+  // model must be servable on ALL gateway chat API kinds by the provider the
+  // gateway would route it to.
+  const unsupported = parsed.data.deciderModels
+    .map(m => m.id)
+    .filter(id => !modelServesAllGatewayChatApis(id))
+    .map(id => `${id} (supports: ${gatewayChatApisForModel(id).join(', ') || 'none'})`);
+  if (unsupported.length > 0) {
+    return NextResponse.json(
+      {
+        error: `Decider models must support all gateway chat APIs (chat_completions, responses, messages): ${unsupported.join('; ')}`,
+      },
+      { status: 400 }
+    );
+  }
+
+  const email = user?.google_user_email ?? '';
+  const result = await updateBenchmarkConfig(parsed.data, email);
+  return NextResponse.json(result.body, { status: result.status });
+}
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts
@@ -0,0 +1,11 @@
+import { NextResponse } from 'next/server';
+import { getBenchmarkRoutingTable } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await getBenchmarkRoutingTable();
+  return NextResponse.json(result.body, { status: result.status });
+}
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
@@ -0,0 +1,36 @@
+import { StartBenchmarkRunRequestSchema } from '@kilocode/auto-routing-contracts';
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import {
+  listBenchmarkRuns,
+  startBenchmarkRun,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await listBenchmarkRuns();
+  return NextResponse.json(result.body, { status: result.status });
+}
+
+export async function POST(request: NextRequest) {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  let rawBody: unknown;
+  try {
+    rawBody = await request.json();
+  } catch {
+    return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
+  }
+
+  const parsed = StartBenchmarkRunRequestSchema.safeParse(rawBody);
+  if (!parsed.success) {
+    return NextResponse.json({ error: 'Invalid start benchmark run request' }, { status: 400 });
+  }
+
+  const result = await startBenchmarkRun(parsed.data.kind, parsed.data.force);
+  return NextResponse.json(result.body, { status: result.status });
+}