diff --git a/.github/workflows/deploy-workers.yml b/.github/workflows/deploy-workers.yml
index 1a247f091d..1e18146a03 100644
--- a/.github/workflows/deploy-workers.yml
+++ b/.github/workflows/deploy-workers.yml
@@ -49,6 +49,10 @@ jobs:
         with:
           apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
           workingDirectory: ${{ inputs.worker }}
+          # Workers that define a `predeploy` script (e.g. D1 migrations) run it
+          # right before deploy; all other workers are unaffected.
+          preCommands: |
+            if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
           command: deploy
 
   detect-changes:
@@ -150,4 +154,8 @@ jobs:
         with:
           apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
           workingDirectory: ${{ matrix.worker }}
+          # Workers that define a `predeploy` script (e.g. D1 migrations) run it
+          # right before deploy; all other workers are unaffected.
+          preCommands: |
+            if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi
           command: deploy
diff --git a/apps/web/.env.development.local.example b/apps/web/.env.development.local.example
index c17511bfe9..ae816bb2c2 100644
--- a/apps/web/.env.development.local.example
+++ b/apps/web/.env.development.local.example
@@ -19,6 +19,9 @@ AUTO_TRIAGE_URL=http://localhost:8791
 # @url auto-routing
 AUTO_ROUTING_WORKER_URL=http://localhost:8810
 
+# @url auto-routing-benchmark
+AUTO_ROUTING_BENCHMARK_WORKER_URL=http://localhost:8814
+
 # @url cloudflare-security-sync
 SECURITY_SYNC_WORKER_URL=http://localhost:8812
 
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
new file mode 100644
index 0000000000..e572c4e2a5
--- /dev/null
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts
@@ -0,0 +1,158 @@
+import { NextRequest } from 'next/server';
+import type { User } from '@kilocode/db';
+import {
+  getBenchmarkConfig,
+  updateBenchmarkConfig,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+import { findExperimentReservedModelIds } from '@/lib/ai-gateway/experiments/reserved-ids';
+import type { KiloExclusiveModel } from '@/lib/ai-gateway/providers/kilo-exclusive-model';
+import type * as ModelsModule from '@/lib/ai-gateway/models';
+
+jest.mock('@/lib/user/server', () => ({
+  getUserFromAuth: jest.fn(),
+}));
+
+jest.mock('@/lib/ai-gateway/auto-routing-benchmark-admin-client', () => ({
+  getBenchmarkConfig: jest.fn(),
+  updateBenchmarkConfig: jest.fn(),
+}));
+
+jest.mock('@/lib/ai-gateway/experiments/reserved-ids', () => ({
+  findExperimentReservedModelIds: jest.fn(),
+}));
+
+// Stub the catalog so tests don't depend on any specific provider file.
+// 'test-exclusive/alibaba-only' maps to the alibaba gateway (chat_completions only).
+jest.mock('@/lib/ai-gateway/models', () => {
+  const actual = jest.requireActual<typeof ModelsModule>('@/lib/ai-gateway/models');
+  const stubModel: KiloExclusiveModel = {
+    public_id: 'test-exclusive/alibaba-only',
+    display_name: 'Test Alibaba-only',
+    description: 'stub for unit tests',
+    context_length: 8192,
+    max_completion_tokens: 4096,
+    status: 'public',
+    flags: [],
+    gateway: 'alibaba',
+    internal_id: 'stub-internal',
+    pricing: null,
+    exclusive_to: [],
+    inference_provider_restriction: [],
+  };
+  return {
+    ...actual,
+    findKiloExclusiveModel: (id: string) =>
+      id === 'test-exclusive/alibaba-only' ? stubModel : actual.findKiloExclusiveModel(id),
+  };
+});
+
+import { PUT } from './route';
+
+const mockGetUserFromAuth = jest.mocked(getUserFromAuth);
+const mockGetBenchmarkConfig = jest.mocked(getBenchmarkConfig);
+const mockUpdateBenchmarkConfig = jest.mocked(updateBenchmarkConfig);
+const mockFindExperimentReservedModelIds = jest.mocked(findExperimentReservedModelIds);
+
+// Test-fixture boundary: only the fields the route actually reads.
+function adminUserFixture(): User {
+  return { id: 'admin_123', google_user_email: 'admin@kilocode.ai' } as Partial<User> as User;
+}
+
+function putRequest(body: unknown) {
+  return new NextRequest('http://localhost:3000/admin/api/auto-routing/benchmark-config', {
+    method: 'PUT',
+    body: JSON.stringify(body),
+    headers: { 'content-type': 'application/json' },
+  });
+}
+
+const validConfig = {
+  classifierModels: ['google/gemini-2.5-flash-lite'],
+  deciderModels: [{ id: 'openai/gpt-5-mini', reasoningEffort: null }],
+  minAccuracy: 0.7,
+  switchCostFactor: 3,
+  maxConcurrency: 4,
+  benchmarkUserId: null,
+  classifierRepetitions: 1,
+  deciderRepetitions: 1,
+  classifierMaxP95LatencyMs: 1000,
+  updatedAt: null,
+  updatedBy: null,
+};
+
+describe('PUT /admin/api/auto-routing/benchmark-config', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    mockGetUserFromAuth.mockResolvedValue({
+      user: adminUserFixture(),
+      authFailedResponse: null,
+    });
+    mockUpdateBenchmarkConfig.mockResolvedValue({
+      status: 200,
+      body: { config: validConfig },
+    });
+    mockGetBenchmarkConfig.mockResolvedValue({ status: 200, body: { config: null } });
+    mockFindExperimentReservedModelIds.mockResolvedValue([]);
+  });
+
+  it('forwards a config whose decider models all serve every gateway chat API', async () => {
+    const response = await PUT(putRequest(validConfig));
+    expect(response.status).toBe(200);
+    expect(mockUpdateBenchmarkConfig).toHaveBeenCalledWith(validConfig, 'admin@kilocode.ai');
+  });
+
+  it('rejects with 400 listing decider models not servable on all gateway chat APIs', async () => {
+    const response = await PUT(
+      putRequest({
+        ...validConfig,
+        deciderModels: [
+          { id: 'openai/gpt-5-mini', reasoningEffort: null },
+          { id: 'test-exclusive/alibaba-only', reasoningEffort: null },
+        ],
+      })
+    );
+
+    expect(response.status).toBe(400);
+    const body = (await response.json()) as { error: string };
+    expect(body.error).toContain('test-exclusive/alibaba-only');
+    expect(body.error).toContain('chat_completions');
+    expect(body.error).not.toContain('openai/gpt-5-mini (');
+    expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
+  });
+
+  it('rejects decider models reserved by a model experiment (any status)', async () => {
+    // Ownership is status-independent per .specs/model-experiments.md: a public
+    // id with a draft/active/paused/completed experiment is reserved for
+    // explicit user selection and must not enter kilo-auto candidate sets.
+    mockFindExperimentReservedModelIds.mockResolvedValue(['preview/experimental-model']);
+
+    const response = await PUT(
+      putRequest({
+        ...validConfig,
+        deciderModels: [
+          { id: 'openai/gpt-5-mini', reasoningEffort: null },
+          { id: 'preview/experimental-model', reasoningEffort: null },
+        ],
+      })
+    );
+
+    expect(response.status).toBe(400);
+    const body = (await response.json()) as { error: string };
+    expect(body.error).toContain('preview/experimental-model');
+    expect(body.error).toContain('model-experiment');
+    expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
+    // The check runs against the decider model ids.
+    expect(mockFindExperimentReservedModelIds).toHaveBeenCalledWith([
+      'openai/gpt-5-mini',
+      'preview/experimental-model',
+    ]);
+  });
+
+  it('rejects a schema-invalid config with 400', async () => {
+    const response = await PUT(putRequest({ classifierModels: 'oops' }));
+    expect(response.status).toBe(400);
+    await expect(response.json()).resolves.toEqual({ error: 'Invalid benchmark config' });
+    expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled();
+  });
+});
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
new file mode 100644
index 0000000000..d85f617353
--- /dev/null
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts
@@ -0,0 +1,74 @@
+import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts';
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import {
+  getBenchmarkConfig,
+  updateBenchmarkConfig,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import {
+  gatewayChatApisForModel,
+  modelServesAllGatewayChatApis,
+} from '@/lib/ai-gateway/model-api-kinds';
+import { findExperimentReservedModelIds } from '@/lib/ai-gateway/experiments/reserved-ids';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await getBenchmarkConfig();
+  return NextResponse.json(result.body, { status: result.status });
+}
+
+export async function PUT(request: NextRequest) {
+  const { authFailedResponse, user } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  let rawBody: unknown;
+  try {
+    rawBody = await request.json();
+  } catch {
+    return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
+  }
+
+  const parsed = BenchmarkConfigSchema.safeParse(rawBody);
+  if (!parsed.success) {
+    return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 });
+  }
+
+  // Model-experiment public ids are dedicated preview ids that users must
+  // explicitly select; per .specs/model-experiments.md they must never enter
+  // kilo-auto candidate sets, so they can't be saved as decider candidates
+  // (the routing table feeds kilo-auto/efficient automatic selection). Checked
+  // across all experiment statuses — ownership, not just routing membership.
+  const deciderModelIds = parsed.data.deciderModels.map(m => m.id);
+  const reservedExperimentIds = await findExperimentReservedModelIds(deciderModelIds);
+  if (reservedExperimentIds.length > 0) {
+    return NextResponse.json(
+      {
+        error: `Decider models must not be model-experiment public ids (reserved for explicit user selection): ${reservedExperimentIds.join(', ')}`,
+      },
+      { status: 400 }
+    );
+  }
+
+  // Routing-table candidates carry no per-protocol metadata, so every decider
+  // model must be servable on ALL gateway chat API kinds by the provider the
+  // gateway would route it to.
+  const unsupported = parsed.data.deciderModels
+    .map(m => m.id)
+    .filter(id => !modelServesAllGatewayChatApis(id))
+    .map(id => `${id} (supports: ${gatewayChatApisForModel(id).join(', ') || 'none'})`);
+  if (unsupported.length > 0) {
+    return NextResponse.json(
+      {
+        error: `Decider models must support all gateway chat APIs (chat_completions, responses, messages): ${unsupported.join('; ')}`,
+      },
+      { status: 400 }
+    );
+  }
+
+  const email = user?.google_user_email ?? '';
+  const result = await updateBenchmarkConfig(parsed.data, email);
+  return NextResponse.json(result.body, { status: result.status });
+}
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts
new file mode 100644
index 0000000000..26fdc8eef1
--- /dev/null
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts
@@ -0,0 +1,11 @@
+import { NextResponse } from 'next/server';
+import { getBenchmarkRoutingTable } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await getBenchmarkRoutingTable();
+  return NextResponse.json(result.body, { status: result.status });
+}
diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
new file mode 100644
index 0000000000..efbfebdde3
--- /dev/null
+++ b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts
@@ -0,0 +1,36 @@
+import { StartBenchmarkRunRequestSchema } from '@kilocode/auto-routing-contracts';
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import {
+  listBenchmarkRuns,
+  startBenchmarkRun,
+} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client';
+import { getUserFromAuth } from '@/lib/user/server';
+
+export async function GET() {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  const result = await listBenchmarkRuns();
+  return NextResponse.json(result.body, { status: result.status });
+}
+
+export async function POST(request: NextRequest) {
+  const { authFailedResponse } = await getUserFromAuth({ adminOnly: true });
+  if (authFailedResponse) return authFailedResponse;
+
+  let rawBody: unknown;
+  try {
+    rawBody = await request.json();
+  } catch {
+    return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
+  }
+
+  const parsed = StartBenchmarkRunRequestSchema.safeParse(rawBody);
+  if (!parsed.success) {
+    return NextResponse.json({ error: 'Invalid start benchmark run request' }, { status: 400 });
+  }
+
+  const result = await startBenchmarkRun(parsed.data.kind, parsed.data.force);
+  return NextResponse.json(result.body, { status: result.status });
+}
diff --git a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
index d893f27382..f6e262d43d 100644
--- a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
+++ b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx
@@ -11,7 +11,6 @@ import React, { useEffect, useMemo, useState, type ReactNode } from 'react';
 import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
 import { toast } from 'sonner';
 import { BarChart3, Clock3, DollarSign, HelpCircle, RefreshCw, Route, Save } from 'lucide-react';
-import * as z from 'zod';
 import { ModelCombobox, type ModelOption } from '@/components/shared/ModelCombobox';
 import { Badge } from '@/components/ui/badge';
 import { Button } from '@/components/ui/button';
@@ -31,6 +30,8 @@ import {
   type OpenRouterModelsResponse,
 } from '@/lib/organizations/organization-types';
 import { cn } from '@/lib/utils';
+import { BenchmarksSection } from './BenchmarksSection';
+import { parseAdminResponse } from './admin-fetch';
 
 const periods: Array<{ value: AutoRoutingAnalyticsPeriod; label: string }> = [
   { value: '1h', label: '1h' },
@@ -39,24 +40,6 @@ const periods: Array<{ value: AutoRoutingAnalyticsPeriod; label: string }> = [
   { value: '30d', label: '30d' },
 ];
 
-const AdminApiErrorSchema = z.object({ error: z.string().optional() });
-
-async function parseAdminResponse<T extends object>(
-  response: Response,
-  schema: z.ZodType<T>
-): Promise<T> {
-  const body: unknown = await response.json();
-  if (!response.ok) {
-    const parsedError = AdminApiErrorSchema.safeParse(body);
-    throw new Error(
-      parsedError.success && parsedError.data.error
-        ? parsedError.data.error
-        : `Request failed: ${response.status}`
-    );
-  }
-  return schema.parse(body);
-}
-
 async function fetchClassifierModel() {
   const response = await fetch('/admin/api/auto-routing/classifier-model');
   return parseAdminResponse<AutoRoutingClassifierModelResponse>(
@@ -65,7 +48,7 @@ async function fetchClassifierModel() {
   );
 }
 
-async function saveClassifierModel(model: string) {
+async function saveClassifierModel(model: string | null) {
   const response = await fetch('/admin/api/auto-routing/classifier-model', {
     method: 'PUT',
     headers: { 'content-type': 'application/json' },
@@ -397,10 +380,12 @@ export function AutoRoutingAdminContent() {
   });
 
   useEffect(() => {
-    if (classifierModelQuery.data?.model) {
-      setSelectedModel(classifierModelQuery.data.model);
+    const override = classifierModelQuery.data?.override;
+    const model = classifierModelQuery.data?.model;
+    if (model !== undefined) {
+      setSelectedModel(override ?? model);
     }
-  }, [classifierModelQuery.data?.model]);
+  }, [classifierModelQuery.data?.override, classifierModelQuery.data?.model]);
 
   const modelOptions = useMemo<ModelOption[]>(() => {
     return (
@@ -414,10 +399,14 @@ export function AutoRoutingAdminContent() {
 
   const saveMutation = useMutation({
     mutationFn: saveClassifierModel,
-    onSuccess: data => {
+    onSuccess: (data, model) => {
       queryClient.setQueryData(['auto-routing', 'classifier-model'], data);
-      setSelectedModel(data.model);
-      toast.success('Classifier model updated');
+      setSelectedModel(data.override ?? data.model);
+      if (model === null) {
+        toast.success('Override cleared — benchmark winner in effect');
+      } else {
+        toast.success('Classifier model override saved');
+      }
     },
     onError: error => {
       toast.error(error instanceof Error ? error.message : 'Failed to update classifier model');
@@ -432,10 +421,12 @@ export function AutoRoutingAdminContent() {
     classifierModelQuery.error instanceof Error ? classifierModelQuery.error.message : undefined;
   const openRouterModelsError =
     openRouterModelsQuery.error instanceof Error ? openRouterModelsQuery.error.message : undefined;
-  const currentModel = classifierModelQuery.data?.model ?? '';
-  const hasClassifierModelLoaded = classifierModelQuery.isSuccess && currentModel.length > 0;
+  const currentOverride = classifierModelQuery.data?.override ?? null;
+  const hasClassifierModelLoaded = classifierModelQuery.isSuccess;
   const hasModelChange =
-    hasClassifierModelLoaded && selectedModel.trim().length > 0 && selectedModel !== currentModel;
+    hasClassifierModelLoaded &&
+    selectedModel.trim().length > 0 &&
+    selectedModel !== (currentOverride ?? '');
   const summary = analyticsQuery.data?.summary;
   const totalRequests = summary?.totalRequests ?? 0;
   const { classifiedRate, cacheHitRate, fallbackRate } = summaryRates(summary);
@@ -472,32 +463,67 @@ export function AutoRoutingAdminContent() {
 
       <Card className="rounded-lg">
         <CardHeader className="flex flex-row items-center justify-between space-y-0 p-4 pb-2">
-          <CardTitle className="text-base">Classifier Model</CardTitle>
+          <CardTitle className="text-base">Classifier model override</CardTitle>
           <MetricHelp
-            label="Classifier Model"
-            description="The OpenRouter model used by the auto-routing classifier. Saving changes updates KV config, so the classifier can change without a redeploy."
+            label="Classifier model override"
+            description="When unset, the latest classifier benchmark winner is used. Setting an override bypasses the benchmark winner. Saving updates KV config without a redeploy."
           />
         </CardHeader>
-        <CardContent className="grid gap-4 p-4 pt-0 lg:grid-cols-[1fr_auto] lg:items-end">
-          <ModelCombobox
-            label="Model"
-            models={modelOptions}
-            value={selectedModel}
-            onValueChange={setSelectedModel}
-            isLoading={openRouterModelsQuery.isLoading || classifierModelQuery.isLoading}
-            error={classifierModelError ?? openRouterModelsError}
-            placeholder={classifierModelQuery.data?.defaultModel ?? 'Select classifier model'}
-            className="w-full"
-          />
-          <Button
-            type="button"
-            onClick={() => saveMutation.mutate(selectedModel)}
-            disabled={!hasModelChange || saveMutation.isPending}
-            className="w-full lg:w-auto"
-          >
-            <Save className="size-4" />
-            Save model
-          </Button>
+        <CardContent className="flex flex-col gap-4 p-4 pt-0">
+          <dl className="grid grid-cols-[auto_1fr] gap-x-4 gap-y-1 text-sm">
+            <dt className="text-muted-foreground">Effective model</dt>
+            <dd className="font-mono text-xs truncate">
+              {classifierModelQuery.data?.model ?? <Skeleton className="h-4 w-48" />}
+            </dd>
+            <dt className="text-muted-foreground">Override</dt>
+            <dd className="font-mono text-xs truncate">
+              {classifierModelQuery.isLoading ? (
+                <Skeleton className="h-4 w-48" />
+              ) : (
+                (classifierModelQuery.data?.override ?? 'none')
+              )}
+            </dd>
+            <dt className="text-muted-foreground">Benchmark winner</dt>
+            <dd className="font-mono text-xs truncate">
+              {classifierModelQuery.isLoading ? (
+                <Skeleton className="h-4 w-48" />
+              ) : (
+                (classifierModelQuery.data?.benchmarkWinner ?? 'not yet published')
+              )}
+            </dd>
+          </dl>
+          <div className="grid gap-4 lg:grid-cols-[1fr_auto_auto] lg:items-end">
+            <ModelCombobox
+              label="Set override"
+              models={modelOptions}
+              value={selectedModel}
+              onValueChange={setSelectedModel}
+              isLoading={openRouterModelsQuery.isLoading || classifierModelQuery.isLoading}
+              error={classifierModelError ?? openRouterModelsError}
+              placeholder={classifierModelQuery.data?.defaultModel ?? 'Select classifier model'}
+              className="w-full"
+            />
+            <Button
+              type="button"
+              onClick={() => saveMutation.mutate(selectedModel)}
+              disabled={!hasModelChange || saveMutation.isPending}
+              className="w-full lg:w-auto"
+            >
+              <Save className="size-4" />
+              Save override
+            </Button>
+            {currentOverride !== null ? (
+              <Button
+                type="button"
+                variant="outline"
+                onClick={() => saveMutation.mutate(null)}
+                disabled={saveMutation.isPending}
+                className="w-full lg:w-auto text-destructive hover:text-destructive"
+              >
+                Clear override
+              </Button>
+            ) : null}
+          </div>
         </CardContent>
       </Card>
 
@@ -600,6 +626,8 @@ export function AutoRoutingAdminContent() {
           />
         </>
       )}
+
+      <BenchmarksSection />
     </div>
   );
 }
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
new file mode 100644
index 0000000000..11a8a6a0e3
--- /dev/null
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts
@@ -0,0 +1,102 @@
+import { describe, expect, it } from '@jest/globals';
+import {
+  configToFormState,
+  formatAccuracy,
+  formatUsd,
+  formStateToConfig,
+} from './BenchmarksSection';
+
+describe('formatAccuracy', () => {
+  it('formats 0.8542 as 85.4%', () => {
+    expect(formatAccuracy(0.8542)).toBe('85.4%');
+  });
+
+  it('formats 1.0 as 100.0%', () => {
+    expect(formatAccuracy(1.0)).toBe('100.0%');
+  });
+
+  it('formats 0 as 0.0%', () => {
+    expect(formatAccuracy(0)).toBe('0.0%');
+  });
+
+  it('formats 0.5 as 50.0%', () => {
+    expect(formatAccuracy(0.5)).toBe('50.0%');
+  });
+
+  it('rounds to one decimal place', () => {
+    expect(formatAccuracy(0.9999)).toBe('100.0%');
+    expect(formatAccuracy(0.9994)).toBe('99.9%');
+  });
+});
+
+describe('formatUsd', () => {
+  it('returns em dash for null', () => {
+    expect(formatUsd(null)).toBe('—');
+  });
+
+  it('formats a small cost with 6 decimal places', () => {
+    expect(formatUsd(0.000123)).toBe('$0.000123');
+  });
+
+  it('trims trailing zeros', () => {
+    expect(formatUsd(0.1)).toBe('$0.1');
+  });
+
+  it('formats zero as $0.0', () => {
+    expect(formatUsd(0)).toBe('$0.0');
+  });
+
+  it('formats a typical cost', () => {
+    expect(formatUsd(0.001234)).toBe('$0.001234');
+  });
+
+  it('formats a cost that fits exactly at 6dp', () => {
+    expect(formatUsd(0.000001)).toBe('$0.000001');
+  });
+});
+
+describe('configToFormState', () => {
+  it('yields defaults including classifierMaxP95LatencyMs "1000" when config is null', () => {
+    const state = configToFormState(null);
+    expect(state.classifierRepetitions).toBe(1);
+    expect(state.deciderRepetitions).toBe(1);
+    expect(state.classifierMaxP95LatencyMs).toBe('1000');
+    expect(state.classifierModels).toBe('');
+    expect(state.deciderModels).toEqual([]);
+  });
+});
+
+describe('formStateToConfig round-trip', () => {
+  const baseConfig = {
+    classifierModels: ['model-a', 'model-b'],
+    deciderModels: [{ id: 'model-c', reasoningEffort: null }],
+    minAccuracy: 0.8,
+    switchCostFactor: 3,
+    maxConcurrency: 4,
+    benchmarkUserId: 'user-123',
+    classifierRepetitions: 3,
+    deciderRepetitions: 2,
+    classifierMaxP95LatencyMs: 500,
+    updatedAt: null,
+    updatedBy: null,
+  };
+
+  it('preserves classifierRepetitions, deciderRepetitions, and classifierMaxP95LatencyMs', () => {
+    const state = configToFormState(baseConfig);
+    expect(state.classifierRepetitions).toBe(3);
+    expect(state.deciderRepetitions).toBe(2);
+    expect(state.classifierMaxP95LatencyMs).toBe('500');
+
+    const result = formStateToConfig(state, baseConfig);
+    expect(result.classifierRepetitions).toBe(3);
+    expect(result.deciderRepetitions).toBe(2);
+    expect(result.classifierMaxP95LatencyMs).toBe(500);
+  });
+
+  it('converts empty-string classifierMaxP95LatencyMs form value to null in config', () => {
+    const state = configToFormState(baseConfig);
+    const stateWithEmpty = { ...state, classifierMaxP95LatencyMs: '' };
+    const result = formStateToConfig(stateWithEmpty, baseConfig);
+    expect(result.classifierMaxP95LatencyMs).toBeNull();
+  });
+});
diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
new file mode 100644
index 0000000000..9bdfac18ba
--- /dev/null
+++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx
@@ -0,0 +1,984 @@
+'use client';
+
+import {
+  BenchmarkConfigResponseSchema,
+  BenchmarkRoutingTableResponseSchema,
+  BenchmarkRunsResponseSchema,
+  StartBenchmarkRunResponseSchema,
+  type BenchmarkConfig,
+  type BenchmarkKind,
+  type BenchmarkRoutingTableResponse,
+  type BenchmarkRun,
+  type BenchmarkModelSummary,
+  type ReasoningEffort,
+} from '@kilocode/auto-routing-contracts';
+import React, { useCallback, useEffect, useRef, useState } from 'react';
+import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query';
+import { toast } from 'sonner';
+import { ChevronDown, ChevronRight, Play, Plus, Save, Trash2 } from 'lucide-react';
+import { Badge } from '@/components/ui/badge';
+import { Button } from '@/components/ui/button';
+import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
+import { Checkbox } from '@/components/ui/checkbox';
+import { Input } from '@/components/ui/input';
+import { Label } from '@/components/ui/label';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { Skeleton } from '@/components/ui/skeleton';
+import {
+  Table,
+  TableBody,
+  TableCell,
+  TableHead,
+  TableHeader,
+  TableRow,
+} from '@/components/ui/table';
+import { Textarea } from '@/components/ui/textarea';
+import { parseAdminResponse } from './admin-fetch';
+
+// ---------------------------------------------------------------------------
+// Pure helpers (exported for unit tests)
+// ---------------------------------------------------------------------------
+
+export function formatAccuracy(n: number): string {
+  return `${(n * 100).toFixed(1)}%`;
+}
+
+export function formatUsd(n: number | null): string {
+  if (n === null) return '—';
+  // 6 dp, remove trailing zeros, but keep at least $0.000001 precision
+  const fixed = n.toFixed(6);
+  // Trim trailing zeros after decimal, but leave at least one digit after dot
+  const trimmed = fixed.replace(/(\.\d*?)0+$/, '$1').replace(/\.$/, '.0');
+  return `$${trimmed}`;
+}
+
+// ---------------------------------------------------------------------------
+// Fetch helpers
+// ---------------------------------------------------------------------------
+
+async function fetchBenchmarkConfig() {
+  const response = await fetch('/admin/api/auto-routing/benchmark-config');
+  return parseAdminResponse(response, BenchmarkConfigResponseSchema);
+}
+
+async function saveBenchmarkConfig(config: BenchmarkConfig) {
+  const response = await fetch('/admin/api/auto-routing/benchmark-config', {
+    method: 'PUT',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify(config),
+  });
+  return parseAdminResponse(response, BenchmarkConfigResponseSchema);
+}
+
+async function fetchBenchmarkRuns() {
+  const response = await fetch('/admin/api/auto-routing/benchmark-runs');
+  return parseAdminResponse(response, BenchmarkRunsResponseSchema);
+}
+
+async function startBenchmarkRun({ kind, force }: { kind: BenchmarkKind; force: boolean }) {
+  const response = await fetch('/admin/api/auto-routing/benchmark-runs', {
+    method: 'POST',
+    headers: { 'content-type': 'application/json' },
+    body: JSON.stringify({ kind, force }),
+  });
+  return parseAdminResponse(response, StartBenchmarkRunResponseSchema);
+}
+
+async function fetchBenchmarkRoutingTable() {
+  const response = await fetch('/admin/api/auto-routing/benchmark-routing-table');
+  return parseAdminResponse<BenchmarkRoutingTableResponse>(
+    response,
+    BenchmarkRoutingTableResponseSchema
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Local form state type for decider model rows
+// ---------------------------------------------------------------------------
+
+type DeciderModelRow = {
+  id: string;
+  reasoningEffort: ReasoningEffort | null;
+};
+
+export function configToFormState(config: BenchmarkConfig | null): {
+  classifierModels: string;
+  deciderModels: DeciderModelRow[];
+  minAccuracy: number;
+  switchCostFactor: number;
+  maxConcurrency: number;
+  benchmarkUserId: string;
+  classifierRepetitions: number;
+  deciderRepetitions: number;
+  classifierMaxP95LatencyMs: string;
+} {
+  if (config === null) {
+    // No config saved yet: the worker fabricates nothing, so the form starts
+    // empty and the admin must enter and save a config before running.
+    return {
+      classifierModels: '',
+      deciderModels: [],
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      maxConcurrency: 4,
+      benchmarkUserId: '',
+      classifierRepetitions: 1,
+      deciderRepetitions: 1,
+      classifierMaxP95LatencyMs: '1000',
+    };
+  }
+  return {
+    classifierModels: config.classifierModels.join('\n'),
+    deciderModels: config.deciderModels.map(m => ({
+      id: m.id,
+      reasoningEffort: m.reasoningEffort ?? null,
+    })),
+    minAccuracy: config.minAccuracy,
+    switchCostFactor: config.switchCostFactor,
+    maxConcurrency: config.maxConcurrency,
+    benchmarkUserId: config.benchmarkUserId ?? '',
+    classifierRepetitions: config.classifierRepetitions,
+    deciderRepetitions: config.deciderRepetitions,
+    classifierMaxP95LatencyMs:
+      config.classifierMaxP95LatencyMs !== null ? String(config.classifierMaxP95LatencyMs) : '',
+  };
+}
+
+export function formStateToConfig(
+  state: ReturnType<typeof configToFormState>,
+  base: BenchmarkConfig | null
+): BenchmarkConfig {
+  const classifierModels = state.classifierModels
+    .split('\n')
+    .map(s => s.trim())
+    .filter(s => s.length > 0);
+  const deciderModels = state.deciderModels
+    .filter(row => row.id.trim().length > 0)
+    .map(row => ({
+      id: row.id.trim(),
+      reasoningEffort: row.reasoningEffort ?? null,
+    }));
+  const benchmarkUserId = state.benchmarkUserId.trim();
+  const rawLatency = state.classifierMaxP95LatencyMs.trim();
+  const classifierMaxP95LatencyMs = rawLatency.length > 0 ? parseInt(rawLatency, 10) || null : null;
+  return {
+    classifierModels,
+    deciderModels,
+    minAccuracy: state.minAccuracy,
+    switchCostFactor: state.switchCostFactor,
+    maxConcurrency: state.maxConcurrency,
+    benchmarkUserId: benchmarkUserId.length > 0 ? benchmarkUserId : null,
+    classifierRepetitions: state.classifierRepetitions,
+    deciderRepetitions: state.deciderRepetitions,
+    classifierMaxP95LatencyMs,
+    updatedAt: base?.updatedAt ?? null,
+    updatedBy: base?.updatedBy ?? null,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Config editor sub-component
+// ---------------------------------------------------------------------------
+
+function BenchmarkConfigEditor({
+  config,
+  onSaved,
+}: {
+  config: BenchmarkConfig | null;
+  onSaved: (next: { config: BenchmarkConfig | null }) => void;
+}) {
+  const [form, setForm] = useState(() => configToFormState(config));
+  // Tracks unsaved local edits. A background config refetch (the runs list
+  // polls; the query also refetches on focus) must not silently overwrite
+  // in-progress edits, so the sync effect only resets the form while pristine.
+  const [dirty, setDirty] = useState(false);
+
+  // Any user edit goes through this so it marks the form dirty.
+  const updateForm = useCallback(
+    (
+      updater: (prev: ReturnType<typeof configToFormState>) => ReturnType<typeof configToFormState>
+    ) => {
+      setForm(updater);
+      setDirty(true);
+    },
+    []
+  );
+
+  // Sync from server config only on initial load / after a save — never while
+  // the admin has unsaved edits (that would discard their work).
+  useEffect(() => {
+    if (!dirty) setForm(configToFormState(config));
+  }, [config, dirty]);
+
+  // Discard local edits and reload the latest server config (explicit conflict
+  // recovery when a remote update arrived while editing).
+  const handleReload = useCallback(() => {
+    setForm(configToFormState(config));
+    setDirty(false);
+  }, [config]);
+
+  const saveMutation = useMutation({
+    mutationFn: saveBenchmarkConfig,
+    onSuccess: data => {
+      // The save is now the source of truth: clear dirty and re-sync so the
+      // next background refetch is free to update the form again.
+      setForm(configToFormState(data.config));
+      setDirty(false);
+      onSaved(data);
+      toast.success('Benchmark config saved');
+    },
+    onError: (error: unknown) => {
+      toast.error(error instanceof Error ? error.message : 'Failed to save benchmark config');
+    },
+  });
+
+  const handleAddDeciderRow = useCallback(() => {
+    updateForm(prev => ({
+      ...prev,
+      deciderModels: [...prev.deciderModels, { id: '', reasoningEffort: null }],
+    }));
+  }, [updateForm]);
+
+  const handleRemoveDeciderRow = useCallback(
+    (index: number) => {
+      updateForm(prev => ({
+        ...prev,
+        deciderModels: prev.deciderModels.filter((_, i) => i !== index),
+      }));
+    },
+    [updateForm]
+  );
+
+  const handleDeciderRowChange = useCallback(
+    (index: number, patch: Partial<DeciderModelRow>) => {
+      updateForm(prev => ({
+        ...prev,
+        deciderModels: prev.deciderModels.map((row, i) =>
+          i === index ? { ...row, ...patch } : row
+        ),
+      }));
+    },
+    [updateForm]
+  );
+
+  const handleSave = useCallback(() => {
+    saveMutation.mutate(formStateToConfig(form, config));
+  }, [form, config, saveMutation]);
+
+  return (
+    <Card className="rounded-lg">
+      <CardHeader className="p-4 pb-2">
+        <CardTitle className="text-base">Benchmark Config</CardTitle>
+      </CardHeader>
+      <CardContent className="flex flex-col gap-4 p-4 pt-0">
+        {/* Classifier models */}
+        <div className="flex flex-col gap-1.5">
+          <Label htmlFor="benchmark-classifier-models" className="text-sm font-medium">
+            Classifier models (one per line)
+          </Label>
+          <Textarea
+            id="benchmark-classifier-models"
+            value={form.classifierModels}
+            onChange={e => updateForm(prev => ({ ...prev, classifierModels: e.target.value }))}
+            rows={4}
+            className="font-mono text-xs"
+            placeholder="openai/gpt-4o-mini"
+          />
+        </div>
+
+        {/* Decider models table */}
+        <div className="flex flex-col gap-1.5">
+          <Label className="text-sm font-medium">Decider models</Label>
+          <div className="rounded-md border">
+            <Table>
+              <TableHeader>
+                <TableRow>
+                  <TableHead>Model ID</TableHead>
+                  <TableHead className="w-36">Reasoning effort</TableHead>
+                  <TableHead className="w-12" />
+                </TableRow>
+              </TableHeader>
+              <TableBody>
+                {form.deciderModels.map((row, index) => (
+                  <TableRow key={index}>
+                    <TableCell className="py-2">
+                      <Input
+                        value={row.id}
+                        onChange={e => handleDeciderRowChange(index, { id: e.target.value })}
+                        className="h-8 font-mono text-xs"
+                        placeholder="openai/gpt-4o"
+                        aria-label={`Decider model ${index + 1} ID`}
+                      />
+                    </TableCell>
+                    <TableCell className="py-2">
+                      <Select
+                        value={row.reasoningEffort ?? 'none'}
+                        onValueChange={value =>
+                          handleDeciderRowChange(index, {
+                            reasoningEffort: value === 'none' ? null : (value as ReasoningEffort),
+                          })
+                        }
+                      >
+                        <SelectTrigger
+                          className="h-8 text-xs"
+                          aria-label={`Model ${index + 1} reasoning effort`}
+                        >
+                          <SelectValue />
+                        </SelectTrigger>
+                        <SelectContent>
+                          <SelectItem value="none">None</SelectItem>
+                          <SelectItem value="minimal">minimal</SelectItem>
+                          <SelectItem value="low">low</SelectItem>
+                          <SelectItem value="medium">medium</SelectItem>
+                          <SelectItem value="high">high</SelectItem>
+                        </SelectContent>
+                      </Select>
+                    </TableCell>
+                    <TableCell className="py-2">
+                      <Button
+                        type="button"
+                        variant="ghost"
+                        size="icon"
+                        className="h-8 w-8 text-destructive hover:text-destructive"
+                        onClick={() => handleRemoveDeciderRow(index)}
+                        aria-label={`Remove decider model ${index + 1}`}
+                      >
+                        <Trash2 className="size-3.5" />
+                      </Button>
+                    </TableCell>
+                  </TableRow>
+                ))}
+              </TableBody>
+            </Table>
+          </div>
+          <Button
+            type="button"
+            variant="outline"
+            size="sm"
+            className="w-fit"
+            onClick={handleAddDeciderRow}
+          >
+            <Plus className="size-3.5" />
+            Add model
+          </Button>
+        </div>
+
+        {/* Numeric inputs */}
+        <div className="grid gap-4 sm:grid-cols-2">
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-min-accuracy" className="text-sm font-medium">
+              Min accuracy (0–1)
+            </Label>
+            <Input
+              id="benchmark-min-accuracy"
+              type="number"
+              min={0}
+              max={1}
+              step={0.05}
+              value={form.minAccuracy}
+              onChange={e =>
+                updateForm(prev => ({ ...prev, minAccuracy: parseFloat(e.target.value) || 0 }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-switch-cost-factor" className="text-sm font-medium">
+              Switch cost factor (1–100)
+            </Label>
+            <Input
+              id="benchmark-switch-cost-factor"
+              type="number"
+              min={1}
+              max={100}
+              step={0.5}
+              value={form.switchCostFactor}
+              onChange={e =>
+                updateForm(prev => ({ ...prev, switchCostFactor: parseFloat(e.target.value) || 1 }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-max-concurrency" className="text-sm font-medium">
+              Max concurrency (1–16)
+            </Label>
+            <Input
+              id="benchmark-max-concurrency"
+              type="number"
+              min={1}
+              max={16}
+              step={1}
+              value={form.maxConcurrency}
+              onChange={e =>
+                updateForm(prev => ({ ...prev, maxConcurrency: parseInt(e.target.value, 10) || 1 }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-classifier-repetitions" className="text-sm font-medium">
+              Classifier repetitions (1–5)
+            </Label>
+            <Input
+              id="benchmark-classifier-repetitions"
+              type="number"
+              min={1}
+              max={5}
+              step={1}
+              value={form.classifierRepetitions}
+              onChange={e =>
+                updateForm(prev => ({
+                  ...prev,
+                  classifierRepetitions: parseInt(e.target.value, 10) || 1,
+                }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
+          <div className="flex flex-col gap-1.5">
+            <Label htmlFor="benchmark-decider-repetitions" className="text-sm font-medium">
+              Decider repetitions (1–5)
+            </Label>
+            <Input
+              id="benchmark-decider-repetitions"
+              type="number"
+              min={1}
+              max={5}
+              step={1}
+              value={form.deciderRepetitions}
+              onChange={e =>
+                updateForm(prev => ({
+                  ...prev,
+                  deciderRepetitions: parseInt(e.target.value, 10) || 1,
+                }))
+              }
+              className="h-8 w-40 tabular-nums"
+            />
+          </div>
+        </div>
+
+        {/* Benchmark user id */}
+        <div className="flex flex-col gap-1.5">
+          <Label htmlFor="benchmark-user-id" className="text-sm font-medium">
+            Benchmark user id
+          </Label>
+          <Input
+            id="benchmark-user-id"
+            value={form.benchmarkUserId}
+            onChange={e => updateForm(prev => ({ ...prev, benchmarkUserId: e.target.value }))}
+            className="h-8 font-mono text-xs"
+            placeholder="(unset)"
+          />
+          <p className="text-muted-foreground text-xs">
+            Kilo user the decider CLI runs bill to; decider runs fail until set.
+          </p>
+        </div>
+
+        {/* Classifier max p95 latency */}
+        <div className="flex flex-col gap-1.5">
+          <Label htmlFor="benchmark-classifier-max-p95-latency" className="text-sm font-medium">
+            Classifier max p95 latency (ms)
+          </Label>
+          <Input
+            id="benchmark-classifier-max-p95-latency"
+            type="number"
+            min={1}
+            step={1}
+            value={form.classifierMaxP95LatencyMs}
+            onChange={e =>
+              updateForm(prev => ({ ...prev, classifierMaxP95LatencyMs: e.target.value }))
+            }
+            className="h-8 w-40 tabular-nums"
+            placeholder="(no limit)"
+          />
+          <p className="text-muted-foreground text-xs">
+            Winner must classify under this p95 latency; empty disables the latency gate.
+          </p>
+        </div>
+
+        {/* Actions + metadata */}
+        <div className="flex flex-col gap-2">
+          <div className="flex flex-wrap items-center gap-2">
+            <Button type="button" onClick={handleSave} disabled={saveMutation.isPending}>
+              <Save className="size-4" />
+              Save config
+            </Button>
+            {dirty ? (
+              <>
+                <Button type="button" variant="outline" onClick={handleReload}>
+                  Discard &amp; reload
+                </Button>
+                <span className="text-muted-foreground text-xs">Unsaved changes</span>
+              </>
+            ) : null}
+          </div>
+          {config === null ? (
+            <p className="text-muted-foreground text-xs">
+              No config saved yet — runs cannot start until one is saved.
+            </p>
+          ) : config.updatedAt ? (
+            <p className="text-muted-foreground text-xs">
+              Last updated {config.updatedAt}
+              {config.updatedBy ? ` by ${config.updatedBy}` : ''}
+            </p>
+          ) : null}
+        </div>
+      </CardContent>
+    </Card>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Run summaries expandable table
+// ---------------------------------------------------------------------------
+
+const TIER_ORDER = { low: 0, medium: 1, high: 2, '*': 3 } as const;
+
+function RunSummariesTable({ run, id }: { run: BenchmarkRun; id: string }) {
+  const isDecider = run.kind === 'decider';
+
+  const sortedSummaries: BenchmarkModelSummary[] = isDecider
+    ? [...run.summaries].sort((a, b) => {
+        const tierDiff =
+          (TIER_ORDER[a.tier as keyof typeof TIER_ORDER] ?? 3) -
+          (TIER_ORDER[b.tier as keyof typeof TIER_ORDER] ?? 3);
+        if (tierDiff !== 0) return tierDiff;
+        return b.accuracy - a.accuracy;
+      })
+    : run.summaries;
+
+  return (
+    <TableRow className="bg-muted/30">
+      <TableCell colSpan={6} id={id} className="px-4 py-2">
+        {/* Full error text (the collapsed row's Error cell is truncated). */}
+        {run.error ? (
+          <div className="border-destructive/40 bg-destructive/10 text-destructive mb-2 rounded-md border px-3 py-2 text-xs whitespace-pre-wrap break-words">
+            {run.error}
+          </div>
+        ) : null}
+        {sortedSummaries.length === 0 ? (
+          <p className="text-muted-foreground py-1 text-center text-xs">No summaries</p>
+        ) : (
+          <div className="overflow-x-auto">
+            <Table className="min-w-max">
+              <TableHeader>
+                <TableRow>
+                  <TableHead className="text-xs">Model</TableHead>
+                  {isDecider ? <TableHead className="text-xs">Tier</TableHead> : null}
+                  <TableHead className="text-right text-xs">Accuracy</TableHead>
+                  <TableHead className="text-right text-xs">Avg cost</TableHead>
+                  <TableHead className="text-right text-xs">Avg latency</TableHead>
+                  <TableHead className="text-right text-xs">p50 latency</TableHead>
+                  <TableHead className="text-right text-xs">p95 latency</TableHead>
+                  <TableHead className="text-right text-xs">Cases</TableHead>
+                  <TableHead className="text-right text-xs">Errors</TableHead>
+                  <TableHead className="text-right text-xs">Timeouts</TableHead>
+                </TableRow>
+              </TableHeader>
+              <TableBody>
+                {sortedSummaries.map((s, i) => (
+                  <TableRow key={`${s.model}-${s.tier}-${i}`}>
+                    <TableCell className="max-w-56 truncate font-mono text-xs">{s.model}</TableCell>
+                    {isDecider ? (
+                      <TableCell className="text-xs capitalize">{s.tier}</TableCell>
+                    ) : null}
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {formatAccuracy(s.accuracy)}
+                    </TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {formatUsd(s.avgCostUsd)}
+                    </TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {s.avgLatencyMs.toFixed(0)} ms
+                    </TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {s.p50LatencyMs !== null ? `${s.p50LatencyMs.toFixed(0)} ms` : '—'}
+                    </TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {s.p95LatencyMs !== null ? `${s.p95LatencyMs.toFixed(0)} ms` : '—'}
+                    </TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">{s.cases}</TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">{s.errors}</TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">{s.timeouts}</TableCell>
+                  </TableRow>
+                ))}
+              </TableBody>
+            </Table>
+          </div>
+        )}
+      </TableCell>
+    </TableRow>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Runs table
+// ---------------------------------------------------------------------------
+
+function statusBadgeVariant(
+  status: BenchmarkRun['status']
+): 'default' | 'secondary' | 'destructive' {
+  if (status === 'completed') return 'default';
+  if (status === 'running') return 'secondary';
+  return 'destructive';
+}
+
+function BenchmarkRunsTable({ runs }: { runs: BenchmarkRun[] }) {
+  const [expandedIds, setExpandedIds] = useState<Set<string>>(new Set());
+
+  const toggleExpand = useCallback((id: string) => {
+    setExpandedIds(prev => {
+      const next = new Set(prev);
+      if (next.has(id)) {
+        next.delete(id);
+      } else {
+        next.add(id);
+      }
+      return next;
+    });
+  }, []);
+
+  if (runs.length === 0) {
+    return (
+      <TableRow>
+        <TableCell colSpan={6} className="text-muted-foreground h-16 text-center">
+          No runs yet
+        </TableCell>
+      </TableRow>
+    );
+  }
+
+  return (
+    <>
+      {runs.map(run => {
+        const expanded = expandedIds.has(run.id);
+        const summariesId = `run-summaries-${run.id}`;
+        return (
+          <React.Fragment key={run.id}>
+            {/* Row click is a mouse convenience; the button in the first cell is
+                the accessible (keyboard/AT) control that owns aria-expanded. */}
+            <TableRow className="cursor-pointer" onClick={() => toggleExpand(run.id)}>
+              <TableCell className="w-8 py-2">
+                <button
+                  type="button"
+                  onClick={e => {
+                    e.stopPropagation();
+                    toggleExpand(run.id);
+                  }}
+                  aria-expanded={expanded}
+                  aria-controls={expanded ? summariesId : undefined}
+                  aria-label={`${expanded ? 'Collapse' : 'Expand'} ${run.kind} run details`}
+                  className="text-muted-foreground hover:text-foreground focus-visible:ring-ring inline-flex size-5 items-center justify-center rounded focus-visible:ring-2 focus-visible:outline-none"
+                >
+                  {expanded ? (
+                    <ChevronDown className="size-4" />
+                  ) : (
+                    <ChevronRight className="size-4" />
+                  )}
+                </button>
+              </TableCell>
+              <TableCell className="py-2 capitalize text-sm">{run.kind}</TableCell>
+              <TableCell className="py-2">
+                <Badge variant={statusBadgeVariant(run.status)} className="capitalize">
+                  {run.status}
+                </Badge>
+              </TableCell>
+              <TableCell className="py-2 text-xs tabular-nums">{run.startedAt}</TableCell>
+              <TableCell className="py-2 text-xs tabular-nums">{run.completedAt ?? '—'}</TableCell>
+              <TableCell
+                className="py-2 text-xs text-destructive max-w-48 truncate"
+                title={run.error ?? undefined}
+              >
+                {run.error ?? ''}
+              </TableCell>
+            </TableRow>
+            {expanded ? <RunSummariesTable run={run} id={summariesId} /> : null}
+          </React.Fragment>
+        );
+      })}
+    </>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Routing table view
+// ---------------------------------------------------------------------------
+
+function RoutingTableView({ data }: { data: BenchmarkRoutingTableResponse }) {
+  if (!data.table) {
+    return <p className="text-muted-foreground text-sm">No routing table published yet.</p>;
+  }
+
+  const { table } = data;
+  const tierEntries = [
+    { tier: 'low', candidates: table.tiers.low },
+    { tier: 'medium', candidates: table.tiers.medium },
+    { tier: 'high', candidates: table.tiers.high },
+  ] as const;
+
+  return (
+    <div className="flex flex-col gap-3">
+      <div className="text-muted-foreground text-xs flex flex-wrap gap-x-4 gap-y-1">
+        <span>
+          Version: <span className="font-mono">{table.version}</span>
+        </span>
+        <span>Generated: {table.generatedAt}</span>
+        <span>Min accuracy: {formatAccuracy(table.minAccuracy)}</span>
+        <span>
+          Source: <span className="capitalize">{table.source}</span>
+        </span>
+      </div>
+
+      {tierEntries.map(({ tier, candidates }) => (
+        <div key={tier}>
+          <p className="text-sm font-medium capitalize mb-1.5">{tier} tier</p>
+          <div className="overflow-x-auto rounded-md border">
+            <Table className="min-w-max">
+              <TableHeader>
+                <TableRow>
+                  <TableHead>Model</TableHead>
+                  <TableHead className="text-right">Accuracy</TableHead>
+                  <TableHead className="text-right">Avg cost</TableHead>
+                  <TableHead>Threshold</TableHead>
+                </TableRow>
+              </TableHeader>
+              <TableBody>
+                {candidates.map((c, i) => (
+                  <TableRow key={`${tier}-${c.model}-${i}`}>
+                    <TableCell className="max-w-56 truncate font-mono text-xs">{c.model}</TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {formatAccuracy(c.accuracy)}
+                    </TableCell>
+                    <TableCell className="text-right tabular-nums text-xs">
+                      {formatUsd(c.avgCostUsd)}
+                    </TableCell>
+                    <TableCell>
+                      <Badge variant={c.meetsThreshold ? 'default' : 'secondary'}>
+                        {c.meetsThreshold ? 'meets' : 'below'}
+                      </Badge>
+                    </TableCell>
+                  </TableRow>
+                ))}
+              </TableBody>
+            </Table>
+          </div>
+        </div>
+      ))}
+    </div>
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Main exported section component
+// ---------------------------------------------------------------------------
+
+export function BenchmarksSection() {
+  const queryClient = useQueryClient();
+  const [forceRerun, setForceRerun] = useState(false);
+
+  const configQuery = useQuery({
+    queryKey: ['auto-routing', 'benchmark-config'],
+    queryFn: fetchBenchmarkConfig,
+  });
+
+  const runsQuery = useQuery({
+    queryKey: ['auto-routing', 'benchmark-runs'],
+    queryFn: fetchBenchmarkRuns,
+  });
+
+  const routingTableQuery = useQuery({
+    queryKey: ['auto-routing', 'benchmark-routing-table'],
+    queryFn: fetchBenchmarkRoutingTable,
+  });
+
+  // Poll runs every 30s while any run is 'running'
+  const hasRunningRun = runsQuery.data?.runs.some(r => r.status === 'running') ?? false;
+  const refetchRuns = runsQuery.refetch;
+  useEffect(() => {
+    if (!hasRunningRun) return;
+    const id = setInterval(() => {
+      void refetchRuns();
+    }, 30_000);
+    return () => clearInterval(id);
+  }, [hasRunningRun, refetchRuns]);
+
+  // When the last running run finishes, its completion publishes a routing
+  // table / classifier winner. Those live in their own query caches, so
+  // invalidate them on the running→terminal edge — otherwise the published
+  // routing table keeps showing stale data (or "No routing table published
+  // yet") until a focus refetch or manual reload.
+  const prevHasRunningRun = useRef(hasRunningRun);
+  useEffect(() => {
+    if (prevHasRunningRun.current && !hasRunningRun) {
+      void queryClient.invalidateQueries({
+        queryKey: ['auto-routing', 'benchmark-routing-table'],
+      });
+      void queryClient.invalidateQueries({ queryKey: ['auto-routing', 'benchmark-config'] });
+    }
+    prevHasRunningRun.current = hasRunningRun;
+  }, [hasRunningRun, queryClient]);
+
+  const startRunMutation = useMutation({
+    mutationFn: startBenchmarkRun,
+    onSuccess: (data, variables) => {
+      const kindLabel = variables.kind === 'classifier' ? 'Classifier' : 'Decider';
+      if (data.enqueuedModels === 0) {
+        toast.success(`All models already have results — republished from existing data`);
+      } else {
+        toast.success(
+          `${kindLabel} benchmark started — ${data.enqueuedModels} models enqueued, ${data.skippedModels.length} skipped`
+        );
+      }
+      void queryClient.invalidateQueries({ queryKey: ['auto-routing', 'benchmark-runs'] });
+    },
+    onError: (error: unknown) => {
+      toast.error(error instanceof Error ? error.message : 'Failed to start benchmark run');
+    },
+  });
+
+  const handleConfigSaved = useCallback(
+    (next: { config: BenchmarkConfig | null }) => {
+      queryClient.setQueryData(['auto-routing', 'benchmark-config'], next);
+    },
+    [queryClient]
+  );
+
+  const anyRunning = hasRunningRun || startRunMutation.isPending;
+
+  return (
+    <div className="flex flex-col gap-4">
+      <div>
+        <h2 className="text-lg font-semibold">Benchmarks</h2>
+        <p className="text-muted-foreground text-sm">
+          Benchmark configuration, runs, and published routing table.
+        </p>
+      </div>
+
+      {/* Config editor */}
+      {configQuery.isLoading ? (
+        <Card className="rounded-lg">
+          <CardContent className="p-4">
+            <Skeleton className="h-48 w-full" />
+          </CardContent>
+        </Card>
+      ) : configQuery.error ? (
+        <div className="border-destructive/40 bg-destructive/10 text-destructive rounded-md border px-3 py-2 text-sm">
+          {configQuery.error instanceof Error
+            ? configQuery.error.message
+            : 'Failed to load benchmark config'}
+        </div>
+      ) : configQuery.data ? (
+        <BenchmarkConfigEditor config={configQuery.data.config} onSaved={handleConfigSaved} />
+      ) : null}
+
+      {/* Run controls */}
+      <Card className="rounded-lg">
+        <CardHeader className="p-4 pb-2">
+          <CardTitle className="text-base">Run Benchmark</CardTitle>
+        </CardHeader>
+        <CardContent className="flex flex-col gap-3 p-4 pt-0">
+          <p className="text-muted-foreground text-xs">
+            Runs are triggered manually. Models with existing results are skipped unless "Re-run
+            models with existing results" is checked.
+          </p>
+          <div className="flex items-center gap-2">
+            <Checkbox
+              id="force-rerun"
+              checked={forceRerun}
+              onCheckedChange={checked => setForceRerun(checked === true)}
+            />
+            <Label htmlFor="force-rerun" className="text-sm font-normal cursor-pointer">
+              Re-run models with existing results
+            </Label>
+          </div>
+          <div className="flex flex-wrap gap-2">
+            <Button
+              type="button"
+              variant="outline"
+              disabled={anyRunning}
+              onClick={() => startRunMutation.mutate({ kind: 'classifier', force: forceRerun })}
+            >
+              <Play className="size-4" />
+              Run classifier benchmark
+            </Button>
+            <Button
+              type="button"
+              variant="outline"
+              disabled={anyRunning}
+              onClick={() => startRunMutation.mutate({ kind: 'decider', force: forceRerun })}
+            >
+              <Play className="size-4" />
+              Run decider benchmark
+            </Button>
+            {hasRunningRun ? (
+              <p className="text-muted-foreground self-center text-xs">
+                A benchmark is running — refreshing every 30 s
+              </p>
+            ) : null}
+          </div>
+        </CardContent>
+      </Card>
+
+      {/* Runs table */}
+      <Card className="rounded-lg">
+        <CardHeader className="p-4 pb-2">
+          <CardTitle className="text-base">Benchmark Runs</CardTitle>
+        </CardHeader>
+        <CardContent className="p-4 pt-0">
+          {runsQuery.isLoading ? (
+            <Skeleton className="h-24 w-full" />
+          ) : runsQuery.error ? (
+            <div className="border-destructive/40 bg-destructive/10 text-destructive rounded-md border px-3 py-2 text-sm">
+              {runsQuery.error instanceof Error
+                ? runsQuery.error.message
+                : 'Failed to load benchmark runs'}
+            </div>
+          ) : (
+            <Table>
+              <TableHeader>
+                <TableRow>
+                  <TableHead className="w-8" />
+                  <TableHead>Kind</TableHead>
+                  <TableHead>Status</TableHead>
+                  <TableHead>Started</TableHead>
+                  <TableHead>Completed</TableHead>
+                  <TableHead>Error</TableHead>
+                </TableRow>
+              </TableHeader>
+              <TableBody>
+                <BenchmarkRunsTable runs={runsQuery.data?.runs ?? []} />
+              </TableBody>
+            </Table>
+          )}
+        </CardContent>
+      </Card>
+
+      {/* Routing table */}
+      <Card className="rounded-lg">
+        <CardHeader className="p-4 pb-2">
+          <CardTitle className="text-base">Published Routing Table</CardTitle>
+        </CardHeader>
+        <CardContent className="p-4 pt-0">
+          {routingTableQuery.isLoading ? (
+            <Skeleton className="h-32 w-full" />
+          ) : routingTableQuery.error ? (
+            <div className="border-destructive/40 bg-destructive/10 text-destructive rounded-md border px-3 py-2 text-sm">
+              {routingTableQuery.error instanceof Error
+                ? routingTableQuery.error.message
+                : 'Failed to load routing table'}
+            </div>
+          ) : routingTableQuery.data ? (
+            <RoutingTableView data={routingTableQuery.data} />
+          ) : null}
+        </CardContent>
+      </Card>
+    </div>
+  );
+}
diff --git a/apps/web/src/app/admin/auto-routing/admin-fetch.ts b/apps/web/src/app/admin/auto-routing/admin-fetch.ts
new file mode 100644
index 0000000000..b29d538e09
--- /dev/null
+++ b/apps/web/src/app/admin/auto-routing/admin-fetch.ts
@@ -0,0 +1,19 @@
+import * as z from 'zod';
+
+const AdminApiErrorSchema = z.object({ error: z.string().optional() });
+
+export async function parseAdminResponse<T extends object>(
+  response: Response,
+  schema: z.ZodType<T>
+): Promise<T> {
+  const body: unknown = await response.json();
+  if (!response.ok) {
+    const parsedError = AdminApiErrorSchema.safeParse(body);
+    throw new Error(
+      parsedError.success && parsedError.data.error
+        ? parsedError.data.error
+        : `Request failed: ${response.status}`
+    );
+  }
+  return schema.parse(body);
+}
diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
new file mode 100644
index 0000000000..798af89524
--- /dev/null
+++ b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.test.ts
@@ -0,0 +1,87 @@
+import { NextRequest } from 'next/server';
+import { generateApiToken } from '@/lib/tokens';
+
+jest.mock('@/lib/config.server', () => ({
+  INTERNAL_API_SECRET: 'internal-secret',
+}));
+
+// Chainable drizzle query builder mock. `.limit()` resolves to the rows we set.
+const mockRows: unknown[] = [];
+jest.mock('@/lib/drizzle', () => ({
+  db: {
+    select: () => ({
+      from: () => ({
+        where: () => ({
+          limit: () => Promise.resolve(mockRows),
+        }),
+      }),
+    }),
+  },
+}));
+
+jest.mock('@/lib/tokens', () => ({
+  generateApiToken: jest.fn(() => 'minted-token'),
+}));
+
+import { POST } from './route';
+
+const mockGenerateApiToken = jest.mocked(generateApiToken);
+
+function createRequest(body: unknown, headers: Record<string, string> = {}) {
+  return new NextRequest('http://localhost:3000/api/internal/auto-routing-benchmark/token', {
+    method: 'POST',
+    body: JSON.stringify(body),
+    headers: { 'content-type': 'application/json', ...headers },
+  });
+}
+
+describe('POST /api/internal/auto-routing-benchmark/token', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    mockRows.length = 0;
+  });
+
+  it('returns 401 without the bearer secret', async () => {
+    mockRows.push({ id: 'user-1', api_token_pepper: 'pepper' });
+    const res = await POST(createRequest({ userId: 'user-1' }));
+    expect(res.status).toBe(401);
+    expect(mockGenerateApiToken).not.toHaveBeenCalled();
+  });
+
+  it('returns 401 with the wrong bearer secret', async () => {
+    const res = await POST(createRequest({ userId: 'user-1' }, { authorization: 'Bearer wrong' }));
+    expect(res.status).toBe(401);
+  });
+
+  it('returns 400 for an invalid body', async () => {
+    const res = await POST(createRequest({}, { authorization: 'Bearer internal-secret' }));
+    expect(res.status).toBe(400);
+  });
+
+  it('returns 404 when the user does not exist', async () => {
+    const res = await POST(
+      createRequest({ userId: 'missing' }, { authorization: 'Bearer internal-secret' })
+    );
+    expect(res.status).toBe(404);
+    expect(mockGenerateApiToken).not.toHaveBeenCalled();
+  });
+
+  it('mints a 6h token for an existing user', async () => {
+    const user = { id: 'user-1', api_token_pepper: 'pepper' };
+    mockRows.push(user);
+    const res = await POST(
+      createRequest({ userId: 'user-1' }, { authorization: 'Bearer internal-secret' })
+    );
+    expect(res.status).toBe(200);
+    const json = (await res.json()) as { token: string; expiresAt: string };
+    expect(json.token).toBe('minted-token');
+    expect(typeof json.expiresAt).toBe('string');
+    expect(mockGenerateApiToken).toHaveBeenCalledWith(
+      user,
+      { tokenSource: 'auto-routing-benchmark' },
+      {
+        expiresIn: 6 * 60 * 60,
+      }
+    );
+  });
+});
diff --git a/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
new file mode 100644
index 0000000000..ef9bf4f93e
--- /dev/null
+++ b/apps/web/src/app/api/internal/auto-routing-benchmark/token/route.ts
@@ -0,0 +1,87 @@
+/**
+ * Internal API: mint a short-lived user API token for the auto-routing
+ * decider benchmark.
+ *
+ * Called by:
+ * - services/auto-routing-benchmark — the decider benchmark runs each case
+ *   through the real `kilo` CLI inside a Cloudflare Container. The CLI
+ *   authenticates against the gateway with a user API token, so the worker
+ *   fetches a fresh, short-lived token for the configured benchmark user
+ *   once per queue message.
+ *
+ * Auth: shared internal secret over `Authorization: Bearer <secret>` — this
+ * is the exact header the benchmark worker sends
+ * (`Authorization: Bearer ${INTERNAL_API_SECRET_PROD}`), and
+ * INTERNAL_API_SECRET_PROD holds the same value as INTERNAL_API_SECRET here.
+ *
+ * The minted token is a full user API token (includes apiTokenPepper) so the
+ * gateway accepts it as a real user token; an internal-service token would be
+ * rejected by gateway pepper validation. It expires in 6 hours.
+ *
+ * URL: POST /api/internal/auto-routing-benchmark/token
+ */
+
+import type { NextRequest } from 'next/server';
+import { NextResponse } from 'next/server';
+import { timingSafeEqual } from '@kilocode/encryption';
+import { z } from 'zod';
+import { eq } from 'drizzle-orm';
+import { kilocode_users } from '@kilocode/db/schema';
+import { db } from '@/lib/drizzle';
+import { generateApiToken } from '@/lib/tokens';
+import { INTERNAL_API_SECRET } from '@/lib/config.server';
+
+const RequestSchema = z.object({ userId: z.string().min(1) });
+
+const SIX_HOURS_IN_SECONDS = 6 * 60 * 60;
+
+// Inline bearer extraction (case-insensitive prefix, RFC 6750 §2.1). Kept local
+// to avoid importing @kilocode/worker-utils, whose transitive `jose` ESM import
+// breaks under jest's CJS transform.
+function extractBearerToken(authHeader: string | null): string | null {
+  if (!authHeader) return null;
+  const trimmed = authHeader.trim();
+  if (trimmed.slice(0, 7).toLowerCase() !== 'bearer ') return null;
+  return trimmed.slice(7).trim() || null;
+}
+
+export async function POST(req: NextRequest) {
+  const token = extractBearerToken(req.headers.get('authorization'));
+  if (!INTERNAL_API_SECRET || !token || !timingSafeEqual(token, INTERNAL_API_SECRET)) {
+    return NextResponse.json({ error: 'Unauthorized' }, { status: 401 });
+  }
+
+  let body: unknown;
+  try {
+    body = await req.json();
+  } catch {
+    return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 });
+  }
+
+  const parsed = RequestSchema.safeParse(body);
+  if (!parsed.success) {
+    return NextResponse.json(
+      { error: 'Invalid request body', issues: parsed.error.issues },
+      { status: 400 }
+    );
+  }
+
+  const [user] = await db
+    .select()
+    .from(kilocode_users)
+    .where(eq(kilocode_users.id, parsed.data.userId))
+    .limit(1);
+
+  if (!user) {
+    return NextResponse.json({ error: 'User not found' }, { status: 404 });
+  }
+
+  const apiToken = generateApiToken(
+    user,
+    { tokenSource: 'auto-routing-benchmark' },
+    { expiresIn: SIX_HOURS_IN_SECONDS }
+  );
+  const expiresAt = new Date(Date.now() + SIX_HOURS_IN_SECONDS * 1000).toISOString();
+
+  return NextResponse.json({ token: apiToken, expiresAt });
+}
diff --git a/apps/web/src/app/api/openrouter/[...path]/route.test.ts b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
index 2fec32f537..bb7b22ded3 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.test.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.test.ts
@@ -10,6 +10,9 @@ import { emitApiMetricsForResponse } from '@/lib/ai-gateway/o11y/api-metrics.ser
 import { accountForMicrodollarUsage } from '@/lib/ai-gateway/llm-proxy-helpers';
 import { redisClient } from '@/lib/redis';
 import type { Provider } from '@/lib/ai-gateway/providers/types';
+import { fetchEfficientAutoDecision } from '@/lib/ai-gateway/auto-routing-decision';
+import { logMicrodollarUsage } from '@/lib/ai-gateway/processUsage';
+import { applyResolvedAutoModel } from '@/lib/ai-gateway/auto-model/resolution';
 
 jest.mock('next/server', () => {
   return {
@@ -58,6 +61,21 @@ jest.mock('@/lib/ai-gateway/llm-proxy-helpers', () => {
     captureProxyError: jest.fn(),
   };
 });
+jest.mock('@/lib/ai-gateway/auto-routing-decision');
+jest.mock('@/lib/ai-gateway/processUsage', () => {
+  const actual = jest.requireActual('@/lib/ai-gateway/processUsage');
+  return {
+    ...(actual as Record<string, unknown>),
+    logMicrodollarUsage: jest.fn(),
+  };
+});
+jest.mock('@/lib/ai-gateway/auto-model/resolution', () => {
+  const actual = jest.requireActual('@/lib/ai-gateway/auto-model/resolution');
+  return {
+    ...(actual as Record<string, unknown>),
+    applyResolvedAutoModel: jest.fn(),
+  };
+});
 
 const mockedGetUserFromAuth = jest.mocked(getUserFromAuth);
 const mockedGetBalanceAndOrgSettings = jest.mocked(getBalanceAndOrgSettings);
@@ -69,6 +87,9 @@ const mockedEmitApiMetricsForResponse = jest.mocked(emitApiMetricsForResponse);
 const mockedAccountForMicrodollarUsage = jest.mocked(accountForMicrodollarUsage);
 const mockedRedisGet = jest.mocked(redisClient.get);
 const mockedRedisSet = jest.mocked(redisClient.set);
+const mockedFetchEfficientAutoDecision = jest.mocked(fetchEfficientAutoDecision);
+const mockedLogMicrodollarUsage = jest.mocked(logMicrodollarUsage);
+const mockedApplyResolvedAutoModel = jest.mocked(applyResolvedAutoModel);
 
 const provider = {
   id: 'openrouter',
@@ -388,3 +409,194 @@ describe('POST /api/openrouter/v1/chat/completions rules-engine actions', () =>
     expect(mockedUpstreamRequest.mock.calls[0]?.[0].body.model).toBe('openai/gpt-4o');
   });
 });
+
+describe('kilo-auto/efficient classifier billing', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    setUserAuth();
+    mockedGetProvider.mockResolvedValue({
+      kind: 'provider',
+      provider,
+      userByok: null,
+      bypassAccessCheck: false,
+    });
+    mockedClassifyAbuse.mockResolvedValue(classifyResult(null));
+    mockedRedisGet.mockResolvedValue(null);
+    mockedRedisSet.mockResolvedValue('OK');
+    mockedGetOpenRouterModels.mockResolvedValue(new Set());
+    mockedUpstreamRequest.mockResolvedValue(
+      upstreamJsonResponse({ id: 'chatcmpl-1', model: 'anthropic/claude-haiku-4', choices: [] })
+    );
+    mockedEmitApiMetricsForResponse.mockReturnValue(undefined);
+    mockedAccountForMicrodollarUsage.mockReturnValue(undefined);
+    mockedLogMicrodollarUsage.mockResolvedValue(null);
+    // Mock applyResolvedAutoModel to resolve the virtual model and invoke the efficientDecision thunk
+    mockedApplyResolvedAutoModel.mockImplementation(async (opts, request) => {
+      if (opts.efficientDecision) await opts.efficientDecision();
+      request.body.model = 'anthropic/claude-haiku-4';
+      return { kind: 'ok', resolved: { model: 'anthropic/claude-haiku-4' } };
+    });
+    // after() accepts a Promise or a function; the billing path passes a Promise
+    const { after: mockedAfter } = jest.requireMock<{ after: jest.Mock }>('next/server');
+    mockedAfter.mockImplementation((_arg: unknown) => {
+      // no-op: the promise has already been started when passed to after()
+    });
+  });
+
+  it('bills classifier cost when cost > 0 and user is non-BYOK', async () => {
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: {
+        model: 'anthropic/claude-haiku-4',
+        tier: 'low',
+        source: 'benchmark',
+        tableVersion: 'v1',
+        sticky: false,
+      },
+      costUsd: 0.002,
+    });
+
+    const { POST } = await import('./route');
+    const response = await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    expect(response.status).toBe(200);
+    // Wait for after() callback to settle
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).toHaveBeenCalledTimes(1);
+    const [stats, ctx] = mockedLogMicrodollarUsage.mock.calls[0];
+    expect(stats.cost_mUsd).toBe(2000); // toMicrodollars(0.002)
+    expect(stats.model).toBe('auto-routing/classifier');
+    expect(stats.inputTokens).toBe(0);
+    expect(stats.outputTokens).toBe(0);
+    expect(ctx.requested_model).toBe('kilo-auto/efficient');
+    expect(ctx.user_byok).toBe(false);
+    // The internal classifier-overhead row must not carry a posthog distinct id,
+    // so it can't emit generic first_usage lifecycle events or be mistaken for
+    // the user's first model usage.
+    expect(ctx.posthog_distinct_id).toBeUndefined();
+  });
+
+  it('does not bill when classifier cost is 0 (cache hit)', async () => {
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: {
+        model: 'anthropic/claude-haiku-4',
+        tier: 'low',
+        source: 'benchmark' as const,
+        tableVersion: 'v1',
+        sticky: false,
+      },
+      costUsd: 0,
+    });
+
+    const { POST } = await import('./route');
+    await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).not.toHaveBeenCalled();
+  });
+
+  it('bills classifier cost even when the final inference is BYOK', async () => {
+    // The classifier runs on Kilo's OpenRouter credential regardless of the
+    // final provider, so its cost is owed even when the user is BYOK.
+    mockedGetProvider.mockResolvedValue({
+      kind: 'provider',
+      provider,
+      userByok: [{ decryptedAPIKey: 'byok-key', providerId: 'openai' }],
+      bypassAccessCheck: false,
+    });
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: {
+        model: 'anthropic/claude-haiku-4',
+        tier: 'low',
+        source: 'benchmark',
+        tableVersion: 'v1',
+        sticky: false,
+      },
+      costUsd: 0.002,
+    });
+
+    const { POST } = await import('./route');
+    await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).toHaveBeenCalledTimes(1);
+    const [stats, ctx] = mockedLogMicrodollarUsage.mock.calls[0];
+    expect(stats.cost_mUsd).toBe(2000);
+    expect(stats.model).toBe('auto-routing/classifier');
+    // The classifier row is always Kilo-funded, never BYOK.
+    expect(stats.is_byok).toBe(false);
+    expect(ctx.user_byok).toBe(false);
+  });
+
+  it('skips the paid classifier and does not bill for unauthenticated requests', async () => {
+    // Unauthenticated: efficient resolves to a paid model and is rejected, so
+    // the classifier must not run (no Kilo-funded spend with no user to bill).
+    mockedGetUserFromAuth.mockResolvedValue({
+      user: null,
+      authFailedResponse: new Response('unauthorized', { status: 401 }),
+      organizationId: undefined,
+    } as unknown as Awaited<ReturnType<typeof getUserFromAuth>>);
+
+    const { POST } = await import('./route');
+    await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedFetchEfficientAutoDecision).not.toHaveBeenCalled();
+    expect(mockedLogMicrodollarUsage).not.toHaveBeenCalled();
+  });
+
+  it('bills the classifier even when the request is rejected downstream (abuse block)', async () => {
+    // Exit-safe billing: the classifier already spent on Kilo's credential, so
+    // the row must persist even though the request is blocked before upstream.
+    mockedRedisGet.mockResolvedValue('block');
+    mockedClassifyAbuse.mockResolvedValue(classifyResult('block'));
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: {
+        model: 'anthropic/claude-haiku-4',
+        tier: 'low',
+        source: 'benchmark',
+        tableVersion: 'v1',
+        sticky: false,
+      },
+      costUsd: 0.003,
+    });
+
+    const { POST } = await import('./route');
+    const response = await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    expect(response.status).toBe(403);
+    expect(mockedUpstreamRequest).not.toHaveBeenCalled();
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).toHaveBeenCalledTimes(1);
+    const [stats] = mockedLogMicrodollarUsage.mock.calls[0];
+    expect(stats.model).toBe('auto-routing/classifier');
+    expect(stats.cost_mUsd).toBe(3000);
+  });
+
+  it('bills classifier cost even when decision is null but cost > 0', async () => {
+    mockedFetchEfficientAutoDecision.mockResolvedValue({
+      decision: null,
+      costUsd: 0.001,
+    });
+
+    const { POST } = await import('./route');
+    const response = await POST(makeRequest(makeBody('kilo-auto/efficient')) as never);
+
+    expect(response.status).toBe(200);
+    await Promise.resolve();
+    await Promise.resolve();
+
+    expect(mockedLogMicrodollarUsage).toHaveBeenCalledTimes(1);
+    const [stats] = mockedLogMicrodollarUsage.mock.calls[0];
+    expect(stats.cost_mUsd).toBe(1000); // toMicrodollars(0.001)
+  });
+});
diff --git a/apps/web/src/app/api/openrouter/[...path]/route.ts b/apps/web/src/app/api/openrouter/[...path]/route.ts
index 179f0f3116..d7c965a4db 100644
--- a/apps/web/src/app/api/openrouter/[...path]/route.ts
+++ b/apps/web/src/app/api/openrouter/[...path]/route.ts
@@ -1,6 +1,6 @@
-import { NextResponse, type NextResponse as NextResponseType } from 'next/server';
+import { after, NextResponse, type NextResponse as NextResponseType } from 'next/server';
 import { type NextRequest } from 'next/server';
-import { stripRequiredPrefix } from '@/lib/utils';
+import { stripRequiredPrefix, toMicrodollars } from '@/lib/utils';
 import { extractPromptInfo } from '@/lib/ai-gateway/extractPromptInfo';
 import { determineFallbackFeature } from '@/lib/ai-gateway/determineFallbackFeature';
 import {
@@ -89,9 +89,18 @@ import {
 import { normalizeModelId } from '@/lib/ai-gateway/model-utils';
 import { isForbiddenFreeModel } from '@/lib/ai-gateway/forbidden-free-models';
 import { isCloudflareIP } from '@/lib/cloudflare-ip';
-import { isKiloAutoModel, KILO_AUTO_FREE_MODEL } from '@/lib/ai-gateway/auto-model';
+import {
+  isKiloAutoModel,
+  KILO_AUTO_FREE_MODEL,
+  KILO_AUTO_EFFICIENT_MODEL,
+} from '@/lib/ai-gateway/auto-model';
 import { applyResolvedAutoModel } from '@/lib/ai-gateway/auto-model/resolution';
-import type { MicrodollarUsageContext } from '@/lib/ai-gateway/processUsage.types';
+import { fetchEfficientAutoDecision } from '@/lib/ai-gateway/auto-routing-decision';
+import type {
+  MicrodollarUsageContext,
+  MicrodollarUsageStats,
+} from '@/lib/ai-gateway/processUsage.types';
+import { logMicrodollarUsage } from '@/lib/ai-gateway/processUsage';
 import {
   getMaxTokens,
   hasMiddleOutTransform,
@@ -261,8 +270,37 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
   }
 
   let autoModel: string | null = null;
+  let classifierCostUsd = 0;
   if (isKiloAutoModel(requestedModelLowerCased)) {
     autoModel = requestedModelLowerCased;
+    const efficientDecision =
+      requestedModelLowerCased === KILO_AUTO_EFFICIENT_MODEL.id
+        ? async () => {
+            const { user, authFailedResponse } = await authPromise;
+            // The classifier is a paid call on Kilo's own credential. Skip it
+            // for unauthenticated requests: kilo-auto/efficient resolves to a
+            // paid model, so an unauthenticated caller is rejected downstream
+            // regardless, and a null decision simply falls back to balanced.
+            // This stops anonymous/abusive traffic from repeatedly spending
+            // Kilo-funded classification with no user to attribute it to.
+            if (!user || authFailedResponse) return null;
+            const result = await fetchEfficientAutoDecision({
+              apiKind: requestBodyParsed.kind,
+              body: requestBodyParsed.body,
+              requestedModel,
+              providerHints: mirrorProviderHints,
+              bodyBytes: Buffer.byteLength(requestBodyText),
+              userId: user.id,
+              sessionId: taskId ?? sessionHeader,
+              machineId: machineIdHeader,
+              clientRequestId,
+              mode: modeHeader,
+              userAgent: extractHeaderAndLimitLength(request, 'user-agent'),
+            });
+            classifierCostUsd = result?.costUsd ?? 0;
+            return result?.decision ?? null;
+          }
+        : undefined;
     const autoResult = await applyResolvedAutoModel(
       {
         model: requestedModelLowerCased,
@@ -271,6 +309,7 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
         sessionId: taskId ?? null,
         apiKind: requestBodyParsed.kind,
         clientIp: ipAddress ?? null,
+        efficientDecision,
       },
       requestBodyParsed,
       authPromise.then(res => res.user),
@@ -393,6 +432,91 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
     user = maybeUser;
   }
 
+  // Fraud/project headers are pure header parsing; resolve them here so the
+  // classifier-overhead billing below can be scheduled before any downstream
+  // rejection path runs.
+  const { fraudHeaders, projectId } = extractFraudAndProjectHeaders(request);
+
+  // Bill the classifier overhead as soon as the cost is known and we have an
+  // authenticated user — via after(), so the row is persisted even when the
+  // request is rejected downstream (abuse block, provider/api-kind rejection,
+  // balance/org checks, upstream 4xx, …). The classifier already ran on Kilo's
+  // OpenRouter credential during model resolution, so the cost is owed
+  // regardless of how this request ends. Anonymous requests never reach a
+  // positive classifier cost (the classifier is skipped for them above), so
+  // this only bills real users.
+  if (classifierCostUsd > 0 && !isAnonymousContext(user)) {
+    const priorMicrodollarUsage = user.microdollars_used;
+    after(
+      (async () => {
+        try {
+          const classifierStats: MicrodollarUsageStats = {
+            messageId: null,
+            model: 'auto-routing/classifier',
+            responseContent: '',
+            hasError: false,
+            inference_provider: null,
+            upstream_id: null,
+            finish_reason: null,
+            latency: null,
+            moderation_latency: null,
+            generation_time: null,
+            streamed: false,
+            cancelled: false,
+            status_code: 200,
+            cost_mUsd: toMicrodollars(classifierCostUsd),
+            inputTokens: 0,
+            outputTokens: 0,
+            cacheWriteTokens: 0,
+            cacheHitTokens: 0,
+            is_byok: false,
+          };
+          const classifierContext: MicrodollarUsageContext = {
+            api_kind: requestBodyParsed.kind,
+            kiloUserId: user.id,
+            fraudHeaders,
+            organizationId,
+            provider: 'openrouter',
+            requested_model: KILO_AUTO_EFFICIENT_MODEL.id,
+            promptInfo: {
+              system_prompt_prefix: '',
+              system_prompt_length: 0,
+              user_prompt_prefix: '',
+            },
+            max_tokens: null,
+            has_middle_out_transform: null,
+            isStreaming: false,
+            prior_microdollar_usage: priorMicrodollarUsage,
+            // No posthog_distinct_id: this internal overhead row must not emit
+            // the generic first_usage / first_microdollar_usage lifecycle
+            // events (those are gated on posthog_distinct_id in processUsage).
+            // Otherwise the classifier row could race the primary usage row and
+            // mis-attribute `auto-routing/classifier` as the user's first model.
+            // DB billing is unaffected — it keys on kiloUserId.
+            posthog_distinct_id: undefined,
+            project_id: projectId,
+            status_code: 200,
+            editor_name: extractHeaderAndLimitLength(request, 'x-kilocode-editorname'),
+            machine_id: machineIdHeader,
+            user_byok: false,
+            has_tools: false,
+            botId,
+            tokenSource,
+            feature,
+            session_id: taskId ?? sessionHeader ?? null,
+            mode: modeHeader,
+            auto_model: autoModel,
+            ttfb_ms: null,
+            clientRequestId,
+          };
+          await logMicrodollarUsage(classifierStats, classifierContext);
+        } catch (error) {
+          console.error('Failed to bill classifier cost for kilo-auto/efficient', error);
+        }
+      })()
+    );
+  }
+
   if (
     requestBodyParsed.kind === 'responses' &&
     (requestBodyParsed.body.store || requestBodyParsed.body.previous_response_id)
@@ -409,8 +533,6 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
     );
   }
 
-  // Use new shared helper for fraud & project headers
-  const { fraudHeaders, projectId } = extractFraudAndProjectHeaders(request);
   // Resolve the initial provider before abuse enforcement because abuse needs
   // provider/BYOK context, and quarantine-3 may later rewrite these values.
   const initialProviderResultForAbuseService = await getProvider({
@@ -718,20 +840,22 @@ export async function POST(request: NextRequest): Promise<NextResponseType<unkno
     await sleepForRulesEngineAction(rulesEngineDecision.delayMs);
   }
 
-  scheduleAutoRoutingMirror({
-    apiKind: requestBodyParsed.kind,
-    body: requestBodyParsed.body,
-    requestedModel,
-    providerHints: mirrorProviderHints,
-    bodyBytes: Buffer.byteLength(requestBodyText),
-    userId: user.id,
-    sessionId: taskId ?? sessionHeader,
-    machineId: machineIdHeader,
-    clientRequestId,
-    mode: modeHeader,
-    userAgent: extractHeaderAndLimitLength(request, 'user-agent'),
-    authContext: Promise.resolve({ organizationId }),
-  });
+  if (autoModel !== KILO_AUTO_EFFICIENT_MODEL.id) {
+    scheduleAutoRoutingMirror({
+      apiKind: requestBodyParsed.kind,
+      body: requestBodyParsed.body,
+      requestedModel,
+      providerHints: mirrorProviderHints,
+      bodyBytes: Buffer.byteLength(requestBodyText),
+      userId: user.id,
+      sessionId: taskId ?? sessionHeader,
+      machineId: machineIdHeader,
+      clientRequestId,
+      mode: modeHeader,
+      userAgent: extractHeaderAndLimitLength(request, 'user-agent'),
+      authContext: Promise.resolve({ organizationId }),
+    });
+  }
 
   const observesProvider = effectiveProviderContext.provider.id === 'custom';
   const attemptId = observesProvider ? crypto.randomUUID() : null;
diff --git a/apps/web/src/lib/ai-gateway/auto-model/index.ts b/apps/web/src/lib/ai-gateway/auto-model/index.ts
index 8c13bb7d96..467c5f7d28 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/index.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/index.ts
@@ -22,6 +22,9 @@ type AutoModel = {
   supports_images: boolean;
   supports_pdf: boolean;
   opencode_settings: OpenCodeSettings | undefined;
+  // Mirrors KiloExclusiveModel['status']. 'hidden' auto models are excluded
+  // from the /models listing but stay usable by anyone who knows the id.
+  status: 'public' | 'hidden';
 };
 
 export type ResolvedAutoModel = {
@@ -105,6 +108,7 @@ export const KILO_AUTO_FRONTIER_MODEL: AutoModel = {
     family: 'claude',
     prompt: 'anthropic',
   },
+  status: 'public',
 };
 
 export const KILO_AUTO_FREE_MODEL: AutoModel = {
@@ -122,6 +126,7 @@ export const KILO_AUTO_FREE_MODEL: AutoModel = {
   supports_images: false,
   supports_pdf: false,
   opencode_settings: undefined,
+  status: 'public',
 };
 
 export const KILO_AUTO_BALANCED_MODEL: AutoModel = {
@@ -137,6 +142,7 @@ export const KILO_AUTO_BALANCED_MODEL: AutoModel = {
   supports_images: true,
   supports_pdf: false,
   opencode_settings: undefined,
+  status: 'public',
 };
 
 export const KILO_AUTO_SMALL_MODEL: AutoModel = {
@@ -152,11 +158,24 @@ export const KILO_AUTO_SMALL_MODEL: AutoModel = {
   supports_images: true,
   supports_pdf: false,
   opencode_settings: undefined,
+  status: 'public',
+};
+
+// Same catalog properties as balanced (it is intended to eventually replace
+// it); hidden while the routing engine is validated on Kilo team traffic.
+export const KILO_AUTO_EFFICIENT_MODEL: AutoModel = {
+  ...KILO_AUTO_BALANCED_MODEL,
+  id: 'kilo-auto/efficient',
+  name: 'Auto Efficient',
+  description:
+    'Routes each request to the cheapest model that gets the job done, based on continuously benchmarked accuracy and cost.',
+  status: 'hidden',
 };
 
 export const AUTO_MODELS = [
   KILO_AUTO_FRONTIER_MODEL,
   KILO_AUTO_BALANCED_MODEL,
+  KILO_AUTO_EFFICIENT_MODEL,
   KILO_AUTO_FREE_MODEL,
   KILO_AUTO_SMALL_MODEL,
 ];
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
new file mode 100644
index 0000000000..f241c5f222
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.test.ts
@@ -0,0 +1,142 @@
+import { describe, expect, it, jest } from '@jest/globals';
+
+jest.mock('@/lib/ai-gateway/providers/gateway-models-cache', () => ({
+  getOpenRouterModels: jest.fn(async () => new Set<string>()),
+}));
+
+jest.mock('@/lib/kiloclaw/setup-promo', () => ({
+  userIsWithinFirstKiloClawInstanceWindow: jest.fn(async () => false),
+}));
+
+import { resolveAutoModel } from './resolution';
+import { BALANCED_QWEN_MODEL, KILO_AUTO_EFFICIENT_MODEL } from '@/lib/ai-gateway/auto-model';
+import type { AutoRoutingDecision } from '@kilocode/auto-routing-contracts';
+
+const baseParams = {
+  model: KILO_AUTO_EFFICIENT_MODEL.id,
+  modeHeader: null,
+  featureHeader: null,
+  sessionId: null,
+  clientIp: null,
+};
+
+const nullUserPromise = Promise.resolve(null);
+const zeroBalancePromise = Promise.resolve(0);
+
+const sampleDecision: AutoRoutingDecision = {
+  model: 'anthropic/claude-haiku-4',
+  tier: 'low',
+  source: 'benchmark',
+  tableVersion: 'v1',
+  sticky: false,
+};
+
+describe('resolveAutoModel — kilo-auto/efficient branch', () => {
+  it('resolves to decision.model when the thunk returns a decision', async () => {
+    const result = await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: async () => sampleDecision,
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: { model: 'anthropic/claude-haiku-4' } });
+  });
+
+  it('applies the decision reasoningEffort as a reasoning config', async () => {
+    const result = await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: async () => ({ ...sampleDecision, reasoningEffort: 'minimal' }),
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({
+      kind: 'ok',
+      resolved: {
+        model: 'anthropic/claude-haiku-4',
+        reasoning: { enabled: true, effort: 'minimal' },
+      },
+    });
+  });
+
+  it('omits reasoning when the decision reasoningEffort is null', async () => {
+    const result = await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: async () => ({ ...sampleDecision, reasoningEffort: null }),
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: { model: 'anthropic/claude-haiku-4' } });
+  });
+
+  it('falls back to BALANCED_QWEN_MODEL when no thunk is provided and apiKind=responses', async () => {
+    const result = await resolveAutoModel(
+      { ...baseParams, apiKind: 'responses' },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_QWEN_MODEL });
+  });
+
+  it('falls back to BALANCED_QWEN_MODEL when no thunk is provided and apiKind=messages', async () => {
+    const result = await resolveAutoModel(
+      { ...baseParams, apiKind: 'messages' },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_QWEN_MODEL });
+  });
+
+  it('falls back to BALANCED_QWEN_MODEL when no thunk is provided and apiKind=chat_completions', async () => {
+    const result = await resolveAutoModel(
+      { ...baseParams, apiKind: 'chat_completions' },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_QWEN_MODEL });
+  });
+
+  it('falls back to BALANCED_QWEN_MODEL when thunk returns null and apiKind=chat_completions', async () => {
+    const result = await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: async () => null,
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(result).toEqual({ kind: 'ok', resolved: BALANCED_QWEN_MODEL });
+  });
+
+  it('does not call the thunk more than once', async () => {
+    const thunk = jest.fn(async () => sampleDecision);
+
+    await resolveAutoModel(
+      {
+        ...baseParams,
+        apiKind: 'chat_completions',
+        efficientDecision: thunk,
+      },
+      nullUserPromise,
+      zeroBalancePromise
+    );
+
+    expect(thunk).toHaveBeenCalledTimes(1);
+  });
+});
diff --git a/apps/web/src/lib/ai-gateway/auto-model/resolution.ts b/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
index 0003dd5101..b45bf1617a 100644
--- a/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
+++ b/apps/web/src/lib/ai-gateway/auto-model/resolution.ts
@@ -9,10 +9,12 @@ import type {
 } from '@/lib/ai-gateway/providers/openrouter/types';
 import type OpenAI from 'openai';
 import type { User } from '@kilocode/db';
+import type { AutoRoutingDecision } from '@kilocode/auto-routing-contracts';
 import {
   KILO_AUTO_FREE_MODEL,
   KILO_AUTO_SMALL_MODEL,
   KILO_AUTO_BALANCED_MODEL,
+  KILO_AUTO_EFFICIENT_MODEL,
   modeSchema,
   BALANCED_CLAW_SETUP_MODEL,
   BALANCED_QWEN_MODEL,
@@ -39,6 +41,9 @@ type ResolveAutoModelParams = {
   sessionId: string | null;
   apiKind: GatewayRequest['kind'] | null;
   clientIp: string | null;
+  // Lazily fetches the auto-routing worker's decision; only set for
+  // kilo-auto/efficient requests (route.ts owns the request-body capture).
+  efficientDecision?: () => Promise<AutoRoutingDecision | null>;
 };
 
 function resolveMode(modeHeader: string | null, featureHeader: FeatureValue | null) {
@@ -115,6 +120,25 @@ export async function resolveAutoModel(
       },
     };
   }
+  if (model === KILO_AUTO_EFFICIENT_MODEL.id) {
+    const decision = params.efficientDecision ? await params.efficientDecision() : null;
+    if (decision) {
+      // Apply the candidate's pinned reasoning effort so the model runs under
+      // the same conditions the benchmark measured it at.
+      return {
+        kind: 'ok',
+        resolved: {
+          model: decision.model,
+          ...(decision.reasoningEffort
+            ? { reasoning: { enabled: true, effort: decision.reasoningEffort } }
+            : {}),
+        },
+      };
+    }
+    // Static fallback when the worker is slow/unavailable: same model as
+    // balanced so an efficient request never degrades below balanced.
+    return { kind: 'ok', resolved: BALANCED_QWEN_MODEL };
+  }
   const mode = resolveMode(modeHeader, featureHeader);
   if (model === KILO_AUTO_BALANCED_MODEL.id || model === KILO_AUTO_LEGACY_MODEL) {
     if (mode === 'claw' && featureHeader === 'kiloclaw') {
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.test.ts
index f98da3baf2..adc58001b3 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.test.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.test.ts
@@ -14,6 +14,8 @@ global.fetch = mockFetch;
 
 const classifierModelResponse = {
   model: 'google/gemini-2.5-flash-lite',
+  override: null,
+  benchmarkWinner: null,
   defaultModel: 'google/gemini-2.5-flash-lite',
 };
 
@@ -86,6 +88,28 @@ describe('auto routing admin client', () => {
     );
   });
 
+  it('clears the classifier model override by sending null', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve(classifierModelResponse),
+    });
+
+    await updateAutoRoutingClassifierModel(null);
+
+    expect(mockFetch).toHaveBeenCalledWith(
+      'https://auto-routing.example.com/admin/classifier-model',
+      {
+        method: 'PUT',
+        headers: {
+          authorization: 'Bearer test-internal-secret',
+          'content-type': 'application/json',
+        },
+        body: JSON.stringify({ model: null }),
+      }
+    );
+  });
+
   it('queries classifier analytics for the selected period', async () => {
     mockFetch.mockResolvedValue({
       status: 200,
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
index fe67e003d2..937c589cf9 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-admin-client.ts
@@ -3,57 +3,13 @@ import {
   AutoRoutingClassifierModelResponseSchema,
   type AutoRoutingAnalyticsPeriod,
 } from '@kilocode/auto-routing-contracts';
-import { AUTO_ROUTING_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
-import * as z from 'zod';
+import { AUTO_ROUTING_WORKER_URL } from '@/lib/config.server';
+import { createWorkerAdminFetch } from './worker-admin-fetch';
 
-export type AutoRoutingAdminResult<T> = {
-  status: number;
-  body: T;
-};
-
-type ErrorBody = { error: string };
-const ErrorBodySchema = z.object({ error: z.string() });
-
-type AutoRoutingAdminRequestInit = Omit<RequestInit, 'headers'> & {
-  headers?: Record<string, string>;
-};
-
-async function fetchAutoRoutingAdmin<T>(
-  path: string,
-  init: AutoRoutingAdminRequestInit,
-  schema: z.ZodType<T>
-): Promise<AutoRoutingAdminResult<T | ErrorBody>> {
-  if (!AUTO_ROUTING_WORKER_URL || !INTERNAL_API_SECRET) {
-    return {
-      status: 500,
-      body: { error: 'Auto routing worker is not configured' },
-    };
-  }
-
-  const response = await fetch(`${AUTO_ROUTING_WORKER_URL}${path}`, {
-    ...init,
-    headers: {
-      authorization: `Bearer ${INTERNAL_API_SECRET}`,
-      ...init.headers,
-    },
-  });
-
-  const body: unknown = await response.json();
-  if (!response.ok) {
-    const parsedError = ErrorBodySchema.safeParse(body);
-    return {
-      status: response.status,
-      body: parsedError.success
-        ? parsedError.data
-        : { error: `Request failed: ${response.status}` },
-    };
-  }
-
-  return {
-    status: response.status,
-    body: schema.parse(body),
-  };
-}
+const fetchAutoRoutingAdmin = createWorkerAdminFetch({
+  workerUrl: AUTO_ROUTING_WORKER_URL,
+  unconfiguredError: 'Auto routing worker is not configured',
+});
 
 export function getAutoRoutingClassifierModel() {
   return fetchAutoRoutingAdmin(
@@ -65,7 +21,7 @@ export function getAutoRoutingClassifierModel() {
   );
 }
 
-export function updateAutoRoutingClassifierModel(model: string) {
+export function updateAutoRoutingClassifierModel(model: string | null) {
   return fetchAutoRoutingAdmin(
     '/admin/classifier-model',
     {
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
new file mode 100644
index 0000000000..d8f209d8df
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.test.ts
@@ -0,0 +1,202 @@
+import {
+  getBenchmarkConfig,
+  updateBenchmarkConfig,
+  listBenchmarkRuns,
+  startBenchmarkRun,
+  getBenchmarkRoutingTable,
+} from './auto-routing-benchmark-admin-client';
+
+jest.mock('@/lib/config.server', () => ({
+  AUTO_ROUTING_BENCHMARK_WORKER_URL: 'https://benchmark-worker.example.com',
+  INTERNAL_API_SECRET: 'test-internal-secret',
+}));
+
+const mockFetch = jest.fn();
+global.fetch = mockFetch;
+
+const configResponse = {
+  config: {
+    classifierModels: ['anthropic/claude-haiku-4'],
+    deciderModels: [{ id: 'anthropic/claude-sonnet-4', reasoningEffort: null }],
+    minAccuracy: 0.8,
+    switchCostFactor: 3,
+    maxConcurrency: 4,
+    benchmarkUserId: null,
+    classifierRepetitions: 1,
+    deciderRepetitions: 1,
+    classifierMaxP95LatencyMs: 1000,
+    updatedAt: null,
+    updatedBy: null,
+  },
+};
+
+const runsResponse = {
+  runs: [
+    {
+      id: 'run-1',
+      kind: 'classifier',
+      status: 'completed',
+      startedAt: '2026-06-01T00:00:00Z',
+      completedAt: '2026-06-01T01:00:00Z',
+      error: null,
+      summaries: [],
+    },
+  ],
+};
+
+describe('auto routing benchmark admin client', () => {
+  beforeEach(() => {
+    mockFetch.mockReset();
+  });
+
+  it('gets the benchmark config and sends bearer auth header', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve(configResponse),
+    });
+
+    await expect(getBenchmarkConfig()).resolves.toEqual({
+      status: 200,
+      body: configResponse,
+    });
+
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/config', {
+      method: 'GET',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+      },
+    });
+  });
+
+  it('propagates error body when upstream responds with a non-OK status', async () => {
+    mockFetch.mockResolvedValue({
+      status: 404,
+      ok: false,
+      json: () => Promise.resolve({ error: 'not found' }),
+    });
+
+    await expect(getBenchmarkConfig()).resolves.toEqual({
+      status: 404,
+      body: { error: 'not found' },
+    });
+  });
+
+  it('updates the benchmark config and sends x-updated-by header', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve(configResponse),
+    });
+
+    await updateBenchmarkConfig(configResponse.config, 'admin@kilocode.ai');
+
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/config', {
+      method: 'PUT',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+        'content-type': 'application/json',
+        'x-updated-by': 'admin@kilocode.ai',
+      },
+      body: JSON.stringify(configResponse.config),
+    });
+  });
+
+  it('lists benchmark runs', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve(runsResponse),
+    });
+
+    await expect(listBenchmarkRuns()).resolves.toEqual({
+      status: 200,
+      body: runsResponse,
+    });
+
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/runs', {
+      method: 'GET',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+      },
+    });
+  });
+
+  it('propagates error body from listBenchmarkRuns on non-OK status', async () => {
+    mockFetch.mockResolvedValue({
+      status: 401,
+      ok: false,
+      json: () => Promise.resolve({ error: 'unauthorized' }),
+    });
+
+    await expect(listBenchmarkRuns()).resolves.toEqual({
+      status: 401,
+      body: { error: 'unauthorized' },
+    });
+  });
+
+  it('starts a benchmark run with the given kind and force flag', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve({ runId: 'run-2', enqueuedModels: 3, skippedModels: [] }),
+    });
+
+    await expect(startBenchmarkRun('classifier', false)).resolves.toEqual({
+      status: 200,
+      body: { runId: 'run-2', enqueuedModels: 3, skippedModels: [] },
+    });
+
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/runs', {
+      method: 'POST',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+        'content-type': 'application/json',
+      },
+      body: JSON.stringify({ kind: 'classifier', force: false }),
+    });
+  });
+
+  it('starts a benchmark run with force=true to re-run existing models', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () =>
+        Promise.resolve({ runId: 'run-3', enqueuedModels: 3, skippedModels: ['model-a'] }),
+    });
+
+    await startBenchmarkRun('decider', true);
+
+    expect(mockFetch).toHaveBeenCalledWith('https://benchmark-worker.example.com/admin/runs', {
+      method: 'POST',
+      headers: {
+        authorization: 'Bearer test-internal-secret',
+        'content-type': 'application/json',
+      },
+      body: JSON.stringify({ kind: 'decider', force: true }),
+    });
+  });
+
+  it('gets the benchmark routing table', async () => {
+    mockFetch.mockResolvedValue({
+      status: 200,
+      ok: true,
+      json: () => Promise.resolve({ table: null, publishedAt: null }),
+    });
+
+    await expect(getBenchmarkRoutingTable()).resolves.toEqual({
+      status: 200,
+      body: { table: null, publishedAt: null },
+    });
+
+    expect(mockFetch).toHaveBeenCalledWith(
+      'https://benchmark-worker.example.com/admin/routing-table',
+      {
+        method: 'GET',
+        headers: {
+          authorization: 'Bearer test-internal-secret',
+        },
+      }
+    );
+  });
+});
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
new file mode 100644
index 0000000000..56a345053e
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-routing-benchmark-admin-client.ts
@@ -0,0 +1,58 @@
+import {
+  BenchmarkRoutingTableResponseSchema,
+  BenchmarkConfigResponseSchema,
+  BenchmarkRunsResponseSchema,
+  StartBenchmarkRunResponseSchema,
+  type BenchmarkConfig,
+  type BenchmarkKind,
+} from '@kilocode/auto-routing-contracts';
+import { AUTO_ROUTING_BENCHMARK_WORKER_URL } from '@/lib/config.server';
+import { createWorkerAdminFetch } from './worker-admin-fetch';
+
+const fetchBenchmarkAdmin = createWorkerAdminFetch({
+  workerUrl: AUTO_ROUTING_BENCHMARK_WORKER_URL,
+  unconfiguredError: 'Auto routing benchmark worker is not configured',
+});
+
+export function getBenchmarkConfig() {
+  return fetchBenchmarkAdmin('/admin/config', { method: 'GET' }, BenchmarkConfigResponseSchema);
+}
+
+export function updateBenchmarkConfig(config: BenchmarkConfig, updatedByEmail: string) {
+  return fetchBenchmarkAdmin(
+    '/admin/config',
+    {
+      method: 'PUT',
+      headers: {
+        'content-type': 'application/json',
+        'x-updated-by': updatedByEmail,
+      },
+      body: JSON.stringify(config),
+    },
+    BenchmarkConfigResponseSchema
+  );
+}
+
+export function listBenchmarkRuns() {
+  return fetchBenchmarkAdmin('/admin/runs', { method: 'GET' }, BenchmarkRunsResponseSchema);
+}
+
+export function startBenchmarkRun(kind: BenchmarkKind, force: boolean) {
+  return fetchBenchmarkAdmin(
+    '/admin/runs',
+    {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({ kind, force }),
+    },
+    StartBenchmarkRunResponseSchema
+  );
+}
+
+export function getBenchmarkRoutingTable() {
+  return fetchBenchmarkAdmin(
+    '/admin/routing-table',
+    { method: 'GET' },
+    BenchmarkRoutingTableResponseSchema
+  );
+}
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
new file mode 100644
index 0000000000..70d8e7e0c6
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.test.ts
@@ -0,0 +1,166 @@
+import { afterEach, beforeEach, describe, expect, it, jest } from '@jest/globals';
+
+const mockedWarnExceptInTest = jest.fn();
+
+jest.mock('@/lib/config.server', () => ({
+  AUTO_ROUTING_WORKER_URL: '',
+  INTERNAL_API_SECRET: '',
+}));
+
+jest.mock('@/lib/utils.server', () => ({
+  warnExceptInTest: (...args: unknown[]) => mockedWarnExceptInTest(...args),
+}));
+
+import { fetchEfficientAutoDecision } from './auto-routing-decision';
+import type { EfficientDecisionParams } from './auto-routing-decision';
+
+const originalFetch = globalThis.fetch;
+const mockedFetch = jest.fn() as jest.MockedFunction<typeof globalThis.fetch>;
+
+function makeParams(): EfficientDecisionParams {
+  return {
+    apiKind: 'chat_completions',
+    body: {
+      model: 'kilo-auto/efficient',
+      stream: true,
+      messages: [
+        { role: 'system', content: 'You are Kilo Code.' },
+        { role: 'user', content: 'Fix the parser bug.' },
+      ],
+    },
+    requestedModel: 'kilo-auto/efficient',
+    providerHints: { provider: null, providerOptions: null },
+    bodyBytes: 512,
+    userId: 'user-1',
+    sessionId: 'task-123',
+    machineId: 'machine-1',
+    clientRequestId: 'req-1',
+    mode: 'code',
+    userAgent: 'Kilo-Code/1.2.3',
+  };
+}
+
+const options = {
+  workerUrl: 'https://auto-routing.example.com',
+  authToken: 'classifier-token',
+};
+
+const validDecision = {
+  model: 'anthropic/claude-haiku-4',
+  tier: 'low' as const,
+  source: 'benchmark' as const,
+  tableVersion: 'v1',
+  sticky: false,
+};
+
+const validResponse = {
+  cost: 0.001,
+  decision: validDecision,
+  classifierResult: null,
+};
+
+describe('fetchEfficientAutoDecision', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+    globalThis.fetch = mockedFetch;
+  });
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+  });
+
+  it('returns the decision on a 200 response with valid body', async () => {
+    mockedFetch.mockResolvedValueOnce(new Response(JSON.stringify(validResponse), { status: 200 }));
+
+    const result = await fetchEfficientAutoDecision(makeParams(), options);
+
+    expect(mockedFetch).toHaveBeenCalledTimes(1);
+    const [url, init] = mockedFetch.mock.calls[0];
+    expect(url).toBe('https://auto-routing.example.com/decide');
+    expect(init).toMatchObject({ method: 'POST' });
+    const headers = init?.headers as Headers;
+    expect(headers.get('authorization')).toBe('Bearer classifier-token');
+    expect(headers.get('content-type')).toBe('application/json');
+    expect(result).toEqual({ decision: validDecision, costUsd: 0.001 });
+  });
+
+  it('returns null and calls onError on a non-OK response', async () => {
+    const onError = jest.fn();
+    mockedFetch.mockResolvedValueOnce(new Response('Internal Server Error', { status: 500 }));
+
+    const result = await fetchEfficientAutoDecision(makeParams(), { ...options, onError });
+
+    expect(result).toBeNull();
+    expect(onError).toHaveBeenCalledWith('Efficient auto decision request failed', {
+      error: 'status 500',
+    });
+  });
+
+  it('returns null and calls onError when fetch rejects (timeout/abort)', async () => {
+    const onError = jest.fn();
+    mockedFetch.mockRejectedValueOnce(new Error('The operation was aborted'));
+
+    const result = await fetchEfficientAutoDecision(makeParams(), { ...options, onError });
+
+    expect(result).toBeNull();
+    expect(onError).toHaveBeenCalledWith('Efficient auto decision request failed', {
+      error: 'The operation was aborted',
+    });
+  });
+
+  it('returns null and calls onError on a schema-invalid response body', async () => {
+    const onError = jest.fn();
+    mockedFetch.mockResolvedValueOnce(
+      new Response(JSON.stringify({ unexpected: 'shape' }), { status: 200 })
+    );
+
+    const result = await fetchEfficientAutoDecision(makeParams(), { ...options, onError });
+
+    expect(result).toBeNull();
+    expect(onError).toHaveBeenCalledWith('Efficient auto decision response invalid', {
+      error: 'invalid_response',
+    });
+  });
+
+  it('returns null when normalization fails (unclassifiable body)', async () => {
+    const result = await fetchEfficientAutoDecision(
+      { ...makeParams(), body: { stream: true } },
+      options
+    );
+
+    expect(mockedFetch).not.toHaveBeenCalled();
+    expect(result).toBeNull();
+  });
+
+  it('returns null when workerUrl is not configured', async () => {
+    const result = await fetchEfficientAutoDecision(makeParams(), {
+      ...options,
+      workerUrl: '',
+    });
+
+    expect(mockedFetch).not.toHaveBeenCalled();
+    expect(result).toBeNull();
+  });
+
+  it('returns null when authToken is not configured', async () => {
+    const result = await fetchEfficientAutoDecision(makeParams(), {
+      ...options,
+      authToken: '',
+    });
+
+    expect(mockedFetch).not.toHaveBeenCalled();
+    expect(result).toBeNull();
+  });
+
+  it('returns decision: null with costUsd when the worker returns a null decision', async () => {
+    mockedFetch.mockResolvedValueOnce(
+      new Response(JSON.stringify({ cost: 0.001, decision: null, classifierResult: null }), {
+        status: 200,
+      })
+    );
+
+    const result = await fetchEfficientAutoDecision(makeParams(), options);
+
+    expect(result).toEqual({ decision: null, costUsd: 0.001 });
+  });
+});
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-decision.ts b/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
new file mode 100644
index 0000000000..b8bef47c09
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/auto-routing-decision.ts
@@ -0,0 +1,62 @@
+import {
+  AutoRoutingDecisionResponseSchema,
+  type AutoRoutingDecision,
+} from '@kilocode/auto-routing-contracts';
+import { AUTO_ROUTING_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
+import { warnExceptInTest } from '@/lib/utils.server';
+import { buildDecidePayload, type DecideBaseParams } from './auto-routing-mirror';
+
+export const EFFICIENT_DECISION_TIMEOUT_MS = 2_000;
+
+// EfficientDecisionParams is an alias for the shared base params type.
+export type EfficientDecisionParams = DecideBaseParams;
+
+type FetchEfficientDecisionOptions = {
+  workerUrl?: string;
+  authToken?: string;
+  timeoutMs?: number;
+  onError?: (message: string, data: { error: string }) => void;
+};
+
+// Blocking counterpart of the fire-and-forget mirror: kilo-auto/efficient
+// waits for the worker's routing decision (cache hits ~20ms, classifier
+// misses ~1.2s) and falls back to the static default on timeout or error.
+export async function fetchEfficientAutoDecision(
+  params: EfficientDecisionParams,
+  options: FetchEfficientDecisionOptions = {}
+): Promise<{ decision: AutoRoutingDecision | null; costUsd: number } | null> {
+  const workerUrl = options.workerUrl ?? AUTO_ROUTING_WORKER_URL;
+  const authToken = options.authToken ?? INTERNAL_API_SECRET;
+  const onError = options.onError ?? warnExceptInTest;
+  if (!workerUrl || !authToken) return null;
+
+  const payload = buildDecidePayload(params);
+  if (!payload) return null;
+
+  try {
+    const response = await fetch(`${workerUrl}/decide`, {
+      method: 'POST',
+      headers: new Headers({
+        authorization: `Bearer ${authToken}`,
+        'content-type': 'application/json',
+      }),
+      body: JSON.stringify(payload),
+      signal: AbortSignal.timeout(options.timeoutMs ?? EFFICIENT_DECISION_TIMEOUT_MS),
+    });
+    if (!response.ok) {
+      onError('Efficient auto decision request failed', { error: `status ${response.status}` });
+      return null;
+    }
+    const parsed = AutoRoutingDecisionResponseSchema.safeParse(await response.json());
+    if (!parsed.success) {
+      onError('Efficient auto decision response invalid', { error: 'invalid_response' });
+      return null;
+    }
+    return { decision: parsed.data.decision, costUsd: parsed.data.cost };
+  } catch (error) {
+    onError('Efficient auto decision request failed', {
+      error: error instanceof Error ? error.message : String(error),
+    });
+    return null;
+  }
+}
diff --git a/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts b/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
index 6192bb9bef..210b78c8d9 100644
--- a/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
+++ b/apps/web/src/lib/ai-gateway/auto-routing-mirror.ts
@@ -4,11 +4,10 @@ import { after } from 'next/server';
 import { AUTO_ROUTING_WORKER_URL, INTERNAL_API_SECRET } from '@/lib/config.server';
 import { warnExceptInTest } from '@/lib/utils.server';
 
-type ScheduleAutoRoutingMirrorParams = {
+// Shared base params for both the mirror (fire-and-forget) and the
+// efficient-decision (blocking) call sites.
+export type DecideBaseParams = {
   apiKind: ClassifierApiKind;
-  // The parsed gateway request body. Provider transforms may mutate it after
-  // scheduling, which is why the requested model and provider hints are
-  // captured separately before any mutation.
   body: unknown;
   requestedModel: string;
   providerHints: MirrorPayload['input']['providerHints'];
@@ -19,6 +18,33 @@ type ScheduleAutoRoutingMirrorParams = {
   clientRequestId: string | null;
   mode: string | null;
   userAgent: string | null;
+};
+
+// Normalize and assemble the /decide payload. Returns null when the body
+// cannot be classified (normalization failed).
+export function buildDecidePayload(params: DecideBaseParams): MirrorPayload | null {
+  const normalizedInput = normalizeClassifierInput(params.apiKind, params.body, {
+    requestedModel: params.requestedModel,
+    providerHints: params.providerHints,
+  });
+  if (!normalizedInput) return null;
+
+  return {
+    input: normalizedInput,
+    userId: params.userId,
+    sessionId: params.sessionId,
+    machineId: params.machineId,
+    clientRequestId: params.clientRequestId,
+    mode: params.mode,
+    userAgent: params.userAgent,
+    bodyBytes: params.bodyBytes,
+  };
+}
+
+type ScheduleAutoRoutingMirrorParams = DecideBaseParams & {
+  // The parsed gateway request body. Provider transforms may mutate it after
+  // scheduling, which is why the requested model and provider hints are
+  // captured separately before any mutation.
   authContext?: Promise<{ organizationId?: string | null }>;
 };
 
@@ -41,11 +67,8 @@ async function sendAutoRoutingMirror(
   // Normalizing here (in background work, off the request path) keeps the
   // mirror payload at a few KB instead of the full request body, and lets
   // requests the worker could not classify anyway skip the mirror call.
-  const normalizedInput = normalizeClassifierInput(params.apiKind, params.body, {
-    requestedModel: params.requestedModel,
-    providerHints: params.providerHints,
-  });
-  if (!normalizedInput) {
+  const payload = buildDecidePayload(params);
+  if (!payload) {
     const onError = options.onError ?? warnExceptInTest;
     onError('Auto routing mirror skipped unclassifiable request body', {
       error: 'normalize_failed',
@@ -53,17 +76,6 @@ async function sendAutoRoutingMirror(
     return;
   }
 
-  const payload: MirrorPayload = {
-    input: normalizedInput,
-    userId: params.userId,
-    sessionId: params.sessionId,
-    machineId: params.machineId,
-    clientRequestId: params.clientRequestId,
-    mode: params.mode,
-    userAgent: params.userAgent,
-    bodyBytes: params.bodyBytes,
-  };
-
   const response = await fetch(`${workerUrl}/decide`, {
     method: 'POST',
     headers: new Headers({
diff --git a/apps/web/src/lib/ai-gateway/experiments/reserved-ids.ts b/apps/web/src/lib/ai-gateway/experiments/reserved-ids.ts
new file mode 100644
index 0000000000..b3fbf007be
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/experiments/reserved-ids.ts
@@ -0,0 +1,24 @@
+import { inArray } from 'drizzle-orm';
+import { model_experiment } from '@kilocode/db/schema';
+import { readDb } from '@/lib/drizzle';
+
+/**
+ * Returns the subset of `publicIds` that are reserved by a model experiment.
+ *
+ * Per `.specs/model-experiments.md`, a model-experiment `public_model_id` is a
+ * dedicated preview/experiment id that users must explicitly select; it MUST
+ * NOT enter `kilo-auto` candidate sets or any other automatic selection path.
+ * Ownership is independent of the experiment's current status, so this checks
+ * every status (`draft`, `active`, `paused`, `completed`) — not just the
+ * routing-relevant ones in the Redis membership hot-path (`isPublicIdExperimented`).
+ *
+ * Server-only (drizzle dependency); do not import from client-reachable modules.
+ */
+export async function findExperimentReservedModelIds(publicIds: string[]): Promise<string[]> {
+  if (publicIds.length === 0) return [];
+  const rows = await readDb
+    .selectDistinct({ publicModelId: model_experiment.public_model_id })
+    .from(model_experiment)
+    .where(inArray(model_experiment.public_model_id, publicIds));
+  return rows.map(r => r.publicModelId);
+}
diff --git a/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts b/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
new file mode 100644
index 0000000000..e2e8eaa01c
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/model-api-kinds.test.ts
@@ -0,0 +1,50 @@
+import { describe, expect, it } from '@jest/globals';
+import { gatewayChatApisForModel, modelServesAllGatewayChatApis } from './model-api-kinds';
+import { seed_20_code_free_model } from '@/lib/ai-gateway/providers/seed';
+import type { KiloExclusiveModel } from '@/lib/ai-gateway/providers/kilo-exclusive-model';
+import type * as ModelsModule from '@/lib/ai-gateway/models';
+
+// Stub the catalog so the rejection test doesn't depend on any specific provider file.
+// 'test-exclusive/alibaba-only' resolves to a KiloExclusiveModel on the alibaba gateway,
+// which only supports chat_completions, exercising the rejection branch.
+jest.mock('@/lib/ai-gateway/models', () => {
+  const actual = jest.requireActual<typeof ModelsModule>('@/lib/ai-gateway/models');
+  const stubModel: KiloExclusiveModel = {
+    public_id: 'test-exclusive/alibaba-only',
+    display_name: 'Test Alibaba-only',
+    description: 'stub for unit tests',
+    context_length: 8192,
+    max_completion_tokens: 4096,
+    status: 'public',
+    flags: [],
+    gateway: 'alibaba',
+    internal_id: 'stub-internal',
+    pricing: null,
+    exclusive_to: [],
+    inference_provider_restriction: [],
+  };
+  return {
+    ...actual,
+    findKiloExclusiveModel: (id: string) =>
+      id === 'test-exclusive/alibaba-only' ? stubModel : actual.findKiloExclusiveModel(id),
+  };
+});
+
+describe('modelServesAllGatewayChatApis', () => {
+  it('accepts a plain OpenRouter model (OpenRouter speaks all gateway chat APIs)', () => {
+    expect(modelServesAllGatewayChatApis('openai/gpt-5-mini')).toBe(true);
+  });
+
+  it('rejects a Kilo-exclusive model served by a chat-completions-only provider', () => {
+    expect(modelServesAllGatewayChatApis('test-exclusive/alibaba-only')).toBe(false);
+    expect(gatewayChatApisForModel('test-exclusive/alibaba-only')).toEqual(['chat_completions']);
+  });
+
+  it('treats disabled Kilo-exclusive models like plain OpenRouter models, matching get-provider', () => {
+    expect(modelServesAllGatewayChatApis(seed_20_code_free_model.public_id)).toBe(true);
+  });
+
+  it('falls back to OpenRouter for unknown model ids', () => {
+    expect(modelServesAllGatewayChatApis('made-up/model')).toBe(true);
+  });
+});
diff --git a/apps/web/src/lib/ai-gateway/model-api-kinds.ts b/apps/web/src/lib/ai-gateway/model-api-kinds.ts
new file mode 100644
index 0000000000..77c00058bf
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/model-api-kinds.ts
@@ -0,0 +1,33 @@
+import { findKiloExclusiveModel } from '@/lib/ai-gateway/models';
+import PROVIDERS from '@/lib/ai-gateway/providers/provider-definitions';
+import type { GatewayChatApiKind } from '@/lib/ai-gateway/providers/types';
+
+const GATEWAY_CHAT_API_KINDS: readonly GatewayChatApiKind[] = [
+  'chat_completions',
+  'responses',
+  'messages',
+];
+
+/**
+ * The gateway chat API kinds the model's serving provider can speak, derived
+ * from the provider the gateway would route it to. Mirrors get-provider.ts's
+ * static fallback resolution — a Kilo-exclusive model is served by its
+ * declared gateway, everything else by OpenRouter.
+ */
+export function gatewayChatApisForModel(modelId: string): ReadonlyArray<GatewayChatApiKind> {
+  const exclusive = findKiloExclusiveModel(modelId);
+  const provider =
+    Object.values(PROVIDERS).find(p => p.id === exclusive?.gateway) ?? PROVIDERS.OPENROUTER;
+  return provider.supportedChatApis;
+}
+
+/**
+ * Guards admin saves of the auto-routing benchmark config: routing-table
+ * candidates carry no per-protocol metadata, so every decider model must be
+ * servable on ALL gateway chat API kinds — otherwise the gateway would hard-
+ * reject requests whose protocol the model's provider can't speak.
+ */
+export function modelServesAllGatewayChatApis(modelId: string): boolean {
+  const supported = gatewayChatApisForModel(modelId);
+  return GATEWAY_CHAT_API_KINDS.every(kind => supported.includes(kind));
+}
diff --git a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
index d57b604274..197941a306 100644
--- a/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
+++ b/apps/web/src/lib/ai-gateway/providers/openrouter/index.ts
@@ -32,7 +32,7 @@ import { applyCustomPricingToModel } from '@/lib/ai-gateway/custom-pricing';
 export { normalizeModelId } from '@/lib/ai-gateway/model-utils';
 
 function buildAutoModels(): OpenRouterModel[] {
-  return AUTO_MODELS.map(m => {
+  return AUTO_MODELS.filter(m => m.status === 'public').map(m => {
     const input_modalities = ['text'];
     if (m.supports_images) {
       input_modalities.push('image');
diff --git a/apps/web/src/lib/ai-gateway/worker-admin-fetch.ts b/apps/web/src/lib/ai-gateway/worker-admin-fetch.ts
new file mode 100644
index 0000000000..855f99b920
--- /dev/null
+++ b/apps/web/src/lib/ai-gateway/worker-admin-fetch.ts
@@ -0,0 +1,56 @@
+import { INTERNAL_API_SECRET } from '@/lib/config.server';
+import * as z from 'zod';
+
+export type WorkerAdminResult<T> = {
+  status: number;
+  body: T;
+};
+
+export type ErrorBody = { error: string };
+export const ErrorBodySchema = z.object({ error: z.string() });
+
+type WorkerAdminRequestInit = Omit<RequestInit, 'headers'> & {
+  headers?: Record<string, string>;
+};
+
+export function createWorkerAdminFetch(options: {
+  workerUrl: string | undefined;
+  unconfiguredError: string;
+}) {
+  return async function fetchAdmin<T>(
+    path: string,
+    init: WorkerAdminRequestInit,
+    schema: z.ZodType<T>
+  ): Promise<WorkerAdminResult<T | ErrorBody>> {
+    if (!options.workerUrl || !INTERNAL_API_SECRET) {
+      return {
+        status: 500,
+        body: { error: options.unconfiguredError },
+      };
+    }
+
+    const response = await fetch(`${options.workerUrl}${path}`, {
+      ...init,
+      headers: {
+        authorization: `Bearer ${INTERNAL_API_SECRET}`,
+        ...init.headers,
+      },
+    });
+
+    const body: unknown = await response.json();
+    if (!response.ok) {
+      const parsedError = ErrorBodySchema.safeParse(body);
+      return {
+        status: response.status,
+        body: parsedError.success
+          ? parsedError.data
+          : { error: `Request failed: ${response.status}` },
+      };
+    }
+
+    return {
+      status: response.status,
+      body: schema.parse(body),
+    };
+  };
+}
diff --git a/apps/web/src/lib/config.server.ts b/apps/web/src/lib/config.server.ts
index 6240690665..a0812e0c36 100644
--- a/apps/web/src/lib/config.server.ts
+++ b/apps/web/src/lib/config.server.ts
@@ -369,6 +369,11 @@ export const SESSION_INGEST_WORKER_URL = getEnvVariable('SESSION_INGEST_WORKER_U
 // Auto routing worker
 export const AUTO_ROUTING_WORKER_URL = getEnvVariable('AUTO_ROUTING_WORKER_URL') || '';
 
+// Auto routing benchmark worker
+export const AUTO_ROUTING_BENCHMARK_WORKER_URL =
+  getEnvVariable('AUTO_ROUTING_BENCHMARK_WORKER_URL') ||
+  'https://auto-routing-benchmark.kiloapps.io';
+
 // Security Agent sync Worker command ingress
 export const SECURITY_SYNC_WORKER_URL = getEnvVariable('SECURITY_SYNC_WORKER_URL') || '';
 // Security Agent auto-analysis Worker command ingress
diff --git a/dev/local/services.test.ts b/dev/local/services.test.ts
index b7faf19c7f..327ff96a1d 100644
--- a/dev/local/services.test.ts
+++ b/dev/local/services.test.ts
@@ -4,15 +4,24 @@ import test from 'node:test';
 
 import { getAlwaysOnGroupIds, getService, resolveGroups } from './services';
 
-test('starts auto routing as a core dev service', () => {
+test('keeps auto routing workers in their own opt-in group', () => {
   const service = getService('auto-routing');
 
-  assert.equal(service.group, 'core');
+  assert.equal(service.group, 'auto-routing');
   assert.equal(service.type, 'worker');
   assert.equal(service.dir, 'services/auto-routing');
   assert.equal(service.port, 8810);
   assert.match(service.command.join(' '), /pnpm run dev/);
-  assert.ok(resolveGroups(getAlwaysOnGroupIds()).includes('auto-routing'));
+
+  const benchmark = getService('auto-routing-benchmark');
+  assert.equal(benchmark.group, 'auto-routing');
+  assert.equal(benchmark.type, 'worker');
+  assert.equal(benchmark.dir, 'services/auto-routing-benchmark');
+  assert.equal(benchmark.port, 8814);
+
+  const alwaysOn = resolveGroups(getAlwaysOnGroupIds());
+  assert.ok(!alwaysOn.includes('auto-routing'));
+  assert.ok(!alwaysOn.includes('auto-routing-benchmark'));
 });
 
 test('keeps auto routing package dev script compatible with local launcher flags', () => {
diff --git a/dev/local/services.ts b/dev/local/services.ts
index ac2a081187..5e377f8309 100644
--- a/dev/local/services.ts
+++ b/dev/local/services.ts
@@ -48,6 +48,7 @@ const groups: ServiceGroup[] = [
   },
   { id: 'deploy', label: 'Deploy', alwaysOn: false },
   { id: 'observability', label: 'Observability', alwaysOn: false },
+  { id: 'auto-routing', label: 'Auto Routing', alwaysOn: false, sectionBreakBefore: true },
   { id: 'mobile', label: 'Mobile', alwaysOn: false, sectionBreakBefore: true },
   { id: 'storybook', label: 'Storybook', alwaysOn: false, sectionBreakBefore: true },
 ];
@@ -74,17 +75,23 @@ const serviceMeta: Record<string, ServiceMeta> = {
   // core
   nextjs: {
     group: 'core',
-    dependsOn: ['postgres', 'redis', 'redis-http', 'stripe', 'auto-routing'],
+    dependsOn: ['postgres', 'redis', 'redis-http', 'stripe'],
   },
   postgres: { group: 'core', dependsOn: [] },
   redis: { group: 'core', dependsOn: [] },
   'redis-http': { group: 'core', dependsOn: ['redis'] },
   stripe: { group: 'core', dependsOn: [] },
+  // auto-routing (kilo-auto/efficient decision engine + benchmark runner)
   'auto-routing': {
-    group: 'core',
+    group: 'auto-routing',
     dependsOn: [],
     dir: 'services/auto-routing',
   },
+  'auto-routing-benchmark': {
+    group: 'auto-routing',
+    dependsOn: [],
+    dir: 'services/auto-routing-benchmark',
+  },
   // cloud-agent
   'cloud-agent-next': {
     group: 'cloud-agent',
@@ -367,6 +374,23 @@ export function getAllInfraProfiles(): string[] {
   return [...new Set(Object.values(INFRA_PROFILES))];
 }
 
+// Wrangler always pulls its container egress-interceptor sidecar
+// (cloudflare/proxy-everything) with --platform linux/amd64. On Apple Silicon
+// the emulated amd64 proxy crashes at startup ("setsockopt: protocol not
+// available" — its transparent-proxy socket options don't survive Rosetta),
+// which surfaces as "Failed to start container" for every local container.
+// Point wrangler at the same proxy version's linux/arm64 manifest instead:
+// pulling a single-platform manifest digest with --platform amd64 only warns.
+// Keep the digest in sync with DEFAULT_CONTAINER_EGRESS_INTERCEPTOR_IMAGE in
+// the pinned wrangler/miniflare version (tag 3cb1195).
+const CONTAINER_EGRESS_IMAGE_ARM64 =
+  'cloudflare/proxy-everything:3cb1195@sha256:78c7910f4575a511d928d7824b1cbcaec6b7c4bf4dbb3fafaeeae3104030e73c';
+
+function containerEgressImageEnvPrefix(): string[] {
+  if (process.arch !== 'arm64') return [];
+  return ['env', `MINIFLARE_CONTAINER_EGRESS_IMAGE=${CONTAINER_EGRESS_IMAGE_ARM64}`];
+}
+
 function buildServiceDefs(): ServiceDef[] {
   const repoRoot = path.resolve(import.meta.dirname, '../..');
   const defs: ServiceDef[] = [];
@@ -513,6 +537,7 @@ function buildServiceDefs(): ServiceDef[] {
     const inspectorPort = port + 10000;
 
     const command = [
+      ...containerEgressImageEnvPrefix(),
       'pnpm',
       'run',
       'dev',
diff --git a/docs/adr/0002-auto-routing-efficient.md b/docs/adr/0002-auto-routing-efficient.md
new file mode 100644
index 0000000000..8060347c43
--- /dev/null
+++ b/docs/adr/0002-auto-routing-efficient.md
@@ -0,0 +1,169 @@
+# ADR 0002: Benchmark-Driven Auto Routing (`kilo-auto/efficient`)
+
+## Status
+
+Accepted
+
+## Context
+
+`kilo-auto/*` virtual models route a request to a concrete model on the user's
+behalf. The existing `balanced` tier picks a single fixed default (Qwen). We want
+a tier that routes each request to the *cheapest model proven accurate enough for
+that request's difficulty*, where "proven" means measured by our own benchmarks
+rather than asserted by hand.
+
+This requires three capabilities the codebase did not have: a way to benchmark
+candidate models reproducibly, a way to turn benchmark results into a routing
+decision per request, and a way to bill the routing overhead honestly. The model
+must ship hidden so it can be validated on Kilo team traffic before it competes
+with `balanced` for real users.
+
+## Decision
+
+Introduce a hidden virtual model `kilo-auto/efficient` backed by a
+benchmark-driven decision engine. Ownership is split across three components with
+strict, one-directional dependencies:
+
+- **`services/auto-routing-benchmark`** (new worker) owns *measurement and
+  publication*. It runs the classifier and decider benchmarks, stores normalized
+  results in its own D1, and publishes two artifacts: a per-difficulty-tier
+  routing table and a classifier winner. It is the **sole writer** of both.
+- **`services/auto-routing`** owns *the per-request decision*. Its `/decide`
+  endpoint classifies the request, derives a difficulty tier, and reads (never
+  writes) the published artifacts to pick a model. Session stickiness lives in a
+  Durable Object here.
+- **`apps/web` gateway** owns *exposure and billing*. It resolves
+  `kilo-auto/efficient`, blocks on `/decide`, falls back to balanced Qwen, bills
+  the classifier cost, and hosts the admin panel (proxied to the benchmark worker
+  with the internal secret).
+
+Shared request-classification code (prompt, parsing, taxonomy, tier derivation,
+routing-table schema) lives in `packages/auto-routing-contracts` so the benchmark
+replays the exact code production runs.
+
+## Invariants (what not to change without revisiting this ADR)
+
+1. **The benchmark worker is the only writer of routing tables and the classifier
+   winner.** The decision engine and gateway read them through a cache chain
+   (isolate 60s → KV 1h → service binding to D1) and never write back.
+2. **No fabricated data.** There is no default routing table and no default
+   benchmark config. `/decide` returns a null decision until a benchmark
+   publishes a table; the gateway then serves the balanced fallback. Runs refuse
+   to start without a saved config; decider runs additionally require a
+   `benchmarkUserId`.
+3. **Graceful degradation at every layer.** Corrupt KV → treated as a miss;
+   origin failure → previous behavior (stale table stays live); classifier
+   failure / `/decide` timeout (2s) → null decision → balanced fallback; publish
+   with any empty tier → skipped, previous table stays live. An
+   `efficient` request must never degrade *below* balanced.
+4. **Results are reproducible.** Grading is mechanical only (`exact` /
+   `contains_all` / `regex` / `json_equal`), never LLM-judged. Each run snapshots
+   its config (`min_accuracy`, `switch_cost_factor`, `max_concurrency`,
+   `benchmark_user_id`, per-model `reasoning_effort`); all processing and
+   publishing reads the snapshot, not live config.
+5. **Carried results are identity-gated.** A prior model's summaries are reused on
+   a new run only when the engine identity (dataset + grading/CLI version),
+   repetition count, and the model's `reasoning_effort` all match. Any change
+   re-benchmarks the affected model rather than silently mixing incomparable
+   numbers.
+6. **One active run per kind.** A partial unique index plus a server-side check
+   admit at most one `running` classifier and one `running` decider run; a second
+   start returns 409, not 500. Stale runs are swept to `failed` on run listing.
+7. **The model stays hidden** (excluded from `/models`, usable by id) until team
+   validation graduates it. Graduation criteria live in the rollout section
+   below, not in code.
+8. **Token boundary.** The decider CLI authenticates as a real Kilo user via a 6h
+   token minted by `apps/web`'s internal endpoint (gated by
+   `INTERNAL_API_SECRET`). The token only ever lives in a child-process env var —
+   never logged, never written to disk.
+
+## Billing policy
+
+The classifier LLM runs on Kilo's OpenRouter credential during model resolution,
+so its cost is owed regardless of how the request ends. It is billed as a separate
+microdollar usage row (`requested_model: kilo-auto/efficient`, model
+`auto-routing/classifier`) to the authenticated requesting user, scheduled as soon
+as auth resolves so it survives every downstream rejection path (abuse block,
+provider/api-kind rejection, balance/org checks, upstream 4xx). It is billed even
+when the final inference is BYOK (the classifier was not BYOK). It is skipped
+entirely for anonymous requests (which never reach a paid classification) and is
+deliberately excluded from generic first-usage lifecycle events so the overhead
+row cannot be mis-attributed as a user's first model.
+
+## Sticky-session rule
+
+A conversation's Durable Object remembers the last served model. The incumbent is
+kept while it still meets the tier's accuracy threshold, unless the fresh pick is
+cheaper by more than the table's `switchCostFactor`. Rationale: a model switch
+discards the provider's prompt cache, and rebuilding it costs full-price input
+tokens (4–10× cache-read rates) on a context that dominates agent-session spend —
+switching only pays off when recurring per-turn savings clearly exceed that
+one-time penalty. Stickiness trusts only real classifier output; heuristic
+fallbacks never re-anchor the session's model.
+
+## Alternatives considered
+
+- **Reuse the model-experiment tooling.** Model experiments are explicit,
+  user-selected preview ids; per `.specs/model-experiments.md` they must never
+  enter automatic `kilo-auto` candidate sets (enforced at config-save time). They
+  give no per-difficulty accuracy/cost signal and no routing table, so they cannot
+  drive automatic routing.
+- **Offline benchmarks + hand-maintained routing tables.** Rejected: a
+  hand-maintained table is fabricated data that drifts from reality, has no
+  reproducible provenance, and cannot be re-derived after a model or prompt
+  change. Making the benchmark the source of truth is the whole point.
+- **A narrower first PR (e.g. classifier-only, or routing without benchmarks).**
+  Considered, but the pieces are not independently useful: a routing engine with
+  no published table has nothing to route from, and a benchmark with no consumer
+  publishes into the void. The smallest *shippable* unit is the full loop behind a
+  hidden model — which is why it ships hidden rather than as smaller live
+  increments.
+- **LLM-judged grading.** Rejected for reproducibility: re-running a benchmark
+  must yield comparable numbers. Mechanical checks are deterministic; golden
+  answers were hand-derived and mechanically re-verified.
+
+## Rollout / cutover
+
+1. Gateway side ships with the merge (Vercel): the hidden model, admin panel, and
+   token mint.
+2. The first post-merge worker deploy applies the D1 migration via the CI
+   predeploy hook (`wrangler d1 migrations apply --remote`); CI's
+   `CLOUDFLARE_API_TOKEN` needs D1 edit permission.
+3. An admin saves a benchmark config (decider runs require `benchmarkUserId` —
+   prefer a dedicated service account, as it is billed for CLI usage) and triggers
+   a classifier and a decider run.
+4. Graduation from hidden to broader use is a judgement call made on team traffic;
+   target signals are a measured cost reduction versus balanced at
+   non-inferior accuracy, and no regression in fallback rate. These live here, not
+   in code, so changing them is a deliberate decision.
+
+### Rollback
+
+`kilo-auto/efficient` is hidden and additive, so rollback is containment, not
+revert:
+
+- **Disable the model**: stop routing to it. Because it is hidden, no `/models`
+  consumer depends on it; the gateway already serves balanced on any null
+  decision, so forcing null decisions (or reverting the gateway deploy) degrades
+  cleanly to balanced.
+- **Clear published artifacts**: delete the routing-table and classifier-winner
+  KV keys in `AUTO_ROUTING_CONFIG`; `/decide` then returns null until a benchmark
+  republishes, i.e. balanced fallback everywhere.
+- **Stop benchmark activity**: pause/avoid triggering runs from the admin panel;
+  in-flight queue jobs drain or fail into the DLQ (see the service README).
+- **Worker rollback**: redeploy the previous `auto-routing` / `auto-routing-bench`
+  worker versions. The D1 schema is additive; if a predeploy migration fails the
+  deploy fails before serving, leaving the prior version live.
+
+## Consequences
+
+This adds a new worker, D1 schema, queue + DLQ, container runner, gateway routing,
+billing path, and admin UI in one merge. The cost is a large surface landing
+together; the benefit is that the surface is the smallest *coherent* one (each
+piece is inert without the others) and it lands hidden, so production exposure is
+gated on explicit team validation. The benchmark-as-source-of-truth design means
+routing decisions are always traceable to a reproducible run, and adding a
+candidate model re-benchmarks only that model rather than the whole set.
+
+Operational ownership and local-dev/DLQ debugging live in
+`services/auto-routing-benchmark/README.md`.
diff --git a/packages/auto-routing-contracts/package.json b/packages/auto-routing-contracts/package.json
index 43e1bd2cfd..6ea28e8178 100644
--- a/packages/auto-routing-contracts/package.json
+++ b/packages/auto-routing-contracts/package.json
@@ -6,7 +6,8 @@
   "main": "./src/index.ts",
   "types": "./src/index.ts",
   "exports": {
-    ".": "./src/index.ts"
+    ".": "./src/index.ts",
+    "./classifier": "./src/classifier/index.ts"
   },
   "scripts": {
     "typecheck": "tsgo --noEmit",
@@ -14,9 +15,11 @@
     "test": "vitest run"
   },
   "dependencies": {
+    "@openrouter/sdk": "^0.12.79",
     "zod": "catalog:"
   },
   "devDependencies": {
+    "@types/node": "catalog:",
     "@typescript/native-preview": "catalog:",
     "typescript": "catalog:",
     "vitest": "catalog:"
diff --git a/packages/auto-routing-contracts/src/benchmark.ts b/packages/auto-routing-contracts/src/benchmark.ts
new file mode 100644
index 0000000000..8409b7f743
--- /dev/null
+++ b/packages/auto-routing-contracts/src/benchmark.ts
@@ -0,0 +1,146 @@
+import * as z from 'zod';
+import { RoutingTableSchema } from './routing-table';
+import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
+
+export { ReasoningEffortSchema } from './tiers';
+export type { ReasoningEffort } from './tiers';
+
+export const BenchmarkKindSchema = z.enum(['classifier', 'decider']);
+export type BenchmarkKind = z.infer<typeof BenchmarkKindSchema>;
+
+export const BenchmarkDeciderModelSchema = z.object({
+  id: z.string().trim().min(1),
+  // Passed to the kilo CLI as --variant during the benchmark and carried into
+  // the routing table so serving uses the same effort the model was graded
+  // with. Null for models without (or not using) configurable reasoning.
+  reasoningEffort: ReasoningEffortSchema.nullable().default(null),
+});
+export type BenchmarkDeciderModel = z.infer<typeof BenchmarkDeciderModelSchema>;
+
+// Flags each list entry whose (trimmed) id already appeared earlier in the
+// array. Model ids are the D1 primary keys for config_classifier_models /
+// config_decider_models, so duplicates would otherwise reach the DB as an
+// opaque constraint violation (HTTP 500) instead of an actionable 400.
+function addDuplicateModelIssues(ids: string[], path: string, ctx: z.RefinementCtx): void {
+  const seen = new Set<string>();
+  ids.forEach((id, index) => {
+    if (seen.has(id)) {
+      ctx.addIssue({
+        code: 'custom',
+        path: [path, index],
+        message: `Duplicate model id: ${id}`,
+      });
+    }
+    seen.add(id);
+  });
+}
+
+export const BenchmarkConfigSchema = z
+  .object({
+    classifierModels: z.array(z.string().trim().min(1)).min(1),
+    deciderModels: z.array(BenchmarkDeciderModelSchema).min(1),
+    // Accuracy threshold for "gets the job done" (per tier).
+    minAccuracy: z.number().min(0).max(1),
+    // Parallel OpenRouter calls per queue message.
+    maxConcurrency: z.number().int().min(1).max(16),
+    // The Kilo user whose identity/billing the decider CLI runs execute under.
+    // Null until an admin configures it; decider runs fail fast while null.
+    benchmarkUserId: z.string().trim().min(1).nullable(),
+    // Session stickiness knob carried into published routing tables: a session
+    // stays on its incumbent model while it meets the tier's accuracy
+    // threshold, unless the fresh pick is cheaper by more than this factor.
+    // Model switches discard provider prompt caches (cache reads are far
+    // cheaper than fresh input tokens), so switching only pays off when the
+    // recurring savings clearly outweigh the cache-rebuild penalty.
+    switchCostFactor: z.number().min(1).max(100),
+    // How many times to repeat each case for classifier / decider benchmarks.
+    // Repeated runs reduce variance; the default of 1 preserves the current
+    // single-pass behaviour.
+    classifierRepetitions: z.number().int().min(1).max(5).default(1),
+    deciderRepetitions: z.number().int().min(1).max(5).default(1),
+    // Maximum acceptable p95 latency for the classifier winner; null means no
+    // constraint (cost-only selection).
+    classifierMaxP95LatencyMs: z.number().int().positive().nullable().default(1000),
+    updatedAt: z.string().nullable(),
+    updatedBy: z.string().nullable(),
+  })
+  .superRefine((config, ctx) => {
+    addDuplicateModelIssues(config.classifierModels, 'classifierModels', ctx);
+    addDuplicateModelIssues(
+      config.deciderModels.map(m => m.id),
+      'deciderModels',
+      ctx
+    );
+  });
+export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
+
+export const BenchmarkRunStatusSchema = z.enum(['running', 'completed', 'failed']);
+export type BenchmarkRunStatus = z.infer<typeof BenchmarkRunStatusSchema>;
+
+export const BenchmarkModelSummarySchema = z.object({
+  model: z.string(),
+  // '*' for classifier runs (no tiering), otherwise the difficulty tier.
+  tier: z.union([DifficultyTierSchema, z.literal('*')]),
+  accuracy: z.number(),
+  avgCostUsd: z.number().nullable(),
+  avgLatencyMs: z.number(),
+  p50LatencyMs: z.number().nullable(),
+  p95LatencyMs: z.number().nullable(),
+  cases: z.number().int(),
+  errors: z.number().int(),
+  timeouts: z.number().int().default(0),
+});
+export type BenchmarkModelSummary = z.infer<typeof BenchmarkModelSummarySchema>;
+
+export const BenchmarkRunSchema = z.object({
+  id: z.string(),
+  kind: BenchmarkKindSchema,
+  status: BenchmarkRunStatusSchema,
+  startedAt: z.string(),
+  completedAt: z.string().nullable(),
+  error: z.string().nullable(),
+  summaries: z.array(BenchmarkModelSummarySchema),
+});
+export type BenchmarkRun = z.infer<typeof BenchmarkRunSchema>;
+
+export const BenchmarkRunsResponseSchema = z.object({ runs: z.array(BenchmarkRunSchema) });
+// config is null until an admin saves one — the worker never fabricates a
+// default config, and runs cannot start without a saved one.
+export const BenchmarkConfigResponseSchema = z.object({
+  config: BenchmarkConfigSchema.nullable(),
+});
+export const StartBenchmarkRunRequestSchema = z.object({
+  kind: BenchmarkKindSchema,
+  // Re-run every configured model even when prior results exist.
+  force: z.boolean().default(false),
+});
+export const StartBenchmarkRunResponseSchema = z.object({
+  runId: z.string(),
+  enqueuedModels: z.number().int(),
+  skippedModels: z.array(z.string()).default([]),
+});
+
+export const BenchmarkRoutingTableResponseSchema = z.object({
+  table: RoutingTableSchema.nullable(),
+  publishedAt: z.string().nullable(),
+});
+export type BenchmarkRoutingTableResponse = z.infer<typeof BenchmarkRoutingTableResponseSchema>;
+
+// The cheapest classifier candidate meeting the accuracy threshold, derived
+// on read from the latest completed classifier run (served via
+// /admin/classifier-winner and cached in the auto-routing KV namespace).
+export const ClassifierWinnerSchema = z.object({
+  model: z.string().trim().min(1),
+  runId: z.string(),
+  accuracy: z.number(),
+  p95LatencyMs: z.number().nullable().default(null),
+  generatedAt: z.string(),
+});
+export type ClassifierWinner = z.infer<typeof ClassifierWinnerSchema>;
+
+export const CLASSIFIER_WINNER_KV_KEY = 'classifier_benchmark_winner';
+
+export const ClassifierWinnerResponseSchema = z.object({
+  winner: ClassifierWinnerSchema.nullable(),
+});
+export type ClassifierWinnerResponse = z.infer<typeof ClassifierWinnerResponseSchema>;
diff --git a/packages/auto-routing-contracts/src/classifier/index.ts b/packages/auto-routing-contracts/src/classifier/index.ts
new file mode 100644
index 0000000000..c3d38367e4
--- /dev/null
+++ b/packages/auto-routing-contracts/src/classifier/index.ts
@@ -0,0 +1,13 @@
+export { buildClassifierMessages, CLASSIFIER_MAX_TOKENS, DEFAULT_CLASSIFIER_MODEL } from './prompt';
+export { default as classifierTaxonomy } from './taxonomy.json';
+export { ClassifierOutputParseError, parseClassifierOutput, type ClassifierOutput } from './output';
+export { fallbackClassifierOutput } from './output-fallback';
+export {
+  classifyWithOpenRouter,
+  ClassifierRunError,
+  type ClassifierCallOptions,
+  type ClassifierModelCallMeta,
+  type ClassifierRunFailureMetadata,
+  type ClassifierRunFallbackMetadata,
+  type ClassifierRunResult,
+} from './model-classifier';
diff --git a/services/auto-routing/src/model-classifier.test.ts b/packages/auto-routing-contracts/src/classifier/model-classifier.test.ts
similarity index 97%
rename from services/auto-routing/src/model-classifier.test.ts
rename to packages/auto-routing-contracts/src/classifier/model-classifier.test.ts
index 622409612e..de54484a8d 100644
--- a/services/auto-routing/src/model-classifier.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/model-classifier.test.ts
@@ -1,9 +1,9 @@
 import { describe, expect, it, vi } from 'vitest';
 import type { OpenRouter } from '@openrouter/sdk';
 import type { ChatResult } from '@openrouter/sdk/models';
-import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
+import { DEFAULT_CLASSIFIER_MODEL } from './prompt';
 import { ClassifierRunError, classifyWithOpenRouter } from './model-classifier';
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
+import type { NormalizedClassifierInput } from '../index';
 
 const normalizedInput = {
   apiKind: 'responses',
diff --git a/packages/auto-routing-contracts/src/classifier/model-classifier.ts b/packages/auto-routing-contracts/src/classifier/model-classifier.ts
new file mode 100644
index 0000000000..645276dd6a
--- /dev/null
+++ b/packages/auto-routing-contracts/src/classifier/model-classifier.ts
@@ -0,0 +1,200 @@
+import type { OpenRouter } from '@openrouter/sdk';
+import type { ChatResult } from '@openrouter/sdk/models';
+import { buildClassifierMessages, CLASSIFIER_MAX_TOKENS } from './prompt';
+import type { NormalizedClassifierInput } from '../index';
+import { ClassifierOutputParseError, parseClassifierOutput, type ClassifierOutput } from './output';
+import { fallbackClassifierOutput } from './output-fallback';
+
+export type ClassifierRunResult = {
+  cost: number | null;
+  classifierModel: string;
+  classification: ClassifierOutput;
+  fallback?: ClassifierRunFallbackMetadata;
+  modelCallMeta?: ClassifierModelCallMeta;
+  retried?: boolean;
+  // Why the first attempt was retried; present only when retried is true.
+  firstAttemptFailure?: {
+    reason: string;
+    failureStage: string | null;
+    finishReason: string | null;
+  };
+};
+
+export type ClassifierModelCallMeta = {
+  finishReason: string | null;
+  completionTokens: number | null;
+  reasoningTokens: number | null;
+  // Length only — the raw output is derived from untrusted, mirrored user
+  // prompts and must not reach persistent logs. Combined with finishReason
+  // and token counts this still distinguishes truncation from prompt echo.
+  textLength: number | null;
+};
+
+export type ClassifierRunFailureMetadata = {
+  cost: number | null;
+  classifierModel: string;
+  failureStage?: string;
+  schemaIssueSummary?: string[];
+  topLevelKeys?: string[];
+};
+
+export type ClassifierRunFallbackMetadata = {
+  reason: 'no_text' | 'invalid_output';
+  failureStage?: string;
+  schemaIssueSummary?: string[];
+  topLevelKeys?: string[];
+};
+
+export class ClassifierRunError extends Error {
+  readonly cost: number | null;
+  readonly classifierModel: string;
+  readonly failureStage?: string;
+  readonly schemaIssueSummary: string[];
+  readonly topLevelKeys: string[];
+
+  constructor(message: string, metadata: ClassifierRunFailureMetadata) {
+    super(message);
+    this.name = 'ClassifierRunError';
+    this.cost = metadata.cost;
+    this.classifierModel = metadata.classifierModel;
+    this.failureStage = metadata.failureStage;
+    this.schemaIssueSummary = metadata.schemaIssueSummary ?? [];
+    this.topLevelKeys = metadata.topLevelKeys ?? [];
+  }
+}
+
+export type ClassifierCallOptions = {
+  // Sticky routing key passed to OpenRouter so requests from the same
+  // session land on the same provider and reuse its prompt cache.
+  openrouterSessionId?: string;
+};
+
+export async function classifyWithOpenRouter(
+  client: OpenRouter,
+  input: NormalizedClassifierInput,
+  classifierModel: string,
+  options: ClassifierCallOptions = {}
+): Promise<ClassifierRunResult> {
+  // Invalid output is usually a transient provider glitch (responses cut
+  // off after a handful of tokens with a "stop" finish reason), so one
+  // retry recovers most of those classifications.
+  const firstAttempt = await runClassifierAttempt(client, input, classifierModel, options);
+  if (!firstAttempt.fallback) {
+    return firstAttempt;
+  }
+
+  let retryAttempt: ClassifierRunResult;
+  try {
+    retryAttempt = await runClassifierAttempt(client, input, classifierModel, options);
+  } catch (error) {
+    // The retry threw (e.g. a transport error) after the first attempt had
+    // already billed and produced diagnostics. Surface those rather than
+    // letting the raw error escape and underreport spend.
+    throw new ClassifierRunError(
+      error instanceof Error ? error.message : 'classifier retry failed',
+      {
+        cost: firstAttempt.cost,
+        classifierModel,
+        failureStage: firstAttempt.fallback.failureStage ?? firstAttempt.fallback.reason,
+        schemaIssueSummary: firstAttempt.fallback.schemaIssueSummary,
+        topLevelKeys: firstAttempt.fallback.topLevelKeys,
+      }
+    );
+  }
+  return {
+    ...retryAttempt,
+    cost: sumCosts(firstAttempt.cost, retryAttempt.cost),
+    retried: true,
+    firstAttemptFailure: {
+      reason: firstAttempt.fallback.reason,
+      failureStage: firstAttempt.fallback.failureStage ?? null,
+      finishReason: firstAttempt.modelCallMeta?.finishReason ?? null,
+    },
+  };
+}
+
+function sumCosts(first: number | null, second: number | null): number | null {
+  if (first === null && second === null) return null;
+  return (first ?? 0) + (second ?? 0);
+}
+
+async function runClassifierAttempt(
+  client: OpenRouter,
+  input: NormalizedClassifierInput,
+  classifierModel: string,
+  options: ClassifierCallOptions
+): Promise<ClassifierRunResult> {
+  const result = await client.chat.send({
+    chatRequest: {
+      model: classifierModel,
+      messages: buildClassifierMessages(input),
+      responseFormat: { type: 'json_object' },
+      stream: false,
+      temperature: 0,
+      maxTokens: CLASSIFIER_MAX_TOKENS,
+      ...(options.openrouterSessionId ? { sessionId: options.openrouterSessionId } : {}),
+    },
+  });
+
+  const cost = result.usage?.cost ?? null;
+  const text = extractClassifierText(result);
+  const modelCallMeta = extractModelCallMeta(result, text);
+  if (!text) {
+    return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, {
+      reason: 'no_text',
+    });
+  }
+
+  try {
+    return {
+      cost,
+      classifierModel,
+      classification: parseClassifierOutput(text),
+      modelCallMeta,
+    };
+  } catch (error) {
+    return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, {
+      reason: 'invalid_output',
+      ...(error instanceof ClassifierOutputParseError
+        ? {
+            failureStage: error.failureStage,
+            schemaIssueSummary: error.schemaIssueSummary,
+            topLevelKeys: error.topLevelKeys,
+          }
+        : {}),
+    });
+  }
+}
+
+function extractModelCallMeta(result: ChatResult, text: string | null): ClassifierModelCallMeta {
+  return {
+    finishReason: result.choices[0]?.finishReason ?? null,
+    completionTokens: result.usage?.completionTokens ?? null,
+    reasoningTokens: result.usage?.completionTokensDetails?.reasoningTokens ?? null,
+    textLength: text?.length ?? null,
+  };
+}
+
+function fallbackClassifierResult(
+  input: NormalizedClassifierInput,
+  classifierModel: string,
+  cost: number | null,
+  modelCallMeta: ClassifierModelCallMeta,
+  fallback: ClassifierRunFallbackMetadata
+): ClassifierRunResult {
+  return {
+    cost,
+    classifierModel,
+    classification: fallbackClassifierOutput(input),
+    fallback,
+    modelCallMeta,
+  };
+}
+
+function extractClassifierText(result: ChatResult) {
+  const content: unknown = result.choices[0]?.message.content;
+  if (typeof content === 'string' && content.trim().length > 0) {
+    return content;
+  }
+  return null;
+}
diff --git a/services/auto-routing/src/classifier-output/fallback.test.ts b/packages/auto-routing-contracts/src/classifier/output-fallback.test.ts
similarity index 92%
rename from services/auto-routing/src/classifier-output/fallback.test.ts
rename to packages/auto-routing-contracts/src/classifier/output-fallback.test.ts
index c5ee6394a1..6bafe4acf3 100644
--- a/services/auto-routing/src/classifier-output/fallback.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/output-fallback.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
-import { fallbackClassifierOutput } from './fallback';
+import type { NormalizedClassifierInput } from '../index';
+import { fallbackClassifierOutput } from './output-fallback';
 
 const input = {
   apiKind: 'chat_completions',
diff --git a/services/auto-routing/src/classifier-output/fallback.ts b/packages/auto-routing-contracts/src/classifier/output-fallback.ts
similarity index 92%
rename from services/auto-routing/src/classifier-output/fallback.ts
rename to packages/auto-routing-contracts/src/classifier/output-fallback.ts
index c047813e50..969374b893 100644
--- a/services/auto-routing/src/classifier-output/fallback.ts
+++ b/packages/auto-routing-contracts/src/classifier/output-fallback.ts
@@ -1,5 +1,5 @@
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
-import type { ClassifierOutput } from './index';
+import type { NormalizedClassifierInput } from '../index';
+import type { ClassifierOutput } from './output';
 
 type IntentRule = {
   taskType: ClassifierOutput['taskType'];
diff --git a/services/auto-routing/src/classifier-output/index.test.ts b/packages/auto-routing-contracts/src/classifier/output.test.ts
similarity index 99%
rename from services/auto-routing/src/classifier-output/index.test.ts
rename to packages/auto-routing-contracts/src/classifier/output.test.ts
index d57003b00d..e842a4b178 100644
--- a/services/auto-routing/src/classifier-output/index.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/output.test.ts
@@ -4,7 +4,7 @@ import {
   parseClassifierOutput,
   type ClassifierOutputParseError,
   type ClassifierOutput,
-} from './index';
+} from './output';
 
 const validOutput = {
   taskType: 'debugging',
diff --git a/services/auto-routing/src/classifier-output/index.ts b/packages/auto-routing-contracts/src/classifier/output.ts
similarity index 98%
rename from services/auto-routing/src/classifier-output/index.ts
rename to packages/auto-routing-contracts/src/classifier/output.ts
index 1796e4b724..8acd5392fc 100644
--- a/services/auto-routing/src/classifier-output/index.ts
+++ b/packages/auto-routing-contracts/src/classifier/output.ts
@@ -1,5 +1,5 @@
-import { ClassifierOutputSchema, type ClassifierOutput } from '@kilocode/auto-routing-contracts';
-import classifierTaxonomy from '../classifier-taxonomy.json';
+import { ClassifierOutputSchema, type ClassifierOutput } from '../index';
+import classifierTaxonomy from './taxonomy.json';
 
 export const classifierOutputSchema = ClassifierOutputSchema;
 export type { ClassifierOutput };
diff --git a/services/auto-routing/src/classifier-prompt.test.ts b/packages/auto-routing-contracts/src/classifier/prompt.test.ts
similarity index 97%
rename from services/auto-routing/src/classifier-prompt.test.ts
rename to packages/auto-routing-contracts/src/classifier/prompt.test.ts
index 782c5a22c6..e3444fedc4 100644
--- a/services/auto-routing/src/classifier-prompt.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/prompt.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from 'vitest';
-import { buildClassifierMessages, DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
+import { buildClassifierMessages, DEFAULT_CLASSIFIER_MODEL } from './prompt';
+import type { NormalizedClassifierInput } from '../index';
 
 const input = {
   apiKind: 'chat_completions',
diff --git a/services/auto-routing/src/classifier-prompt.ts b/packages/auto-routing-contracts/src/classifier/prompt.ts
similarity index 96%
rename from services/auto-routing/src/classifier-prompt.ts
rename to packages/auto-routing-contracts/src/classifier/prompt.ts
index 641df0fb24..efaf1793fd 100644
--- a/services/auto-routing/src/classifier-prompt.ts
+++ b/packages/auto-routing-contracts/src/classifier/prompt.ts
@@ -1,5 +1,5 @@
-import classifierTaxonomy from './classifier-taxonomy.json';
-import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
+import classifierTaxonomy from './taxonomy.json';
+import type { NormalizedClassifierInput } from '../index';
 
 export const DEFAULT_CLASSIFIER_MODEL = 'google/gemini-2.5-flash-lite';
 // The classification JSON needs ~60 tokens; the headroom avoids truncated
diff --git a/services/auto-routing/src/classifier-taxonomy.json b/packages/auto-routing-contracts/src/classifier/taxonomy.json
similarity index 100%
rename from services/auto-routing/src/classifier-taxonomy.json
rename to packages/auto-routing-contracts/src/classifier/taxonomy.json
diff --git a/services/auto-routing/src/classifier-taxonomy.test.ts b/packages/auto-routing-contracts/src/classifier/taxonomy.test.ts
similarity index 96%
rename from services/auto-routing/src/classifier-taxonomy.test.ts
rename to packages/auto-routing-contracts/src/classifier/taxonomy.test.ts
index dc510492cf..b3a3ab7dd0 100644
--- a/services/auto-routing/src/classifier-taxonomy.test.ts
+++ b/packages/auto-routing-contracts/src/classifier/taxonomy.test.ts
@@ -46,7 +46,7 @@ const TaxonomySchema = z.object({
 });
 
 async function readTaxonomy() {
-  const file = await readFile(join(__dirname, 'classifier-taxonomy.json'), 'utf8');
+  const file = await readFile(join(__dirname, 'taxonomy.json'), 'utf8');
   return TaxonomySchema.parse(JSON.parse(file));
 }
 
diff --git a/packages/auto-routing-contracts/src/contracts.test.ts b/packages/auto-routing-contracts/src/contracts.test.ts
index 56257f8f05..0c826251dc 100644
--- a/packages/auto-routing-contracts/src/contracts.test.ts
+++ b/packages/auto-routing-contracts/src/contracts.test.ts
@@ -6,6 +6,7 @@ import {
   MirrorPayloadSchema,
   UpdateClassifierModelRequestSchema,
 } from './index';
+import { BenchmarkConfigSchema } from './benchmark';
 
 describe('auto routing contracts', () => {
   it('validates the cross-service request and response contracts', () => {
@@ -94,6 +95,8 @@ describe('auto routing contracts', () => {
     expect(
       AutoRoutingClassifierModelResponseSchema.parse({
         model: 'google/gemini-2.5-flash-lite',
+        override: null,
+        benchmarkWinner: 'google/gemini-2.5-flash-lite',
         defaultModel: 'google/gemini-2.5-flash-lite',
       })
     ).toMatchObject({ model: 'google/gemini-2.5-flash-lite' });
@@ -126,3 +129,69 @@ describe('auto routing contracts', () => {
     ).toMatchObject({ period: '24h' });
   });
 });
+
+describe('BenchmarkConfigSchema defaults', () => {
+  it('applies defaults of 1/1/1000 for classifierRepetitions, deciderRepetitions, classifierMaxP95LatencyMs', () => {
+    const result = BenchmarkConfigSchema.parse({
+      classifierModels: ['model/a'],
+      deciderModels: [{ id: 'model/b' }],
+      minAccuracy: 0.8,
+      maxConcurrency: 4,
+      benchmarkUserId: null,
+      switchCostFactor: 2,
+      updatedAt: null,
+      updatedBy: null,
+      // classifierRepetitions, deciderRepetitions, classifierMaxP95LatencyMs intentionally omitted
+    });
+    expect(result.classifierRepetitions).toBe(1);
+    expect(result.deciderRepetitions).toBe(1);
+    expect(result.classifierMaxP95LatencyMs).toBe(1000);
+  });
+});
+
+describe('BenchmarkConfigSchema duplicate model ids', () => {
+  const base = {
+    minAccuracy: 0.8,
+    maxConcurrency: 4,
+    benchmarkUserId: null,
+    switchCostFactor: 2,
+    updatedAt: null,
+    updatedBy: null,
+  };
+
+  it('rejects duplicate classifier model ids with a field-specific issue', () => {
+    const result = BenchmarkConfigSchema.safeParse({
+      ...base,
+      classifierModels: ['model/a', 'model/a'],
+      deciderModels: [{ id: 'model/b' }],
+    });
+    expect(result.success).toBe(false);
+    if (!result.success) {
+      const issue = result.error.issues.find(i => i.path[0] === 'classifierModels');
+      expect(issue?.path).toEqual(['classifierModels', 1]);
+      expect(issue?.message).toContain('Duplicate model id');
+    }
+  });
+
+  it('rejects duplicate decider model ids (trim-normalized)', () => {
+    const result = BenchmarkConfigSchema.safeParse({
+      ...base,
+      classifierModels: ['model/a'],
+      deciderModels: [{ id: 'model/b' }, { id: '  model/b  ' }],
+    });
+    expect(result.success).toBe(false);
+    if (!result.success) {
+      const issue = result.error.issues.find(i => i.path[0] === 'deciderModels');
+      expect(issue?.path).toEqual(['deciderModels', 1]);
+    }
+  });
+
+  it('accepts distinct model ids', () => {
+    const result = BenchmarkConfigSchema.safeParse({
+      ...base,
+      classifierModels: ['model/a', 'model/b'],
+      deciderModels: [{ id: 'model/c' }, { id: 'model/d' }],
+    });
+    expect(result.success).toBe(true);
+  });
+});
diff --git a/packages/auto-routing-contracts/src/index.ts b/packages/auto-routing-contracts/src/index.ts
index ef537f600e..31915439ec 100644
--- a/packages/auto-routing-contracts/src/index.ts
+++ b/packages/auto-routing-contracts/src/index.ts
@@ -1,5 +1,6 @@
 import * as z from 'zod';
 import { NormalizedClassifierInputSchema } from './input';
+import { DifficultyTierSchema, ReasoningEffortSchema } from './tiers';
 
 export {
   NormalizedClassifierInputSchema,
@@ -96,9 +97,25 @@ export const ClassifierOutputSchema = z
   });
 export type ClassifierOutput = z.infer<typeof ClassifierOutputSchema>;
 
+export const AutoRoutingDecisionSchema = z.object({
+  model: z.string(),
+  tier: DifficultyTierSchema,
+  source: z.enum(['benchmark']),
+  tableVersion: z.string(),
+  // Mirrors the effort the chosen model was benchmarked with, when set.
+  reasoningEffort: ReasoningEffortSchema.nullable().optional(),
+  // True when the session's incumbent model was kept over a cheaper fresh
+  // pick. Defaulted so responses from a not-yet-redeployed worker still
+  // parse.
+  sticky: z.boolean().default(false),
+});
+export type AutoRoutingDecision = z.infer<typeof AutoRoutingDecisionSchema>;
+
 export const AutoRoutingDecisionResponseSchema = z.object({
   cost: z.number(),
-  decision: z.null(),
+  // Null when classification failed or no routing table is published; the
+  // gateway then falls back to its static balanced defaults.
+  decision: AutoRoutingDecisionSchema.nullable(),
   classifierResult: z
     .object({
       classification: ClassifierOutputSchema,
@@ -108,13 +125,17 @@ export const AutoRoutingDecisionResponseSchema = z.object({
 });
 export type AutoRoutingDecisionResponse = z.infer<typeof AutoRoutingDecisionResponseSchema>;
 
+// model: null clears the admin override (benchmark winner takes effect).
 export const UpdateClassifierModelRequestSchema = z.object({
-  model: z.string().trim().min(1),
+  model: z.string().trim().min(1).nullable(),
 });
 export type UpdateClassifierModelRequest = z.infer<typeof UpdateClassifierModelRequestSchema>;
 
 export const AutoRoutingClassifierModelResponseSchema = z.object({
+  // Effective model used by /decide: override ?? benchmark winner ?? default.
   model: z.string(),
+  override: z.string().nullable(),
+  benchmarkWinner: z.string().nullable(),
   defaultModel: z.string(),
 });
 export type AutoRoutingClassifierModelResponse = z.infer<
@@ -158,3 +179,7 @@ export type AutoRoutingClassifierAnalyticsResponse = z.infer<
 >;
 
 export { normalizeClassifierInput, redactProviderHints, type ClassifierApiKind } from './normalize';
+
+export * from './tiers';
+export * from './routing-table';
+export * from './benchmark';
diff --git a/packages/auto-routing-contracts/src/routing-table.test.ts b/packages/auto-routing-contracts/src/routing-table.test.ts
new file mode 100644
index 0000000000..edcd573b44
--- /dev/null
+++ b/packages/auto-routing-contracts/src/routing-table.test.ts
@@ -0,0 +1,43 @@
+import { describe, expect, it } from 'vitest';
+import { rankCandidates, RoutingTableSchema } from './routing-table';
+
+const candidate = (model: string, accuracy: number, avgCostUsd: number) => ({
+  model,
+  accuracy,
+  avgCostUsd,
+  meetsThreshold: false,
+});
+
+describe('rankCandidates', () => {
+  it('puts the cheapest above-threshold candidate first', () => {
+    const ranked = rankCandidates(
+      [candidate('expensive', 0.95, 10), candidate('cheap', 0.8, 1), candidate('weak', 0.5, 0.1)],
+      0.7
+    );
+    expect(ranked.map(c => c.model)).toEqual(['cheap', 'expensive', 'weak']);
+    expect(ranked[0].meetsThreshold).toBe(true);
+    expect(ranked[2].meetsThreshold).toBe(false);
+  });
+  it('falls back to highest accuracy when nothing meets the threshold', () => {
+    const ranked = rankCandidates([candidate('a', 0.5, 1), candidate('b', 0.6, 5)], 0.9);
+    expect(ranked[0].model).toBe('b');
+  });
+  it('breaks cost ties by accuracy', () => {
+    const ranked = rankCandidates([candidate('a', 0.8, 1), candidate('b', 0.9, 1)], 0.7);
+    expect(ranked[0].model).toBe('b');
+  });
+});
+
+describe('RoutingTableSchema', () => {
+  it('requires at least one candidate per tier', () => {
+    expect(
+      RoutingTableSchema.safeParse({
+        version: 'v',
+        generatedAt: new Date(0).toISOString(),
+        minAccuracy: 0.7,
+        source: 'benchmark',
+        tiers: { low: [], medium: [candidate('m', 1, 1)], high: [candidate('h', 1, 1)] },
+      }).success
+    ).toBe(false);
+  });
+});
diff --git a/packages/auto-routing-contracts/src/routing-table.ts b/packages/auto-routing-contracts/src/routing-table.ts
new file mode 100644
index 0000000000..ff49e81578
--- /dev/null
+++ b/packages/auto-routing-contracts/src/routing-table.ts
@@ -0,0 +1,51 @@
+import * as z from 'zod';
+import { ReasoningEffortSchema } from './tiers';
+
+export const RankedCandidateSchema = z.object({
+  model: z.string().trim().min(1),
+  // Benchmark accuracy in [0, 1] for this tier.
+  accuracy: z.number().min(0).max(1),
+  // Average observed OpenRouter cost per benchmark case, in USD credits.
+  avgCostUsd: z.number().nonnegative(),
+  meetsThreshold: z.boolean(),
+  // Reasoning effort the model was benchmarked with; serving mirrors it.
+  // Optional so tables published before this field existed stay valid.
+  reasoningEffort: ReasoningEffortSchema.nullable().optional(),
+});
+export type RankedCandidate = z.infer<typeof RankedCandidateSchema>;
+
+export const RoutingTableSchema = z.object({
+  // Benchmark run id.
+  version: z.string().min(1),
+  generatedAt: z.string().min(1),
+  minAccuracy: z.number().min(0).max(1),
+  // Keep a session's incumbent model unless the fresh pick is cheaper by
+  // more than this factor (see BenchmarkConfigSchema.switchCostFactor).
+  switchCostFactor: z.number().min(1),
+  source: z.enum(['benchmark']),
+  tiers: z.object({
+    low: z.array(RankedCandidateSchema).min(1),
+    medium: z.array(RankedCandidateSchema).min(1),
+    high: z.array(RankedCandidateSchema).min(1),
+  }),
+});
+export type RoutingTable = z.infer<typeof RoutingTableSchema>;
+
+export const ROUTING_TABLE_KV_KEY = 'routing_table_v1';
+
+// "Best bang for buck": candidates meeting the accuracy threshold come
+// first, cheapest first (accuracy breaks ties); below-threshold candidates
+// follow ordered by accuracy so a degenerate table still routes sensibly.
+export function rankCandidates(
+  candidates: ReadonlyArray<Omit<RankedCandidate, 'meetsThreshold'> & { meetsThreshold?: boolean }>,
+  minAccuracy: number
+): RankedCandidate[] {
+  const flagged = candidates.map(c => ({ ...c, meetsThreshold: c.accuracy >= minAccuracy }));
+  return flagged.toSorted((a, b) => {
+    if (a.meetsThreshold !== b.meetsThreshold) return a.meetsThreshold ? -1 : 1;
+    if (a.meetsThreshold) {
+      return a.avgCostUsd - b.avgCostUsd || b.accuracy - a.accuracy;
+    }
+    return b.accuracy - a.accuracy || a.avgCostUsd - b.avgCostUsd;
+  });
+}
diff --git a/packages/auto-routing-contracts/src/tiers.test.ts b/packages/auto-routing-contracts/src/tiers.test.ts
new file mode 100644
index 0000000000..5d62f7259f
--- /dev/null
+++ b/packages/auto-routing-contracts/src/tiers.test.ts
@@ -0,0 +1,79 @@
+import { describe, expect, it } from 'vitest';
+import { deriveDifficultyTier } from './tiers';
+import type { ClassifierOutput } from './index';
+
+function classification(overrides: Partial<ClassifierOutput>): ClassifierOutput {
+  return {
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    contextComplexity: 'small',
+    reasoningComplexity: 'low',
+    riskLevel: 'low',
+    executionMode: 'answer_only',
+    requiresTools: false,
+    confidence: 0.9,
+    ...overrides,
+  };
+}
+
+describe('deriveDifficultyTier', () => {
+  it('classifies trivial answer-only requests as low', () => {
+    expect(deriveDifficultyTier(classification({}))).toBe('low');
+  });
+  it('classifies mid-size code changes as medium', () => {
+    expect(
+      deriveDifficultyTier(
+        classification({
+          contextComplexity: 'medium',
+          reasoningComplexity: 'medium',
+          executionMode: 'code_change',
+        })
+      )
+    ).toBe('medium');
+  });
+  it('classifies high-reasoning multi-step work as high', () => {
+    expect(
+      deriveDifficultyTier(
+        classification({
+          contextComplexity: 'large',
+          reasoningComplexity: 'high',
+          executionMode: 'multi_step_project',
+          riskLevel: 'high',
+        })
+      )
+    ).toBe('high');
+  });
+  it('high risk tips an otherwise-low request to medium', () => {
+    expect(
+      deriveDifficultyTier(
+        classification({ executionMode: 'multi_step_project', riskLevel: 'high' })
+      )
+    ).toBe('medium');
+  });
+  it('high risk tips an otherwise-medium request to high', () => {
+    expect(
+      deriveDifficultyTier(
+        classification({
+          reasoningComplexity: 'medium',
+          contextComplexity: 'large',
+          executionMode: 'code_change',
+          riskLevel: 'high',
+        })
+      )
+    ).toBe('high');
+  });
+  it('is monotonic: bumping reasoning complexity never lowers the tier', () => {
+    const tiers = ['low', 'medium', 'high'] as const;
+    for (const ctx of ['small', 'medium', 'large'] as const) {
+      let prev = 0;
+      for (const reasoning of ['low', 'medium', 'high'] as const) {
+        const tier = deriveDifficultyTier(
+          classification({ contextComplexity: ctx, reasoningComplexity: reasoning })
+        );
+        const idx = tiers.indexOf(tier);
+        expect(idx).toBeGreaterThanOrEqual(prev);
+        prev = idx;
+      }
+    }
+  });
+});
diff --git a/packages/auto-routing-contracts/src/tiers.ts b/packages/auto-routing-contracts/src/tiers.ts
new file mode 100644
index 0000000000..8358c5e3bf
--- /dev/null
+++ b/packages/auto-routing-contracts/src/tiers.ts
@@ -0,0 +1,43 @@
+import * as z from 'zod';
+
+export const DifficultyTierSchema = z.enum(['low', 'medium', 'high']);
+
+export const ReasoningEffortSchema = z.enum(['minimal', 'low', 'medium', 'high']);
+export type ReasoningEffort = z.infer<typeof ReasoningEffortSchema>;
+export type DifficultyTier = z.infer<typeof DifficultyTierSchema>;
+
+export const DIFFICULTY_TIERS: readonly DifficultyTier[] = ['low', 'medium', 'high'];
+
+const REASONING_POINTS = { low: 0, medium: 2, high: 4 } as const;
+const CONTEXT_POINTS = { small: 0, medium: 1, large: 2 } as const;
+const EXECUTION_POINTS = {
+  answer_only: 0,
+  code_change: 1,
+  command_execution: 1,
+  multi_step_project: 2,
+} as const;
+const RISK_POINTS = { low: 0, medium: 0, high: 1 } as const;
+
+// Deterministic mapping from the classifier taxonomy to a difficulty tier.
+// Reasoning complexity dominates (weight 2x) because it is the strongest
+// signal for whether a cheap model can complete the task; context size,
+// execution mode and blast radius nudge borderline cases up.
+// Structural subset of ClassifierOutput: importing the full type from
+// ./index would create a module cycle (index re-exports this file).
+export type DifficultyTierSignal = {
+  reasoningComplexity: 'low' | 'medium' | 'high';
+  contextComplexity: 'small' | 'medium' | 'large';
+  executionMode: 'answer_only' | 'code_change' | 'command_execution' | 'multi_step_project';
+  riskLevel: 'low' | 'medium' | 'high';
+};
+
+export function deriveDifficultyTier(classification: DifficultyTierSignal): DifficultyTier {
+  const score =
+    REASONING_POINTS[classification.reasoningComplexity] +
+    CONTEXT_POINTS[classification.contextComplexity] +
+    EXECUTION_POINTS[classification.executionMode] +
+    RISK_POINTS[classification.riskLevel];
+  if (score <= 2) return 'low';
+  if (score <= 5) return 'medium';
+  return 'high';
+}
diff --git a/packages/auto-routing-contracts/tsconfig.json b/packages/auto-routing-contracts/tsconfig.json
index 76473b226e..b293f0f4ef 100644
--- a/packages/auto-routing-contracts/tsconfig.json
+++ b/packages/auto-routing-contracts/tsconfig.json
@@ -4,11 +4,13 @@
     "module": "ESNext",
     "moduleResolution": "bundler",
     "lib": ["ESNext", "WebWorker"],
+    "types": ["node"],
     "strict": true,
     "skipLibCheck": true,
     "forceConsistentCasingInFileNames": true,
     "noEmit": true,
     "isolatedModules": true,
+    "resolveJsonModule": true,
     "noImplicitReturns": true,
     "noFallthroughCasesInSwitch": true
   },
diff --git a/packages/auto-routing-contracts/vitest.config.ts b/packages/auto-routing-contracts/vitest.config.ts
new file mode 100644
index 0000000000..7dd13254e7
--- /dev/null
+++ b/packages/auto-routing-contracts/vitest.config.ts
@@ -0,0 +1,9 @@
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    globals: true,
+    environment: 'node',
+    include: ['src/**/*.test.ts'],
+  },
+});
diff --git a/packages/worker-utils/src/index.ts b/packages/worker-utils/src/index.ts
index 1c61b58ff1..066a693587 100644
--- a/packages/worker-utils/src/index.ts
+++ b/packages/worker-utils/src/index.ts
@@ -99,6 +99,9 @@ export type { RepoCoordinates } from './git-url.js';
 
 export { KILO_MODEL_PREFIX, unprefixKiloGatewayModelId } from './kilo-model-id.js';
 
+export { ttlCached } from './ttl-cache.js';
+export type { TtlCache } from './ttl-cache.js';
+
 export {
   CloudAgentQueueReportSchema,
   CloudAgentRunStatuses,
diff --git a/services/auto-routing/src/ttl-cache.ts b/packages/worker-utils/src/ttl-cache.ts
similarity index 100%
rename from services/auto-routing/src/ttl-cache.ts
rename to packages/worker-utils/src/ttl-cache.ts
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index a2c02b8bdf..c8727c619b 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -990,10 +990,16 @@ importers:
 
   packages/auto-routing-contracts:
     dependencies:
+      '@openrouter/sdk':
+        specifier: ^0.12.79
+        version: 0.12.79
       zod:
         specifier: 'catalog:'
         version: 4.4.3
     devDependencies:
+      '@types/node':
+        specifier: 'catalog:'
+        version: 24.12.4
       '@typescript/native-preview':
         specifier: 'catalog:'
         version: 7.0.0-dev.20260514.1
@@ -1002,7 +1008,7 @@ importers:
         version: 5.9.3
       vitest:
         specifier: 'catalog:'
-        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@25.5.2)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
 
   packages/cloud-agent-profile:
     dependencies:
@@ -1504,6 +1510,52 @@ importers:
         specifier: 'catalog:'
         version: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6)
 
+  services/auto-routing-benchmark:
+    dependencies:
+      '@cloudflare/containers':
+        specifier: 0.1.1
+        version: 0.1.1
+      '@kilocode/auto-routing-contracts':
+        specifier: workspace:*
+        version: link:../../packages/auto-routing-contracts
+      '@kilocode/worker-utils':
+        specifier: workspace:*
+        version: link:../../packages/worker-utils
+      '@openrouter/sdk':
+        specifier: ^0.12.79
+        version: 0.12.79
+      drizzle-orm:
+        specifier: 0.45.2
+        version: 0.45.2(@cloudflare/workers-types@4.20260605.1)(@opentelemetry/api@1.9.1)(@types/pg@8.18.0)(@upstash/redis@1.38.0)(bun-types@1.3.14)(kysely@0.29.2)(pg@8.20.0)
+      hono:
+        specifier: 4.12.18
+        version: 4.12.18
+      zod:
+        specifier: 'catalog:'
+        version: 4.4.3
+    devDependencies:
+      '@cloudflare/workers-types':
+        specifier: 'catalog:'
+        version: 4.20260605.1
+      '@types/node':
+        specifier: 'catalog:'
+        version: 24.12.4
+      '@typescript/native-preview':
+        specifier: 'catalog:'
+        version: 7.0.0-dev.20260514.1
+      drizzle-kit:
+        specifier: 'catalog:'
+        version: 0.31.10
+      typescript:
+        specifier: 'catalog:'
+        version: 5.9.3
+      vitest:
+        specifier: 'catalog:'
+        version: 4.1.6(@opentelemetry/api@1.9.1)(@types/node@24.12.4)(@vitest/coverage-v8@4.1.6)(@vitest/ui@4.1.6)(esbuild@0.27.4)(jiti@2.7.0)(terser@5.46.0)(tsx@4.21.0)(yaml@2.8.4)
+      wrangler:
+        specifier: 'catalog:'
+        version: 4.98.0(@cloudflare/workers-types@4.20260605.1)(bufferutil@4.1.0)(utf-8-validate@6.0.6)
+
   services/auto-triage-infra:
     dependencies:
       '@kilocode/worker-utils':
@@ -23806,7 +23858,7 @@ snapshots:
 
   '@types/pg@8.18.0':
     dependencies:
-      '@types/node': 25.5.2
+      '@types/node': 24.12.4
       pg-protocol: 1.13.0
       pg-types: 2.2.0
 
diff --git a/services/auto-routing-benchmark/.dev.vars.example b/services/auto-routing-benchmark/.dev.vars.example
new file mode 100644
index 0000000000..9f3063f8ad
--- /dev/null
+++ b/services/auto-routing-benchmark/.dev.vars.example
@@ -0,0 +1,14 @@
+# Base URL the worker uses for apps/web's /api/internal/* routes (decider
+# benchmark token mint). The worker process runs on the host, so localhost
+# reaches the local apps/web dev server directly.
+# @url nextjs
+KILO_WEB_API_BASE_URL=http://localhost:3000
+
+# Gateway base URL for the kilo CLI inside the benchmark container (injected
+# as KILO_API_URL). Containers cannot use localhost (that resolves to the
+# container itself). host.docker.internal works under OrbStack; on Docker
+# Desktop the wrangler container network may not get that mapping — use the
+# Docker Desktop host gateway IP http://192.168.65.254:3000 instead (same
+# convention as services/wasteland).
+# @url nextjs
+KILO_CLI_API_URL=http://host.docker.internal:3000
diff --git a/services/auto-routing-benchmark/README.md b/services/auto-routing-benchmark/README.md
new file mode 100644
index 0000000000..cd5a226bf6
--- /dev/null
+++ b/services/auto-routing-benchmark/README.md
@@ -0,0 +1,136 @@
+# auto-routing-benchmark
+
+Cloudflare Worker that benchmarks candidate models and publishes the artifacts
+that drive `kilo-auto/efficient` routing. It is the **sole writer** of the
+routing table and classifier winner; `services/auto-routing` and the `apps/web`
+gateway only read them. See `docs/adr/0002-auto-routing-efficient.md` for the
+design, invariants, and rollout/rollback.
+
+## What it does
+
+- **Classifier benchmark** — replays 72 normalized classifier inputs through
+  OpenRouter using the exact production classifier code
+  (`@kilocode/auto-routing-contracts/classifier`), grades per-field, and derives
+  the cheapest above-threshold model as the classifier winner.
+- **Decider benchmark** — runs 76 golden tasks per candidate through the real
+  `kilo` CLI inside a Cloudflare Container, grades mechanically, and publishes a
+  per-difficulty-tier routing table.
+- Normalized results live in D1 (`BENCH_DB`); published artifacts are cached in
+  the shared `AUTO_ROUTING_CONFIG` KV namespace (publish = delete the keys so the
+  next read repopulates from D1).
+
+## Admin endpoints
+
+All under `/admin`, gated by `Authorization: Bearer <INTERNAL_API_SECRET_PROD>`
+(the gateway's admin panel proxies these with the internal secret):
+
+| Endpoint | Purpose |
+|---|---|
+| `GET/PUT /admin/config` | Read / save benchmark config (model lists, thresholds, `benchmarkUserId`) |
+| `GET /admin/runs` | List runs (sweeps stale `running` runs to `failed` first) |
+| `POST /admin/runs` | Start a run (`{kind, force}`); returns 409 if one of that kind is already running |
+| `GET /admin/routing-table` | Latest published routing table |
+| `GET /admin/classifier-winner` | Current classifier winner |
+| `POST /admin/debug-cli` | Run one ad-hoc prompt through the kilo CLI container (diagnostic) |
+
+## Local development
+
+The worker is part of the dev runner. From the repo root:
+
+```bash
+pnpm dev:start auto-routing
+```
+
+This brings up the auto-routing worker (:8810), this worker (:8814), and the
+Next.js gateway (:3000). Logs land in `dev/logs/*.log`; the tmux session is
+`kilo-dev-<worktree>`.
+
+### Required env / secrets
+
+- **`.dev.vars`** (copy from `.dev.vars.example`): `KILO_WEB_API_BASE_URL`
+  (`http://localhost:3000`) and `KILO_CLI_API_URL`
+  (`http://host.docker.internal:3000` under OrbStack — containers can't reach
+  `localhost`).
+- **Secrets store** (seeded via `pnpm dev:env -y auto-routing-benchmark`, not
+  `.dev.vars`): `INTERNAL_API_SECRET_PROD` (same value as the gateway's
+  `INTERNAL_API_SECRET`) and `OPENROUTER_API_KEY`.
+
+### Hitting it locally
+
+```bash
+SECRET=$(grep '^INTERNAL_API_SECRET=' ../../.env.local | cut -d= -f2- | tr -d '"')
+curl -s http://localhost:8814/admin/config -H "Authorization: Bearer $SECRET"
+```
+
+Decider runs need a `benchmarkUserId` that exists locally with credits — the dev
+seed provides `auto-routing-cli-local`.
+
+> Local KV/D1 writes from a *second* `wrangler` process are not seen by the
+> running dev process (miniflare holds its own view). After writing state out of
+> band, `pnpm dev:restart auto-routing-benchmark` to make it visible.
+
+## D1
+
+Single squashed baseline migration in `migrations/`. Regenerate after a schema
+change in `src/db-schema.ts`:
+
+```bash
+pnpm db:generate     # drizzle-kit generate
+pnpm typecheck && pnpm test
+```
+
+Migrations apply on deploy via the `predeploy` hook
+(`wrangler d1 migrations apply auto-routing-benchmark --remote`).
+
+Inspect local D1 by copying the sqlite out (direct reads often hit miniflare
+locks):
+
+```bash
+cp .wrangler/state/v3/d1/miniflare-D1DatabaseObject/*.sqlite* /tmp/
+sqlite3 /tmp/<file>.sqlite 'select id, kind, status from benchmark_runs;'
+```
+
+## Debugging container (decider) failures
+
+- Each (model, 10-case chunk) gets its own container instance
+  (`runId:model:chunk`); CLI runs are serialized per instance (its sqlite state
+  is not safe under concurrent first runs). A `/warmup` call absorbs the one-time
+  sqlite migration before the case loop.
+- `case_results` rows carry diagnostics: CLI exit code, output prefix, and an
+  event tail — start there for a failing case.
+- `POST /admin/debug-cli {model, prompt}` runs one prompt through the container
+  and returns truncated stdout + the parsed result, without a full run.
+- Container → host networking: under OrbStack use `host.docker.internal`; the
+  Docker Desktop gateway IP `192.168.65.254` does **not** work there (times out).
+- Wrangler pulls the egress proxy image as amd64; on Apple Silicon it crashes
+  unless the dev runner pins the arm64 manifest digest
+  (`MINIFLARE_CONTAINER_EGRESS_IMAGE`) — already handled by the dev runner.
+
+## Debugging the DLQ
+
+Failed queue messages land in `auto-routing-benchmark-dlq` after `max_retries`
+(2) on `auto-routing-benchmark-jobs`. A message is one (model, chunk) job, so a
+DLQ'd message means that chunk never produced results; its model's summaries for
+the affected tier(s) will be missing or incomplete and `finalizeRunIfComplete`
+will mark the run accordingly.
+
+To inspect / handle:
+
+- **Prod**: read the DLQ from the Cloudflare dashboard (Workers → Queues →
+  `auto-routing-benchmark-dlq`) or `wrangler queues` tooling; the message body is
+  the JSON job (`runId`, `model`, `chunk`, case ids).
+- **Replay**: re-run the affected model with the admin `force` toggle once the
+  underlying cause (OpenRouter outage, container image, bad case) is fixed —
+  carried summaries mean only the re-triggered model is re-benchmarked.
+- **Declare failed**: a run with a wedged/dead `running` row is swept to `failed`
+  on the next `GET /admin/runs`, freeing the one-active-run-per-kind slot.
+
+## Commands
+
+```bash
+pnpm dev          # wrangler dev (port 8814)
+pnpm typecheck    # tsgo --noEmit
+pnpm lint
+pnpm test         # vitest run
+pnpm db:generate  # regenerate D1 migration from src/db-schema.ts
+```
diff --git a/services/auto-routing-benchmark/container/Dockerfile b/services/auto-routing-benchmark/container/Dockerfile
new file mode 100644
index 0000000000..25550a3da1
--- /dev/null
+++ b/services/auto-routing-benchmark/container/Dockerfile
@@ -0,0 +1,16 @@
+# Decider-benchmark runner container.
+#
+# Runs the stable `kilo` CLI (@kilocode/cli, dist-tag `latest`) for one decider
+# case at a time. `wrangler deploy` builds and pushes this image automatically.
+#
+# NOTE: `@kilocode/cli@latest` is resolved at IMAGE BUILD time (i.e. at deploy
+# time), so each deploy pins whatever version was `latest` then. Re-deploy to
+# pick up a newer stable CLI.
+FROM node:22-slim
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates git python3 g++ make \
+  && rm -rf /var/lib/apt/lists/* \
+  && npm install -g @kilocode/cli@latest
+WORKDIR /app
+COPY server.mjs ./
+EXPOSE 3000
+CMD ["node", "server.mjs"]
diff --git a/services/auto-routing-benchmark/container/server.mjs b/services/auto-routing-benchmark/container/server.mjs
new file mode 100644
index 0000000000..719c54d68e
--- /dev/null
+++ b/services/auto-routing-benchmark/container/server.mjs
@@ -0,0 +1,215 @@
+// Dependency-free HTTP server that runs one decider-benchmark case through the
+// stable `kilo` CLI per request. Intentionally dumb: it spawns the CLI, caps
+// output, and returns raw stdout lines. All event parsing happens in the
+// worker (src/kilo-events.ts), not here.
+//
+// The Kilo user token is passed in the request body and injected only as a
+// child-process env var (KILO_AUTH_CONTENT). It is never written to disk and
+// never logged.
+
+import { createServer } from 'node:http';
+import { spawn } from 'node:child_process';
+import { mkdtemp, rm } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+
+const PORT = 3000;
+const DEFAULT_TIMEOUT_MS = 180_000;
+const STDOUT_CAP_BYTES = 2 * 1024 * 1024; // 2MB
+const STDERR_CAP_BYTES = 4 * 1024; // 4KB tail
+
+function sendJson(res, status, body) {
+  const payload = JSON.stringify(body);
+  res.writeHead(status, { 'content-type': 'application/json' });
+  res.end(payload);
+}
+
+async function readBody(req) {
+  const chunks = [];
+  for await (const chunk of req) chunks.push(chunk);
+  return Buffer.concat(chunks).toString('utf8');
+}
+
+// The CLI's one-time sqlite migration (and its state dir generally) is not
+// safe under concurrent first runs; serialize every CLI execution in this
+// instance. Callers see requests queue, which is fine for benchmark traffic.
+let runChain = Promise.resolve();
+function runCaseSerialized(params) {
+  const next = runChain.then(() => runCase(params));
+  runChain = next.catch(() => {});
+  return next;
+}
+
+function runCase({ model, prompt, kiloToken, timeoutMs, variant }) {
+  return new Promise(resolve => {
+    void (async () => {
+      const dir = await mkdtemp(join(tmpdir(), 'kilo-bench-'));
+      const startedAt = Date.now();
+      let timedOut = false;
+
+      let stdout = '';
+      let stdoutTruncated = false;
+      let stderrTail = '';
+
+      const args = ['run', '--format', 'json', '--auto', '-m', `kilo/${model}`];
+      // Reasoning effort: forwarded as the CLI's provider-specific variant.
+      if (typeof variant === 'string' && variant.length > 0) args.push('--variant', variant);
+      args.push(prompt);
+      // detached: the `kilo` bin is a wrapper that spawns the real CLI binary
+      // as a grandchild. Killing only the wrapper orphans the grandchild: it
+      // keeps running (and spending) and holds the stdout/stderr pipes open,
+      // so 'close' never fires and the case hangs forever. A detached child
+      // leads its own process group, letting the timeout kill the whole tree.
+      const child = spawn('kilo', args, {
+        cwd: dir,
+        env: {
+          ...process.env,
+          KILO_AUTH_CONTENT: JSON.stringify({ kilo: { type: 'api', key: kiloToken } }),
+          NO_COLOR: '1',
+        },
+        stdio: ['ignore', 'pipe', 'pipe'],
+        detached: true,
+      });
+
+      const killProcessTree = () => {
+        // Negative pid = the child's whole process group (wrapper + real CLI).
+        try {
+          process.kill(-child.pid, 'SIGKILL');
+        } catch {
+          child.kill('SIGKILL');
+        }
+      };
+      const killTimer = setTimeout(() => {
+        timedOut = true;
+        killProcessTree();
+      }, timeoutMs);
+
+      child.stdout.on('data', chunk => {
+        if (stdoutTruncated) return;
+        const text = chunk.toString('utf8');
+        if (stdout.length + text.length > STDOUT_CAP_BYTES) {
+          stdout += text.slice(0, STDOUT_CAP_BYTES - stdout.length);
+          stdoutTruncated = true;
+        } else {
+          stdout += text;
+        }
+      });
+
+      child.stderr.on('data', chunk => {
+        stderrTail = (stderrTail + chunk.toString('utf8')).slice(-STDERR_CAP_BYTES);
+      });
+
+      // 'error' and 'close' can both fire for the same child (Node emits
+      // 'close' after 'error' on spawn failure); only the first wins.
+      let finished = false;
+      const finish = async exitCode => {
+        if (finished) return;
+        finished = true;
+        clearTimeout(killTimer);
+        await rm(dir, { recursive: true, force: true }).catch(() => {});
+        const stdoutLines = stdout.split('\n').filter(line => line.length > 0);
+        // Defense in case a future CLI version echoes auth material to stderr.
+        const redactedStderrTail = stderrTail.split(kiloToken).join('[redacted]');
+        resolve({
+          exitCode,
+          durationMs: Date.now() - startedAt,
+          stdoutLines,
+          stderrTail: redactedStderrTail,
+          timedOut,
+        });
+      };
+
+      child.on('error', err => {
+        stderrTail = (stderrTail + `\nspawn error: ${err.message}`).slice(-STDERR_CAP_BYTES);
+        void finish(-1);
+      });
+      child.on('close', code => {
+        void finish(code ?? -1);
+      });
+      // Backstop for 'close' never firing: a stray process that survives the
+      // group kill (e.g. a tool process that moved to its own group) can hold
+      // the stdio pipes open indefinitely. After the child itself has exited,
+      // give the streams a short grace to flush, then finish regardless.
+      child.on('exit', code => {
+        setTimeout(() => void finish(code ?? -1), 5_000).unref();
+      });
+    })();
+  });
+}
+
+const server = createServer((req, res) => {
+  void (async () => {
+    if (req.method === 'GET' && req.url === '/health') {
+      sendJson(res, 200, { ok: true });
+      return;
+    }
+
+    // One-time CLI warmup (sqlite migration on a fresh instance): a trivial
+    // serialized run so real cases never burn their timeout on it.
+    if (req.method === 'POST' && req.url === '/warmup') {
+      let parsed;
+      try {
+        parsed = JSON.parse(await readBody(req));
+      } catch {
+        sendJson(res, 400, { error: 'invalid JSON body' });
+        return;
+      }
+      const { model, kiloToken } = parsed ?? {};
+      if (typeof model !== 'string' || typeof kiloToken !== 'string') {
+        sendJson(res, 400, { error: 'model and kiloToken are required strings' });
+        return;
+      }
+      const result = await runCaseSerialized({
+        model,
+        prompt: 'Reply with exactly: ok',
+        kiloToken,
+        timeoutMs: DEFAULT_TIMEOUT_MS,
+      });
+      sendJson(res, 200, { exitCode: result.exitCode, durationMs: result.durationMs });
+      return;
+    }
+
+    if (req.method === 'POST' && req.url === '/run') {
+      let parsed;
+      try {
+        parsed = JSON.parse(await readBody(req));
+      } catch {
+        sendJson(res, 400, { error: 'invalid JSON body' });
+        return;
+      }
+
+      const { model, prompt, kiloToken, variant } = parsed ?? {};
+      const timeoutMs =
+        typeof parsed?.timeoutMs === 'number' && parsed.timeoutMs > 0
+          ? parsed.timeoutMs
+          : DEFAULT_TIMEOUT_MS;
+
+      if (
+        typeof model !== 'string' ||
+        typeof prompt !== 'string' ||
+        typeof kiloToken !== 'string'
+      ) {
+        sendJson(res, 400, { error: 'model, prompt and kiloToken are required strings' });
+        return;
+      }
+
+      try {
+        if (variant !== undefined && variant !== null && typeof variant !== 'string') {
+          sendJson(res, 400, { error: 'variant must be a string when provided' });
+          return;
+        }
+        const result = await runCaseSerialized({ model, prompt, kiloToken, timeoutMs, variant });
+        sendJson(res, 200, result);
+      } catch (err) {
+        sendJson(res, 500, { error: err instanceof Error ? err.message : 'run failed' });
+      }
+      return;
+    }
+
+    sendJson(res, 404, { error: 'not found' });
+  })();
+});
+
+server.listen(PORT, () => {
+  console.log(`decider-benchmark runner listening on :${PORT}`);
+});
diff --git a/services/auto-routing-benchmark/drizzle.config.ts b/services/auto-routing-benchmark/drizzle.config.ts
new file mode 100644
index 0000000000..3214ffe4b8
--- /dev/null
+++ b/services/auto-routing-benchmark/drizzle.config.ts
@@ -0,0 +1,6 @@
+import { defineConfig } from 'drizzle-kit';
+export default defineConfig({
+  out: './migrations',
+  schema: './src/db-schema.ts',
+  dialect: 'sqlite',
+});
diff --git a/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql b/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql
new file mode 100644
index 0000000000..3db1df3b2b
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/0000_absent_wallow.sql
@@ -0,0 +1,103 @@
+CREATE TABLE `benchmark_config` (
+	`id` integer PRIMARY KEY NOT NULL,
+	`min_accuracy` real NOT NULL,
+	`switch_cost_factor` real NOT NULL,
+	`max_concurrency` integer NOT NULL,
+	`benchmark_user_id` text,
+	`classifier_repetitions` integer DEFAULT 1 NOT NULL,
+	`decider_repetitions` integer DEFAULT 1 NOT NULL,
+	`classifier_max_p95_latency_ms` integer,
+	`updated_at` text NOT NULL,
+	`updated_by` text
+);
+--> statement-breakpoint
+CREATE TABLE `benchmark_runs` (
+	`id` text PRIMARY KEY NOT NULL,
+	`kind` text NOT NULL,
+	`status` text NOT NULL,
+	`started_at` text NOT NULL,
+	`completed_at` text,
+	`error` text,
+	`min_accuracy` real NOT NULL,
+	`switch_cost_factor` real NOT NULL,
+	`max_concurrency` integer NOT NULL,
+	`benchmark_user_id` text,
+	`repetitions` integer DEFAULT 1 NOT NULL,
+	`classifier_max_p95_latency_ms` integer,
+	`engine_identity` text DEFAULT '' NOT NULL
+);
+--> statement-breakpoint
+CREATE UNIQUE INDEX `UQ_benchmark_runs_one_running_per_kind` ON `benchmark_runs` (`kind`) WHERE "benchmark_runs"."status" = 'running';--> statement-breakpoint
+CREATE TABLE `case_results` (
+	`run_id` text NOT NULL,
+	`model` text NOT NULL,
+	`case_id` text NOT NULL,
+	`tier` text,
+	`score` real NOT NULL,
+	`latency_ms` integer NOT NULL,
+	`cost_usd` real,
+	`error` text,
+	`fallback_reason` text,
+	`retried` integer,
+	`exit_code` integer,
+	`output_prefix` text,
+	`event_count` integer,
+	`last_event_types` text,
+	`rep` integer DEFAULT 0 NOT NULL,
+	`timed_out` integer DEFAULT 0 NOT NULL,
+	PRIMARY KEY(`run_id`, `model`, `case_id`, `rep`)
+);
+--> statement-breakpoint
+CREATE TABLE `config_classifier_models` (
+	`model` text PRIMARY KEY NOT NULL
+);
+--> statement-breakpoint
+CREATE TABLE `config_decider_models` (
+	`model` text PRIMARY KEY NOT NULL,
+	`reasoning_effort` text
+);
+--> statement-breakpoint
+CREATE TABLE `model_summaries` (
+	`run_id` text NOT NULL,
+	`model` text NOT NULL,
+	`tier` text NOT NULL,
+	`accuracy` real NOT NULL,
+	`avg_cost_usd` real,
+	`avg_latency_ms` real NOT NULL,
+	`p50_latency_ms` real,
+	`cases` integer NOT NULL,
+	`errors` integer NOT NULL,
+	`p95_latency_ms` real,
+	`timeouts` integer DEFAULT 0 NOT NULL,
+	`carried` integer DEFAULT false NOT NULL,
+	PRIMARY KEY(`run_id`, `model`, `tier`)
+);
+--> statement-breakpoint
+CREATE TABLE `routing_table_candidates` (
+	`run_id` text NOT NULL,
+	`tier` text NOT NULL,
+	`rank` integer NOT NULL,
+	`model` text NOT NULL,
+	`accuracy` real NOT NULL,
+	`avg_cost_usd` real NOT NULL,
+	`meets_threshold` integer NOT NULL,
+	`reasoning_effort` text,
+	PRIMARY KEY(`run_id`, `tier`, `rank`)
+);
+--> statement-breakpoint
+CREATE TABLE `routing_tables` (
+	`run_id` text PRIMARY KEY NOT NULL,
+	`published_at` text NOT NULL,
+	`generated_at` text NOT NULL,
+	`min_accuracy` real NOT NULL,
+	`switch_cost_factor` real NOT NULL,
+	`source` text NOT NULL
+);
+--> statement-breakpoint
+CREATE TABLE `run_models` (
+	`run_id` text NOT NULL,
+	`model` text NOT NULL,
+	`enqueued` integer NOT NULL,
+	`reasoning_effort` text,
+	PRIMARY KEY(`run_id`, `model`)
+);
diff --git a/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
new file mode 100644
index 0000000000..35ce39e53e
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/meta/0000_snapshot.json
@@ -0,0 +1,665 @@
+{
+  "version": "6",
+  "dialect": "sqlite",
+  "id": "ba559fc8-fdd3-4c96-b116-53573fb79c74",
+  "prevId": "00000000-0000-0000-0000-000000000000",
+  "tables": {
+    "benchmark_config": {
+      "name": "benchmark_config",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "integer",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "max_concurrency": {
+          "name": "max_concurrency",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "benchmark_user_id": {
+          "name": "benchmark_user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "classifier_repetitions": {
+          "name": "classifier_repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "decider_repetitions": {
+          "name": "decider_repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "classifier_max_p95_latency_ms": {
+          "name": "classifier_max_p95_latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "updated_at": {
+          "name": "updated_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "updated_by": {
+          "name": "updated_by",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "benchmark_runs": {
+      "name": "benchmark_runs",
+      "columns": {
+        "id": {
+          "name": "id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "kind": {
+          "name": "kind",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "status": {
+          "name": "status",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "started_at": {
+          "name": "started_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "completed_at": {
+          "name": "completed_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "error": {
+          "name": "error",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "max_concurrency": {
+          "name": "max_concurrency",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "benchmark_user_id": {
+          "name": "benchmark_user_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "repetitions": {
+          "name": "repetitions",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 1
+        },
+        "classifier_max_p95_latency_ms": {
+          "name": "classifier_max_p95_latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "engine_identity": {
+          "name": "engine_identity",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": "''"
+        }
+      },
+      "indexes": {
+        "UQ_benchmark_runs_one_running_per_kind": {
+          "name": "UQ_benchmark_runs_one_running_per_kind",
+          "columns": [
+            "kind"
+          ],
+          "isUnique": true,
+          "where": "\"benchmark_runs\".\"status\" = 'running'"
+        }
+      },
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "case_results": {
+      "name": "case_results",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "case_id": {
+          "name": "case_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "score": {
+          "name": "score",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "latency_ms": {
+          "name": "latency_ms",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "cost_usd": {
+          "name": "cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "error": {
+          "name": "error",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "fallback_reason": {
+          "name": "fallback_reason",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "retried": {
+          "name": "retried",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "exit_code": {
+          "name": "exit_code",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "output_prefix": {
+          "name": "output_prefix",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "event_count": {
+          "name": "event_count",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "last_event_types": {
+          "name": "last_event_types",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "rep": {
+          "name": "rep",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
+        },
+        "timed_out": {
+          "name": "timed_out",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "case_results_run_id_model_case_id_rep_pk": {
+          "columns": [
+            "run_id",
+            "model",
+            "case_id",
+            "rep"
+          ],
+          "name": "case_results_run_id_model_case_id_rep_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "config_classifier_models": {
+      "name": "config_classifier_models",
+      "columns": {
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "config_decider_models": {
+      "name": "config_decider_models",
+      "columns": {
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "model_summaries": {
+      "name": "model_summaries",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "accuracy": {
+          "name": "accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "avg_cost_usd": {
+          "name": "avg_cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "avg_latency_ms": {
+          "name": "avg_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "p50_latency_ms": {
+          "name": "p50_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "cases": {
+          "name": "cases",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "errors": {
+          "name": "errors",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "p95_latency_ms": {
+          "name": "p95_latency_ms",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        },
+        "timeouts": {
+          "name": "timeouts",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": 0
+        },
+        "carried": {
+          "name": "carried",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false,
+          "default": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "model_summaries_run_id_model_tier_pk": {
+          "columns": [
+            "run_id",
+            "model",
+            "tier"
+          ],
+          "name": "model_summaries_run_id_model_tier_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "routing_table_candidates": {
+      "name": "routing_table_candidates",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "tier": {
+          "name": "tier",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "rank": {
+          "name": "rank",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "accuracy": {
+          "name": "accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "avg_cost_usd": {
+          "name": "avg_cost_usd",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "meets_threshold": {
+          "name": "meets_threshold",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "routing_table_candidates_run_id_tier_rank_pk": {
+          "columns": [
+            "run_id",
+            "tier",
+            "rank"
+          ],
+          "name": "routing_table_candidates_run_id_tier_rank_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "routing_tables": {
+      "name": "routing_tables",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": true,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "published_at": {
+          "name": "published_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "generated_at": {
+          "name": "generated_at",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "min_accuracy": {
+          "name": "min_accuracy",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "switch_cost_factor": {
+          "name": "switch_cost_factor",
+          "type": "real",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "source": {
+          "name": "source",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {},
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    },
+    "run_models": {
+      "name": "run_models",
+      "columns": {
+        "run_id": {
+          "name": "run_id",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "model": {
+          "name": "model",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "enqueued": {
+          "name": "enqueued",
+          "type": "integer",
+          "primaryKey": false,
+          "notNull": true,
+          "autoincrement": false
+        },
+        "reasoning_effort": {
+          "name": "reasoning_effort",
+          "type": "text",
+          "primaryKey": false,
+          "notNull": false,
+          "autoincrement": false
+        }
+      },
+      "indexes": {},
+      "foreignKeys": {},
+      "compositePrimaryKeys": {
+        "run_models_run_id_model_pk": {
+          "columns": [
+            "run_id",
+            "model"
+          ],
+          "name": "run_models_run_id_model_pk"
+        }
+      },
+      "uniqueConstraints": {},
+      "checkConstraints": {}
+    }
+  },
+  "views": {},
+  "enums": {},
+  "_meta": {
+    "schemas": {},
+    "tables": {},
+    "columns": {}
+  },
+  "internal": {
+    "indexes": {}
+  }
+}
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/migrations/meta/_journal.json b/services/auto-routing-benchmark/migrations/meta/_journal.json
new file mode 100644
index 0000000000..7ee67d2c06
--- /dev/null
+++ b/services/auto-routing-benchmark/migrations/meta/_journal.json
@@ -0,0 +1,13 @@
+{
+  "version": "7",
+  "dialect": "sqlite",
+  "entries": [
+    {
+      "idx": 0,
+      "version": "6",
+      "when": 1781523205381,
+      "tag": "0000_absent_wallow",
+      "breakpoints": true
+    }
+  ]
+}
\ No newline at end of file
diff --git a/services/auto-routing-benchmark/package.json b/services/auto-routing-benchmark/package.json
new file mode 100644
index 0000000000..46745f55d3
--- /dev/null
+++ b/services/auto-routing-benchmark/package.json
@@ -0,0 +1,34 @@
+{
+  "name": "auto-routing-benchmark",
+  "version": "1.0.0",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "deploy": "wrangler deploy",
+    "predeploy": "wrangler d1 migrations apply auto-routing-benchmark --remote",
+    "dev": "wrangler dev",
+    "types": "wrangler types --include-runtime=false",
+    "typecheck": "tsgo --noEmit",
+    "lint": "pnpm -w exec oxlint --config .oxlintrc.json services/auto-routing-benchmark/src",
+    "db:generate": "drizzle-kit generate",
+    "test": "vitest run"
+  },
+  "dependencies": {
+    "@cloudflare/containers": "0.1.1",
+    "@kilocode/auto-routing-contracts": "workspace:*",
+    "@kilocode/worker-utils": "workspace:*",
+    "@openrouter/sdk": "^0.12.79",
+    "drizzle-orm": "catalog:",
+    "hono": "catalog:",
+    "zod": "catalog:"
+  },
+  "devDependencies": {
+    "@cloudflare/workers-types": "catalog:",
+    "@types/node": "catalog:",
+    "@typescript/native-preview": "catalog:",
+    "drizzle-kit": "catalog:",
+    "typescript": "catalog:",
+    "vitest": "catalog:",
+    "wrangler": "catalog:"
+  }
+}
diff --git a/services/auto-routing-benchmark/src/admin.test.ts b/services/auto-routing-benchmark/src/admin.test.ts
new file mode 100644
index 0000000000..68e830634d
--- /dev/null
+++ b/services/auto-routing-benchmark/src/admin.test.ts
@@ -0,0 +1,551 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+import type {
+  BenchmarkConfig,
+  BenchmarkModelSummary,
+  RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+import { app } from './index';
+import { computeEngineIdentity } from './run';
+import type * as DbModule from './db';
+
+function makeSummary(model: string): BenchmarkModelSummary {
+  return {
+    model,
+    tier: 'low',
+    accuracy: 0.9,
+    avgCostUsd: 0.001,
+    avgLatencyMs: 100,
+    p50LatencyMs: 90,
+    p95LatencyMs: 120,
+    cases: 10,
+    errors: 0,
+    timeouts: 0,
+  };
+}
+
+const TEST_CONFIG: BenchmarkConfig = {
+  classifierModels: ['google/gemini-2.5-flash-lite', 'google/gemini-2.5-flash'],
+  deciderModels: [
+    { id: 'google/gemini-2.5-flash-lite', reasoningEffort: null },
+    { id: 'anthropic/claude-sonnet-4.6', reasoningEffort: null },
+  ],
+  minAccuracy: 0.7,
+  switchCostFactor: 3,
+  maxConcurrency: 4,
+  benchmarkUserId: null,
+  classifierRepetitions: 1,
+  deciderRepetitions: 1,
+  classifierMaxP95LatencyMs: 1000,
+  updatedAt: null,
+  updatedBy: null,
+};
+
+// getConfigRows result that mapConfigRows resolves back to TEST_CONFIG.
+const TEST_CONFIG_ROWS = {
+  config: {
+    id: 1 as const,
+    min_accuracy: TEST_CONFIG.minAccuracy,
+    switch_cost_factor: TEST_CONFIG.switchCostFactor,
+    max_concurrency: TEST_CONFIG.maxConcurrency,
+    benchmark_user_id: TEST_CONFIG.benchmarkUserId,
+    classifier_repetitions: TEST_CONFIG.classifierRepetitions,
+    decider_repetitions: TEST_CONFIG.deciderRepetitions,
+    classifier_max_p95_latency_ms: TEST_CONFIG.classifierMaxP95LatencyMs,
+    updated_at: '2026-06-01T00:00:00.000Z',
+    updated_by: null,
+  },
+  classifierModels: TEST_CONFIG.classifierModels,
+  deciderModels: TEST_CONFIG.deciderModels.map(m => ({
+    model: m.id,
+    reasoning_effort: m.reasoningEffort ?? null,
+  })),
+};
+
+// ---------------------------------------------------------------------------
+// Stubs: the db module is mocked at its function boundary (drizzle generates
+// the SQL, so statement-level stubbing would couple tests to its internals).
+// ---------------------------------------------------------------------------
+
+vi.mock('./db', async importOriginal => {
+  const actual = await importOriginal<typeof DbModule>();
+  return {
+    ...actual,
+    getConfigRows: vi.fn(),
+    replaceConfig: vi.fn(),
+    listRuns: vi.fn(),
+    getLatestRoutingTable: vi.fn(),
+    getClassifierWinner: vi.fn(),
+    getLatestSummariesByModel: vi.fn(),
+    insertRun: vi.fn(),
+    markStaleRunsFailed: vi.fn(),
+    getRunningRun: vi.fn(),
+    existsNewerCompletedRun: vi.fn(),
+  };
+});
+
+import {
+  getConfigRows,
+  getClassifierWinner,
+  getLatestRoutingTable,
+  getLatestSummariesByModel,
+  getRunningRun,
+  existsNewerCompletedRun,
+  insertRun,
+  listRuns,
+  markStaleRunsFailed,
+  replaceConfig,
+} from './db';
+
+const tokenGet = vi.fn<() => Promise<string>>();
+const queueSendBatch = vi.fn();
+
+const env = {
+  INTERNAL_API_SECRET_PROD: { get: tokenGet },
+  BENCH_DB: {} as D1Database,
+  BENCH_QUEUE: { sendBatch: queueSendBatch },
+  AUTO_ROUTING_CONFIG: { put: vi.fn(), get: vi.fn(), delete: vi.fn() },
+} as unknown as Env;
+
+const executionCtx = {
+  waitUntil: () => {},
+  passThroughOnException: () => {},
+} as unknown as ExecutionContext;
+
+function request(path: string, init: RequestInit = {}) {
+  return app.request(`https://bench.example.com${path}`, init, env, executionCtx);
+}
+
+function authedGet(path: string) {
+  return request(path, { headers: { authorization: 'Bearer bench-token' } });
+}
+
+function authedPost(path: string, body: unknown) {
+  return request(path, {
+    method: 'POST',
+    headers: { authorization: 'Bearer bench-token', 'content-type': 'application/json' },
+    body: JSON.stringify(body),
+  });
+}
+
+function authedPut(path: string, body: unknown, extraHeaders: Record<string, string> = {}) {
+  return request(path, {
+    method: 'PUT',
+    headers: {
+      authorization: 'Bearer bench-token',
+      'content-type': 'application/json',
+      ...extraHeaders,
+    },
+    body: JSON.stringify(body),
+  });
+}
+
+// ---------------------------------------------------------------------------
+// Setup
+// ---------------------------------------------------------------------------
+
+beforeEach(() => {
+  vi.clearAllMocks();
+  tokenGet.mockResolvedValue('bench-token');
+  vi.mocked(getConfigRows).mockResolvedValue({
+    config: null,
+    classifierModels: [],
+    deciderModels: [],
+  });
+  vi.mocked(replaceConfig).mockResolvedValue(undefined);
+  vi.mocked(listRuns).mockResolvedValue([]);
+  vi.mocked(getLatestRoutingTable).mockResolvedValue(null);
+  vi.mocked(getClassifierWinner).mockResolvedValue(null);
+  vi.mocked(getLatestSummariesByModel).mockResolvedValue(new Map());
+  vi.mocked(insertRun).mockResolvedValue(undefined);
+  vi.mocked(markStaleRunsFailed).mockResolvedValue(undefined);
+  vi.mocked(getRunningRun).mockResolvedValue(undefined);
+  vi.mocked(existsNewerCompletedRun).mockResolvedValue(false);
+  queueSendBatch.mockResolvedValue(undefined);
+});
+
+// ---------------------------------------------------------------------------
+// Auth guard
+// ---------------------------------------------------------------------------
+
+describe('auth middleware', () => {
+  it('rejects requests without a bearer token', async () => {
+    const res = await request('/admin/config');
+    expect(res.status).toBe(401);
+    await expect(res.json()).resolves.toEqual({ error: 'Unauthorized' });
+  });
+
+  it('rejects requests with the wrong bearer token', async () => {
+    const res = await request('/admin/config', {
+      headers: { authorization: 'Bearer wrong-token' },
+    });
+    expect(res.status).toBe(401);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// GET /admin/config
+// ---------------------------------------------------------------------------
+
+describe('GET /admin/config', () => {
+  it('returns a null config when the DB rows are absent', async () => {
+    // getConfigRows already returns null config by default
+    const res = await authedGet('/admin/config');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ config: null });
+  });
+
+  it('returns the stored config when DB rows exist', async () => {
+    const classifierModels = ['some/model'];
+    const deciderModels = TEST_CONFIG.deciderModels.map(m => ({
+      model: m.id,
+      reasoning_effort: null,
+    }));
+    vi.mocked(getConfigRows).mockResolvedValueOnce({
+      config: {
+        id: 1,
+        min_accuracy: 0.9,
+        switch_cost_factor: 3,
+        max_concurrency: 4,
+        benchmark_user_id: null,
+        classifier_repetitions: 1,
+        decider_repetitions: 1,
+        classifier_max_p95_latency_ms: null,
+        updated_at: '2026-06-01T00:00:00.000Z',
+        updated_by: 'admin@example.com',
+      },
+      classifierModels,
+      deciderModels,
+    });
+
+    const res = await authedGet('/admin/config');
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as {
+      config: { minAccuracy: number; updatedBy: string | null };
+    };
+    expect(body.config.minAccuracy).toBe(0.9);
+    expect(body.config.updatedBy).toBe('admin@example.com');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// PUT /admin/config
+// ---------------------------------------------------------------------------
+
+describe('PUT /admin/config', () => {
+  it('rejects a non-JSON body', async () => {
+    const res = await request('/admin/config', {
+      method: 'PUT',
+      headers: {
+        authorization: 'Bearer bench-token',
+        'content-type': 'application/json',
+      },
+      body: 'not json {{{',
+    });
+    expect(res.status).toBe(500);
+  });
+
+  it('returns 400 for a schema-invalid config', async () => {
+    const res = await authedPut('/admin/config', { classifierModels: 'oops' });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toMatchObject({
+      success: false,
+      error: 'Invalid benchmark config',
+    });
+    expect(replaceConfig).not.toHaveBeenCalled();
+  });
+
+  it('returns 400 for duplicate decider model ids instead of a D1 PK violation', async () => {
+    const res = await authedPut('/admin/config', {
+      ...TEST_CONFIG,
+      deciderModels: [
+        { id: 'google/gemini-2.5-flash-lite', reasoningEffort: null },
+        { id: 'google/gemini-2.5-flash-lite', reasoningEffort: null },
+      ],
+    });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toMatchObject({
+      success: false,
+      error: 'Invalid benchmark config',
+    });
+    expect(replaceConfig).not.toHaveBeenCalled();
+  });
+
+  it('persists a valid config and returns it', async () => {
+    const validConfig = {
+      ...TEST_CONFIG,
+      minAccuracy: 0.85,
+      updatedAt: null,
+      updatedBy: null,
+    };
+
+    const res = await authedPut('/admin/config', validConfig, {
+      'x-updated-by': 'igor@kilocode.ai',
+    });
+
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as {
+      config: { minAccuracy: number; updatedBy: string | null; updatedAt: string | null };
+    };
+    expect(body.config.minAccuracy).toBe(0.85);
+    expect(body.config.updatedBy).toBe('igor@kilocode.ai');
+    expect(typeof body.config.updatedAt).toBe('string');
+
+    expect(replaceConfig).toHaveBeenCalledOnce();
+    const [, configArg] = vi.mocked(replaceConfig).mock.calls[0];
+    expect(configArg.min_accuracy).toBe(0.85);
+    expect(typeof configArg.updated_at).toBe('string');
+    expect(configArg.updated_by).toBe('igor@kilocode.ai');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// GET /admin/runs
+// ---------------------------------------------------------------------------
+
+describe('GET /admin/runs', () => {
+  it('returns an empty runs array when the table is empty', async () => {
+    const res = await authedGet('/admin/runs');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ runs: [] });
+  });
+
+  it('sweeps stale runs before listing so a wedged run is recovered', async () => {
+    await authedGet('/admin/runs');
+    // sweepStaleRuns → markStaleRunsFailed runs on list, independent of starting.
+    expect(markStaleRunsFailed).toHaveBeenCalledTimes(1);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// POST /admin/runs
+// ---------------------------------------------------------------------------
+
+describe('POST /admin/runs', () => {
+  it('rejects a non-JSON body', async () => {
+    const res = await request('/admin/runs', {
+      method: 'POST',
+      headers: {
+        authorization: 'Bearer bench-token',
+        'content-type': 'application/json',
+      },
+      body: '<<<',
+    });
+    expect(res.status).toBe(500);
+  });
+
+  it('returns 400 for an invalid kind', async () => {
+    const res = await authedPost('/admin/runs', { kind: 'turbo' });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toMatchObject({
+      success: false,
+      error: 'Invalid run request',
+    });
+    expect(queueSendBatch).not.toHaveBeenCalled();
+  });
+
+  it('returns 409 when a run of the same kind is already in progress', async () => {
+    vi.mocked(getConfigRows).mockResolvedValue(TEST_CONFIG_ROWS);
+    vi.mocked(getRunningRun).mockResolvedValue({
+      id: 'classifier-2026-06-15T00-00-00-000Z',
+      kind: 'classifier',
+      status: 'running',
+      started_at: '2026-06-15T00:00:00.000Z',
+      completed_at: null,
+      error: null,
+      min_accuracy: 0.7,
+      switch_cost_factor: 3,
+      max_concurrency: 4,
+      benchmark_user_id: null,
+      repetitions: 1,
+      classifier_max_p95_latency_ms: 1000,
+      engine_identity: 'v1:deadbeef',
+    });
+
+    const res = await authedPost('/admin/runs', { kind: 'classifier' });
+    expect(res.status).toBe(409);
+    await expect(res.json()).resolves.toMatchObject({
+      error: expect.stringContaining('already in progress'),
+    });
+    expect(insertRun).not.toHaveBeenCalled();
+    expect(queueSendBatch).not.toHaveBeenCalled();
+  });
+
+  it('returns 400 when no config has been saved', async () => {
+    // getConfigRows already returns null config by default
+    const res = await authedPost('/admin/runs', { kind: 'classifier' });
+    expect(res.status).toBe(400);
+    await expect(res.json()).resolves.toMatchObject({
+      error: 'benchmark config not set: save it in the admin panel before starting a run',
+    });
+    expect(insertRun).not.toHaveBeenCalled();
+    expect(queueSendBatch).not.toHaveBeenCalled();
+  });
+
+  it('starts a classifier run and returns runId + enqueuedModels', async () => {
+    // No prior summaries → every configured model is enqueued.
+    vi.mocked(getConfigRows).mockResolvedValue(TEST_CONFIG_ROWS);
+    const res = await authedPost('/admin/runs', { kind: 'classifier' });
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as { runId: string; enqueuedModels: number };
+    expect(body.runId).toMatch(/^classifier-/);
+    expect(body.enqueuedModels).toBe(TEST_CONFIG.classifierModels.length);
+    expect(insertRun).toHaveBeenCalledOnce();
+    // The run row snapshots the live config (mid-run edits must not skew results).
+    const [, runArg] = vi.mocked(insertRun).mock.calls[0];
+    expect(runArg.min_accuracy).toBe(TEST_CONFIG.minAccuracy);
+    expect(runArg.switch_cost_factor).toBe(TEST_CONFIG.switchCostFactor);
+    expect(queueSendBatch).toHaveBeenCalledOnce();
+  });
+
+  it('carries a decider model only when its benchmark identity still matches', async () => {
+    vi.mocked(getConfigRows).mockResolvedValue({
+      ...TEST_CONFIG_ROWS,
+      config: { ...TEST_CONFIG_ROWS.config, benchmark_user_id: 'user-123' },
+      deciderModels: [
+        { model: 'vendor/a', reasoning_effort: null },
+        { model: 'vendor/b', reasoning_effort: null },
+      ],
+    });
+    // vendor/a has a prior result measured under the current engine identity,
+    // matching repetitions and reasoning_effort → carried (skipped). vendor/b
+    // has none → enqueued.
+    vi.mocked(getLatestSummariesByModel).mockResolvedValue(
+      new Map([
+        [
+          'vendor/a',
+          {
+            engineIdentity: computeEngineIdentity('decider'),
+            repetitions: 1,
+            reasoningEffort: null,
+            summaries: [makeSummary('vendor/a')],
+          },
+        ],
+      ])
+    );
+
+    const res = await authedPost('/admin/runs', { kind: 'decider' });
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as { enqueuedModels: number; skippedModels: string[] };
+    expect(body.skippedModels).toEqual(['vendor/a']);
+    expect(body.enqueuedModels).toBe(1);
+  });
+
+  it('re-benchmarks a model whose prior reasoning_effort differs (no stale carry)', async () => {
+    vi.mocked(getConfigRows).mockResolvedValue({
+      ...TEST_CONFIG_ROWS,
+      config: { ...TEST_CONFIG_ROWS.config, benchmark_user_id: 'user-123' },
+      deciderModels: [{ model: 'vendor/a', reasoning_effort: null }],
+    });
+    // Prior result was measured at reasoning_effort 'high'; current config runs
+    // it at null, so the carry is invalidated and the model is re-enqueued.
+    vi.mocked(getLatestSummariesByModel).mockResolvedValue(
+      new Map([
+        [
+          'vendor/a',
+          {
+            engineIdentity: computeEngineIdentity('decider'),
+            repetitions: 1,
+            reasoningEffort: 'high',
+            summaries: [makeSummary('vendor/a')],
+          },
+        ],
+      ])
+    );
+
+    const res = await authedPost('/admin/runs', { kind: 'decider' });
+    expect(res.status).toBe(200);
+    const body = (await res.json()) as { enqueuedModels: number; skippedModels: string[] };
+    expect(body.skippedModels).toEqual([]);
+    expect(body.enqueuedModels).toBe(1);
+  });
+
+  it('slices a >100-message decider fan-out into sendBatch-sized batches', async () => {
+    // 7 decider models × 1 rep × ceil(76/5)=16 chunks = 112 messages, which
+    // exceeds Cloudflare Queues' 100-per-sendBatch cap and must be sliced.
+    const manyModels = Array.from({ length: 7 }, (_, i) => ({
+      id: `vendor/model-${i}`,
+      reasoningEffort: null,
+    }));
+    vi.mocked(getConfigRows).mockResolvedValue({
+      ...TEST_CONFIG_ROWS,
+      config: { ...TEST_CONFIG_ROWS.config, benchmark_user_id: 'user-123' },
+      deciderModels: manyModels.map(m => ({ model: m.id, reasoning_effort: null })),
+    });
+
+    const res = await authedPost('/admin/runs', { kind: 'decider' });
+    expect(res.status).toBe(200);
+
+    // 112 messages → two batches (100 + 12), neither over the limit.
+    expect(queueSendBatch).toHaveBeenCalledTimes(2);
+    const batchSizes = queueSendBatch.mock.calls.map(([batch]) => (batch as unknown[]).length);
+    expect(batchSizes).toEqual([100, 12]);
+    for (const size of batchSizes) expect(size).toBeLessThanOrEqual(100);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// GET /admin/routing-table
+// ---------------------------------------------------------------------------
+
+describe('GET /admin/routing-table', () => {
+  it('returns {table: null, publishedAt: null} when no rows exist', async () => {
+    const res = await authedGet('/admin/routing-table');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ table: null, publishedAt: null });
+  });
+
+  it('returns the parsed table and publishedAt when a row exists', async () => {
+    const candidate = {
+      model: 'm',
+      accuracy: 1,
+      avgCostUsd: 0.1,
+      meetsThreshold: true,
+    };
+    const tableData = {
+      version: 'test-v1',
+      generatedAt: '2026-06-01T10:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      source: 'benchmark',
+      tiers: { low: [candidate], medium: [candidate], high: [candidate] },
+    };
+    vi.mocked(getLatestRoutingTable).mockResolvedValueOnce({
+      table: tableData as RoutingTable,
+      publishedAt: '2026-06-01T10:00:00.000Z',
+    });
+
+    const res = await authedGet('/admin/routing-table');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({
+      table: tableData,
+      publishedAt: '2026-06-01T10:00:00.000Z',
+    });
+  });
+});
+
+// ---------------------------------------------------------------------------
+// GET /admin/classifier-winner
+// ---------------------------------------------------------------------------
+
+describe('GET /admin/classifier-winner', () => {
+  it('returns {winner: null} when no completed classifier run exists', async () => {
+    const res = await authedGet('/admin/classifier-winner');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ winner: null });
+  });
+
+  it('returns the winner when a completed classifier run exists', async () => {
+    const winner = {
+      model: 'google/gemini-2.5-flash-lite',
+      runId: 'classifier-2026-06-01T00-00-00-000Z',
+      accuracy: 0.92,
+      p95LatencyMs: null,
+      generatedAt: '2026-06-01T10:00:00.000Z',
+    };
+    vi.mocked(getClassifierWinner).mockResolvedValueOnce(winner);
+
+    const res = await authedGet('/admin/classifier-winner');
+    expect(res.status).toBe(200);
+    await expect(res.json()).resolves.toEqual({ winner });
+  });
+});
diff --git a/services/auto-routing-benchmark/src/admin.ts b/services/auto-routing-benchmark/src/admin.ts
new file mode 100644
index 0000000000..0b95cd3a94
--- /dev/null
+++ b/services/auto-routing-benchmark/src/admin.ts
@@ -0,0 +1,95 @@
+import * as z from 'zod';
+import {
+  BenchmarkConfigSchema,
+  StartBenchmarkRunRequestSchema,
+  type BenchmarkRun,
+} from '@kilocode/auto-routing-contracts';
+import { zodJsonValidator } from '@kilocode/worker-utils';
+import type { Hono } from 'hono';
+import { getBenchmarkConfig, saveBenchmarkConfig } from './config';
+import { debugRunCli } from './cli-runner';
+import { fetchBenchmarkUserToken, RunAlreadyActiveError, startRun, sweepStaleRuns } from './run';
+import { getClassifierWinner, getLatestRoutingTable, listRuns } from './db';
+import type { HonoEnv } from './hono-env';
+
+const DebugCliRequestSchema = z.object({
+  model: z.string().trim().min(1),
+  prompt: z.string().min(1),
+});
+
+export function registerAdminRoutes(app: Hono<HonoEnv>): void {
+  app.get('/admin/config', async c => c.json({ config: await getBenchmarkConfig(c.env.BENCH_DB) }));
+
+  app.put(
+    '/admin/config',
+    zodJsonValidator(BenchmarkConfigSchema, { errorMessage: 'Invalid benchmark config' }),
+    async c => {
+      const updatedBy = c.req.header('x-updated-by') ?? null;
+      const saved = await saveBenchmarkConfig(c.env.BENCH_DB, c.req.valid('json'), updatedBy);
+      return c.json({ config: saved });
+    }
+  );
+
+  app.get('/admin/runs', async c => {
+    // Sweep stale runs first so a dead/wedged run surfaces as 'failed' (and
+    // frees the one-active-run slot) without needing a new run to be started.
+    await sweepStaleRuns(c.env.BENCH_DB);
+    const limit = Math.min(Number(c.req.query('limit') ?? 20) || 20, 100);
+    const runs: BenchmarkRun[] = await listRuns(c.env.BENCH_DB, limit);
+    return c.json({ runs });
+  });
+
+  app.post(
+    '/admin/runs',
+    zodJsonValidator(StartBenchmarkRunRequestSchema, { errorMessage: 'Invalid run request' }),
+    async c => {
+      const { kind, force } = c.req.valid('json');
+      const config = await getBenchmarkConfig(c.env.BENCH_DB);
+      if (!config) {
+        return c.json(
+          { error: 'benchmark config not set: save it in the admin panel before starting a run' },
+          400
+        );
+      }
+      try {
+        return c.json(await startRun(c.env, kind, { force }));
+      } catch (error) {
+        // One active run per kind: surface the conflict as 409 so automated
+        // callers don't treat it as a transient 5xx and retry.
+        if (error instanceof RunAlreadyActiveError) {
+          return c.json({ error: error.message }, 409);
+        }
+        throw error;
+      }
+    }
+  );
+
+  app.get('/admin/routing-table', async c => {
+    const latest = await getLatestRoutingTable(c.env.BENCH_DB);
+    return c.json({
+      table: latest?.table ?? null,
+      publishedAt: latest?.publishedAt ?? null,
+    });
+  });
+
+  app.get('/admin/classifier-winner', async c => {
+    const winner = await getClassifierWinner(c.env.BENCH_DB);
+    return c.json({ winner });
+  });
+
+  // Runs one ad-hoc prompt through the kilo CLI container and returns raw
+  // (truncated) stdout lines plus the parsed result. Diagnostic-only.
+  app.post(
+    '/admin/debug-cli',
+    zodJsonValidator(DebugCliRequestSchema, { errorMessage: 'Invalid debug request' }),
+    async c => {
+      const config = await getBenchmarkConfig(c.env.BENCH_DB);
+      if (!config?.benchmarkUserId) {
+        return c.json({ error: 'benchmarkUserId is not configured' }, 400);
+      }
+      const kiloToken = await fetchBenchmarkUserToken(c.env, config.benchmarkUserId);
+      const result = await debugRunCli(c.env, { ...c.req.valid('json'), kiloToken });
+      return c.json(result);
+    }
+  );
+}
diff --git a/services/auto-routing-benchmark/src/auth.ts b/services/auto-routing-benchmark/src/auth.ts
new file mode 100644
index 0000000000..62d86cfe71
--- /dev/null
+++ b/services/auto-routing-benchmark/src/auth.ts
@@ -0,0 +1,6 @@
+import { backendAuthMiddleware } from '@kilocode/worker-utils';
+import type { HonoEnv } from './hono-env';
+
+export const authMiddleware = backendAuthMiddleware<HonoEnv>(c =>
+  c.env.INTERNAL_API_SECRET_PROD.get()
+);
diff --git a/services/auto-routing-benchmark/src/bench-runner-container.ts b/services/auto-routing-benchmark/src/bench-runner-container.ts
new file mode 100644
index 0000000000..a3c712c4c7
--- /dev/null
+++ b/services/auto-routing-benchmark/src/bench-runner-container.ts
@@ -0,0 +1,14 @@
+import { Container } from '@cloudflare/containers';
+
+// Cloudflare Container that runs the stable `kilo` CLI for decider benchmark
+// cases. The worker proxies POST /run to the container's HTTP server (see
+// container/server.mjs) via this DO. One instance is keyed per
+// (runId, model, chunk) so concurrent chunks/models don't share state.
+export class BenchRunnerContainer extends Container<Env> {
+  defaultPort = 3000;
+  sleepAfter = '2m';
+  // The CLI resolves every gateway endpoint from KILO_API_URL. Production
+  // points at the real gateway; local dev overrides it via .dev.vars so the
+  // benchmark runs against the local apps/web instance.
+  envVars = { KILO_API_URL: this.env.KILO_CLI_API_URL };
+}
diff --git a/services/auto-routing-benchmark/src/cli-runner.ts b/services/auto-routing-benchmark/src/cli-runner.ts
new file mode 100644
index 0000000000..9f22cb3695
--- /dev/null
+++ b/services/auto-routing-benchmark/src/cli-runner.ts
@@ -0,0 +1,146 @@
+import { parseKiloRunEvents } from './kilo-events';
+import type { DeciderCase } from './datasets/decider-cases';
+
+export type CliRunResult = {
+  text: string;
+  costUsd: number | null;
+  latencyMs: number;
+  exitCode: number;
+  stderrTail: string;
+  eventCount: number;
+  lastEventTypes: string[];
+  timedOut: boolean;
+};
+
+const DECIDER_CLI_TIMEOUT_MS = 180_000;
+
+// Appended to every decider prompt: the agent harness tends to wrap answers
+// in prose ("The output is: ..."), which strict mechanical checks reject.
+// One uniform instruction across all candidate models keeps grading fair.
+const FINAL_ANSWER_SUFFIX =
+  '\n\nIMPORTANT: Your final message must contain ONLY the answer in the exact requested format - no explanations, no preamble, no extra words.';
+
+type ContainerRunResponse = {
+  exitCode: number;
+  durationMs: number;
+  stdoutLines: string[];
+  stderrTail: string;
+  timedOut?: boolean;
+};
+
+/**
+ * Run one decider case through the `kilo` CLI inside a Cloudflare Container.
+ *
+ * `instanceName` is the precomputed DO instance name (e.g.
+ * `${runId}:${model}:${chunk}`); the caller owns the keying so chunks/models
+ * map to stable instances. The CLI has no system-prompt flag, so we fold the
+ * system prompt into the user prompt.
+ */
+export async function runDeciderCaseViaCli(
+  env: Env,
+  params: {
+    instanceName: string;
+    model: string;
+    benchCase: DeciderCase;
+    kiloToken: string;
+    reasoningEffort?: string | null;
+  }
+): Promise<CliRunResult> {
+  const { instanceName, model, benchCase, kiloToken, reasoningEffort } = params;
+  const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(instanceName));
+  const prompt = `${benchCase.systemPrompt}\n\n${benchCase.userPrompt}${FINAL_ANSWER_SUFFIX}`;
+
+  const startedAt = Date.now();
+  const response = await stub.fetch(
+    new Request('http://container/run', {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({
+        model,
+        prompt,
+        kiloToken,
+        timeoutMs: DECIDER_CLI_TIMEOUT_MS,
+        variant: reasoningEffort ?? null,
+      }),
+    })
+  );
+
+  if (!response.ok) {
+    const detail = (await response.text().catch(() => '')).slice(0, 500);
+    throw new Error(`container /run failed: HTTP ${response.status} ${detail}`);
+  }
+
+  const body = (await response.json()) as ContainerRunResponse;
+  const { text, costUsd, eventCount, lastEventTypes } = parseKiloRunEvents(body.stdoutLines ?? []);
+
+  return {
+    text,
+    costUsd,
+    latencyMs: body.durationMs ?? Date.now() - startedAt,
+    exitCode: body.exitCode,
+    stderrTail: body.stderrTail ?? '',
+    eventCount,
+    lastEventTypes,
+    timedOut: body.timedOut ?? false,
+  };
+}
+
+// Ad-hoc CLI run for the /admin/debug-cli endpoint: returns raw (truncated)
+// stdout lines alongside the parsed result so empty-output cases in prod can
+// be diagnosed without redeploying.
+export async function debugRunCli(
+  env: Env,
+  params: { model: string; prompt: string; kiloToken: string }
+): Promise<{
+  exitCode: number;
+  durationMs: number;
+  stderrTail: string;
+  stdoutLines: string[];
+  parsed: ReturnType<typeof parseKiloRunEvents>;
+}> {
+  const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(`debug:${params.model}`));
+  const response = await stub.fetch(
+    new Request('http://container/run', {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({
+        model: params.model,
+        prompt: params.prompt,
+        kiloToken: params.kiloToken,
+        timeoutMs: DECIDER_CLI_TIMEOUT_MS,
+      }),
+    })
+  );
+  if (!response.ok) {
+    const detail = (await response.text().catch(() => '')).slice(0, 500);
+    throw new Error(`container /run failed: HTTP ${response.status} ${detail}`);
+  }
+  const body = (await response.json()) as ContainerRunResponse;
+  const stdoutLines = (body.stdoutLines ?? []).slice(0, 80).map(l => l.slice(0, 600));
+  return {
+    exitCode: body.exitCode,
+    durationMs: body.durationMs,
+    stderrTail: body.stderrTail ?? '',
+    stdoutLines,
+    parsed: parseKiloRunEvents(body.stdoutLines ?? []),
+  };
+}
+
+// Asks the container to run its one-time CLI warmup (sqlite migration etc.)
+// before the case loop starts. Best-effort: callers ignore failures.
+export async function warmUpCliContainer(
+  env: Env,
+  params: { instanceName: string; model: string; kiloToken: string }
+): Promise<void> {
+  const stub = env.BENCH_RUNNER.get(env.BENCH_RUNNER.idFromName(params.instanceName));
+  const response = await stub.fetch(
+    new Request('http://container/warmup', {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({ model: params.model, kiloToken: params.kiloToken }),
+    })
+  );
+  if (!response.ok) {
+    throw new Error(`container /warmup failed: HTTP ${response.status}`);
+  }
+}
diff --git a/services/auto-routing-benchmark/src/config.test.ts b/services/auto-routing-benchmark/src/config.test.ts
new file mode 100644
index 0000000000..02bf051239
--- /dev/null
+++ b/services/auto-routing-benchmark/src/config.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, it } from 'vitest';
+import { mapConfigRows } from './config';
+import type { ConfigDeciderModelRow } from './db';
+
+const configRow = {
+  id: 1 as const,
+  min_accuracy: 0.85,
+  switch_cost_factor: 3,
+  max_concurrency: 8,
+  benchmark_user_id: 'user-123',
+  classifier_repetitions: 1,
+  decider_repetitions: 1,
+  classifier_max_p95_latency_ms: null,
+  updated_at: '2026-06-01T00:00:00.000Z',
+  updated_by: 'admin@example.com',
+};
+
+const deciderRows: ConfigDeciderModelRow[] = [
+  {
+    model: 'some/decider',
+    reasoning_effort: 'high',
+  },
+];
+
+describe('mapConfigRows', () => {
+  it('returns null when config row is null', () => {
+    expect(mapConfigRows(null, ['some/model'], deciderRows)).toBeNull();
+  });
+
+  it('returns null when classifierModels array is empty', () => {
+    expect(mapConfigRows(configRow, [], deciderRows)).toBeNull();
+  });
+
+  it('returns null when deciderModels array is empty', () => {
+    expect(mapConfigRows(configRow, ['some/model'], [])).toBeNull();
+  });
+
+  it('maps a full config row set to BenchmarkConfig', () => {
+    const classifierModels = ['some/model-a', 'some/model-b'];
+
+    const result = mapConfigRows(configRow, classifierModels, deciderRows);
+
+    expect(result).not.toBeNull();
+    expect(result?.minAccuracy).toBe(0.85);
+    expect(result?.switchCostFactor).toBe(3);
+    expect(result?.maxConcurrency).toBe(8);
+    expect(result?.benchmarkUserId).toBe('user-123');
+    expect(result?.updatedAt).toBe('2026-06-01T00:00:00.000Z');
+    expect(result?.updatedBy).toBe('admin@example.com');
+    expect(result?.classifierModels).toEqual(classifierModels);
+    expect(result?.deciderModels).toHaveLength(1);
+    expect(result?.deciderModels[0].id).toBe('some/decider');
+    expect(result?.deciderModels[0].reasoningEffort).toBe('high');
+    expect(result?.classifierRepetitions).toBe(1);
+    expect(result?.deciderRepetitions).toBe(1);
+    expect(result?.classifierMaxP95LatencyMs).toBeNull();
+  });
+});
diff --git a/services/auto-routing-benchmark/src/config.ts b/services/auto-routing-benchmark/src/config.ts
new file mode 100644
index 0000000000..99f34e2cb6
--- /dev/null
+++ b/services/auto-routing-benchmark/src/config.ts
@@ -0,0 +1,81 @@
+import type { BenchmarkConfig } from '@kilocode/auto-routing-contracts';
+import { getConfigRows, replaceConfig, type ConfigDeciderModelRow } from './db';
+
+// Maps the three normalized config tables to the BenchmarkConfig contract.
+// Null when no admin has saved a config yet — the worker never fabricates
+// one, and runs cannot start until a config exists.
+export function mapConfigRows(
+  configRow: {
+    min_accuracy: number;
+    switch_cost_factor: number;
+    max_concurrency: number;
+    benchmark_user_id: string | null;
+    classifier_repetitions: number;
+    decider_repetitions: number;
+    classifier_max_p95_latency_ms: number | null;
+    updated_at: string;
+    updated_by: string | null;
+  } | null,
+  classifierModels: string[],
+  deciderModelRows: ConfigDeciderModelRow[]
+): BenchmarkConfig | null {
+  if (configRow === null || classifierModels.length === 0 || deciderModelRows.length === 0) {
+    return null;
+  }
+
+  return {
+    classifierModels,
+    deciderModels: deciderModelRows.map(r => ({
+      id: r.model,
+      reasoningEffort:
+        r.reasoning_effort as BenchmarkConfig['deciderModels'][number]['reasoningEffort'],
+    })),
+    minAccuracy: configRow.min_accuracy,
+    switchCostFactor: configRow.switch_cost_factor,
+    maxConcurrency: configRow.max_concurrency,
+    benchmarkUserId: configRow.benchmark_user_id,
+    classifierRepetitions: configRow.classifier_repetitions,
+    deciderRepetitions: configRow.decider_repetitions,
+    classifierMaxP95LatencyMs: configRow.classifier_max_p95_latency_ms,
+    updatedAt: configRow.updated_at,
+    updatedBy: configRow.updated_by,
+  };
+}
+
+export async function getBenchmarkConfig(db: D1Database): Promise<BenchmarkConfig | null> {
+  const { config, classifierModels, deciderModels } = await getConfigRows(db);
+  return mapConfigRows(config, classifierModels, deciderModels);
+}
+
+export async function saveBenchmarkConfig(
+  db: D1Database,
+  config: BenchmarkConfig,
+  updatedBy: string | null
+): Promise<BenchmarkConfig> {
+  const updatedAt = new Date().toISOString();
+  const stamped: BenchmarkConfig = { ...config, updatedAt, updatedBy };
+
+  const deciderModelRows: ConfigDeciderModelRow[] = config.deciderModels.map(m => ({
+    model: m.id,
+    reasoning_effort: m.reasoningEffort ?? null,
+  }));
+
+  await replaceConfig(
+    db,
+    {
+      min_accuracy: config.minAccuracy,
+      switch_cost_factor: config.switchCostFactor,
+      max_concurrency: config.maxConcurrency,
+      benchmark_user_id: config.benchmarkUserId,
+      classifier_repetitions: config.classifierRepetitions,
+      decider_repetitions: config.deciderRepetitions,
+      classifier_max_p95_latency_ms: config.classifierMaxP95LatencyMs,
+      updated_at: updatedAt,
+      updated_by: updatedBy,
+    },
+    config.classifierModels,
+    deciderModelRows
+  );
+
+  return stamped;
+}
diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts
new file mode 100644
index 0000000000..ab1221497f
--- /dev/null
+++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.test.ts
@@ -0,0 +1,88 @@
+import { describe, expect, it } from 'vitest';
+import { NormalizedClassifierInputSchema } from '@kilocode/auto-routing-contracts';
+import { classifierTaxonomy } from '@kilocode/auto-routing-contracts/classifier';
+import { CLASSIFIER_CASES } from './classifier-cases';
+
+const TAXONOMY_PAIRS = classifierTaxonomy.taskTypes.flatMap(taskType =>
+  taskType.subtypes.map(subtype => ({ taskType: taskType.id, subtaskType: subtype.id }))
+);
+
+const SUBTYPES_BY_TASK_TYPE = new Map(
+  classifierTaxonomy.taskTypes.map(taskType => [
+    taskType.id,
+    new Set(taskType.subtypes.map(subtype => subtype.id)),
+  ])
+);
+
+describe('CLASSIFIER_CASES', () => {
+  it('covers all 18 taxonomy pairs', () => {
+    expect(TAXONOMY_PAIRS.length).toBe(18);
+  });
+
+  it('has unique ids and valid inputs', () => {
+    const ids = new Set(CLASSIFIER_CASES.map(c => c.id));
+    expect(ids.size).toBe(CLASSIFIER_CASES.length);
+    for (const c of CLASSIFIER_CASES) {
+      const result = NormalizedClassifierInputSchema.safeParse(c.input);
+      expect(result.success, `case ${c.id}: ${JSON.stringify(result.error?.issues)}`).toBe(true);
+    }
+  });
+
+  it('has at least 4 cases per (taskType, subtaskType) pair', () => {
+    for (const pair of TAXONOMY_PAIRS) {
+      const count = CLASSIFIER_CASES.filter(
+        c => c.expected.taskType === pair.taskType && c.expected.subtaskType === pair.subtaskType
+      ).length;
+      expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(4);
+    }
+  });
+
+  it('labels every case with a subtaskType that belongs to its taskType', () => {
+    for (const c of CLASSIFIER_CASES) {
+      const subtypes = SUBTYPES_BY_TASK_TYPE.get(c.expected.taskType);
+      expect(subtypes, `unknown taskType in case ${c.id}`).toBeDefined();
+      expect(
+        subtypes?.has(c.expected.subtaskType),
+        `case ${c.id}: ${c.expected.subtaskType} does not belong to ${c.expected.taskType}`
+      ).toBe(true);
+    }
+  });
+
+  it('covers every task type with exactly 12 cases', () => {
+    const counts = new Map<string, number>();
+    for (const c of CLASSIFIER_CASES) {
+      counts.set(c.expected.taskType, (counts.get(c.expected.taskType) ?? 0) + 1);
+    }
+    for (const taskType of classifierTaxonomy.taskTypes) {
+      expect(counts.get(taskType.id) ?? 0, taskType.id).toBe(12);
+    }
+  });
+
+  it('covers every reasoning complexity at least 8 times', () => {
+    for (const level of ['low', 'medium', 'high'] as const) {
+      expect(
+        CLASSIFIER_CASES.filter(c => c.expected.reasoningComplexity === level).length,
+        level
+      ).toBeGreaterThanOrEqual(8);
+    }
+  });
+
+  it('covers every risk level at least 4 times', () => {
+    for (const level of ['low', 'medium', 'high'] as const) {
+      expect(
+        CLASSIFIER_CASES.filter(c => c.expected.riskLevel === level).length,
+        level
+      ).toBeGreaterThanOrEqual(4);
+    }
+  });
+
+  it('has at least one of each reasoning complexity within every task type', () => {
+    const byType = Map.groupBy(CLASSIFIER_CASES, c => c.expected.taskType);
+    for (const [taskType, cases] of byType) {
+      const levels = new Set(cases.map(c => c.expected.reasoningComplexity));
+      for (const level of ['low', 'medium', 'high'] as const) {
+        expect(levels.has(level), `${taskType} missing ${level}`).toBe(true);
+      }
+    }
+  });
+});
diff --git a/services/auto-routing-benchmark/src/datasets/classifier-cases.ts b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
new file mode 100644
index 0000000000..7866c762c1
--- /dev/null
+++ b/services/auto-routing-benchmark/src/datasets/classifier-cases.ts
@@ -0,0 +1,1351 @@
+import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
+import type { ClassifierExpectation } from '../grading';
+
+export type ClassifierCase = {
+  id: string; // stable slug, e.g. 'impl-gen-semver-helper' (<taskType>-<subtype>-<topic>)
+  input: NormalizedClassifierInput;
+  expected: ClassifierExpectation;
+};
+
+const AGENT_TOOLS_SYSTEM =
+  'You are Kilo Code, an AI coding assistant operating in an agentic loop with access to read_file, write_file, apply_diff, run_command and search_files tools. Work step by step and verify your changes.';
+const AGENT_PLAIN_SYSTEM =
+  'You are Kilo Code, an AI coding assistant. You help the user write and modify code in their workspace. Follow the user instructions precisely.';
+const CHAT_ASSISTANT_SYSTEM =
+  'You are a helpful senior software engineer. Answer the user clearly and concisely. Do not assume access to the user files unless they are pasted in the conversation.';
+
+const HINTS = { provider: null, providerOptions: null } as const;
+
+function chat(
+  systemPromptPrefix: string,
+  userPromptPrefix: string,
+  opts: {
+    messageCount: number;
+    hasTools: boolean;
+    latestUserPromptPrefix?: string | null;
+  }
+): NormalizedClassifierInput {
+  return {
+    apiKind: 'chat_completions',
+    requestedModel: 'kilo-auto/efficient',
+    systemPromptPrefix,
+    userPromptPrefix,
+    latestUserPromptPrefix: opts.latestUserPromptPrefix ?? null,
+    messageCount: opts.messageCount,
+    hasTools: opts.hasTools,
+    stream: true,
+    providerHints: HINTS,
+  };
+}
+
+// Four cases per (taskType, subtaskType) pair, with difficulty (context and
+// reasoning), execution mode, and risk varied within each pair. riskLevel
+// follows the taxonomy axis: high = auth/secrets/billing/user-data
+// migrations/production routing/destructive ops; medium = changes runtime
+// code, service config, or request contracts; low = read-only, test-only,
+// docs-only, or isolated reversible code.
+export const CLASSIFIER_CASES: readonly ClassifierCase[] = [
+  // ---------------------------------------------------------------------------
+  // implementation / feature_development
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-feat-members-endpoint',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Add a new GET /api/projects/:id/members endpoint to our Express router in src/routes/projects.ts. Reuse the existing requireAuth middleware and the ProjectService.getMembers method, and return 404 when the project does not exist.',
+      { messageCount: 7, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'feature_development',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-feat-debounced-search',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Implement a useDebouncedValue(value, delayMs) React hook in src/hooks and use it in the SearchBar component so the onSearch callback fires at most once every 300ms. Keep the existing controlled-input behavior.',
+      { messageCount: 9, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'feature_development',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-feat-realtime-collab',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Build real-time collaborative editing for our document editor. We have a React frontend, a Node WebSocket gateway, and a Postgres store. Decide and implement a conflict-resolution strategy (OT vs CRDT), wire presence, persistence, and reconnection, and make it consistent across all three layers.',
+      { messageCount: 18, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'feature_development',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-feat-rate-limiter',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Implement a distributed sliding-window rate limiter that works across our 4 API replicas backed by Redis. It must handle clock skew between nodes, degrade gracefully if Redis is unavailable, and expose per-tenant limits configured in src/config/limits.ts. Integrate it into the existing middleware chain.',
+      { messageCount: 16, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'feature_development',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // implementation / code_generation
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-gen-semver-helper',
+    input: chat(
+      AGENT_PLAIN_SYSTEM,
+      'Write a TypeScript helper function isValidSemver(version: string): boolean that returns true for valid semantic version strings like 1.2.3 and false otherwise. No external dependencies.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'code_generation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'impl-gen-pagination-schema',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Add a Zod schema named PaginationParamsSchema to src/schemas/pagination.ts with optional page (positive int, default 1) and pageSize (positive int, max 100, default 20) fields, and export its inferred type.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'code_generation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-gen-api-client',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Generate a typed TypeScript client for our internal REST API from the OpenAPI spec at docs/openapi.yaml: one function per endpoint, a shared fetch wrapper that injects the Authorization header, and response types derived from the spec schemas. Write it to src/generated/api-client.ts; nothing imports it yet, we will wire it in later.',
+      { messageCount: 5, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'code_generation',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-gen-ci-workflow',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Create a GitHub Actions workflow at .github/workflows/ci.yml that runs pnpm install with caching, then runs typecheck, lint, and test as parallel jobs on every pull request, using Node 22 and pnpm 9.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'code_generation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // implementation / test_creation
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-test-slugify-units',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Add Jest unit tests for the slugify function in src/utils/slugify.ts. Cover unicode input, repeated spaces, leading and trailing dashes, and the maxLength option. The function works correctly today, we just have no coverage.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'test_creation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-test-checkout-route',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Add supertest integration tests for the POST /api/checkout route: the happy path, an invalid coupon code, and an out-of-stock item. Reuse the existing test app factory in test/helpers/app.ts and the product fixtures. The route itself works fine in production.',
+      { messageCount: 7, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'test_creation',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-test-e2e-onboarding',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Build a Playwright E2E suite covering signup, email verification, workspace creation, and inviting a teammate, across the web app and the API. Set up seeded test users, per-test database isolation, and wire the suite into CI. Nothing is broken — we have zero end-to-end coverage today and need it before the next launch.',
+      { messageCount: 15, hasTools: true }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'test_creation',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'impl-test-pasted-debounce',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Here is my debounce implementation pasted below. Write a Jest test file for it covering the delay behavior, cancellation, and the immediate=true mode. Just give me the test code, I will add it to the repo myself.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'implementation',
+      subtaskType: 'test_creation',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / bug_fixing
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-fix-import-mismatch',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Running the app throws "TypeError: formatDate is not a function" from src/utils/date.ts line 12. The file exports formatDate as a named export but App.tsx imports it as a default. Fix the import.',
+      { messageCount: 4, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'bug_fixing',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-fix-pagination-slice',
+    input: chat(
+      AGENT_PLAIN_SYSTEM,
+      'This pagination function returns one too few items on the last page. Here is the code: `return items.slice(page * size, page * size + size - 1)`. What is wrong and how do I fix it?',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'bug_fixing',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'debug-fix-cors-upload',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Browser requests to our /api/upload endpoint fail with "blocked by CORS policy: No Access-Control-Allow-Origin header". GET requests to other endpoints work fine. The cors middleware is configured in src/server.ts. Find why only upload is affected and fix it.',
+      { messageCount: 10, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'bug_fixing',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-fix-double-charge',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our payment webhook handler intermittently double-charges customers under load. We use a Postgres advisory lock around the charge, but the duplicate rows have timestamps 2-3ms apart. The handler runs on 3 replicas behind a queue with at-least-once delivery. Investigate the root cause across the worker, queue consumer, and DB layers and fix it.',
+      { messageCount: 14, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'bug_fixing',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / test_repair
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-repair-bcrypt-stub',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our test "UserService > createUser persists the hashed password" fails since we upgraded bcryptjs to v3: the hash comes back undefined because the test still stubs the old callback-style API. The production code is verified working in staging. Update the test stub and assertions so the suite passes.',
+      { messageCount: 8, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'test_repair',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-repair-aria-snapshots',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'After adding an aria-label to the IconButton component, 14 Jest snapshot tests fail and every diff is just the new attribute. The new markup is intentional and correct. Update the snapshots and fix the one inline assertion in IconButton.test.tsx that checks the rendered props.',
+      { messageCount: 5, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'test_repair',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-repair-flaky-backoff',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The "retries with exponential backoff" test in src/queue/retry.test.ts is flaky in CI: it asserts real elapsed time around setTimeout and fails when the runners are slow. The production retry logic is correct. Make the test deterministic with vitest fake timers without weakening what it asserts.',
+      { messageCount: 9, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'test_repair',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-repair-stale-fixtures',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'CI is red: nine tests in services/billing-worker fail with ZodError because the request fixtures still use the old amountCents field that was intentionally renamed to amountMinorUnits last week. The schema change is correct and already deployed. Update the fixtures to match.',
+      { messageCount: 6, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'test_repair',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / root_cause_analysis
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-rca-sidebar-overflow',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Why does this sidebar overflow horizontally on mobile only? I pasted the component and its CSS module below; min-width is set on the nav list. Explain the cause — I will fix it myself.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'root_cause_analysis',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'debug-rca-local-401',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Requests from our Next.js app to the o11y worker return 401 in local dev, but the same code works in staging. The bearer token is read in apps/web/src/lib/workerClient.ts and validated in the worker auth middleware. Trace where the values diverge and tell me the root cause. Do not change anything yet.',
+      { messageCount: 7, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'root_cause_analysis',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-rca-search-500s',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Roughly 0.5% of requests to /api/search return a 500 with nothing in the application logs. Candidates: the Express handler, the OpenSearch client timeout config, or the nginx proxy in front. Gather evidence from the code and configs and tell me where the failures originate and why. Diagnosis only — I will decide on the fix.',
+      { messageCount: 13, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'root_cause_analysis',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'debug-rca-memory-leak',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our Node service RSS grows by ~50MB/hour in production and OOMs after a day, but it is stable locally. Heap snapshots show growing retained closures referencing our EventEmitter-based cache. It spans the cache module, the websocket session manager, and a third-party metrics client. Trace the leak across these and report the root cause with the retaining-path evidence. Do not fix anything yet — I want to review the diagnosis with the team first.',
+      { messageCount: 22, hasTools: true }
+    ),
+    expected: {
+      taskType: 'debugging',
+      subtaskType: 'root_cause_analysis',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / code_cleanup
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-cleanup-rename-total',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'In src/cart.ts rename the variable `x` to `lineItemTotal` everywhere it is used in the calculateTotal function. No behavior change.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'code_cleanup',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-cleanup-seconds-constant',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The magic number 86400 appears three times in src/scheduler.ts. Extract it into a named constant SECONDS_PER_DAY at the top of the file and use it in all three places. Keep behavior identical.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'code_cleanup',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-cleanup-shared-pagination',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'src/routes/users.ts and src/routes/orgs.ts each define a parsePagination helper that is character-for-character identical. Move it to src/lib/pagination.ts and import it in both routes. No behavior change.',
+      { messageCount: 4, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'code_cleanup',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-cleanup-dead-flag',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Remove the dead code paths guarded by ENABLE_OLD_DASHBOARD across src/dashboard/ — the flag has been false in every environment for over a year and the env var was deleted from our deploy configs. Delete the guarded branches, the flag helper, and the now-unused components, keeping everything else identical. Run the test suite when done.',
+      { messageCount: 10, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'code_cleanup',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / architecture_improvement
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-arch-order-service',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The OrderController in src/controllers/order.ts has grown to 400 lines and mixes HTTP handling with business logic. Extract the business logic into an OrderService class, keep the controller thin, and update the existing controller tests to match. Behavior must stay the same.',
+      { messageCount: 11, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'architecture_improvement',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-arch-modular-monolith',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our monolithic src/app.ts wires routing, auth, database access, and background jobs in one 1200-line file with tangled circular imports. Restructure it into clear modules with one-directional dependencies, without changing any external behavior or public routes. Decide the boundaries and migrate incrementally.',
+      { messageCount: 26, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'architecture_improvement',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-arch-shared-worker-auth',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The o11y worker and the notifications worker each carry a copy of the same bearer-token auth middleware. Move it into packages/worker-utils as a shared helper and have both workers consume it. Keep the validation behavior identical and keep both workers test suites green.',
+      { messageCount: 9, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'architecture_improvement',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-arch-repository-layer',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Our tRPC routers import the Drizzle client directly all over the place. Introduce a repository layer: define repository interfaces, implement them for the user and project routers first, update the wiring, and keep every procedure output identical. Set it up so the remaining routers can migrate incrementally.',
+      { messageCount: 21, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'architecture_improvement',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / migration
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-migrate-async-await',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate the .then()/.catch() promise chains in src/api/client.ts to async/await. There are about six methods. Preserve the existing error-handling semantics and return types exactly.',
+      { messageCount: 6, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'migration',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-migrate-drizzle',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate our data layer from the legacy hand-written SQL query helpers spread across 30 files to Drizzle ORM, preserving every query result shape and transaction boundary. Plan the sequence so the app keeps passing tests at each step, then carry it out.',
+      { messageCount: 30, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'migration',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-migrate-secrets-binding',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate the gastown worker from plaintext vars in wrangler.jsonc to Cloudflare Secrets Store bindings for OPENROUTER_API_KEY and WEBHOOK_SIGNING_SECRET: add the secrets_store_secrets binding, update the env access in the code, and remove the plaintext values. These are live production credentials.',
+      { messageCount: 8, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'migration',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'high',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'refactor-migrate-oxlint',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Migrate the packages/encryption package from ESLint to oxlint to match the rest of the monorepo: add an .oxlintrc.json extending the root config, switch the lint script in its package.json, and remove the eslint devDependencies.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'refactoring',
+      subtaskType: 'migration',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'code_change',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / architecture_design
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-arch-express-structure',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'For a small Express API with about 8 endpoints, what is a sensible folder structure for routes, controllers, and services? Just describe the layout, do not write code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'architecture_design',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-arch-export-responsibility',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'We are adding CSV export to the reporting feature. Should it live in the existing ReportsService, which already handles querying and aggregation, or in a new ExportService? Export adds formatting and async delivery concerns. Recommend where the responsibility belongs and why — no code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'architecture_design',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-arch-dashboard-state',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Design the state-management structure for our React dashboard: we have server data via tRPC and React Query, local UI state, and filters that must survive page navigation. Propose which layer owns what (query cache vs a store vs URL params) and where the boundaries between them sit. Design only, I will implement it.',
+      { messageCount: 2, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'architecture_design',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-arch-cli-plugins',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Design a plugin architecture for our internal CLI so other teams can ship commands without touching core: the plugin interface, discovery and loading, version compatibility between core and plugins, and which core APIs stay stable. There are about 40 commands today and three teams that want in. Architecture only — no implementation plan needed yet.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'architecture_design',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / technical_planning
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-steps-optimistic-ui',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'We want to add optimistic UI updates to our existing React + tRPC todo app. Break the work into an ordered implementation plan (state, mutation handling, rollback on error, tests). Just the plan, I will implement it.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'technical_planning',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-steps-node-upgrade',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Give me an ordered checklist for upgrading our Express service from Node 20 to Node 22: what to verify beforehand, the upgrade steps, and how to validate after each step. Keep it to the sequence of steps — we already know the runtime differences barely affect our code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'technical_planning',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-steps-user-module-cutover',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'The target design is already approved: the user module moves from the PHP monolith to the new TypeScript service. Plan the cutover into shippable steps — sequencing, feature flags, data backfill order, verification gates, and rollback points for each step. Plan only, the architecture itself is settled.',
+      { messageCount: 3, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'technical_planning',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-steps-flaky-ci-triage',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Our CI is red on about 30% of runs due to flaky tests. Draft a triage plan: how to rank the worst offenders from CI history, a quarantine policy, the order to fix them in, and how to keep new flakes out. Just the plan — no test code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'technical_planning',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / system_design
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-system-catalog-caching',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'We have a read-heavy product catalog API hitting Postgres directly. Walk me through the tradeoffs of adding Redis caching vs HTTP cache headers vs a materialized view, and recommend one for a team of three with moderate traffic. No implementation yet.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'system_design',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-system-multitenant',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Design a multi-tenant architecture for our B2B SaaS. We need tenant isolation, per-tenant data residency (EU vs US), noisy-neighbor protection, and a path to enterprise single-tenant deployments later. Compare schema-per-tenant, row-level, and database-per-tenant, and recommend an approach with its failure modes. Design only.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'system_design',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-system-event-driven-orders',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'We run a synchronous request/response monolith and want to move order processing to an event-driven design with a message broker. Design the target architecture: event schema/versioning, idempotency, ordering guarantees, dead-letter handling, and how we cut over without downtime. Tradeoffs and a recommended broker, no code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'system_design',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'plan-system-webhook-guarantees',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Design the delivery contract for our outbound webhooks: retry schedule, idempotency keys, payload signing, ordering guarantees, and what we promise customers when their endpoint is down for hours. I want the contract and failure modes nailed down, not code.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'planning_design',
+      subtaskType: 'system_design',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / repo_exploration
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-repo-feature-flags',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Where in the codebase is the function getFeatureFlags defined and which files import it? Just tell me, do not change anything.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'repo_exploration',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-repo-secrets-usage',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'List every worker service in this monorepo that uses secrets_store_secrets in its wrangler config, and flag any that still keep plaintext vars. Just report the list with file paths — change nothing.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'repo_exploration',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-repo-kiloclaw-todos',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Find all TODO and FIXME comments under services/kiloclaw and list them with file path and line number. Read-only, do not modify anything.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'repo_exploration',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-repo-lodash-audit',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Which packages in the monorepo still depend on lodash, and which lodash functions does each one actually import? I am assessing whether we can drop the dependency entirely. Report findings only.',
+      { messageCount: 4, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'repo_exploration',
+      contextComplexity: 'large',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / codebase_understanding
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-code-cart-reducer',
+    input: chat(
+      AGENT_PLAIN_SYSTEM,
+      'Explain what this reducer does, step by step. It handles ADD_ITEM, REMOVE_ITEM, and CLEAR_CART actions. I just want to understand the logic.',
+      { messageCount: 1, hasTools: false }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'codebase_understanding',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: false,
+    },
+  },
+  {
+    id: 'invest-code-login-flow',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Explain how a login request flows through our app from the /auth/login route to the session cookie being set. Cover the controller, the AuthService, and the session middleware. I want to understand it before changing anything.',
+      { messageCount: 6, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'codebase_understanding',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-code-checkout-path',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Before we plan any optimization work, walk me through where time goes in our checkout path: the API handler, the database queries it runs, the cache lookups, and the synchronous third-party payment call. Explain which parts block the response and which are deferred. Understanding only — nothing is broken and nothing should change.',
+      { messageCount: 12, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'codebase_understanding',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-code-data-pipeline',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'We inherited an undocumented data pipeline spanning a cron service, three Lambda functions, an SQS queue, and a Redshift loader. Map out how data flows end to end, what each component assumes about the others, and where the implicit coupling and failure points are. Understanding only, no changes.',
+      { messageCount: 24, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'codebase_understanding',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / external_research
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-ext-stripe-webhooks',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Look up the current Stripe Node SDK and summarize how to verify a webhook signature and what the recommended way to handle idempotency keys is. I need to know the current recommended API before I write any code.',
+      { messageCount: 1, hasTools: true, latestUserPromptPrefix: null }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'external_research',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-ext-license-check',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Check the current license of the fast-xml-parser npm package — the package page and its repository — and tell me whether we can use it in a commercial closed-source product. Report what the license actually says today, do not rely on memory.',
+      { messageCount: 1, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'external_research',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-ext-wrangler-secrets',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Read the current Cloudflare Wrangler docs and summarize how wrangler secret put relates to the newer Secrets Store commands: which command writes where, and what the recommended setup for Workers is today. Current docs only — this changed recently.',
+      { messageCount: 2, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'external_research',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'invest-ext-llm-pricing',
+    input: chat(
+      CHAT_ASSISTANT_SYSTEM,
+      'Research current pricing and rate limits for the frontier model APIs we could route to — OpenRouter plus the major first-party providers — and compare the effective cost per million tokens for our traffic mix of 80% short completions and 20% long-context requests, with prompt caching factored in. Summarize with sources.',
+      { messageCount: 1, hasTools: true }
+    ),
+    expected: {
+      taskType: 'investigation',
+      subtaskType: 'external_research',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'answer_only',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / tool_usage
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-tool-pricing-toggle',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Use the browser tool to open http://localhost:3000/pricing, verify the new annual-billing toggle switches the displayed prices, and take a screenshot of both states. Report what you see — do not change any code.',
+      { messageCount: 4, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'tool_usage',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-tool-flag-toggle',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Use your HTTP tool to call the staging admin API: enable the newOnboarding feature flag for the qa-team tenant via POST /admin/flags, then GET it back to confirm it took effect. The admin token is in .env.staging.',
+      { messageCount: 5, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'tool_usage',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'medium',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-tool-mobile-screenshots',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Open /dashboard, /settings, and /billing in the browser at 375px viewport width and take a screenshot of each. I need them to review the mobile layout — just capture and report, no code changes.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'tool_usage',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-tool-signup-walkthrough',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Drive the browser through the signup flow on localhost:3000: fill the form with test+kilo@example.com, submit, enter the dev-mode verification code 000000, and confirm you land on the onboarding screen. Report the outcome of each step with a screenshot at the end.',
+      { messageCount: 8, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'tool_usage',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / terminal_operations
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-term-run-tests',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Run the test suite with `pnpm test` and tell me if it passes.',
+      {
+        messageCount: 2,
+        hasTools: true,
+      }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'terminal_operations',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-term-git-state',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Run git status and git log --oneline -5 and show me the output so I know what state this checkout is in.',
+      { messageCount: 3, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'terminal_operations',
+      contextComplexity: 'small',
+      reasoningComplexity: 'low',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-term-dev-health',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Start the local dev environment with `pnpm dev`, wait for it to boot, then curl http://localhost:3000/health and report whether the service and its database connection are healthy.',
+      { messageCount: 8, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'terminal_operations',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-term-api-container-logs',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'The api container keeps restarting. Run docker compose ps, then docker compose logs api --tail 100, identify which command in the logs is failing on boot, and report it back. Just diagnose via the commands, do not edit files.',
+      { messageCount: 10, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'terminal_operations',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'low',
+      executionMode: 'command_execution',
+      requiresTools: true,
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / multi_step_execution
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-multi-cut-release',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Cut a release: bump the version, run the full build and test suite, build and push the multi-arch Docker image to our registry, tag the git commit, and verify the staging deploy comes up healthy. Stop and report if any step fails.',
+      { messageCount: 28, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'multi_step_execution',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-multi-env-recovery',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'My local environment is broken after a branch switch: migrations are out of sync, node_modules looks stale, and the worker will not start. Diagnose and recover it end to end by running the right commands in order, re-running checks after each fix, until pnpm dev comes up clean. Report what you changed.',
+      {
+        messageCount: 32,
+        hasTools: true,
+        latestUserPromptPrefix:
+          'Also clear the local cache before reinstalling, I think it is corrupt.',
+      }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'multi_step_execution',
+      contextComplexity: 'large',
+      reasoningComplexity: 'high',
+      riskLevel: 'low',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-multi-staging-deploy',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Deploy the notifications worker to staging: run its tests first, then wrangler deploy --env staging, tail the logs for a couple of minutes, hit the staging /health endpoint, and roll back to the previous version if anything looks wrong. Report each step.',
+      { messageCount: 11, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'multi_step_execution',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'medium',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+  {
+    id: 'agentic-multi-prod-backfill',
+    input: chat(
+      AGENT_TOOLS_SYSTEM,
+      'Run the production backfill for the new display_name column: snapshot the database first, run scripts/backfill-display-name.ts against prod in batches of 1000, verify the updated row count matches the user count, and stop immediately and report if any batch errors. I will be watching — narrate each step.',
+      { messageCount: 14, hasTools: true }
+    ),
+    expected: {
+      taskType: 'agentic_execution',
+      subtaskType: 'multi_step_execution',
+      contextComplexity: 'medium',
+      reasoningComplexity: 'medium',
+      riskLevel: 'high',
+      executionMode: 'multi_step_project',
+      requiresTools: true,
+    },
+  },
+];
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
new file mode 100644
index 0000000000..1fb02e8de4
--- /dev/null
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.test.ts
@@ -0,0 +1,92 @@
+import { describe, expect, it } from 'vitest';
+import { classifierTaxonomy } from '@kilocode/auto-routing-contracts/classifier';
+import { DECIDER_CASES } from './decider-cases';
+
+const TAXONOMY_PAIRS = classifierTaxonomy.taskTypes.flatMap(taskType =>
+  taskType.subtypes.map(subtype => ({ taskType: taskType.id, subtaskType: subtype.id }))
+);
+
+const SUBTYPES_BY_TASK_TYPE = new Map(
+  classifierTaxonomy.taskTypes.map(taskType => [
+    taskType.id,
+    new Set(taskType.subtypes.map(subtype => subtype.id)),
+  ])
+);
+
+describe('DECIDER_CASES', () => {
+  it('covers all 18 taxonomy pairs', () => {
+    expect(TAXONOMY_PAIRS.length).toBe(18);
+  });
+
+  it('has exactly 76 cases with unique ids', () => {
+    expect(DECIDER_CASES.length).toBe(76);
+    const ids = new Set(DECIDER_CASES.map(c => c.id));
+    expect(ids.size).toBe(DECIDER_CASES.length);
+  });
+
+  it('has at least 4 cases per (taskType, subtaskType) pair', () => {
+    for (const pair of TAXONOMY_PAIRS) {
+      const count = DECIDER_CASES.filter(
+        c => c.taskType === pair.taskType && c.subtaskType === pair.subtaskType
+      ).length;
+      expect(count, `${pair.taskType}/${pair.subtaskType}`).toBeGreaterThanOrEqual(4);
+    }
+  });
+
+  it('labels every case with a subtaskType that belongs to its taskType', () => {
+    for (const c of DECIDER_CASES) {
+      const subtypes = SUBTYPES_BY_TASK_TYPE.get(c.taskType);
+      expect(subtypes, `unknown taskType in case ${c.id}`).toBeDefined();
+      expect(
+        subtypes?.has(c.subtaskType),
+        `case ${c.id}: ${c.subtaskType} does not belong to ${c.taskType}`
+      ).toBe(true);
+    }
+  });
+
+  it('has at least 20 cases per tier', () => {
+    for (const tier of ['low', 'medium', 'high'] as const) {
+      expect(DECIDER_CASES.filter(c => c.tier === tier).length, tier).toBeGreaterThanOrEqual(20);
+    }
+  });
+
+  it('covers at least 4 distinct task types per tier', () => {
+    for (const tier of ['low', 'medium', 'high'] as const) {
+      const taskTypes = new Set(DECIDER_CASES.filter(c => c.tier === tier).map(c => c.taskType));
+      expect(taskTypes.size, tier).toBeGreaterThanOrEqual(4);
+    }
+  });
+
+  it('has compilable regex patterns', () => {
+    for (const c of DECIDER_CASES) {
+      const check = c.check;
+      if (check.kind === 'regex') {
+        expect(() => new RegExp(check.pattern, check.flags), c.id).not.toThrow();
+      }
+    }
+  });
+
+  it('has json_equal values that round-trip through JSON', () => {
+    for (const c of DECIDER_CASES) {
+      const check = c.check;
+      if (check.kind === 'json_equal') {
+        expect(JSON.parse(JSON.stringify(check.value)), c.id).toEqual(check.value);
+      }
+    }
+  });
+
+  it('has nonempty exact and contains_all values', () => {
+    for (const c of DECIDER_CASES) {
+      const check = c.check;
+      if (check.kind === 'exact') {
+        expect(check.value.length, c.id).toBeGreaterThan(0);
+      }
+      if (check.kind === 'contains_all') {
+        expect(check.values.length, c.id).toBeGreaterThan(0);
+        for (const v of check.values) {
+          expect(v.length, c.id).toBeGreaterThan(0);
+        }
+      }
+    }
+  });
+});
diff --git a/services/auto-routing-benchmark/src/datasets/decider-cases.ts b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
new file mode 100644
index 0000000000..fcb82a223f
--- /dev/null
+++ b/services/auto-routing-benchmark/src/datasets/decider-cases.ts
@@ -0,0 +1,881 @@
+import type {
+  ClassifierSubtaskType,
+  ClassifierTaskType,
+  DifficultyTier,
+} from '@kilocode/auto-routing-contracts';
+import type { DeciderCheck } from '../grading';
+
+export type DeciderCase = {
+  id: string; // stable slug, e.g. 'impl-gen-squares-array' (<taskType>-<subtype>-<topic>)
+  tier: DifficultyTier;
+  taskType: ClassifierTaskType;
+  subtaskType: ClassifierSubtaskType;
+  systemPrompt: string;
+  userPrompt: string;
+  check: DeciderCheck;
+};
+
+const CODE_SYS =
+  'You are a precise coding assistant. Answer with only what is asked, no explanations.';
+const SYS_SYS =
+  'You are a precise systems engineer. Answer with only what is asked, no explanations.';
+const AGENT_SYS =
+  'You are a precise coding agent with file and terminal tools available. Complete the task exactly as specified, then answer with only what is asked, no explanations.';
+
+// Golden answers below were each worked through by hand (and re-verified
+// mechanically where a snippet could be executed). Every case has a single
+// unambiguous, mechanically-checkable answer. Checks tolerate formatting
+// noise (fences/case/whitespace) but never wrong values. For json_equal cases
+// the prompt pins the exact key set in the same order as the expected value
+// (the comparison is JSON.stringify-based and order-sensitive). Each case
+// carries exactly one difficulty tier: low = mechanical lookups / trivial
+// evaluation, medium = multi-step reasoning / off-by-one traps / spec
+// application, high = deep tracing / multi-constraint puzzles / subtle
+// semantics. agentic_execution cases are self-contained tasks performed with
+// file/terminal tools inside the benchmark container (node:22-slim, no repo,
+// no network) and every command involved is deterministic there.
+export const DECIDER_CASES: readonly DeciderCase[] = [
+  // ---------------------------------------------------------------------------
+  // implementation / feature_development
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-feat-ternary-parity',
+    tier: 'low',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with the exact output line only.\n\nconst n = 7;\nconsole.log(n % 2 === 0 ? "even" : "odd");',
+    check: { kind: 'exact', value: 'odd' },
+  },
+  {
+    id: 'impl-feat-array-pipeline',
+    tier: 'low',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with the exact output line only.\n\nconst xs = [1, 2, 3, 4].filter(x => x % 2 === 0).map(x => x * 10);\nconsole.log(xs.join("-"));',
+    check: { kind: 'exact', value: '20-40' },
+  },
+  {
+    id: 'impl-feat-closure-counter',
+    tier: 'medium',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What is the final printed value? Answer with only the number.\n\nfunction make() {\n  let c = 0;\n  return () => ++c;\n}\nconst f = make();\nf();\nf();\nconsole.log(f());',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'impl-feat-recursion-fib',
+    tier: 'medium',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'This computes a Fibonacci-like sequence where f(0)=0, f(1)=1, f(n)=f(n-1)+f(n-2). What is f(7)? Answer with only the number.',
+    check: { kind: 'exact', value: '13' },
+  },
+  {
+    id: 'impl-feat-this-binding',
+    tier: 'high',
+    taskType: 'implementation',
+    subtaskType: 'feature_development',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nconst obj = {\n  v: 10,\n  get() {\n    return [1, 2].map(function () {\n      return this?.v ?? 0;\n    }).reduce((a, b) => a + b, 0);\n  },\n};\nconsole.log(obj.get());',
+    check: { kind: 'exact', value: '0' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // implementation / code_generation
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-gen-package-manifest',
+    tier: 'low',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a minimal package manifest. Reply with only a JSON object with exactly the keys "name" and "version" in that order, where name is "demo-app" and version is "1.2.3".',
+    check: { kind: 'json_equal', value: { name: 'demo-app', version: '1.2.3' } },
+  },
+  {
+    id: 'impl-gen-squares-array',
+    tier: 'low',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a test fixture: a JSON array containing the squares of the integers 1 through 6, in increasing order. Reply with only the JSON array.',
+    check: { kind: 'json_equal', value: [1, 4, 9, 16, 25, 36] },
+  },
+  {
+    id: 'impl-gen-no-consecutive-ones',
+    tier: 'medium',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a test fixture: a JSON array of all binary strings of length 3 that contain no two consecutive 1s, in lexicographic order, each string as a JSON string. Reply with only the JSON array.',
+    check: { kind: 'json_equal', value: ['000', '001', '010', '100', '101'] },
+  },
+  {
+    id: 'impl-gen-two-ones-strings',
+    tier: 'high',
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Generate a test fixture. Reply with only a JSON object with exactly the keys "count" and "strings" in that order, where strings is the JSON array of all binary strings of length 4 containing exactly two 1s, in lexicographic order, each as a JSON string, and count is the length of that array.',
+    check: {
+      kind: 'json_equal',
+      value: { count: 6, strings: ['0011', '0101', '0110', '1001', '1010', '1100'] },
+    },
+  },
+
+  // ---------------------------------------------------------------------------
+  // implementation / test_creation
+  // ---------------------------------------------------------------------------
+  {
+    id: 'impl-test-sort-expectation',
+    tier: 'low',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are writing a unit test. What value makes this assertion pass? Answer with the exact string only.\n\nexpect([5, 3, 8, 1].sort((a, b) => a - b).join(",")).toBe(?)',
+    check: { kind: 'exact', value: '1,3,5,8' },
+  },
+  {
+    id: 'impl-test-upper-expectation',
+    tier: 'low',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are writing a unit test. What value makes this assertion pass? Answer with the exact string only.\n\nexpect("hello".toUpperCase()).toBe(?)',
+    check: { kind: 'exact', value: 'HELLO' },
+  },
+  {
+    id: 'impl-test-mock-call-count',
+    tier: 'medium',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are writing a unit test with a mock:\n\nconst fn = vi.fn(x => x * 2);\nconst wrapped = x => fn(x) + fn(x);\nwrapped(3);\nwrapped(4);\nexpect(fn).toHaveBeenCalledTimes(?)\n\nWhat number makes the assertion pass? Answer with only the number.',
+    check: { kind: 'exact', value: '4' },
+  },
+  {
+    id: 'impl-test-trailing-zeros',
+    tier: 'high',
+    taskType: 'implementation',
+    subtaskType: 'test_creation',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are adding a test for a function trailingZeros(n) that returns the number of trailing zero digits of n! (n factorial). What expected value should the test assert for trailingZeros(25)? Answer with only the number.',
+    check: { kind: 'exact', value: '6' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / bug_fixing
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-fix-parseint-suffix',
+    tier: 'low',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this JavaScript print? Answer with only the number.\n\nconsole.log(parseInt("42px", 10));',
+    check: { kind: 'exact', value: '42' },
+  },
+  {
+    id: 'debug-fix-binary-search',
+    tier: 'medium',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'This binary search has a bug. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line with leading whitespace removed, keeping single spaces around operators>"}.\n\n1: function bsearch(a, t) {\n2:   let lo = 0, hi = a.length;\n3:   while (lo < hi) {\n4:     const mid = (lo + hi) >> 1;\n5:     if (a[mid] === t) return mid;\n6:     if (a[mid] < t) lo = mid;\n7:     else hi = mid;\n8:   }\n9:   return -1;\n10: }',
+    check: { kind: 'json_equal', value: { line: 6, fix: 'if (a[mid] < t) lo = mid + 1;' } },
+  },
+  {
+    // 'pages' rather than 'pagination' so the id never collides with the
+    // classifier dataset's debug-fix-pagination-slice in shared telemetry.
+    id: 'debug-fix-pages-slice',
+    tier: 'medium',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'This pagination helper is buggy: pages([1, 2, 3, 4, 5, 6, 7], 3) should return [[1,2,3],[4,5,6],[7]] but loses elements. Reply with JSON {"line": <1-based line number of the buggy line>, "fix": "<the corrected line with leading whitespace removed, keeping single spaces around operators>"}.\n\n1: function pages(xs, size) {\n2:   const out = [];\n3:   for (let i = 0; i < xs.length; i += size) {\n4:     out.push(xs.slice(i, size));\n5:   }\n6:   return out;\n7: }',
+    check: { kind: 'json_equal', value: { line: 4, fix: 'out.push(xs.slice(i, i + size));' } },
+  },
+  {
+    id: 'debug-fix-regex-lastindex',
+    tier: 'high',
+    taskType: 'debugging',
+    subtaskType: 'bug_fixing',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A validator misbehaves on its second call because of a stateful regex bug. What does this print? Answer with only the two words printed, separated by a single space.\n\nconst re = /a/g;\nconsole.log(re.test("abc"), re.test("abc"));',
+    check: { kind: 'exact', value: 'true false' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / test_repair
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-repair-compound-assign',
+    tier: 'low',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A unit test asserts that this program prints 25, and the test fails. The code is correct; the expectation is stale. What value should the updated test expect? Answer with only the number.\n\nlet x = 10;\nx += 5;\nx *= 2;\nconsole.log(x);',
+    check: { kind: 'exact', value: '30' },
+  },
+  {
+    id: 'debug-repair-date-format',
+    tier: 'medium',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A snapshot test fails after a date-formatter fix. The formatter now emits dates as zero-padded YYYY-MM-DD. What exact string should the updated snapshot expect for June 1, 2026? Answer with only the date string.',
+    check: { kind: 'exact', value: '2026-06-01' },
+  },
+  {
+    id: 'debug-repair-entries-shape',
+    tier: 'medium',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A test broke because a refactor changed a function to return Object.entries(obj) instead of obj. For obj = {a: 1, b: 2} (keys in that insertion order), what is the new return value? Reply with only that value as JSON (an array of [key, value] pairs in insertion order).',
+    check: {
+      kind: 'json_equal',
+      value: [
+        ['a', 1],
+        ['b', 2],
+      ],
+    },
+  },
+  {
+    id: 'debug-repair-float-sum',
+    tier: 'high',
+    taskType: 'debugging',
+    subtaskType: 'test_repair',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A failing test asserts expect(0.1 + 0.2).toBe(0.3). The repair pins the actual IEEE-754 value. What does console.log(0.1 + 0.2) print in JavaScript? Answer with the exact printed number only.',
+    check: { kind: 'exact', value: '0.30000000000000004' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // debugging / root_cause_analysis
+  // ---------------------------------------------------------------------------
+  {
+    id: 'debug-rca-async-order',
+    tier: 'medium',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this program print, in order? Answer with the four uppercase letters joined by commas, e.g. "A,B,C,D".\n\nconsole.log("A");\nPromise.resolve().then(() => console.log("B"));\nsetTimeout(() => console.log("C"), 0);\nconsole.log("D");',
+    check: { kind: 'regex', pattern: '^\\s*A\\s*,\\s*D\\s*,\\s*B\\s*,\\s*C\\s*$', flags: 'im' },
+  },
+  {
+    id: 'debug-rca-shared-ref',
+    tier: 'medium',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nconst a = [1, 2, 3];\nconst b = a;\nb.push(4);\nconsole.log(a.length);',
+    check: { kind: 'exact', value: '4' },
+  },
+  {
+    id: 'debug-rca-closure-loop-var',
+    tier: 'high',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (var i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
+    check: { kind: 'regex', pattern: '^\\s*3\\s*,\\s*3\\s*,\\s*3\\s*$', flags: 'm' },
+  },
+  {
+    id: 'debug-rca-float-equality',
+    tier: 'high',
+    taskType: 'debugging',
+    subtaskType: 'root_cause_analysis',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'In IEEE-754 double precision (JavaScript Number), does the expression (0.1 + 0.2 === 0.3) evaluate to true or false? Answer with only the lowercase word true or false.',
+    check: { kind: 'exact', value: 'false' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / code_cleanup
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-cleanup-loop-to-reduce',
+    tier: 'low',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A loop sums an array. What value does it produce? Answer with only the number.\n\nlet total = 0;\nfor (const n of [4, 4, 4]) total += n;\nconsole.log(total);',
+    check: { kind: 'exact', value: '12' },
+  },
+  {
+    id: 'refactor-cleanup-extract-helper',
+    tier: 'low',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Two branches both compute s.trim().toLowerCase(), so you extract a helper norm(s) that does exactly that. What does norm("  HeLLo ") return? Answer with the exact string only.',
+    check: { kind: 'exact', value: 'hello' },
+  },
+  {
+    id: 'refactor-cleanup-map-equivalent',
+    tier: 'medium',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'After refactoring, both versions must produce the same output. What number does this print? Answer with only the number.\n\nconst nums = [10, 20, 30];\nconst doubled = nums.map(n => n * 2);\nconsole.log(doubled[1]);',
+    check: { kind: 'exact', value: '40' },
+  },
+  {
+    id: 'refactor-cleanup-short-circuit',
+    tier: 'high',
+    taskType: 'refactoring',
+    subtaskType: 'code_cleanup',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'What does this print? Answer with only the number.\n\nlet calls = 0;\nfunction side() {\n  calls++;\n  return 0;\n}\nconst result = side() || side() || 7;\nconsole.log(calls);',
+    check: { kind: 'exact', value: '2' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / architecture_improvement
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-arch-import-updates',
+    tier: 'low',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Files x.ts, y.ts, and z.ts each contain exactly one import of helper.ts. helper.ts moves to a new directory, changing its import path. How many import statements must be updated? Answer with only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'refactor-arch-layer-depth',
+    tier: 'medium',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      "Modules and their imports: app imports auth and billing; auth imports core; billing imports core; core imports nothing. In a layered architecture where a module's layer is 1 + the maximum layer of its imports, and core is layer 1, what layer is app? Answer with only the number.",
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'refactor-arch-interface-edges',
+    tier: 'medium',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A module graph has edges A->B, A->C, B->D, C->D. To improve the architecture you introduce an interface module I: the edges B->D and C->D are removed and replaced by B->I, C->I, and I->D. How many edges does the new graph have? Answer with only the number.',
+    check: { kind: 'exact', value: '5' },
+  },
+  {
+    id: 'refactor-arch-cycle-cut',
+    tier: 'high',
+    taskType: 'refactoring',
+    subtaskType: 'architecture_improvement',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A module graph has directed import edges A->B, B->C, C->A, B->D, D->B, D->E. You must make the graph acyclic by deleting the minimum number of import edges. Reply with JSON {"deleted": <minimum number of edges to delete>, "remaining": <number of edges left after deleting them>}.',
+    check: { kind: 'json_equal', value: { deleted: 2, remaining: 4 } },
+  },
+
+  // ---------------------------------------------------------------------------
+  // refactoring / migration
+  // ---------------------------------------------------------------------------
+  {
+    id: 'refactor-migrate-substr-slice',
+    tier: 'low',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are migrating code off the deprecated String.prototype.substr. The old call is "javascript".substr(4, 3). What string does the equivalent migrated call "javascript".slice(4, 7) return? Answer with the exact string only.',
+    check: { kind: 'exact', value: 'scr' },
+  },
+  {
+    id: 'refactor-migrate-promise-chain',
+    tier: 'medium',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'After migrating a callback API to promises, the code reads:\n\nPromise.resolve(2).then(x => x + 1).then(x => x * 10).then(x => console.log(x));\n\nWhat number does it print? Answer with only the number.',
+    check: { kind: 'exact', value: '30' },
+  },
+  {
+    id: 'refactor-migrate-strict-equality',
+    tier: 'medium',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are migrating a codebase from == to ===. How many of these four comparisons change their result after replacing == with ===?\n\n"1" == 1\nnull == undefined\n2 == 2\nNaN == NaN\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'refactor-migrate-var-to-let',
+    tier: 'high',
+    taskType: 'refactoring',
+    subtaskType: 'migration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A loop variable was migrated from var to let. What does the migrated code print? Answer with the three numbers joined by commas, e.g. "1,2,3".\n\nconst fns = [];\nfor (let i = 0; i < 3; i++) {\n  fns.push(() => i);\n}\nconsole.log(fns[0]() + "," + fns[1]() + "," + fns[2]());',
+    check: { kind: 'regex', pattern: '^\\s*0\\s*,\\s*1\\s*,\\s*2\\s*$', flags: 'm' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / architecture_design
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-arch-three-tier',
+    tier: 'low',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'In a classic three-tier architecture with presentation, business, and data tiers, which tier should contain the SQL queries? Answer with only one word: presentation, business, or data.',
+    check: { kind: 'exact', value: 'data' },
+  },
+  {
+    id: 'plan-arch-call-chain',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A service design has these synchronous call edges: gateway calls auth and orders; orders calls inventory and billing; billing calls ledger. Counting edges, how long is the longest call chain starting at gateway? Answer with only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'plan-arch-dependency-rules',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A layered design enforces these rules: ui may import only app; app may import domain and infra; infra may import domain; domain imports nothing. How many of these five proposed imports violate the rules?\n\nui -> app\nui -> domain\napp -> domain\ninfra -> app\ndomain -> infra\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'plan-arch-latency-budget',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'architecture_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A design must keep worst-case request latency within a 300 ms budget. The synchronous chain is gateway (10 ms) -> auth (40 ms) -> service (120 ms) -> db (90 ms), and in the worst case the db call is retried once (the db is called twice; all other components run once). Reply with JSON {"totalMs": <worst-case total latency in ms>, "withinBudget": <true|false>}.',
+    check: { kind: 'json_equal', value: { totalMs: 350, withinBudget: false } },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / technical_planning
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-steps-rollout-order',
+    tier: 'low',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A rollout plan has four steps in strict sequence: write code, code review, deploy to staging, deploy to production. Which step is third? Answer with only the exact step name.',
+    check: { kind: 'exact', value: 'deploy to staging' },
+  },
+  {
+    id: 'plan-steps-batch-count',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A data migration plan processes 1000 records in batches of up to 80 records, one batch per run. How many runs does the plan need to process all records? Answer with only the number.',
+    check: { kind: 'exact', value: '13' },
+  },
+  {
+    id: 'plan-steps-deploy-waves',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Services A, B, C, D deploy in waves: a service can only deploy after all its dependencies are deployed, and any number of services can share a wave. Dependencies: B needs A; C needs A; D needs B and C. Reply with JSON {"waves": <minimum number of waves>, "dWave": <1-based wave in which D deploys>}.',
+    check: { kind: 'json_equal', value: { waves: 3, dWave: 3 } },
+  },
+  {
+    id: 'plan-steps-critical-path',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'technical_planning',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A plan has tasks with durations in days and dependencies: A (3 days) has no dependencies; B (2 days) starts after A; C (4 days) starts after A; D (1 day) starts after both B and C; E (2 days) starts after D. With unlimited parallelism, what is the minimum number of days to finish all tasks? Answer with only the number.',
+    check: { kind: 'exact', value: '10' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // planning_design / system_design
+  // ---------------------------------------------------------------------------
+  {
+    id: 'plan-system-write-quorum',
+    tier: 'low',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A system replicates each write to 3 nodes and requires a majority quorum of acknowledgements before confirming the write. How many node acknowledgements are required? Answer with only the number.',
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'plan-system-rate-limit-window',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A fixed-window rate limiter allows 100 requests per 60-second window. A client sends 80 requests in the first 30 seconds of a window, then 40 more requests in the next 20 seconds (same window). How many of the 40 later requests are rejected? Answer with only the number.',
+    check: { kind: 'exact', value: '20' },
+  },
+  {
+    id: 'plan-system-replica-availability',
+    tier: 'medium',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A service is available when at least one of its two independent replicas is up. Each replica is up 90% of the time, independently. What is the service availability as a percentage? Answer with only the number.',
+    check: { kind: 'exact', value: '99' },
+  },
+  {
+    id: 'plan-system-cache-staleness',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A write-through cache with TTL 60s. At t=0s key K is written (value 1, cached). At t=30s the database row for K is updated to value 2 by a process that bypasses the cache (does not invalidate it). At t=45s a reader requests K. At t=70s another reader requests K. The cache returns its entry if present and unexpired, otherwise reads the DB and caches. What value does the t=45s reader get, and what value does the t=70s reader get? Reply with JSON {"first": <number>, "second": <number>}.',
+    check: { kind: 'json_equal', value: { first: 1, second: 2 } },
+  },
+  {
+    id: 'plan-system-queue-trace',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Three workers process a queue with at-least-once delivery. Worker A reads job 7 at t=0ms and crashes at t=50ms, before performing the insert and before ack. Visibility timeout is 30ms. Worker B receives job 7 at t=35ms, processes it in 40ms and acks. Worker C receives job 7 at t=80ms (redelivery triggered by the crash recovery scan at t=70ms) and processes it in 10ms, acking at t=90ms. The job inserts a row keyed by an idempotency key with ON CONFLICT DO NOTHING. How many rows exist at t=100ms, and which worker\'s insert won? Reply with JSON {"rows": <number>, "winner": "<A|B|C>"}.',
+    check: { kind: 'json_equal', value: { rows: 1, winner: 'B' } },
+  },
+  {
+    id: 'plan-system-deadlock-order',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Two threads acquire locks. Thread 1: lock A, then lock B. Thread 2: lock B, then lock A. Both hold the first lock and then block forever waiting for the second. To eliminate the deadlock by enforcing a global lock acquisition order (alphabetical: A before B), which single thread number must have its two lock acquisitions reordered? Answer with only the thread number.',
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'plan-system-txn-isolation',
+    tier: 'high',
+    taskType: 'planning_design',
+    subtaskType: 'system_design',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'A counter row holds value 5. Under READ COMMITTED isolation, two concurrent transactions T1 and T2 each run: SELECT v FROM c; then UPDATE c SET v = (the value they read) + 1. Both read before either writes, T1 commits first, then T2 commits (last-write-wins, no row lock taken on the SELECT). What is the final value of v? Answer with only the number.',
+    check: { kind: 'exact', value: '6' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / repo_exploration
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-repo-test-file-count',
+    tier: 'low',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A repository listing shows these files:\n\nsrc/app.ts\nsrc/app.test.ts\nsrc/util.ts\nsrc/util.test.ts\nsrc/index.ts\nREADME.md\n\nHow many files end in .test.ts? Answer with only the number.',
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'invest-repo-glob-match',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Using a glob where ** matches zero or more directories, how many of these files match the pattern src/**/*.ts?\n\nsrc/a.ts\nsrc/lib/b.ts\nsrc/lib/deep/c.ts\ntest/d.ts\nsrc/e.tsx\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'invest-repo-grep-case',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A file contains exactly these 5 lines:\n\nError: failed\nerror handled\nno problems\nERROR_CODE=7\nerrors: none\n\nHow many lines does a case-sensitive search for the string "error" match? Answer with only the number.',
+    check: { kind: 'exact', value: '2' },
+  },
+  {
+    id: 'invest-repo-gitignore',
+    tier: 'high',
+    taskType: 'investigation',
+    subtaskType: 'repo_exploration',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'A .gitignore contains exactly these rules in order:\n\n*.log\n!important.log\nlogs/\n\nUsing standard git semantics (a pattern without a slash matches at any depth, and a file cannot be re-included if a parent directory of it is excluded), how many of these files are ignored?\n\ndebug.log\nimportant.log\nlogs/important.log\nlogs/app.txt\nsrc/trace.log\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '4' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / codebase_understanding
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-code-char-count',
+    tier: 'low',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'How many times does the letter "a" appear in the word "banana"? Answer with only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'invest-code-object-keys',
+    tier: 'low',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'How many own enumerable keys does this object have? Answer with only the number.\n\nconst o = { a: 1, b: 2, c: 3 };',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'invest-code-regex-groups',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Given the regex /(\\d{4})-(\\d{2})-(\\d{2})/ applied to "2026-06-11", what is capture group 2? Answer with only the value.',
+    check: { kind: 'exact', value: '06' },
+  },
+  {
+    id: 'invest-code-collatz-depth',
+    tier: 'high',
+    taskType: 'investigation',
+    subtaskType: 'codebase_understanding',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'You are reading unfamiliar code. What does f(6) return?\n\nfunction f(n) {\n  if (n <= 1) return n;\n  return n % 2 === 0 ? f(n / 2) + 1 : f(3 * n + 1);\n}\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '7' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // investigation / external_research
+  // ---------------------------------------------------------------------------
+  {
+    id: 'invest-ext-http-created',
+    tier: 'low',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt:
+      'You are a precise web API expert. Answer with only what is asked, no explanations.',
+    userPrompt:
+      'Which standard HTTP status code indicates that a new resource was successfully created? Answer with only the 3-digit number.',
+    check: { kind: 'exact', value: '201' },
+  },
+  {
+    id: 'invest-ext-utf8-euro',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: SYS_SYS,
+    userPrompt:
+      'Per the UTF-8 encoding specification, how many bytes does the encoding of the euro sign (U+20AC) use? Answer with only the number.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'invest-ext-semver-caret',
+    tier: 'medium',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Per the npm semver range specification, consider the range ^1.4.2. Does it include version 1.5.0, and does it include version 2.0.0? Reply with JSON {"v150": <true|false>, "v200": <true|false>}.',
+    check: { kind: 'json_equal', value: { v150: true, v200: false } },
+  },
+  {
+    id: 'invest-ext-json-spec',
+    tier: 'high',
+    taskType: 'investigation',
+    subtaskType: 'external_research',
+    systemPrompt: CODE_SYS,
+    userPrompt:
+      'Per the JSON specification (RFC 8259), how many of these four documents are valid JSON?\n\n{"a": 01}\n{"a": 1,}\n{"a": .5}\n{"a": 1e2}\n\nAnswer with only the number.',
+    check: { kind: 'exact', value: '1' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / tool_usage
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-tool-json-read',
+    tier: 'low',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Using your file tools, create a file /tmp/bench-kv.json containing exactly this JSON: {"alpha": 4, "beta": 9}. Then read the file back and answer with only the value of the key "beta".',
+    check: { kind: 'exact', value: '9' },
+  },
+  {
+    id: 'agentic-tool-notes-count',
+    tier: 'low',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a directory /tmp/bench-notes containing exactly three files named one.txt, two.txt, and three.txt (any content). Then list the directory and answer with only the number of files it contains.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'agentic-tool-log-grep',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a file /tmp/bench-app.log containing exactly these 6 lines:\n\nINFO start\nERROR disk full\nINFO retry\nERROR timeout\nWARN slow\nERROR disk full\n\nThen search the file and answer with only the number of lines that contain the word ERROR.',
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'agentic-tool-csv-filter-sum',
+    tier: 'high',
+    taskType: 'agentic_execution',
+    subtaskType: 'tool_usage',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a file /tmp/bench-data.csv containing exactly these 6 lines:\n\nid,qty\na,12\nb,7\ne,31\no,50\nk,9\n\nThen compute the sum of the qty column over only the rows whose id is a vowel (a, e, i, o, or u), and answer with only the number.',
+    check: { kind: 'exact', value: '93' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / terminal_operations
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-term-node-major',
+    tier: 'low',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Run this command in the terminal and answer with only the number it prints:\n\nnode -e "console.log(process.versions.node.split(\'.\')[0])"',
+    check: { kind: 'exact', value: '22' },
+  },
+  {
+    id: 'agentic-term-wc-lines',
+    tier: 'low',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Using the terminal, write a file /tmp/bench-words.txt containing exactly these 5 lines:\n\nred\ngreen\nblue\ncyan\nplum\n\nThen run: wc -l < /tmp/bench-words.txt and answer with only the number it prints.',
+    check: { kind: 'exact', value: '5' },
+  },
+  {
+    id: 'agentic-term-sort-pipeline',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      "Run this pipeline in the terminal and answer with only the line it prints:\n\nprintf 'pear\\napple\\nbanana\\n' | sort | head -n 1",
+    check: { kind: 'exact', value: 'apple' },
+  },
+  {
+    id: 'agentic-term-sha256-prefix',
+    tier: 'high',
+    taskType: 'agentic_execution',
+    subtaskType: 'terminal_operations',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      "Run this command in the terminal and answer with only the 8 characters it prints:\n\nnode -e \"console.log(require('crypto').createHash('sha256').update('kilo-benchmark').digest('hex').slice(0, 8))\"",
+    check: { kind: 'exact', value: 'fd99e6a4' },
+  },
+
+  // ---------------------------------------------------------------------------
+  // agentic_execution / multi_step_execution
+  // ---------------------------------------------------------------------------
+  {
+    id: 'agentic-multi-seq-sum',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a file /tmp/bench-seq.txt containing the integers 1 through 10, one per line. Then use a terminal command to sum the lines and answer with only the sum.',
+    check: { kind: 'exact', value: '55' },
+  },
+  {
+    id: 'agentic-multi-node-script',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Write a file /tmp/bench-fib.js containing a Node.js script that computes f(12) for the sequence f(1) = 1, f(2) = 1, f(n) = f(n-1) + f(n-2), and prints the result. Run it with node and answer with only the number it prints.',
+    check: { kind: 'exact', value: '144' },
+  },
+  {
+    id: 'agentic-multi-find-count',
+    tier: 'medium',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      "Create directories /tmp/bench-proj/src and /tmp/bench-proj/test. Create empty files /tmp/bench-proj/src/a.ts, /tmp/bench-proj/src/b.ts, and /tmp/bench-proj/test/a.test.ts. Then run:\n\nfind /tmp/bench-proj -name '*.ts' | wc -l\n\nand answer with only the number it prints.",
+    check: { kind: 'exact', value: '3' },
+  },
+  {
+    id: 'agentic-multi-json-transform',
+    tier: 'high',
+    taskType: 'agentic_execution',
+    subtaskType: 'multi_step_execution',
+    systemPrompt: AGENT_SYS,
+    userPrompt:
+      'Create a file /tmp/bench-in.json containing exactly this JSON array: [3, 1, 4, 1, 5, 9, 2, 6, 5, 3]. Then write and run a Node.js script that reads the file, computes the sum of the distinct values in the array, and prints it. Answer with only the number.',
+    check: { kind: 'exact', value: '30' },
+  },
+];
diff --git a/services/auto-routing-benchmark/src/db-schema.ts b/services/auto-routing-benchmark/src/db-schema.ts
new file mode 100644
index 0000000000..2a4c88035c
--- /dev/null
+++ b/services/auto-routing-benchmark/src/db-schema.ts
@@ -0,0 +1,149 @@
+import { sql } from 'drizzle-orm';
+import { integer, primaryKey, real, sqliteTable, text, uniqueIndex } from 'drizzle-orm/sqlite-core';
+import type { BenchmarkKind, BenchmarkRunStatus } from '@kilocode/auto-routing-contracts';
+
+// Migrations are generated via `pnpm db:generate` (drizzle-kit) and applied
+// via wrangler d1 migrations apply.
+
+export const benchmarkConfig = sqliteTable('benchmark_config', {
+  id: integer('id').primaryKey(),
+  min_accuracy: real('min_accuracy').notNull(),
+  switch_cost_factor: real('switch_cost_factor').notNull(),
+  max_concurrency: integer('max_concurrency').notNull(),
+  benchmark_user_id: text('benchmark_user_id'),
+  classifier_repetitions: integer('classifier_repetitions').notNull().default(1),
+  decider_repetitions: integer('decider_repetitions').notNull().default(1),
+  classifier_max_p95_latency_ms: integer('classifier_max_p95_latency_ms'),
+  updated_at: text('updated_at').notNull(),
+  updated_by: text('updated_by'),
+});
+
+export const configClassifierModels = sqliteTable('config_classifier_models', {
+  model: text('model').primaryKey(),
+});
+
+export const configDeciderModels = sqliteTable('config_decider_models', {
+  model: text('model').primaryKey(),
+  reasoning_effort: text('reasoning_effort'),
+});
+
+export const benchmarkRuns = sqliteTable(
+  'benchmark_runs',
+  {
+    id: text('id').primaryKey(),
+    kind: text('kind').$type<BenchmarkKind>().notNull(),
+    status: text('status').$type<BenchmarkRunStatus>().notNull(),
+    started_at: text('started_at').notNull(),
+    completed_at: text('completed_at'),
+    error: text('error'),
+    // Config snapshot taken at startRun time so mid-run edits can't skew results.
+    min_accuracy: real('min_accuracy').notNull(),
+    switch_cost_factor: real('switch_cost_factor').notNull(),
+    max_concurrency: integer('max_concurrency').notNull(),
+    benchmark_user_id: text('benchmark_user_id'),
+    repetitions: integer('repetitions').notNull().default(1),
+    classifier_max_p95_latency_ms: integer('classifier_max_p95_latency_ms'),
+    // Benchmark-identity snapshot: dataset content hash + engine version. A prior
+    // model's summaries may only be carried into a new run when this matches (and
+    // repetitions + the model's reasoning_effort match), so changes to the
+    // dataset, grading, or CLI/image pinning re-benchmark instead of pairing
+    // current serving config with measurements taken under different conditions.
+    engine_identity: text('engine_identity').notNull().default(''),
+  },
+  table => [
+    // At most one running run per kind — the atomic backstop for the
+    // server-side "one active run per kind" admission rule (concurrent POSTs /
+    // multiple tabs that slip past the pre-check still can't both claim).
+    uniqueIndex('UQ_benchmark_runs_one_running_per_kind')
+      .on(table.kind)
+      .where(sql`${table.status} = 'running'`),
+  ]
+);
+
+export const runModels = sqliteTable(
+  'run_models',
+  {
+    run_id: text('run_id').notNull(),
+    model: text('model').notNull(),
+    // enqueued=false means the model was skipped (had prior results).
+    enqueued: integer('enqueued', { mode: 'boolean' }).notNull(),
+    reasoning_effort: text('reasoning_effort'),
+  },
+  table => [primaryKey({ columns: [table.run_id, table.model] })]
+);
+
+export const modelSummaries = sqliteTable(
+  'model_summaries',
+  {
+    run_id: text('run_id').notNull(),
+    model: text('model').notNull(),
+    tier: text('tier').notNull(),
+    accuracy: real('accuracy').notNull(),
+    avg_cost_usd: real('avg_cost_usd'),
+    avg_latency_ms: real('avg_latency_ms').notNull(),
+    p50_latency_ms: real('p50_latency_ms'),
+    cases: integer('cases').notNull(),
+    errors: integer('errors').notNull(),
+    p95_latency_ms: real('p95_latency_ms'),
+    timeouts: integer('timeouts').notNull().default(0),
+    // carried=true rows are prior-run summaries copied in at startRun for skipped models.
+    carried: integer('carried', { mode: 'boolean' }).notNull().default(false),
+  },
+  table => [primaryKey({ columns: [table.run_id, table.model, table.tier] })]
+);
+
+export const caseResults = sqliteTable(
+  'case_results',
+  {
+    run_id: text('run_id').notNull(),
+    model: text('model').notNull(),
+    case_id: text('case_id').notNull(),
+    tier: text('tier'),
+    score: real('score').notNull(),
+    latency_ms: integer('latency_ms').notNull(),
+    cost_usd: real('cost_usd'),
+    error: text('error'),
+    // Classifier diagnostics.
+    fallback_reason: text('fallback_reason'),
+    retried: integer('retried', { mode: 'boolean' }),
+    // Decider diagnostics.
+    exit_code: integer('exit_code'),
+    output_prefix: text('output_prefix'),
+    event_count: integer('event_count'),
+    last_event_types: text('last_event_types'),
+    // Repetition index (0-based); together with run_id/model/case_id forms the PK.
+    rep: integer('rep').notNull().default(0),
+    // 1 when the case was killed by the wall-clock timeout, 0 otherwise.
+    timed_out: integer('timed_out').notNull().default(0),
+  },
+  // The composite PK's leftmost column already serves run_id-prefix lookups
+  // (count/fetch by run); no separate run_id index is needed.
+  table => [primaryKey({ columns: [table.run_id, table.model, table.case_id, table.rep] })]
+);
+
+export const routingTables = sqliteTable('routing_tables', {
+  run_id: text('run_id').primaryKey(),
+  published_at: text('published_at').notNull(),
+  generated_at: text('generated_at').notNull(),
+  min_accuracy: real('min_accuracy').notNull(),
+  switch_cost_factor: real('switch_cost_factor').notNull(),
+  source: text('source').notNull(),
+});
+
+export const routingTableCandidates = sqliteTable(
+  'routing_table_candidates',
+  {
+    run_id: text('run_id').notNull(),
+    tier: text('tier').notNull(),
+    rank: integer('rank').notNull(),
+    model: text('model').notNull(),
+    accuracy: real('accuracy').notNull(),
+    // Non-null unlike model_summaries: RankedCandidate.avgCostUsd is a plain
+    // nonnegative number (buildRoutingTable excludes summaries without a
+    // cost signal, so every published candidate has one).
+    avg_cost_usd: real('avg_cost_usd').notNull(),
+    meets_threshold: integer('meets_threshold', { mode: 'boolean' }).notNull(),
+    reasoning_effort: text('reasoning_effort'),
+  },
+  table => [primaryKey({ columns: [table.run_id, table.tier, table.rank] })]
+);
diff --git a/services/auto-routing-benchmark/src/db.test.ts b/services/auto-routing-benchmark/src/db.test.ts
new file mode 100644
index 0000000000..103482e00d
--- /dev/null
+++ b/services/auto-routing-benchmark/src/db.test.ts
@@ -0,0 +1,199 @@
+import { describe, it, expect } from 'vitest';
+import { RoutingTableSchema } from '@kilocode/auto-routing-contracts';
+import type { RankedCandidate, RoutingTable } from '@kilocode/auto-routing-contracts';
+import { mapRunRow, mapSummaryRow, routingTableToRows, rowsToRoutingTable } from './db';
+import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
+
+// ---------------------------------------------------------------------------
+// mapSummaryRow
+// ---------------------------------------------------------------------------
+
+describe('mapSummaryRow', () => {
+  it('maps snake_case columns to camelCase BenchmarkModelSummary', () => {
+    const row = {
+      run_id: 'run-1',
+      model: 'openai/gpt-4o',
+      tier: 'high',
+      accuracy: 0.92,
+      avg_cost_usd: 0.0015,
+      avg_latency_ms: 320.5,
+      p50_latency_ms: 300.0,
+      p95_latency_ms: 300.0,
+      cases: 50,
+      errors: 2,
+      timeouts: 0,
+      carried: false,
+    };
+    const result = mapSummaryRow(row);
+    expect(result).toEqual<BenchmarkModelSummary>({
+      model: 'openai/gpt-4o',
+      tier: 'high',
+      accuracy: 0.92,
+      avgCostUsd: 0.0015,
+      avgLatencyMs: 320.5,
+      p50LatencyMs: 300.0,
+      p95LatencyMs: 300.0,
+      cases: 50,
+      errors: 2,
+      timeouts: 0,
+    });
+  });
+
+  it('handles null avg_cost_usd and p50_latency_ms', () => {
+    const row = {
+      run_id: 'run-2',
+      model: 'anthropic/claude-3-haiku',
+      tier: '*',
+      accuracy: 0.85,
+      avg_cost_usd: null,
+      avg_latency_ms: 150.0,
+      p50_latency_ms: null,
+      p95_latency_ms: null,
+      cases: 30,
+      errors: 0,
+      timeouts: 0,
+      carried: false,
+    };
+    const result = mapSummaryRow(row);
+    expect(result.avgCostUsd).toBeNull();
+    expect(result.p50LatencyMs).toBeNull();
+    expect(result.p95LatencyMs).toBeNull();
+    expect(result.tier).toBe('*');
+    expect(result.errors).toBe(0);
+    expect(result.timeouts).toBe(0);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// mapRunRow
+// ---------------------------------------------------------------------------
+
+describe('mapRunRow', () => {
+  it('maps a RunRow and attaches its summaries', () => {
+    const runRow = {
+      id: 'run-abc',
+      kind: 'classifier' as const,
+      status: 'completed' as const,
+      started_at: '2026-06-10T04:10:00.000Z',
+      completed_at: '2026-06-10T04:25:00.000Z',
+      error: null,
+      min_accuracy: 0.7,
+      switch_cost_factor: 3,
+      max_concurrency: 4,
+      benchmark_user_id: null,
+      repetitions: 1,
+      classifier_max_p95_latency_ms: null,
+      engine_identity: 'v1:deadbeef',
+    };
+    const summaries: BenchmarkModelSummary[] = [
+      {
+        model: 'openai/gpt-4o-mini',
+        tier: '*',
+        accuracy: 0.78,
+        avgCostUsd: 0.0002,
+        avgLatencyMs: 120,
+        p50LatencyMs: 110,
+        p95LatencyMs: null,
+        cases: 100,
+        errors: 5,
+        timeouts: 0,
+      },
+    ];
+    const result = mapRunRow(runRow, summaries);
+    expect(result.id).toBe('run-abc');
+    expect(result.kind).toBe('classifier');
+    expect(result.status).toBe('completed');
+    expect(result.startedAt).toBe('2026-06-10T04:10:00.000Z');
+    expect(result.completedAt).toBe('2026-06-10T04:25:00.000Z');
+    expect(result.error).toBeNull();
+    expect(result.summaries).toHaveLength(1);
+    expect(result.summaries[0].model).toBe('openai/gpt-4o-mini');
+  });
+
+  it('attaches an empty summaries array when none are provided', () => {
+    const runRow = {
+      id: 'run-xyz',
+      kind: 'decider' as const,
+      status: 'running' as const,
+      started_at: '2026-06-11T05:10:00.000Z',
+      completed_at: null,
+      error: null,
+      min_accuracy: 0.7,
+      switch_cost_factor: 3,
+      max_concurrency: 4,
+      benchmark_user_id: null,
+      repetitions: 1,
+      classifier_max_p95_latency_ms: null,
+      engine_identity: 'v1:deadbeef',
+    };
+    const result = mapRunRow(runRow, []);
+    expect(result.summaries).toEqual([]);
+    expect(result.completedAt).toBeNull();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// routingTableToRows / rowsToRoutingTable round-trip
+// ---------------------------------------------------------------------------
+
+const candidate = (model: string): RankedCandidate => ({
+  model,
+  accuracy: 0.9,
+  avgCostUsd: 0.001,
+  meetsThreshold: true,
+  reasoningEffort: null,
+});
+
+const sampleTable: RoutingTable = {
+  version: 'run-test-1',
+  generatedAt: '2026-06-01T10:00:00.000Z',
+  minAccuracy: 0.7,
+  switchCostFactor: 3,
+  source: 'benchmark',
+  tiers: {
+    low: [candidate('model-a'), candidate('model-b')],
+    medium: [candidate('model-c')],
+    high: [candidate('model-a')],
+  },
+};
+
+describe('routingTableToRows', () => {
+  it('produces a tableRow with the correct scalar fields', () => {
+    const { tableRow } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    expect(tableRow.run_id).toBe('run-test-1');
+    expect(tableRow.published_at).toBe('2026-06-01T11:00:00.000Z');
+    expect(tableRow.generated_at).toBe('2026-06-01T10:00:00.000Z');
+    expect(tableRow.min_accuracy).toBe(0.7);
+    expect(tableRow.switch_cost_factor).toBe(3);
+    expect(tableRow.source).toBe('benchmark');
+  });
+
+  it('assigns rank 0,1 for the two low-tier candidates', () => {
+    const { candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    const lowRows = candidateRows.filter(r => r.tier === 'low').sort((a, b) => a.rank - b.rank);
+    expect(lowRows).toHaveLength(2);
+    expect(lowRows[0].model).toBe('model-a');
+    expect(lowRows[0].rank).toBe(0);
+    expect(lowRows[1].model).toBe('model-b');
+    expect(lowRows[1].rank).toBe(1);
+  });
+});
+
+describe('rowsToRoutingTable', () => {
+  it('round-trips: rowsToRoutingTable(routingTableToRows(table)) === table', () => {
+    const { tableRow, candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    const reassembled = rowsToRoutingTable(tableRow, candidateRows);
+    expect(reassembled).toEqual(sampleTable);
+    // The reassembled table must satisfy the contract schema (getLatestRoutingTable parses it).
+    expect(RoutingTableSchema.parse(reassembled)).toEqual(sampleTable);
+  });
+
+  it('preserves candidate order within each tier', () => {
+    const { tableRow, candidateRows } = routingTableToRows(sampleTable, '2026-06-01T11:00:00.000Z');
+    // Shuffle candidateRows to verify rank-based sorting.
+    const shuffled = [...candidateRows].reverse();
+    const reassembled = rowsToRoutingTable(tableRow, shuffled);
+    expect(reassembled.tiers.low[0].model).toBe('model-a');
+    expect(reassembled.tiers.low[1].model).toBe('model-b');
+  });
+});
diff --git a/services/auto-routing-benchmark/src/db.ts b/services/auto-routing-benchmark/src/db.ts
new file mode 100644
index 0000000000..da3e805f97
--- /dev/null
+++ b/services/auto-routing-benchmark/src/db.ts
@@ -0,0 +1,641 @@
+import type {
+  BenchmarkKind,
+  BenchmarkModelSummary,
+  BenchmarkRun,
+  ClassifierWinner,
+  RankedCandidate,
+  RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+import type { BatchItem } from 'drizzle-orm/batch';
+import { RoutingTableSchema } from '@kilocode/auto-routing-contracts';
+import { and, count, desc, eq, gt, inArray, lt, ne } from 'drizzle-orm';
+import { drizzle } from 'drizzle-orm/d1';
+import {
+  benchmarkConfig,
+  benchmarkRuns,
+  caseResults,
+  configClassifierModels,
+  configDeciderModels,
+  modelSummaries,
+  routingTableCandidates,
+  routingTables,
+  runModels,
+} from './db-schema';
+import { pickClassifierWinner } from './winner';
+
+export type CaseResultRow = typeof caseResults.$inferSelect;
+export type RunRow = typeof benchmarkRuns.$inferSelect;
+export type RunModelRow = typeof runModels.$inferSelect;
+export type ConfigDeciderModelRow = typeof configDeciderModels.$inferSelect;
+type ModelSummaryRow = typeof modelSummaries.$inferSelect;
+
+// ---------------------------------------------------------------------------
+// Row mapping helpers
+// ---------------------------------------------------------------------------
+
+export function mapSummaryRow(row: ModelSummaryRow): BenchmarkModelSummary {
+  return {
+    model: row.model,
+    tier: row.tier as BenchmarkModelSummary['tier'],
+    accuracy: row.accuracy,
+    avgCostUsd: row.avg_cost_usd,
+    avgLatencyMs: row.avg_latency_ms,
+    p50LatencyMs: row.p50_latency_ms,
+    p95LatencyMs: row.p95_latency_ms,
+    cases: row.cases,
+    errors: row.errors,
+    timeouts: row.timeouts,
+  };
+}
+
+export function mapRunRow(row: RunRow, summaries: BenchmarkModelSummary[]): BenchmarkRun {
+  return {
+    id: row.id,
+    kind: row.kind,
+    status: row.status,
+    startedAt: row.started_at,
+    completedAt: row.completed_at,
+    error: row.error,
+    summaries,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Config
+// ---------------------------------------------------------------------------
+
+export async function getConfigRows(db: D1Database): Promise<{
+  config: typeof benchmarkConfig.$inferSelect | null;
+  classifierModels: string[];
+  deciderModels: ConfigDeciderModelRow[];
+}> {
+  const orm = drizzle(db);
+  const [configRows, classifierRows, deciderRows] = await Promise.all([
+    orm.select().from(benchmarkConfig).where(eq(benchmarkConfig.id, 1)).limit(1),
+    orm.select().from(configClassifierModels),
+    orm.select().from(configDeciderModels),
+  ]);
+  return {
+    config: configRows[0] ?? null,
+    classifierModels: classifierRows.map(r => r.model),
+    deciderModels: deciderRows,
+  };
+}
+
+export async function replaceConfig(
+  db: D1Database,
+  config: {
+    min_accuracy: number;
+    switch_cost_factor: number;
+    max_concurrency: number;
+    benchmark_user_id: string | null;
+    classifier_repetitions: number;
+    decider_repetitions: number;
+    classifier_max_p95_latency_ms: number | null;
+    updated_at: string;
+    updated_by: string | null;
+  },
+  classifierModels: string[],
+  deciderModels: ConfigDeciderModelRow[]
+): Promise<void> {
+  const orm = drizzle(db);
+  const stmts: [BatchItem<'sqlite'>, ...BatchItem<'sqlite'>[]] = [
+    orm
+      .insert(benchmarkConfig)
+      .values({ id: 1, ...config })
+      .onConflictDoUpdate({
+        target: benchmarkConfig.id,
+        set: config,
+      }),
+    orm.delete(configClassifierModels),
+    orm.delete(configDeciderModels),
+  ];
+  if (classifierModels.length > 0) {
+    stmts.push(
+      orm.insert(configClassifierModels).values(classifierModels.map(m => ({ model: m })))
+    );
+  }
+  if (deciderModels.length > 0) {
+    stmts.push(orm.insert(configDeciderModels).values(deciderModels));
+  }
+  await orm.batch(stmts);
+}
+
+// ---------------------------------------------------------------------------
+// Runs
+// ---------------------------------------------------------------------------
+
+export async function insertRun(
+  db: D1Database,
+  run: {
+    id: string;
+    kind: BenchmarkKind;
+    startedAt: string;
+    min_accuracy: number;
+    switch_cost_factor: number;
+    max_concurrency: number;
+    benchmark_user_id: string | null;
+    repetitions: number;
+    classifier_max_p95_latency_ms: number | null;
+    engine_identity: string;
+  },
+  models: RunModelRow[],
+  carriedSummaries: BenchmarkModelSummary[]
+): Promise<void> {
+  const orm = drizzle(db);
+  const insertRunStmt = orm.insert(benchmarkRuns).values({
+    id: run.id,
+    kind: run.kind,
+    status: 'running',
+    started_at: run.startedAt,
+    min_accuracy: run.min_accuracy,
+    switch_cost_factor: run.switch_cost_factor,
+    max_concurrency: run.max_concurrency,
+    benchmark_user_id: run.benchmark_user_id,
+    repetitions: run.repetitions,
+    classifier_max_p95_latency_ms: run.classifier_max_p95_latency_ms,
+    engine_identity: run.engine_identity,
+  });
+
+  if (models.length === 0 && carriedSummaries.length === 0) {
+    await insertRunStmt;
+    return;
+  }
+
+  const stmts: [BatchItem<'sqlite'>, ...BatchItem<'sqlite'>[]] = [insertRunStmt];
+
+  if (models.length > 0) {
+    stmts.push(orm.insert(runModels).values(models));
+  }
+
+  if (carriedSummaries.length > 0) {
+    stmts.push(
+      orm.insert(modelSummaries).values(
+        carriedSummaries.map(s => ({
+          run_id: run.id,
+          model: s.model,
+          tier: s.tier,
+          accuracy: s.accuracy,
+          avg_cost_usd: s.avgCostUsd,
+          avg_latency_ms: s.avgLatencyMs,
+          p50_latency_ms: s.p50LatencyMs,
+          p95_latency_ms: s.p95LatencyMs,
+          cases: s.cases,
+          errors: s.errors,
+          timeouts: s.timeouts,
+          carried: true,
+        }))
+      )
+    );
+  }
+
+  await orm.batch(stmts);
+}
+
+export async function getRunWithModels(
+  db: D1Database,
+  runId: string
+): Promise<{ run: RunRow; models: RunModelRow[] } | null> {
+  const orm = drizzle(db);
+  const [run, models] = await Promise.all([
+    orm.select().from(benchmarkRuns).where(eq(benchmarkRuns.id, runId)).get(),
+    orm.select().from(runModels).where(eq(runModels.run_id, runId)),
+  ]);
+  if (!run) return null;
+  return { run, models };
+}
+
+// ---------------------------------------------------------------------------
+// Case results
+// ---------------------------------------------------------------------------
+
+export async function upsertCaseResult(db: D1Database, row: CaseResultRow): Promise<void> {
+  await drizzle(db)
+    .insert(caseResults)
+    .values(row)
+    .onConflictDoUpdate({
+      target: [caseResults.run_id, caseResults.model, caseResults.case_id, caseResults.rep],
+      set: {
+        tier: row.tier,
+        score: row.score,
+        latency_ms: row.latency_ms,
+        cost_usd: row.cost_usd,
+        error: row.error,
+        fallback_reason: row.fallback_reason,
+        retried: row.retried,
+        exit_code: row.exit_code,
+        output_prefix: row.output_prefix,
+        event_count: row.event_count,
+        last_event_types: row.last_event_types,
+        rep: row.rep,
+        timed_out: row.timed_out,
+      },
+    });
+}
+
+export async function countCaseResults(db: D1Database, runId: string): Promise<number> {
+  const row = await drizzle(db)
+    .select({ n: count() })
+    .from(caseResults)
+    .where(eq(caseResults.run_id, runId))
+    .get();
+  return row?.n ?? 0;
+}
+
+export async function getCaseResults(db: D1Database, runId: string): Promise<CaseResultRow[]> {
+  return drizzle(db).select().from(caseResults).where(eq(caseResults.run_id, runId));
+}
+
+// ---------------------------------------------------------------------------
+// Model summaries
+// ---------------------------------------------------------------------------
+
+export async function replaceModelSummaries(
+  db: D1Database,
+  runId: string,
+  summaries: BenchmarkModelSummary[]
+): Promise<void> {
+  const orm = drizzle(db);
+  // Only delete non-carried rows; carried rows (from skipped models) stay.
+  const deleteStmt = orm
+    .delete(modelSummaries)
+    .where(and(eq(modelSummaries.run_id, runId), eq(modelSummaries.carried, false)));
+
+  if (summaries.length === 0) {
+    await deleteStmt;
+    return;
+  }
+  await orm.batch([
+    deleteStmt,
+    orm.insert(modelSummaries).values(
+      summaries.map(s => ({
+        run_id: runId,
+        model: s.model,
+        tier: s.tier,
+        accuracy: s.accuracy,
+        avg_cost_usd: s.avgCostUsd,
+        avg_latency_ms: s.avgLatencyMs,
+        p50_latency_ms: s.p50LatencyMs,
+        p95_latency_ms: s.p95LatencyMs,
+        cases: s.cases,
+        errors: s.errors,
+        timeouts: s.timeouts,
+        carried: false,
+      }))
+    ),
+  ]);
+}
+
+export async function getSummaries(
+  db: D1Database,
+  runId: string
+): Promise<BenchmarkModelSummary[]> {
+  const rows = await drizzle(db)
+    .select()
+    .from(modelSummaries)
+    .where(eq(modelSummaries.run_id, runId));
+  return rows.map(mapSummaryRow);
+}
+
+export async function listRuns(db: D1Database, limit: number): Promise<BenchmarkRun[]> {
+  const orm = drizzle(db);
+  const runRows = await orm
+    .select()
+    .from(benchmarkRuns)
+    .orderBy(desc(benchmarkRuns.started_at))
+    .limit(limit);
+
+  if (runRows.length === 0) {
+    return [];
+  }
+
+  const summaryRows = await orm
+    .select()
+    .from(modelSummaries)
+    .where(
+      inArray(
+        modelSummaries.run_id,
+        runRows.map(r => r.id)
+      )
+    );
+
+  const summariesByRunId = new Map<string, BenchmarkModelSummary[]>();
+  for (const row of summaryRows) {
+    const existing = summariesByRunId.get(row.run_id);
+    if (existing) {
+      existing.push(mapSummaryRow(row));
+    } else {
+      summariesByRunId.set(row.run_id, [mapSummaryRow(row)]);
+    }
+  }
+
+  return runRows.map(row => mapRunRow(row, summariesByRunId.get(row.id) ?? []));
+}
+
+export async function markRunCompleted(db: D1Database, runId: string): Promise<void> {
+  await drizzle(db)
+    .update(benchmarkRuns)
+    .set({ status: 'completed', completed_at: new Date().toISOString() })
+    .where(and(eq(benchmarkRuns.id, runId), eq(benchmarkRuns.status, 'running')));
+}
+
+export async function markStaleRunsFailed(db: D1Database, olderThanIso: string): Promise<void> {
+  await drizzle(db)
+    .update(benchmarkRuns)
+    .set({ status: 'failed', error: 'timed out' })
+    .where(and(eq(benchmarkRuns.status, 'running'), lt(benchmarkRuns.started_at, olderThanIso)));
+}
+
+// The currently-running run of a kind, if any (used for the one-active-run-per-kind
+// admission pre-check). Stale runs are swept to 'failed' before this is consulted.
+export async function getRunningRun(
+  db: D1Database,
+  kind: BenchmarkKind
+): Promise<RunRow | undefined> {
+  return drizzle(db)
+    .select()
+    .from(benchmarkRuns)
+    .where(and(eq(benchmarkRuns.kind, kind), eq(benchmarkRuns.status, 'running')))
+    .get();
+}
+
+// True when a run of the same kind started later than this one has already
+// completed. Used to skip publishing so a slow older run can't overwrite a
+// newer run's published routing table / classifier winner.
+export async function existsNewerCompletedRun(
+  db: D1Database,
+  kind: BenchmarkKind,
+  startedAt: string,
+  runId: string
+): Promise<boolean> {
+  const newer = await drizzle(db)
+    .select({ id: benchmarkRuns.id })
+    .from(benchmarkRuns)
+    .where(
+      and(
+        eq(benchmarkRuns.kind, kind),
+        eq(benchmarkRuns.status, 'completed'),
+        gt(benchmarkRuns.started_at, startedAt),
+        ne(benchmarkRuns.id, runId)
+      )
+    )
+    .get();
+  return newer !== undefined;
+}
+
+export async function markRunFailed(db: D1Database, runId: string, error: string): Promise<void> {
+  await drizzle(db)
+    .update(benchmarkRuns)
+    .set({ status: 'failed', error: error.slice(0, 500), completed_at: new Date().toISOString() })
+    .where(and(eq(benchmarkRuns.id, runId), eq(benchmarkRuns.status, 'running')));
+}
+
+// ---------------------------------------------------------------------------
+// Latest summaries per model (for skip logic and classifier winner)
+// ---------------------------------------------------------------------------
+
+// What the most recent completed run measured for a model, plus the
+// benchmark identity it was measured under. startRun carries these summaries
+// into a new run only when the identity (engine + repetitions + the model's
+// reasoning_effort) still matches; otherwise the model is re-benchmarked.
+export type PriorModelResult = {
+  engineIdentity: string;
+  repetitions: number;
+  reasoningEffort: string | null;
+  summaries: BenchmarkModelSummary[];
+};
+
+// Latest summaries per model for a benchmark kind: for each model, all tiers
+// from the most recent COMPLETED run that included it (mixing tiers across
+// runs would pair incomparable numbers).
+export async function getLatestSummariesByModel(
+  db: D1Database,
+  kind: BenchmarkKind
+): Promise<Map<string, PriorModelResult>> {
+  const results = await drizzle(db)
+    .select({
+      run_id: modelSummaries.run_id,
+      model: modelSummaries.model,
+      tier: modelSummaries.tier,
+      accuracy: modelSummaries.accuracy,
+      avg_cost_usd: modelSummaries.avg_cost_usd,
+      avg_latency_ms: modelSummaries.avg_latency_ms,
+      p50_latency_ms: modelSummaries.p50_latency_ms,
+      p95_latency_ms: modelSummaries.p95_latency_ms,
+      cases: modelSummaries.cases,
+      errors: modelSummaries.errors,
+      timeouts: modelSummaries.timeouts,
+      carried: modelSummaries.carried,
+      engine_identity: benchmarkRuns.engine_identity,
+      repetitions: benchmarkRuns.repetitions,
+      reasoning_effort: runModels.reasoning_effort,
+    })
+    .from(modelSummaries)
+    .innerJoin(benchmarkRuns, eq(benchmarkRuns.id, modelSummaries.run_id))
+    .leftJoin(
+      runModels,
+      and(eq(runModels.run_id, modelSummaries.run_id), eq(runModels.model, modelSummaries.model))
+    )
+    .where(and(eq(benchmarkRuns.kind, kind), eq(benchmarkRuns.status, 'completed')))
+    .orderBy(desc(benchmarkRuns.started_at));
+
+  const latestRunByModel = new Map<string, string>();
+  for (const row of results) {
+    if (!latestRunByModel.has(row.model)) latestRunByModel.set(row.model, row.run_id);
+  }
+  const byModel = new Map<string, PriorModelResult>();
+  for (const row of results) {
+    if (latestRunByModel.get(row.model) !== row.run_id) continue;
+    const existing = byModel.get(row.model);
+    if (existing) {
+      existing.summaries.push(mapSummaryRow(row));
+    } else {
+      byModel.set(row.model, {
+        engineIdentity: row.engine_identity,
+        repetitions: row.repetitions,
+        reasoningEffort: row.reasoning_effort,
+        summaries: [mapSummaryRow(row)],
+      });
+    }
+  }
+  return byModel;
+}
+
+// ---------------------------------------------------------------------------
+// Routing table — pure helpers for explode/reassemble
+// ---------------------------------------------------------------------------
+
+type RoutingTableRow = typeof routingTables.$inferSelect;
+type RoutingTableCandidateRow = typeof routingTableCandidates.$inferSelect;
+
+export function routingTableToRows(
+  table: RoutingTable,
+  publishedAt: string
+): { tableRow: RoutingTableRow; candidateRows: RoutingTableCandidateRow[] } {
+  const tableRow: RoutingTableRow = {
+    run_id: table.version,
+    published_at: publishedAt,
+    generated_at: table.generatedAt,
+    min_accuracy: table.minAccuracy,
+    switch_cost_factor: table.switchCostFactor,
+    source: table.source,
+  };
+
+  const candidateRows: RoutingTableCandidateRow[] = [];
+  for (const [tier, candidates] of Object.entries(table.tiers)) {
+    candidates.forEach((c, rank) => {
+      candidateRows.push({
+        run_id: table.version,
+        tier,
+        rank,
+        model: c.model,
+        accuracy: c.accuracy,
+        avg_cost_usd: c.avgCostUsd,
+        meets_threshold: c.meetsThreshold,
+        reasoning_effort: c.reasoningEffort ?? null,
+      });
+    });
+  }
+
+  return { tableRow, candidateRows };
+}
+
+export function rowsToRoutingTable(
+  tableRow: RoutingTableRow,
+  candidateRows: RoutingTableCandidateRow[]
+): RoutingTable {
+  const tierMap: Record<string, RankedCandidate[]> = { low: [], medium: [], high: [] };
+  const sorted = [...candidateRows].sort((a, b) => {
+    if (a.tier !== b.tier) return a.tier.localeCompare(b.tier);
+    return a.rank - b.rank;
+  });
+  for (const row of sorted) {
+    if (!(row.tier in tierMap)) tierMap[row.tier] = [];
+    tierMap[row.tier].push({
+      model: row.model,
+      accuracy: row.accuracy,
+      avgCostUsd: row.avg_cost_usd,
+      meetsThreshold: row.meets_threshold,
+      reasoningEffort: row.reasoning_effort as RankedCandidate['reasoningEffort'],
+    });
+  }
+  return {
+    version: tableRow.run_id,
+    generatedAt: tableRow.generated_at,
+    minAccuracy: tableRow.min_accuracy,
+    switchCostFactor: tableRow.switch_cost_factor,
+    source: tableRow.source as RoutingTable['source'],
+    tiers: {
+      low: tierMap.low ?? [],
+      medium: tierMap.medium ?? [],
+      high: tierMap.high ?? [],
+    },
+  };
+}
+
+export async function saveRoutingTable(
+  db: D1Database,
+  table: RoutingTable,
+  publishedAt: string
+): Promise<void> {
+  const orm = drizzle(db);
+  const { tableRow, candidateRows } = routingTableToRows(table, publishedAt);
+
+  const stmts: [BatchItem<'sqlite'>, ...BatchItem<'sqlite'>[]] = [
+    orm.delete(routingTableCandidates).where(eq(routingTableCandidates.run_id, table.version)),
+    orm
+      .insert(routingTables)
+      .values(tableRow)
+      .onConflictDoUpdate({
+        target: routingTables.run_id,
+        set: {
+          published_at: tableRow.published_at,
+          generated_at: tableRow.generated_at,
+          min_accuracy: tableRow.min_accuracy,
+          switch_cost_factor: tableRow.switch_cost_factor,
+          source: tableRow.source,
+        },
+      }),
+  ];
+
+  if (candidateRows.length > 0) {
+    stmts.push(orm.insert(routingTableCandidates).values(candidateRows));
+  }
+
+  await orm.batch(stmts);
+}
+
+export async function getLatestRoutingTable(
+  db: D1Database
+): Promise<{ table: RoutingTable; publishedAt: string } | null> {
+  const orm = drizzle(db);
+  const tableRow = await orm
+    .select()
+    .from(routingTables)
+    .orderBy(desc(routingTables.published_at))
+    .limit(1)
+    .get();
+
+  if (!tableRow) return null;
+
+  const candidateRows = await orm
+    .select()
+    .from(routingTableCandidates)
+    .where(eq(routingTableCandidates.run_id, tableRow.run_id))
+    .orderBy(routingTableCandidates.tier, routingTableCandidates.rank);
+
+  const assembled = rowsToRoutingTable(tableRow, candidateRows);
+  const parsed = RoutingTableSchema.safeParse(assembled);
+  if (!parsed.success) {
+    console.warn(
+      JSON.stringify({
+        event: 'routing_table_invalid',
+        run_id: tableRow.run_id,
+        error: parsed.error.message,
+      })
+    );
+    return null;
+  }
+
+  return { table: parsed.data, publishedAt: tableRow.published_at };
+}
+
+// ---------------------------------------------------------------------------
+// Classifier winner
+// ---------------------------------------------------------------------------
+
+export async function getClassifierWinner(db: D1Database): Promise<ClassifierWinner | null> {
+  const orm = drizzle(db);
+  // Find the latest completed classifier run.
+  const runRow = await orm
+    .select()
+    .from(benchmarkRuns)
+    .where(and(eq(benchmarkRuns.kind, 'classifier'), eq(benchmarkRuns.status, 'completed')))
+    .orderBy(desc(benchmarkRuns.completed_at))
+    .limit(1)
+    .get();
+
+  if (!runRow) return null;
+
+  // Get the tier='*' summaries for this run (classifier uses '*' tier).
+  const summaryRows = await orm
+    .select()
+    .from(modelSummaries)
+    .where(and(eq(modelSummaries.run_id, runRow.id), eq(modelSummaries.tier, '*')));
+
+  const summaries = summaryRows.map(mapSummaryRow);
+  const winner = pickClassifierWinner(
+    summaries,
+    runRow.min_accuracy,
+    runRow.classifier_max_p95_latency_ms
+  );
+  if (!winner) return null;
+
+  return {
+    model: winner.model,
+    runId: runRow.id,
+    accuracy: winner.accuracy,
+    p95LatencyMs: winner.p95LatencyMs,
+    generatedAt: runRow.completed_at ?? new Date().toISOString(),
+  };
+}
diff --git a/services/auto-routing-benchmark/src/grading.test.ts b/services/auto-routing-benchmark/src/grading.test.ts
new file mode 100644
index 0000000000..3ed664ba15
--- /dev/null
+++ b/services/auto-routing-benchmark/src/grading.test.ts
@@ -0,0 +1,132 @@
+import { describe, expect, it } from 'vitest';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts';
+import {
+  CLASSIFIER_FIELD_WEIGHTS,
+  gradeClassifierOutput,
+  normalizeAnswer,
+  runDeciderCheck,
+  type ClassifierExpectation,
+} from './grading';
+
+const expected: ClassifierExpectation = {
+  taskType: 'implementation',
+  subtaskType: 'code_generation',
+  contextComplexity: 'small',
+  reasoningComplexity: 'low',
+  riskLevel: 'low',
+  executionMode: 'answer_only',
+  requiresTools: false,
+};
+
+function actualFrom(overrides: Partial<ClassifierOutput>): ClassifierOutput {
+  return {
+    taskType: 'implementation',
+    subtaskType: 'code_generation',
+    contextComplexity: 'small',
+    reasoningComplexity: 'low',
+    riskLevel: 'low',
+    executionMode: 'answer_only',
+    requiresTools: false,
+    confidence: 0.9,
+    ...overrides,
+  };
+}
+
+describe('gradeClassifierOutput', () => {
+  it('scores a full match as 1', () => {
+    expect(gradeClassifierOutput(expected, actualFrom({}))).toBe(1);
+  });
+
+  it('scores a taskType mismatch alone as 0.75', () => {
+    expect(gradeClassifierOutput(expected, actualFrom({ taskType: 'debugging' }))).toBe(0.75);
+  });
+
+  it('scores a requiresTools mismatch alone as 0.9', () => {
+    expect(gradeClassifierOutput(expected, actualFrom({ requiresTools: true }))).toBe(0.9);
+  });
+
+  it('scores a combined subtaskType and riskLevel mismatch as 0.85', () => {
+    expect(
+      gradeClassifierOutput(
+        expected,
+        actualFrom({ subtaskType: 'feature_development', riskLevel: 'high' })
+      )
+    ).toBe(0.85);
+  });
+});
+
+describe('CLASSIFIER_FIELD_WEIGHTS', () => {
+  it('sums to 1', () => {
+    expect(Object.values(CLASSIFIER_FIELD_WEIGHTS).reduce((a, b) => a + b, 0)).toBeCloseTo(1);
+  });
+});
+
+describe('normalizeAnswer', () => {
+  it('strips fences, lowercases and trims', () => {
+    expect(normalizeAnswer('```js\n  Hello World  \n```')).toBe('hello world');
+  });
+});
+
+describe('runDeciderCheck: exact', () => {
+  it('passes with surrounding code fences and different case', () => {
+    expect(runDeciderCheck({ kind: 'exact', value: '20-40' }, '```\n20-40\n```')).toBe(true);
+    expect(runDeciderCheck({ kind: 'exact', value: 'Hello' }, 'HELLO')).toBe(true);
+  });
+
+  it('fails on a wrong answer', () => {
+    expect(runDeciderCheck({ kind: 'exact', value: '20-40' }, '20-30')).toBe(false);
+  });
+});
+
+describe('runDeciderCheck: contains_all', () => {
+  it('passes regardless of order and case', () => {
+    expect(
+      runDeciderCheck({ kind: 'contains_all', values: ['Alpha', 'Beta'] }, 'beta then ALPHA')
+    ).toBe(true);
+  });
+
+  it('fails when one value is missing', () => {
+    expect(
+      runDeciderCheck({ kind: 'contains_all', values: ['alpha', 'beta'] }, 'only alpha here')
+    ).toBe(false);
+  });
+});
+
+describe('runDeciderCheck: regex', () => {
+  it('passes a basic match with flags', () => {
+    expect(
+      runDeciderCheck({ kind: 'regex', pattern: '^answer: \\d+$', flags: 'im' }, 'ANSWER: 42')
+    ).toBe(true);
+  });
+
+  it('fails when the pattern does not match', () => {
+    expect(runDeciderCheck({ kind: 'regex', pattern: '^\\d+$' }, 'not a number')).toBe(false);
+  });
+});
+
+describe('runDeciderCheck: json_equal', () => {
+  it('passes with a json fence plus prose before and after', () => {
+    const output = 'Here you go:\n```json\n{"a":1}\n```\nLet me know!';
+    expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, output)).toBe(true);
+  });
+
+  it('passes with bare JSON', () => {
+    expect(runDeciderCheck({ kind: 'json_equal', value: { line: 6 } }, '{"line": 6}')).toBe(true);
+  });
+
+  it('fails on unparseable output', () => {
+    expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, 'sorry, no idea')).toBe(false);
+  });
+
+  it('fails when values differ', () => {
+    expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1 } }, '{"a": 2}')).toBe(false);
+  });
+
+  // Documents current behavior: comparison is JSON.stringify-based, so key
+  // ORDER is significant. Dataset authoring must mirror the prompted key order.
+  it('is sensitive to object key order (documented behavior)', () => {
+    expect(runDeciderCheck({ kind: 'json_equal', value: { a: 1, b: 2 } }, '{"b": 2, "a": 1}')).toBe(
+      false
+    );
+  });
+});
diff --git a/services/auto-routing-benchmark/src/grading.ts b/services/auto-routing-benchmark/src/grading.ts
new file mode 100644
index 0000000000..0661e3ac4b
--- /dev/null
+++ b/services/auto-routing-benchmark/src/grading.ts
@@ -0,0 +1,123 @@
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts';
+
+// Golden labels grade every classifier field except confidence. subtaskType
+// is worth less than taskType: a wrong subtype under the right type is a near
+// miss. riskLevel gets a small weight matching its small influence on tier
+// derivation.
+export type ClassifierExpectation = {
+  taskType: ClassifierOutput['taskType'];
+  subtaskType: ClassifierOutput['subtaskType'];
+  contextComplexity: ClassifierOutput['contextComplexity'];
+  reasoningComplexity: ClassifierOutput['reasoningComplexity'];
+  riskLevel: ClassifierOutput['riskLevel'];
+  executionMode: ClassifierOutput['executionMode'];
+  requiresTools: boolean;
+};
+
+export const CLASSIFIER_FIELD_WEIGHTS: Record<keyof ClassifierExpectation, number> = {
+  taskType: 0.25,
+  subtaskType: 0.1,
+  reasoningComplexity: 0.2,
+  contextComplexity: 0.15,
+  executionMode: 0.15,
+  riskLevel: 0.05,
+  requiresTools: 0.1,
+};
+
+export function gradeClassifierOutput(
+  expected: ClassifierExpectation,
+  actual: ClassifierOutput
+): number {
+  let score = 0;
+  for (const key of Object.keys(CLASSIFIER_FIELD_WEIGHTS) as (keyof ClassifierExpectation)[]) {
+    if (actual[key] === expected[key]) score += CLASSIFIER_FIELD_WEIGHTS[key];
+  }
+  return Number(score.toFixed(4));
+}
+
+export type DeciderCheck =
+  | { kind: 'exact'; value: string }
+  | { kind: 'contains_all'; values: readonly string[] }
+  | { kind: 'regex'; pattern: string; flags?: string }
+  | { kind: 'json_equal'; value: unknown };
+
+// Mechanical pass/fail grading keeps the decider benchmark deterministic:
+// no LLM judges. Normalization tolerates formatting noise (whitespace,
+// case, markdown fences) without weakening the assertion.
+export function normalizeAnswer(text: string): string {
+  return text
+    .replace(/```[a-z]*\n?/gi, '')
+    .replace(/```/g, '')
+    .trim()
+    .toLowerCase();
+}
+
+// Balance-scan from the first `{`/`[` to its matching close so trailing prose
+// after the JSON payload doesn't break parsing. String-aware so braces inside
+// string literals are ignored.
+function extractJson(text: string): unknown {
+  const stripped = text.replace(/```(?:json)?\n?/gi, '').replace(/```/g, '');
+  const start = stripped.search(/[[{]/);
+  if (start === -1) throw new Error('no JSON found');
+
+  const open = stripped[start];
+  const close = open === '{' ? '}' : ']';
+  let depth = 0;
+  let inString = false;
+  let escaped = false;
+
+  for (let i = start; i < stripped.length; i++) {
+    const ch = stripped[i];
+    if (inString) {
+      if (escaped) {
+        escaped = false;
+      } else if (ch === '\\') {
+        escaped = true;
+      } else if (ch === '"') {
+        inString = false;
+      }
+      continue;
+    }
+    if (ch === '"') {
+      inString = true;
+    } else if (ch === open) {
+      depth++;
+    } else if (ch === close) {
+      depth--;
+      if (depth === 0) {
+        return JSON.parse(stripped.slice(start, i + 1));
+      }
+    }
+  }
+  throw new Error('unbalanced JSON');
+}
+
+export function runDeciderCheck(check: DeciderCheck, output: string): boolean {
+  switch (check.kind) {
+    case 'exact': {
+      // Agent harnesses sometimes prepend prose despite instructions; accept
+      // the answer when the whole output OR its last non-empty line matches.
+      // Wrong answers fail either way.
+      const normalized = normalizeAnswer(output);
+      const expected = normalizeAnswer(check.value);
+      if (normalized === expected) return true;
+      const lastLine =
+        normalized
+          .split('\n')
+          .filter(l => l.trim().length > 0)
+          .at(-1) ?? '';
+      return lastLine.trim() === expected;
+    }
+    case 'contains_all':
+      return check.values.every(v => normalizeAnswer(output).includes(normalizeAnswer(v)));
+    case 'regex':
+      return new RegExp(check.pattern, check.flags).test(output);
+    case 'json_equal': {
+      try {
+        return JSON.stringify(extractJson(output)) === JSON.stringify(check.value);
+      } catch {
+        return false;
+      }
+    }
+  }
+}
diff --git a/services/auto-routing-benchmark/src/hono-env.ts b/services/auto-routing-benchmark/src/hono-env.ts
new file mode 100644
index 0000000000..deb5b5bea3
--- /dev/null
+++ b/services/auto-routing-benchmark/src/hono-env.ts
@@ -0,0 +1 @@
+export type HonoEnv = { Bindings: Env };
diff --git a/services/auto-routing-benchmark/src/index.ts b/services/auto-routing-benchmark/src/index.ts
new file mode 100644
index 0000000000..dd431b5ce8
--- /dev/null
+++ b/services/auto-routing-benchmark/src/index.ts
@@ -0,0 +1,33 @@
+import { Hono } from 'hono';
+import { createErrorHandler, createNotFoundHandler } from '@kilocode/worker-utils';
+import { registerAdminRoutes } from './admin';
+import { authMiddleware } from './auth';
+import type { HonoEnv } from './hono-env';
+import { processJob, type BenchmarkJobMessage } from './run';
+
+// Re-exported so the Durable Object class binding (BENCH_RUNNER) can find it.
+export { BenchRunnerContainer } from './bench-runner-container';
+
+export const app = new Hono<HonoEnv>();
+app.use('*', authMiddleware);
+app.get('/health', c => c.json({ status: 'ok', service: 'auto-routing-benchmark' }));
+
+registerAdminRoutes(app);
+
+app.notFound(createNotFoundHandler());
+app.onError(createErrorHandler());
+
+export default {
+  fetch: app.fetch,
+  async queue(batch: MessageBatch<BenchmarkJobMessage>, env: Env): Promise<void> {
+    for (const message of batch.messages) {
+      // Deliberately no try/catch: a throw from processJob (transient token,
+      // D1 or container failures) must skip the ack so the queue retries the
+      // whole (run, model, chunk) unit, dead-lettering after max_retries.
+      // Case-level failures are recorded as failed rows inside processJob and
+      // do not throw. Swallowing the throw here would silently drop chunks.
+      await processJob(env, message.body);
+      message.ack();
+    }
+  },
+};
diff --git a/services/auto-routing-benchmark/src/kilo-events.test.ts b/services/auto-routing-benchmark/src/kilo-events.test.ts
new file mode 100644
index 0000000000..1f0c1078dc
--- /dev/null
+++ b/services/auto-routing-benchmark/src/kilo-events.test.ts
@@ -0,0 +1,63 @@
+import { describe, expect, it } from 'vitest';
+import { parseKiloRunEvents } from './kilo-events';
+
+describe('parseKiloRunEvents', () => {
+  it('assembles completed text parts and sums step-finish costs (part.* shape)', () => {
+    const lines = [
+      JSON.stringify({ type: 'text', part: { text: 'partial', time: { start: 1 } } }), // no end → skipped
+      JSON.stringify({ type: 'text', part: { text: 'The answer is', time: { end: 10 } } }),
+      JSON.stringify({ type: 'step-finish', part: { cost: 0.0012, tokens: { input: 5 } } }),
+      JSON.stringify({ type: 'text', part: { text: '```\n20-40\n```', time: { end: 20 } } }),
+      JSON.stringify({ type: 'step-finish', part: { cost: 0.0008 } }),
+    ];
+
+    const { text, costUsd } = parseKiloRunEvents(lines);
+    expect(text).toBe('The answer is\n```\n20-40\n```');
+    expect(costUsd).toBeCloseTo(0.002, 10);
+  });
+
+  it('skips unparseable lines without throwing', () => {
+    const lines = [
+      'not json',
+      '',
+      JSON.stringify({ type: 'text', part: { text: 'hello', time: { end: 1 } } }),
+      '{ broken',
+    ];
+    const { text, costUsd } = parseKiloRunEvents(lines);
+    expect(text).toBe('hello');
+    expect(costUsd).toBeNull();
+  });
+
+  it('returns null cost when no step-finish event is seen', () => {
+    const lines = [JSON.stringify({ type: 'text', part: { text: 'x', time: { end: 1 } } })];
+    expect(parseKiloRunEvents(lines).costUsd).toBeNull();
+  });
+
+  it('accepts the flattened top-level event shape (evt.text / evt.cost)', () => {
+    const lines = [
+      JSON.stringify({ type: 'text', text: 'flat answer', time: { end: 5 } }),
+      JSON.stringify({ type: 'step-finish', cost: 0.5 }),
+    ];
+    const { text, costUsd } = parseKiloRunEvents(lines);
+    expect(text).toBe('flat answer');
+    expect(costUsd).toBe(0.5);
+  });
+
+  it('prefers part.* over top-level fields when both present', () => {
+    const lines = [
+      JSON.stringify({ type: 'text', text: 'top', part: { text: 'nested', time: { end: 1 } } }),
+      JSON.stringify({ type: 'step-finish', cost: 9, part: { cost: 0.01 } }),
+    ];
+    const { text, costUsd } = parseKiloRunEvents(lines);
+    expect(text).toBe('nested');
+    expect(costUsd).toBe(0.01);
+  });
+
+  it('returns empty text and null cost for no relevant events', () => {
+    const lines = [
+      JSON.stringify({ type: 'tool', part: { name: 'read' } }),
+      JSON.stringify({ type: 'start' }),
+    ];
+    expect(parseKiloRunEvents(lines)).toMatchObject({ text: '', costUsd: null });
+  });
+});
diff --git a/services/auto-routing-benchmark/src/kilo-events.ts b/services/auto-routing-benchmark/src/kilo-events.ts
new file mode 100644
index 0000000000..53efff642b
--- /dev/null
+++ b/services/auto-routing-benchmark/src/kilo-events.ts
@@ -0,0 +1,86 @@
+// Pure parser for the `kilo run --format json` event stream.
+//
+// The CLI emits one JSON event per line on stdout. We care about two things:
+//   1. The final assistant answer — assembled from completed `text` events
+//      (those whose part has `time.end` set), concatenated in order.
+//   2. Total cost — summed across `step-finish` events' `part.cost` (USD).
+//
+// Event shapes vary across CLI versions; we accept both the documented
+// `evt.part.*` shape and a flattened `evt.*` shape, preferring `part.*`.
+// Everything is optional-chained so malformed lines can't throw.
+
+export type ParsedKiloRun = {
+  text: string;
+  costUsd: number | null;
+  // Diagnostics for empty-output investigations: how many event lines parsed
+  // and the trailing event types (never the payloads, which may be sensitive).
+  eventCount: number;
+  lastEventTypes: string[];
+};
+
+type LooseEvent = {
+  type?: unknown;
+  text?: unknown;
+  cost?: unknown;
+  time?: { end?: unknown };
+  part?: {
+    text?: unknown;
+    cost?: unknown;
+    time?: { end?: unknown };
+  };
+};
+
+function isCompletedTextEvent(evt: LooseEvent): boolean {
+  const end = evt.part?.time?.end ?? evt.time?.end;
+  return end !== undefined && end !== null;
+}
+
+function readText(evt: LooseEvent): string | null {
+  const partText = evt.part?.text;
+  if (typeof partText === 'string') return partText;
+  if (typeof evt.text === 'string') return evt.text;
+  return null;
+}
+
+function readCost(evt: LooseEvent): number | null {
+  const partCost = evt.part?.cost;
+  if (typeof partCost === 'number' && Number.isFinite(partCost)) return partCost;
+  if (typeof evt.cost === 'number' && Number.isFinite(evt.cost)) return evt.cost;
+  return null;
+}
+
+export function parseKiloRunEvents(lines: string[]): ParsedKiloRun {
+  const textParts: string[] = [];
+  let costUsd: number | null = null;
+  const eventTypes: string[] = [];
+
+  for (const line of lines) {
+    let evt: LooseEvent;
+    try {
+      evt = JSON.parse(line) as LooseEvent;
+    } catch {
+      continue;
+    }
+    if (evt === null || typeof evt !== 'object') continue;
+    if (typeof evt.type === 'string') eventTypes.push(evt.type);
+
+    if (evt.type === 'text' && isCompletedTextEvent(evt)) {
+      const text = readText(evt);
+      if (text !== null) textParts.push(text);
+    }
+
+    // The CLI emits `step_finish` at the top level (part.type is the
+    // hyphenated `step-finish`); accept both spellings across versions.
+    if (evt.type === 'step_finish' || evt.type === 'step-finish') {
+      const cost = readCost(evt);
+      if (cost !== null) costUsd = (costUsd ?? 0) + cost;
+    }
+  }
+
+  return {
+    text: textParts.join('\n'),
+    costUsd,
+    eventCount: eventTypes.length,
+    lastEventTypes: eventTypes.slice(-3),
+  };
+}
diff --git a/services/auto-routing-benchmark/src/openrouter.ts b/services/auto-routing-benchmark/src/openrouter.ts
new file mode 100644
index 0000000000..8d48367720
--- /dev/null
+++ b/services/auto-routing-benchmark/src/openrouter.ts
@@ -0,0 +1,26 @@
+import { OpenRouter } from '@openrouter/sdk';
+import { ttlCached } from '@kilocode/worker-utils';
+
+type OpenRouterEnv = Pick<Env, 'OPENROUTER_API_KEY'>;
+
+export const OPENROUTER_HTTP_REFERER = 'https://kilocode.ai';
+export const OPENROUTER_APP_TITLE = 'Kilo Code';
+
+// Only the API key string is cached at module scope (plain value, not a
+// transport-owning SDK object), so each classification skips the
+// secrets-store read. The client itself is constructed per request; that is
+// just object setup around global fetch. The TTL keeps key rotations
+// effective within five minutes.
+const API_KEY_CACHE_TTL_MS = 300_000;
+
+const apiKeyCache = ttlCached(API_KEY_CACHE_TTL_MS, (env: OpenRouterEnv) =>
+  env.OPENROUTER_API_KEY.get()
+);
+
+export async function createOpenRouterClient(env: OpenRouterEnv): Promise<OpenRouter> {
+  return new OpenRouter({
+    apiKey: await apiKeyCache.get(env),
+    httpReferer: OPENROUTER_HTTP_REFERER,
+    appTitle: OPENROUTER_APP_TITLE,
+  });
+}
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.test.ts b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
new file mode 100644
index 0000000000..8c124ee496
--- /dev/null
+++ b/services/auto-routing-benchmark/src/routing-table-builder.test.ts
@@ -0,0 +1,258 @@
+import { describe, expect, it } from 'vitest';
+import type {
+  BenchmarkDeciderModel,
+  BenchmarkModelSummary,
+} from '@kilocode/auto-routing-contracts';
+import { buildRoutingTable } from './routing-table-builder';
+
+const DECIDER_MODELS: BenchmarkDeciderModel[] = [
+  { id: 'model/cheap', reasoningEffort: null },
+  { id: 'model/expensive', reasoningEffort: 'medium' },
+  { id: 'model/mid', reasoningEffort: null },
+];
+
+function summary(
+  model: string,
+  tier: BenchmarkModelSummary['tier'],
+  accuracy: number,
+  avgCostUsd: number | null = 0.001
+): BenchmarkModelSummary {
+  return {
+    model,
+    tier,
+    accuracy,
+    avgCostUsd,
+    avgLatencyMs: 500,
+    p50LatencyMs: 450,
+    p95LatencyMs: null,
+    cases: 10,
+    errors: 0,
+    timeouts: 0,
+  };
+}
+
+const ALL_TIERS_SUMMARIES: BenchmarkModelSummary[] = [
+  summary('model/cheap', 'low', 0.9, 0.001),
+  summary('model/expensive', 'low', 0.95, 0.01),
+  summary('model/mid', 'low', 0.8, 0.005),
+  summary('model/cheap', 'medium', 0.75, 0.001),
+  summary('model/expensive', 'medium', 0.85, 0.01),
+  summary('model/mid', 'medium', 0.72, 0.005),
+  summary('model/cheap', 'high', 0.6, 0.001),
+  summary('model/expensive', 'high', 0.9, 0.01),
+  summary('model/mid', 'high', 0.75, 0.005),
+];
+
+describe('buildRoutingTable', () => {
+  it('cheapest above-threshold model comes first per tier', () => {
+    const table = buildRoutingTable({
+      runId: 'test-run-1',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      deciderModels: DECIDER_MODELS,
+      summaries: ALL_TIERS_SUMMARIES,
+    });
+
+    // low tier: cheap (0.001) and mid (0.005) and expensive (0.01) all meet threshold (0.7)
+    // cheapest first
+    expect(table.tiers.low[0].model).toBe('model/cheap');
+    expect(table.tiers.low[1].model).toBe('model/mid');
+    expect(table.tiers.low[2].model).toBe('model/expensive');
+
+    // medium tier: all meet threshold, cheapest first
+    expect(table.tiers.medium[0].model).toBe('model/cheap');
+    expect(table.tiers.medium[1].model).toBe('model/mid');
+    expect(table.tiers.medium[2].model).toBe('model/expensive');
+
+    // high tier: expensive (0.9) and mid (0.75) meet threshold; cheap (0.6) does not
+    // meeting threshold first, then by cost; cheap last (below threshold)
+    expect(table.tiers.high[0].model).toBe('model/mid'); // meets threshold, cheaper
+    expect(table.tiers.high[1].model).toBe('model/expensive'); // meets threshold, more expensive
+    expect(table.tiers.high[2].model).toBe('model/cheap'); // below threshold
+  });
+
+  it('excludes a model whose tier summary has no cost signal', () => {
+    const table = buildRoutingTable({
+      runId: 'test-run-nocost',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      deciderModels: DECIDER_MODELS,
+      summaries: ALL_TIERS_SUMMARIES.map(s =>
+        s.model === 'model/cheap' && s.tier === 'low' ? { ...s, avgCostUsd: null } : s
+      ),
+    });
+
+    // model/cheap would have won 'low' as cheapest; without a cost signal it
+    // must not be ranked (unknown cost is not zero cost).
+    expect(table.tiers.low.map(c => c.model)).toEqual(['model/mid', 'model/expensive']);
+  });
+
+  it('marks meetsThreshold correctly', () => {
+    const table = buildRoutingTable({
+      runId: 'test-run-2',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      deciderModels: DECIDER_MODELS,
+      summaries: ALL_TIERS_SUMMARIES,
+    });
+
+    for (const candidate of table.tiers.low) {
+      expect(candidate.meetsThreshold).toBe(candidate.accuracy >= 0.7);
+    }
+  });
+
+  it('excludes a model absent from a tier summaries', () => {
+    // model/cheap has no 'high' summary entry
+    const summaries: BenchmarkModelSummary[] = [
+      summary('model/cheap', 'low', 0.9),
+      summary('model/cheap', 'medium', 0.8),
+      // no 'high' entry for model/cheap
+      summary('model/expensive', 'low', 0.9),
+      summary('model/expensive', 'medium', 0.8),
+      summary('model/expensive', 'high', 0.9),
+      summary('model/mid', 'low', 0.8),
+      summary('model/mid', 'medium', 0.75),
+      summary('model/mid', 'high', 0.75),
+    ];
+
+    const table = buildRoutingTable({
+      runId: 'test-run-3',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      deciderModels: DECIDER_MODELS,
+      summaries,
+    });
+
+    const highModels = table.tiers.high.map(c => c.model);
+    expect(highModels).not.toContain('model/cheap');
+    expect(highModels).toContain('model/expensive');
+    expect(highModels).toContain('model/mid');
+  });
+
+  it('carries reasoningEffort from the run snapshot', () => {
+    const table = buildRoutingTable({
+      runId: 'test-run-4',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      deciderModels: DECIDER_MODELS,
+      summaries: ALL_TIERS_SUMMARIES,
+    });
+
+    const expensiveInLow = table.tiers.low.find(c => c.model === 'model/expensive');
+    expect(expensiveInLow?.reasoningEffort).toBe('medium');
+
+    const midInLow = table.tiers.low.find(c => c.model === 'model/mid');
+    expect(midInLow?.reasoningEffort).toBeNull();
+  });
+
+  it('defaults reasoningEffort to null when model missing from the snapshot', () => {
+    const summaries: BenchmarkModelSummary[] = [
+      summary('model/unknown', 'low', 0.9),
+      summary('model/cheap', 'low', 0.8),
+      summary('model/cheap', 'medium', 0.8),
+      summary('model/cheap', 'high', 0.8),
+      summary('model/unknown', 'medium', 0.9),
+      summary('model/unknown', 'high', 0.9),
+    ];
+
+    const table = buildRoutingTable({
+      runId: 'test-run-5',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      deciderModels: DECIDER_MODELS,
+      summaries,
+    });
+
+    const unknown = table.tiers.low.find(c => c.model === 'model/unknown');
+    expect(unknown?.reasoningEffort).toBeNull();
+  });
+
+  it('throws when a tier has no candidates', () => {
+    // Only low and medium summaries — high is missing entirely
+    const summaries: BenchmarkModelSummary[] = [
+      summary('model/cheap', 'low', 0.9),
+      summary('model/expensive', 'low', 0.9),
+      summary('model/mid', 'low', 0.9),
+      summary('model/cheap', 'medium', 0.9),
+      summary('model/expensive', 'medium', 0.9),
+      summary('model/mid', 'medium', 0.9),
+    ];
+
+    expect(() =>
+      buildRoutingTable({
+        runId: 'test-run-6',
+        generatedAt: '2026-01-01T00:00:00.000Z',
+        minAccuracy: 0.7,
+        switchCostFactor: 3,
+        deciderModels: DECIDER_MODELS,
+        summaries,
+      })
+    ).toThrow();
+  });
+
+  it('throws when a tier has only zero-case entries', () => {
+    const summaries: BenchmarkModelSummary[] = [
+      ...ALL_TIERS_SUMMARIES.filter(s => s.tier !== 'high'),
+      // high tier entries with 0 cases — should be excluded
+      { ...summary('model/cheap', 'high', 0.9), cases: 0 },
+      { ...summary('model/expensive', 'high', 0.9), cases: 0 },
+      { ...summary('model/mid', 'high', 0.9), cases: 0 },
+    ];
+
+    expect(() =>
+      buildRoutingTable({
+        runId: 'test-run-7',
+        generatedAt: '2026-01-01T00:00:00.000Z',
+        minAccuracy: 0.7,
+        switchCostFactor: 3,
+        deciderModels: DECIDER_MODELS,
+        summaries,
+      })
+    ).toThrow();
+  });
+
+  it('ignores classifier-style * tier summaries', () => {
+    const summaries: BenchmarkModelSummary[] = [
+      ...ALL_TIERS_SUMMARIES,
+      // classifier summaries with '*' tier — should be ignored
+      summary('model/cheap', '*', 0.95),
+      summary('model/expensive', '*', 0.95),
+    ];
+
+    // Should not throw and * tier entries should not affect output
+    const table = buildRoutingTable({
+      runId: 'test-run-8',
+      generatedAt: '2026-01-01T00:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      deciderModels: DECIDER_MODELS,
+      summaries,
+    });
+
+    expect(table.tiers.low.length).toBe(3);
+    expect(table.tiers.medium.length).toBe(3);
+  });
+
+  it('sets version and generatedAt from params', () => {
+    const table = buildRoutingTable({
+      runId: 'decider-2026-01-01',
+      generatedAt: '2026-01-01T12:00:00.000Z',
+      minAccuracy: 0.7,
+      switchCostFactor: 3,
+      deciderModels: DECIDER_MODELS,
+      summaries: ALL_TIERS_SUMMARIES,
+    });
+
+    expect(table.version).toBe('decider-2026-01-01');
+    expect(table.generatedAt).toBe('2026-01-01T12:00:00.000Z');
+    expect(table.source).toBe('benchmark');
+    expect(table.minAccuracy).toBe(0.7);
+    expect(table.switchCostFactor).toBe(3);
+  });
+});
diff --git a/services/auto-routing-benchmark/src/routing-table-builder.ts b/services/auto-routing-benchmark/src/routing-table-builder.ts
new file mode 100644
index 0000000000..222f19436f
--- /dev/null
+++ b/services/auto-routing-benchmark/src/routing-table-builder.ts
@@ -0,0 +1,58 @@
+import {
+  rankCandidates,
+  RoutingTableSchema,
+  type BenchmarkDeciderModel,
+  type BenchmarkModelSummary,
+  type DifficultyTier,
+  type RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+
+// Builds the routing table from per-(model, tier) decider summaries. Models
+// with zero graded cases in a tier are excluded from that tier, as are
+// models with no cost signal at all (avgCostUsd null means every case failed
+// to report cost; ranking such a model as cheapest would hand it the tier).
+// Throws when any tier ends up empty so the caller keeps the previous
+// published table. deciderModels/minAccuracy/switchCostFactor come from the
+// run's snapshot, not live config.
+export function buildRoutingTable(params: {
+  runId: string;
+  generatedAt: string;
+  minAccuracy: number;
+  switchCostFactor: number;
+  deciderModels: BenchmarkDeciderModel[];
+  summaries: BenchmarkModelSummary[];
+}): RoutingTable {
+  const { runId, generatedAt, minAccuracy, switchCostFactor, deciderModels, summaries } = params;
+  const modelConfigById = new Map(deciderModels.map(m => [m.id, m] as const));
+
+  const tierCandidates = (t: DifficultyTier) =>
+    rankCandidates(
+      summaries
+        .filter(s => s.tier === t && s.cases > 0 && s.avgCostUsd !== null)
+        .map(s => ({
+          model: s.model,
+          accuracy: s.accuracy,
+          avgCostUsd: s.avgCostUsd ?? 0,
+          reasoningEffort: modelConfigById.get(s.model)?.reasoningEffort ?? null,
+        })),
+      minAccuracy
+    );
+
+  const table: RoutingTable = {
+    version: runId,
+    generatedAt,
+    minAccuracy,
+    switchCostFactor,
+    source: 'benchmark',
+    tiers: {
+      low: tierCandidates('low'),
+      medium: tierCandidates('medium'),
+      high: tierCandidates('high'),
+    },
+  };
+
+  // RoutingTableSchema enforces .min(1) on each tier array; throws ZodError
+  // when a tier is empty — caller logs and skips publish, keeping the previous
+  // live table intact.
+  return RoutingTableSchema.parse(table);
+}
diff --git a/services/auto-routing-benchmark/src/run.test.ts b/services/auto-routing-benchmark/src/run.test.ts
new file mode 100644
index 0000000000..303fbafff2
--- /dev/null
+++ b/services/auto-routing-benchmark/src/run.test.ts
@@ -0,0 +1,471 @@
+import { describe, expect, it } from 'vitest';
+import type { CaseResultRow } from './db';
+import {
+  BenchmarkJobMessageSchema,
+  buildDeciderMessages,
+  chunkArray,
+  computeEngineIdentity,
+  runCasesWithConcurrency,
+  summarize,
+} from './run';
+import { pickClassifierWinner } from './winner';
+
+function makeRow(overrides: Partial<CaseResultRow> = {}): CaseResultRow {
+  return {
+    run_id: 'run-1',
+    model: 'model/a',
+    case_id: 'case-1',
+    tier: null,
+    score: 1,
+    latency_ms: 100,
+    cost_usd: 0.001,
+    error: null,
+    fallback_reason: null,
+    retried: null,
+    exit_code: null,
+    output_prefix: null,
+    event_count: null,
+    last_event_types: null,
+    rep: 0,
+    timed_out: 0,
+    ...overrides,
+  };
+}
+
+describe('summarize — classifier kind', () => {
+  it('groups all classifier rows under * tier', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({
+        model: 'model/a',
+        case_id: 'c1',
+        tier: null,
+        score: 1,
+        latency_ms: 100,
+        cost_usd: 0.001,
+      }),
+      makeRow({
+        model: 'model/a',
+        case_id: 'c2',
+        tier: null,
+        score: 0.5,
+        latency_ms: 200,
+        cost_usd: 0.002,
+      }),
+    ];
+
+    const summaries = summarize(rows, 'classifier');
+    expect(summaries).toHaveLength(1);
+    const [s] = summaries;
+    expect(s.model).toBe('model/a');
+    expect(s.tier).toBe('*');
+    expect(s.cases).toBe(2);
+  });
+
+  it('computes accuracy correctly', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ score: 1.0 }),
+      makeRow({ case_id: 'c2', score: 0.5 }),
+      makeRow({ case_id: 'c3', score: 0.0 }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    // (1.0 + 0.5 + 0.0) / 3 = 0.5
+    expect(s.accuracy).toBe(0.5);
+  });
+
+  it('computes avgCostUsd excluding null cost rows', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', cost_usd: 0.002 }),
+      makeRow({ case_id: 'c2', cost_usd: null }),
+      makeRow({ case_id: 'c3', cost_usd: 0.004 }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    // (0.002 + 0.004) / 2 = 0.003
+    expect(s.avgCostUsd).toBe(0.003);
+  });
+
+  it('returns null avgCostUsd when all cost_usd are null', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', cost_usd: null }),
+      makeRow({ case_id: 'c2', cost_usd: null }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    expect(s.avgCostUsd).toBeNull();
+  });
+
+  it('computes p50LatencyMs', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', latency_ms: 100 }),
+      makeRow({ case_id: 'c2', latency_ms: 300 }),
+      makeRow({ case_id: 'c3', latency_ms: 200 }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    // sorted: [100, 200, 300], floor(3/2) = 1 → 200
+    expect(s.p50LatencyMs).toBe(200);
+  });
+
+  it('counts errors correctly', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', score: 0, error: 'timeout' }),
+      makeRow({ case_id: 'c2', score: 1, error: null }),
+      makeRow({ case_id: 'c3', score: 0, error: 'rate_limit' }),
+    ];
+
+    const [s] = summarize(rows, 'classifier');
+    expect(s.errors).toBe(2);
+    // error rows have score 0 which drags accuracy down
+    expect(s.accuracy).toBe(Number((1 / 3).toFixed(4)));
+  });
+});
+
+describe('summarize — decider kind', () => {
+  it('groups by tier', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ model: 'model/a', case_id: 'low-1', tier: 'low', score: 1 }),
+      makeRow({ model: 'model/a', case_id: 'low-2', tier: 'low', score: 0 }),
+      makeRow({ model: 'model/a', case_id: 'med-1', tier: 'medium', score: 1 }),
+      makeRow({ model: 'model/b', case_id: 'low-3', tier: 'low', score: 1 }),
+    ];
+
+    const summaries = summarize(rows, 'decider');
+    expect(summaries).toHaveLength(3);
+
+    const aLow = summaries.find(s => s.model === 'model/a' && s.tier === 'low');
+    expect(aLow?.cases).toBe(2);
+    expect(aLow?.accuracy).toBe(0.5);
+
+    const aMed = summaries.find(s => s.model === 'model/a' && s.tier === 'medium');
+    expect(aMed?.cases).toBe(1);
+    expect(aMed?.accuracy).toBe(1);
+
+    const bLow = summaries.find(s => s.model === 'model/b' && s.tier === 'low');
+    expect(bLow?.cases).toBe(1);
+  });
+
+  it('uses * fallback when tier is null', () => {
+    const rows: CaseResultRow[] = [makeRow({ tier: null, score: 1 })];
+    const [s] = summarize(rows, 'decider');
+    expect(s.tier).toBe('*');
+  });
+
+  it('computes avgLatencyMs as rounded mean', () => {
+    const rows: CaseResultRow[] = [
+      makeRow({ case_id: 'c1', tier: 'low', latency_ms: 100 }),
+      makeRow({ case_id: 'c2', tier: 'low', latency_ms: 301 }),
+    ];
+
+    const [s] = summarize(rows, 'decider');
+    expect(s.avgLatencyMs).toBe(Math.round((100 + 301) / 2));
+  });
+
+  it('handles single-element groups for p50', () => {
+    const rows: CaseResultRow[] = [makeRow({ tier: 'high', latency_ms: 500 })];
+    const [s] = summarize(rows, 'decider');
+    expect(s.p50LatencyMs).toBe(500);
+  });
+});
+
+describe('runCasesWithConcurrency', () => {
+  it('processes all items exactly once', async () => {
+    const processed: number[] = [];
+    await runCasesWithConcurrency([1, 2, 3, 4, 5], 2, async item => {
+      processed.push(item);
+    });
+    expect(processed.sort((a, b) => a - b)).toEqual([1, 2, 3, 4, 5]);
+  });
+
+  it('processes empty array without error', async () => {
+    await expect(runCasesWithConcurrency([], 4, async () => {})).resolves.toBeUndefined();
+  });
+
+  it('respects the concurrency cap', async () => {
+    let inFlight = 0;
+    let maxInFlight = 0;
+    const concurrency = 3;
+
+    await runCasesWithConcurrency(
+      Array.from({ length: 10 }, (_, i) => i),
+      concurrency,
+      async () => {
+        inFlight++;
+        maxInFlight = Math.max(maxInFlight, inFlight);
+        // Yield to allow other workers to start
+        await new Promise(resolve => setTimeout(resolve, 0));
+        inFlight--;
+      }
+    );
+
+    expect(maxInFlight).toBeLessThanOrEqual(concurrency);
+    expect(maxInFlight).toBeGreaterThan(0);
+  });
+
+  it('works when concurrency exceeds item count', async () => {
+    const processed: number[] = [];
+    await runCasesWithConcurrency([1, 2], 10, async item => {
+      processed.push(item);
+    });
+    expect(processed.sort((a, b) => a - b)).toEqual([1, 2]);
+  });
+
+  it('propagates errors from the callback', async () => {
+    await expect(
+      runCasesWithConcurrency([1], 1, async () => {
+        throw new Error('test error');
+      })
+    ).rejects.toThrow('test error');
+  });
+});
+
+describe('computeEngineIdentity', () => {
+  it('is deterministic for a given kind', () => {
+    expect(computeEngineIdentity('classifier')).toBe(computeEngineIdentity('classifier'));
+    expect(computeEngineIdentity('decider')).toBe(computeEngineIdentity('decider'));
+  });
+
+  it('differs between classifier and decider datasets', () => {
+    expect(computeEngineIdentity('classifier')).not.toBe(computeEngineIdentity('decider'));
+  });
+
+  it('is versioned (carries the engine version prefix)', () => {
+    expect(computeEngineIdentity('decider')).toMatch(/^v\d+:[0-9a-f]{8}$/);
+  });
+});
+
+describe('chunkArray', () => {
+  it('splits into 5-per-chunk with a partial final chunk', () => {
+    const items = Array.from({ length: 13 }, (_, i) => i);
+    const chunks = chunkArray(items, 5);
+    expect(chunks).toHaveLength(3);
+    expect(chunks[0]).toHaveLength(5);
+    expect(chunks[1]).toHaveLength(5);
+    expect(chunks[2]).toHaveLength(3);
+  });
+
+  it('round-trips caseIds: flatten equals the original order', () => {
+    const ids = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k'];
+    const chunks = chunkArray(ids, 10);
+    expect(chunks).toHaveLength(2);
+    expect(chunks.flat()).toEqual(ids);
+  });
+
+  it('returns a single full chunk when items fit exactly', () => {
+    const chunks = chunkArray([1, 2, 3, 4, 5], 5);
+    expect(chunks).toHaveLength(1);
+    expect(chunks[0]).toEqual([1, 2, 3, 4, 5]);
+  });
+
+  it('returns no chunks for an empty array', () => {
+    expect(chunkArray([], 10)).toEqual([]);
+  });
+});
+
+describe('pickClassifierWinner', () => {
+  const summary = (model: string, accuracy: number, avgCostUsd: number | null) => ({
+    model,
+    tier: '*' as const,
+    accuracy,
+    avgCostUsd,
+    avgLatencyMs: 100,
+    p50LatencyMs: 90,
+    p95LatencyMs: 90,
+    cases: 36,
+    errors: 0,
+    timeouts: 0,
+  });
+
+  it('picks the cheapest model meeting the threshold', () => {
+    const winner = pickClassifierWinner(
+      [summary('pricy', 0.95, 0.01), summary('cheap', 0.9, 0.001), summary('weak', 0.5, 0.0001)],
+      0.7
+    );
+    expect(winner?.model).toBe('cheap');
+  });
+
+  it('falls back to highest accuracy when nothing meets the threshold', () => {
+    const winner = pickClassifierWinner([summary('a', 0.5, 0.001), summary('b', 0.6, 0.01)], 0.9);
+    expect(winner?.model).toBe('b');
+  });
+
+  it('treats null cost as most expensive', () => {
+    const winner = pickClassifierWinner(
+      [summary('nocost', 0.95, null), summary('cheap', 0.9, 0.001)],
+      0.7
+    );
+    expect(winner?.model).toBe('cheap');
+  });
+
+  it('ignores decider-tier summaries and returns null when nothing is graded', () => {
+    expect(
+      pickClassifierWinner([{ ...summary('m', 1, 0.001), tier: 'low' as const }], 0.7)
+    ).toBeNull();
+    expect(pickClassifierWinner([], 0.7)).toBeNull();
+  });
+
+  // helper with explicit p95LatencyMs
+  const summaryWithLatency = (
+    model: string,
+    accuracy: number,
+    avgCostUsd: number | null,
+    p95: number | null = 90
+  ) => ({
+    model,
+    tier: '*' as const,
+    accuracy,
+    avgCostUsd,
+    avgLatencyMs: 100,
+    p50LatencyMs: 80,
+    p95LatencyMs: p95,
+    timeouts: 0,
+    cases: 36,
+    errors: 0,
+  });
+
+  it('latency gate: picks cheapest within budget when both meet accuracy and latency', () => {
+    const winner = pickClassifierWinner(
+      [
+        summaryWithLatency('fast-cheap', 0.9, 0.001, 800),
+        summaryWithLatency('fast-pricy', 0.95, 0.01, 500),
+        summaryWithLatency('slow', 0.9, 0.0005, 1500),
+      ],
+      0.7,
+      1000
+    );
+    expect(winner?.model).toBe('fast-cheap');
+  });
+
+  it('latency gate fallback: picks lowest p95 among accuracy-meeting when none in budget', () => {
+    const winner = pickClassifierWinner(
+      [
+        summaryWithLatency('almost', 0.9, 0.001, 1200),
+        summaryWithLatency('closest', 0.85, 0.002, 1100),
+        summaryWithLatency('way-off', 0.9, 0.0005, 2000),
+      ],
+      0.8,
+      1000
+    );
+    expect(winner?.model).toBe('closest');
+  });
+
+  it('null budget disables latency gate', () => {
+    const winner = pickClassifierWinner(
+      [
+        summaryWithLatency('cheap-slow', 0.9, 0.001, 5000),
+        summaryWithLatency('pricy-fast', 0.95, 0.01, 100),
+      ],
+      0.7,
+      null
+    );
+    expect(winner?.model).toBe('cheap-slow');
+  });
+
+  it('null p95 on summary fails non-null latency constraint', () => {
+    const winner = pickClassifierWinner(
+      [
+        summaryWithLatency('no-p95', 0.9, 0.001, null),
+        summaryWithLatency('has-p95', 0.85, 0.01, 800),
+      ],
+      0.7,
+      1000
+    );
+    // no-p95 fails the gate (null p95 cannot meet non-null constraint)
+    // has-p95 meets both → wins
+    expect(winner?.model).toBe('has-p95');
+  });
+});
+
+describe('summarize — p95 and timeouts', () => {
+  it('computes p95LatencyMs using nearest-rank formula', () => {
+    // 20 rows, sorted latencies at 95th percentile: ceil(0.95*20)-1 = 18
+    const rows = Array.from({ length: 20 }, (_, i) =>
+      makeRow({ case_id: `c${i}`, latency_ms: (i + 1) * 100 })
+    );
+    const [s] = summarize(rows, 'classifier');
+    // sorted latencies: [100, 200, ..., 2000], index 18 = 1900
+    expect(s.p95LatencyMs).toBe(1900);
+  });
+
+  it('counts timeouts', () => {
+    const rows = [
+      makeRow({ case_id: 'c1', timed_out: 1 }),
+      makeRow({ case_id: 'c2', timed_out: 0 }),
+      makeRow({ case_id: 'c3', timed_out: 1 }),
+    ];
+    const [s] = summarize(rows, 'classifier');
+    expect(s.timeouts).toBe(2);
+  });
+
+  it('aggregates multi-rep rows correctly (same case_id different rep)', () => {
+    const rows = [
+      makeRow({ case_id: 'c1', rep: 0, score: 1, latency_ms: 100 }),
+      makeRow({ case_id: 'c1', rep: 1, score: 0, latency_ms: 200 }),
+      makeRow({ case_id: 'c2', rep: 0, score: 1, latency_ms: 150 }),
+      makeRow({ case_id: 'c2', rep: 1, score: 1, latency_ms: 250 }),
+    ];
+    const [s] = summarize(rows, 'classifier');
+    expect(s.cases).toBe(4);
+    expect(s.accuracy).toBe(0.75);
+  });
+});
+
+describe('decider message fan-out', () => {
+  it('DECIDER_CHUNK_SIZE is 5 (chunk count for 76 cases)', () => {
+    // DECIDER_CASES = 76, chunk size 5 → ceil(76/5) = 16 chunks
+    const chunks = chunkArray(
+      Array.from({ length: 76 }, (_, i) => String(i)),
+      5
+    );
+    expect(chunks).toHaveLength(16);
+  });
+
+  it('message schema accepts and defaults rep', () => {
+    const msg = BenchmarkJobMessageSchema.parse({ runId: 'r1', kind: 'classifier', model: 'm1' });
+    expect(msg.rep).toBeUndefined();
+    const withRep = BenchmarkJobMessageSchema.parse({
+      runId: 'r1',
+      kind: 'decider',
+      model: 'm1',
+      rep: 2,
+      caseIds: ['a'],
+      chunk: 0,
+    });
+    expect(withRep.rep).toBe(2);
+  });
+
+  it('buildDeciderMessages: produces models × reps × ceil(76/5) messages with correct rep', () => {
+    // 76 cases, chunk size 5 → 16 chunks
+    const cases76 = Array.from({ length: 76 }, (_, i) => ({ id: `case-${i}` }));
+    const chunks = chunkArray(cases76, 5);
+    expect(chunks).toHaveLength(16);
+
+    const models = ['model/a', 'model/b'];
+    const repetitions = 3;
+    const messages = buildDeciderMessages('run-test', 'decider', models, repetitions, chunks);
+
+    // Total: 2 models × 3 reps × 16 chunks = 96 messages
+    expect(messages).toHaveLength(models.length * repetitions * chunks.length);
+
+    // Each rep index (0..2) should appear exactly models.length × chunks.length times
+    for (let rep = 0; rep < repetitions; rep++) {
+      const forRep = messages.filter(m => m.body.rep === rep);
+      expect(forRep).toHaveLength(models.length * chunks.length);
+    }
+
+    // Every message carries the correct rep in its body
+    for (const { body } of messages) {
+      expect(typeof body.rep).toBe('number');
+      expect(body.rep).toBeGreaterThanOrEqual(0);
+      expect(body.rep).toBeLessThan(repetitions);
+    }
+
+    // caseIds on each message match the chunk
+    for (let chunkIdx = 0; chunkIdx < chunks.length; chunkIdx++) {
+      const forChunk = messages.filter(m => m.body.chunk === chunkIdx);
+      for (const { body } of forChunk) {
+        expect(body.caseIds).toEqual(chunks[chunkIdx].map(c => c.id));
+      }
+    }
+  });
+});
diff --git a/services/auto-routing-benchmark/src/run.ts b/services/auto-routing-benchmark/src/run.ts
new file mode 100644
index 0000000000..fa685a3168
--- /dev/null
+++ b/services/auto-routing-benchmark/src/run.ts
@@ -0,0 +1,739 @@
+import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classifier';
+import {
+  CLASSIFIER_WINNER_KV_KEY,
+  ROUTING_TABLE_KV_KEY,
+  type BenchmarkDeciderModel,
+  type BenchmarkKind,
+  type BenchmarkModelSummary,
+} from '@kilocode/auto-routing-contracts';
+import { formatError } from '@kilocode/worker-utils';
+import * as z from 'zod';
+import { getBenchmarkConfig } from './config';
+import { CLASSIFIER_CASES } from './datasets/classifier-cases';
+import { DECIDER_CASES } from './datasets/decider-cases';
+import type { RunModelRow } from './db';
+import {
+  countCaseResults,
+  existsNewerCompletedRun,
+  getCaseResults,
+  getLatestSummariesByModel,
+  getRunningRun,
+  getRunWithModels,
+  getSummaries,
+  insertRun,
+  markRunCompleted,
+  markRunFailed,
+  markStaleRunsFailed,
+  replaceModelSummaries,
+  saveRoutingTable,
+  upsertCaseResult,
+  type CaseResultRow,
+  type PriorModelResult,
+} from './db';
+import { gradeClassifierOutput, runDeciderCheck } from './grading';
+import { createOpenRouterClient } from './openrouter';
+import { buildRoutingTable } from './routing-table-builder';
+import { runDeciderCaseViaCli, warmUpCliContainer } from './cli-runner';
+import { pickClassifierWinner } from './winner';
+
+export type BenchmarkJobMessage = {
+  runId: string;
+  kind: BenchmarkKind;
+  model: string;
+  // Decider only: the case ids this message is responsible for, plus the chunk
+  // index used to key the container instance. Absent for classifier messages.
+  caseIds?: string[];
+  chunk?: number;
+  // Repetition index (0-based). Absent for classifier messages.
+  rep?: number;
+};
+
+export const BenchmarkJobMessageSchema = z.object({
+  runId: z.string().min(1),
+  kind: z.enum(['classifier', 'decider']),
+  model: z.string().min(1),
+  caseIds: z.array(z.string().min(1)).optional(),
+  chunk: z.number().int().min(0).optional(),
+  rep: z.number().int().min(0).optional(),
+});
+
+// Decider cases run through the real `kilo` CLI in a container (up to ~3 min
+// each). Chunking caps how many cases a single queue invocation processes so
+// each stays well under CF's wall-clock limit.
+const DECIDER_CHUNK_SIZE = 5;
+
+// Cloudflare Queues caps a single sendBatch at 100 messages. A decider fan-out
+// is models × reps × ceil(76 / 5) messages, which clears 100 with as few as two
+// models, so the dispatch must be sliced.
+const QUEUE_SEND_BATCH_LIMIT = 100;
+
+export function chunkArray<T>(items: readonly T[], size: number): T[][] {
+  const chunks: T[][] = [];
+  for (let i = 0; i < items.length; i += size) {
+    chunks.push(items.slice(i, i + size));
+  }
+  return chunks;
+}
+
+// Enqueues messages in sendBatch-sized slices. A mid-dispatch failure leaves a
+// partially-enqueued run that can never reach its expected result count, so the
+// run is marked failed (surfacing in the admin panel) before the throw
+// propagates to the POST handler.
+async function enqueueRunMessages(
+  env: Env,
+  runId: string,
+  messages: { body: BenchmarkJobMessage }[]
+): Promise<void> {
+  for (let i = 0; i < messages.length; i += QUEUE_SEND_BATCH_LIMIT) {
+    try {
+      await env.BENCH_QUEUE.sendBatch(messages.slice(i, i + QUEUE_SEND_BATCH_LIMIT));
+    } catch (error) {
+      await markRunFailed(
+        env.BENCH_DB,
+        runId,
+        `enqueue failed after ${i} of ${messages.length} messages: ${formatError(error).error}`
+      ).catch(() => {});
+      throw error;
+    }
+  }
+}
+
+const STALE_RUN_MAX_AGE_MS = 6 * 3600_000;
+
+// Fails any run still 'running' past the stale threshold (queue retries
+// exhausted / dead-lettered). Called both before starting a run and when
+// listing runs, so a wedged run is recovered without depending on a new run
+// being started (the UI disables Start while a run shows 'running').
+export async function sweepStaleRuns(db: D1Database): Promise<void> {
+  await markStaleRunsFailed(db, new Date(Date.now() - STALE_RUN_MAX_AGE_MS).toISOString());
+}
+
+// Bump when grading logic, the CLI invocation/variant handling, the container
+// image's pinned CLI, or any other execution input NOT captured by the dataset
+// hash changes in a way that invalidates prior measurements. Forces every
+// carried summary to be re-benchmarked on the next run.
+const BENCHMARK_ENGINE_VERSION = 1;
+
+function fnv1aHex(input: string): string {
+  let hash = 0x811c9dc5;
+  for (let i = 0; i < input.length; i++) {
+    hash ^= input.charCodeAt(i);
+    hash = Math.imul(hash, 0x01000193);
+  }
+  return (hash >>> 0).toString(16).padStart(8, '0');
+}
+
+// Identifies the benchmark inputs that a run measured under, beyond the
+// per-model reasoning_effort and run-level repetitions tracked separately:
+// the dataset contents (ids + grading checks/expectations) and an engine
+// version for code-level execution changes. Two runs sharing this identity
+// (plus repetitions + reasoning_effort) produced comparable measurements, so a
+// model's prior summaries can be carried instead of re-run.
+export function computeEngineIdentity(kind: BenchmarkKind): string {
+  const datasetSignature =
+    kind === 'classifier'
+      ? CLASSIFIER_CASES.map(c => ({ id: c.id, expected: c.expected }))
+      : DECIDER_CASES.map(c => ({ id: c.id, tier: c.tier, check: c.check }));
+  return `v${BENCHMARK_ENGINE_VERSION}:${fnv1aHex(JSON.stringify(datasetSignature))}`;
+}
+
+/** Pure helper: produces the sendBatch bodies for a decider run fan-out.
+ * Extracted for unit-testability; the shape is models × reps × chunks messages.
+ */
+export function buildDeciderMessages(
+  runId: string,
+  kind: BenchmarkKind,
+  modelIds: string[],
+  repetitions: number,
+  chunks: readonly (readonly { id: string }[])[]
+): { body: BenchmarkJobMessage }[] {
+  return modelIds.flatMap(model =>
+    Array.from({ length: repetitions }, (_, rep) =>
+      chunks.map((chunkCases, chunk) => ({
+        body: {
+          runId,
+          kind,
+          model,
+          chunk,
+          rep,
+          caseIds: chunkCases.map(c => c.id),
+        } satisfies BenchmarkJobMessage,
+      }))
+    ).flat()
+  );
+}
+
+// Thrown when a run of the same kind is already active. The admin route maps
+// it to HTTP 409 so automated callers can distinguish it from a 5xx fault.
+export class RunAlreadyActiveError extends Error {
+  constructor(
+    readonly kind: BenchmarkKind,
+    readonly activeRunId: string
+  ) {
+    super(`a ${kind} benchmark run is already in progress (${activeRunId})`);
+    this.name = 'RunAlreadyActiveError';
+  }
+}
+
+export async function startRun(
+  env: Env,
+  kind: BenchmarkKind,
+  options: { force?: boolean } = {}
+): Promise<{ runId: string; enqueuedModels: number; skippedModels: string[] }> {
+  // Stale-run sweeper: fail dead 'running' runs first so a wedged run can't
+  // block new ones and the admin panel shows the truth.
+  await sweepStaleRuns(env.BENCH_DB);
+
+  const config = await getBenchmarkConfig(env.BENCH_DB);
+  if (!config) {
+    throw new Error('benchmark config not set: save it in the admin panel before starting a run');
+  }
+
+  // One active run per kind. The unique partial index is the atomic backstop;
+  // this pre-check turns the common case (a run already going) into a clean
+  // RunAlreadyActiveError instead of an insert-constraint failure.
+  const activeRun = await getRunningRun(env.BENCH_DB, kind);
+  if (activeRun) {
+    throw new RunAlreadyActiveError(kind, activeRun.id);
+  }
+  const repetitions =
+    kind === 'classifier' ? config.classifierRepetitions : config.deciderRepetitions;
+  const models =
+    kind === 'classifier' ? config.classifierModels : config.deciderModels.map(m => m.id);
+
+  const engineIdentity = computeEngineIdentity(kind);
+  const reasoningEffortFor = (modelId: string): string | null =>
+    kind === 'classifier'
+      ? null
+      : (config.deciderModels.find(m => m.id === modelId)?.reasoningEffort ?? null);
+
+  // Models with prior results are skipped (their latest summaries are carried
+  // into this run's aggregate) unless the admin forces a full re-run. A prior
+  // result is only carried when it was measured under the SAME benchmark
+  // identity — engine identity (dataset + grading/CLI version), repetitions,
+  // and the model's reasoning_effort — so a config/dataset change re-benchmarks
+  // the model instead of pairing current serving config with stale numbers.
+  const priorByModel = options.force
+    ? new Map<string, PriorModelResult>()
+    : await getLatestSummariesByModel(env.BENCH_DB, kind);
+  const isCarryable = (modelId: string): boolean => {
+    const prior = priorByModel.get(modelId);
+    return (
+      prior !== undefined &&
+      prior.engineIdentity === engineIdentity &&
+      prior.repetitions === repetitions &&
+      (prior.reasoningEffort ?? null) === reasoningEffortFor(modelId)
+    );
+  };
+  const enqueuedModelIds = models.filter(m => !isCarryable(m));
+  const skippedModels = models.filter(m => isCarryable(m));
+  const carriedSummaries = skippedModels.flatMap(m => priorByModel.get(m)?.summaries ?? []);
+
+  // Decider runs execute through the kilo CLI under a real Kilo user's
+  // identity/billing. Fail fast (before inserting the run) when that user
+  // isn't configured so the admin POST surfaces the misconfiguration.
+  if (kind === 'decider' && enqueuedModelIds.length > 0 && !config.benchmarkUserId) {
+    throw new Error(
+      'benchmark user not configured: set benchmarkUserId before running the decider benchmark'
+    );
+  }
+
+  const startedAt = new Date().toISOString();
+  const runId = `${kind}-${startedAt.replace(/[:.]/g, '-')}`;
+
+  // Build run_models rows for ALL models of this run's kind.
+  const runModelRows: RunModelRow[] = models.map(modelId => ({
+    run_id: runId,
+    model: modelId,
+    enqueued: enqueuedModelIds.includes(modelId),
+    reasoning_effort: reasoningEffortFor(modelId),
+  }));
+
+  try {
+    await insertRun(
+      env.BENCH_DB,
+      {
+        id: runId,
+        kind,
+        startedAt,
+        min_accuracy: config.minAccuracy,
+        switch_cost_factor: config.switchCostFactor,
+        max_concurrency: config.maxConcurrency,
+        benchmark_user_id: config.benchmarkUserId,
+        repetitions,
+        classifier_max_p95_latency_ms:
+          kind === 'classifier' ? config.classifierMaxP95LatencyMs : null,
+        engine_identity: engineIdentity,
+      },
+      runModelRows,
+      carriedSummaries
+    );
+  } catch (error) {
+    // The pre-check already passed, so an insert failure is almost certainly a
+    // race losing the one-running-per-kind unique index. Re-read the winner and
+    // surface a clean conflict rather than a 500.
+    const winner = await getRunningRun(env.BENCH_DB, kind).catch(() => undefined);
+    if (winner && winner.id !== runId) {
+      throw new RunAlreadyActiveError(kind, winner.id);
+    }
+    throw error;
+  }
+
+  console.log(
+    JSON.stringify({
+      event: 'benchmark_run_started',
+      runId,
+      kind,
+      enqueuedModels: enqueuedModelIds,
+      skippedModels,
+    })
+  );
+
+  if (enqueuedModelIds.length === 0) {
+    // Everything already has results: complete immediately and republish the
+    // aggregate so config-only changes (model removed, threshold tweaked)
+    // take effect without re-running any model. The state mirrors the rows
+    // insertRun just wrote, so no re-read is needed.
+    await finalizeRunIfComplete(env, runId, kind, {
+      maxConcurrency: config.maxConcurrency,
+      minAccuracy: config.minAccuracy,
+      switchCostFactor: config.switchCostFactor,
+      benchmarkUserId: config.benchmarkUserId,
+      models: runModelRows,
+      repetitions,
+      classifierMaxP95LatencyMs: kind === 'classifier' ? config.classifierMaxP95LatencyMs : null,
+      startedAt,
+    });
+    return { runId, enqueuedModels: 0, skippedModels };
+  }
+
+  if (kind === 'classifier') {
+    await enqueueRunMessages(
+      env,
+      runId,
+      enqueuedModelIds.map(model => ({
+        body: { runId, kind, model } satisfies BenchmarkJobMessage,
+      }))
+    );
+    return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
+  }
+
+  // Decider: one message per (model, rep, chunk) so each queue invocation stays
+  // bounded. finalizeRunIfComplete expects enqueuedModels × DECIDER_CASES × repetitions rows.
+  const chunks = chunkArray(DECIDER_CASES, DECIDER_CHUNK_SIZE);
+  const messages = buildDeciderMessages(runId, kind, enqueuedModelIds, repetitions, chunks);
+  await enqueueRunMessages(env, runId, messages);
+  return { runId, enqueuedModels: enqueuedModelIds.length, skippedModels };
+}
+
+export async function processJob(env: Env, rawMessage: unknown): Promise<void> {
+  // Validate the message shape; malformed messages are logged and dropped
+  // rather than retried forever.
+  const parsed = BenchmarkJobMessageSchema.safeParse(rawMessage);
+  if (!parsed.success) {
+    console.warn(
+      JSON.stringify({
+        event: 'benchmark_job_invalid_message',
+        error: parsed.error.message,
+        raw: JSON.stringify(rawMessage).slice(0, 200),
+      })
+    );
+    return;
+  }
+
+  const message = parsed.data;
+  const state = await getRunState(env, message.runId);
+
+  if (message.kind === 'classifier') {
+    // Create the OpenRouter client inside processJob — no module-scope transport clients.
+    const client = await createOpenRouterClient(env);
+    // Expand cases × repetitions into a flat work list.
+    const expandedItems: { benchCase: (typeof CLASSIFIER_CASES)[number]; rep: number }[] = [];
+    for (let rep = 0; rep < state.repetitions; rep++) {
+      for (const benchCase of CLASSIFIER_CASES) {
+        expandedItems.push({ benchCase, rep });
+      }
+    }
+    await runCasesWithConcurrency(
+      expandedItems,
+      state.maxConcurrency,
+      async ({ benchCase, rep }) => {
+        const startedAt = performance.now();
+        try {
+          const result = await classifyWithOpenRouter(client, benchCase.input, message.model);
+          const score = result.fallback
+            ? 0
+            : gradeClassifierOutput(benchCase.expected, result.classification);
+          await upsertCaseResult(env.BENCH_DB, {
+            run_id: message.runId,
+            model: message.model,
+            case_id: benchCase.id,
+            tier: null,
+            score,
+            latency_ms: Math.round(performance.now() - startedAt),
+            cost_usd: result.cost,
+            error: null,
+            fallback_reason: result.fallback?.reason ?? null,
+            retried: result.retried ?? false,
+            exit_code: null,
+            output_prefix: null,
+            event_count: null,
+            last_event_types: null,
+            rep,
+            timed_out: 0,
+          });
+        } catch (error) {
+          await upsertCaseResult(
+            env.BENCH_DB,
+            failedRow(message, benchCase.id, null, startedAt, error, rep)
+          );
+        }
+      }
+    );
+  } else {
+    await processDeciderJob(env, message, state);
+  }
+
+  await finalizeRunIfComplete(env, message.runId, message.kind, state);
+}
+
+type RunState = {
+  maxConcurrency: number;
+  minAccuracy: number;
+  switchCostFactor: number;
+  benchmarkUserId: string | null;
+  models: RunModelRow[];
+  repetitions: number;
+  classifierMaxP95LatencyMs: number | null;
+  startedAt: string;
+};
+
+async function getRunState(env: Env, runId: string): Promise<RunState> {
+  // Snapshots taken at startRun time so a mid-run admin edit can't skew them.
+  const result = await getRunWithModels(env.BENCH_DB, runId);
+  if (!result) throw new Error(`unknown run ${runId}`);
+  const { run, models } = result;
+  return {
+    maxConcurrency: run.max_concurrency,
+    minAccuracy: run.min_accuracy,
+    switchCostFactor: run.switch_cost_factor,
+    benchmarkUserId: run.benchmark_user_id,
+    models,
+    repetitions: run.repetitions,
+    classifierMaxP95LatencyMs: run.classifier_max_p95_latency_ms,
+    startedAt: run.started_at,
+  };
+}
+
+async function processDeciderJob(
+  env: Env,
+  message: BenchmarkJobMessage,
+  state: RunState
+): Promise<void> {
+  // Decider messages always carry their chunk's case ids; anything else is
+  // malformed and dropped (same policy as unparseable messages).
+  if (!message.caseIds?.length) {
+    console.warn(JSON.stringify({ event: 'benchmark_job_missing_case_ids', runId: message.runId }));
+    return;
+  }
+  const caseIds = new Set(message.caseIds);
+  const cases = DECIDER_CASES.filter(c => caseIds.has(c.id));
+
+  if (!state.benchmarkUserId) {
+    // startRun fails fast before enqueueing, so this only happens if the run
+    // snapshot was tampered with; throwing lets the queue retry/dead-letter.
+    throw new Error(`run ${message.runId} has no benchmarkUserId`);
+  }
+
+  // Fetch a short-lived user token ONCE per queue message. Non-OK throws so the
+  // queue retries the message. The token is never logged.
+  const kiloToken = await fetchBenchmarkUserToken(env, state.benchmarkUserId);
+  const rep = message.rep ?? 0;
+  const instanceName = `${message.runId}:${message.model}:${rep}:${message.chunk ?? 0}`;
+
+  // Reasoning effort comes from the run snapshot (run_models row), not live config.
+  const modelRow = state.models.find(m => m.model === message.model);
+  const reasoningEffort = modelRow?.reasoning_effort ?? null;
+
+  // Fresh container instances run the CLI's one-time sqlite migration; the
+  // container owns that via its /warmup endpoint so the first real case
+  // doesn't burn its timeout on it. Failures are non-fatal: the first case
+  // simply absorbs whatever warmup work remains.
+  await warmUpCliContainer(env, { instanceName, model: message.model, kiloToken }).catch(() => {});
+
+  // Concurrency 1: the CLI's sqlite state in the container is not safe under
+  // concurrent sessions (partial-migration crashes); the container serializes
+  // too, so higher concurrency here would only hold HTTP requests open.
+  await runCasesWithConcurrency(cases, 1, async benchCase => {
+    const startedAt = performance.now();
+    try {
+      let result = await runDeciderCaseViaCli(env, {
+        instanceName,
+        model: message.model,
+        benchCase,
+        kiloToken,
+        reasoningEffort,
+      });
+      // The CLI occasionally ends a session with no assistant text at all
+      // (transient empty completion: a lone step_finish with cost 0). Mirror
+      // the production classifier's policy and retry once.
+      let retried = false;
+      if (result.exitCode === 0 && result.text.length === 0) {
+        retried = true;
+        const retry = await runDeciderCaseViaCli(env, {
+          instanceName,
+          model: message.model,
+          benchCase,
+          kiloToken,
+          reasoningEffort,
+        });
+        retry.costUsd =
+          retry.costUsd === null && result.costUsd === null
+            ? null
+            : (retry.costUsd ?? 0) + (result.costUsd ?? 0);
+        result = retry;
+      }
+      const succeeded =
+        result.exitCode === 0 &&
+        result.text.length > 0 &&
+        runDeciderCheck(benchCase.check, result.text);
+      await upsertCaseResult(env.BENCH_DB, {
+        run_id: message.runId,
+        model: message.model,
+        case_id: benchCase.id,
+        tier: benchCase.tier,
+        score: succeeded ? 1 : 0,
+        latency_ms: result.latencyMs,
+        cost_usd: result.costUsd,
+        error: result.exitCode !== 0 ? result.stderrTail.slice(0, 500) : null,
+        fallback_reason: null,
+        retried,
+        exit_code: result.exitCode,
+        output_prefix: result.text.slice(0, 200),
+        event_count: result.eventCount,
+        last_event_types: result.lastEventTypes.join(' '),
+        rep,
+        timed_out: result.timedOut ? 1 : 0,
+      });
+    } catch (error) {
+      await upsertCaseResult(
+        env.BENCH_DB,
+        failedRow(message, benchCase.id, benchCase.tier, startedAt, error, rep)
+      );
+    }
+  });
+}
+
+const TokenResponseSchema = z.object({ token: z.string().min(1), expiresAt: z.string() });
+
+// Calls apps/web's internal endpoint to mint a short-lived user API token for
+// the decider CLI. Never logs the token.
+export async function fetchBenchmarkUserToken(env: Env, userId: string): Promise<string> {
+  const secret = await env.INTERNAL_API_SECRET_PROD.get();
+  const response = await fetch(
+    `${env.KILO_WEB_API_BASE_URL}/api/internal/auto-routing-benchmark/token`,
+    {
+      method: 'POST',
+      headers: {
+        'content-type': 'application/json',
+        authorization: `Bearer ${secret}`,
+      },
+      body: JSON.stringify({ userId }),
+    }
+  );
+  if (!response.ok) {
+    const detail = (await response.text().catch(() => '')).slice(0, 200);
+    throw new Error(`token mint failed: HTTP ${response.status} ${detail}`);
+  }
+  const parsedToken = TokenResponseSchema.safeParse(await response.json());
+  if (!parsedToken.success) {
+    throw new Error('token mint returned unexpected response shape');
+  }
+  return parsedToken.data.token;
+}
+
+function failedRow(
+  message: BenchmarkJobMessage,
+  caseId: string,
+  tier: string | null,
+  startedAt: number,
+  error: unknown,
+  rep: number = 0
+): CaseResultRow {
+  return {
+    run_id: message.runId,
+    model: message.model,
+    case_id: caseId,
+    tier,
+    score: 0,
+    latency_ms: Math.round(performance.now() - startedAt),
+    cost_usd: null,
+    error: JSON.stringify(formatError(error)).slice(0, 500),
+    fallback_reason: null,
+    retried: null,
+    exit_code: null,
+    output_prefix: null,
+    event_count: null,
+    last_event_types: null,
+    rep,
+    timed_out: 0,
+  };
+}
+
+export async function runCasesWithConcurrency<T>(
+  cases: readonly T[],
+  concurrency: number,
+  fn: (item: T) => Promise<void>
+): Promise<void> {
+  const queue = [...cases];
+  const workers = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
+    for (let item = queue.shift(); item !== undefined; item = queue.shift()) {
+      await fn(item);
+    }
+  });
+  await Promise.all(workers);
+}
+
+async function finalizeRunIfComplete(
+  env: Env,
+  runId: string,
+  kind: BenchmarkKind,
+  // Run snapshot already loaded by the caller (startRun / processJob).
+  state: RunState
+): Promise<void> {
+  const enqueuedModels = state.models.filter(m => m.enqueued);
+  const caseCount = kind === 'classifier' ? CLASSIFIER_CASES.length : DECIDER_CASES.length;
+  const expected = enqueuedModels.length * caseCount * state.repetitions;
+  const actual = await countCaseResults(env.BENCH_DB, runId);
+
+  if (actual < expected) return;
+
+  // Two consumers may both see completion and both aggregate — harmless:
+  // identical deterministic inputs → identical summaries; replaceModelSummaries
+  // is a batched delete+insert; markRunCompleted guards on status='running'.
+  const rows = await getCaseResults(env.BENCH_DB, runId);
+  // Fresh results (enqueued models). Carried summaries (skipped models) stay in
+  // model_summaries with carried=true and are included via getSummaries below.
+  const freshSummaries = summarize(rows, kind);
+  await replaceModelSummaries(env.BENCH_DB, runId, freshSummaries);
+  await markRunCompleted(env.BENCH_DB, runId);
+
+  // Read back all summaries (fresh + carried) for publishing.
+  const allSummaries = await getSummaries(env.BENCH_DB, runId);
+
+  // Don't let a slow older run overwrite a newer run's already-published table
+  // or classifier winner. Publication is selected by publish time, so an older
+  // run finishing last would otherwise win. The run is still marked completed
+  // above (it did finish); only its publication is suppressed.
+  const supersededByNewer = await existsNewerCompletedRun(
+    env.BENCH_DB,
+    kind,
+    state.startedAt,
+    runId
+  );
+  if (supersededByNewer) {
+    console.warn(JSON.stringify({ event: 'benchmark_publish_skipped_superseded', runId, kind }));
+  }
+
+  if (kind === 'classifier' && !supersededByNewer) {
+    const winner = pickClassifierWinner(
+      allSummaries,
+      state.minAccuracy,
+      state.classifierMaxP95LatencyMs
+    );
+    if (winner) {
+      console.log(
+        JSON.stringify({ event: 'classifier_winner_published', runId, model: winner.model })
+      );
+    } else {
+      console.warn(JSON.stringify({ event: 'classifier_winner_skipped', runId }));
+    }
+    // Clear KV so the auto-routing worker repopulates from D1 on next request.
+    await env.AUTO_ROUTING_CONFIG.delete(CLASSIFIER_WINNER_KV_KEY);
+  }
+
+  if (kind === 'decider' && !supersededByNewer) {
+    const generatedAt = new Date().toISOString();
+    try {
+      // Built from the run's own model snapshot, not live config, so a mid-run
+      // admin edit can't skew the published table.
+      const deciderModels: BenchmarkDeciderModel[] = state.models.map(m => ({
+        id: m.model,
+        reasoningEffort: m.reasoning_effort as BenchmarkDeciderModel['reasoningEffort'],
+      }));
+      const table = buildRoutingTable({
+        runId,
+        generatedAt,
+        minAccuracy: state.minAccuracy,
+        switchCostFactor: state.switchCostFactor,
+        deciderModels,
+        summaries: allSummaries,
+      });
+      await saveRoutingTable(env.BENCH_DB, table, generatedAt);
+      // Clear KV so the auto-routing worker repopulates from D1 on next request.
+      await env.AUTO_ROUTING_CONFIG.delete(ROUTING_TABLE_KV_KEY);
+      console.log(
+        JSON.stringify({ event: 'routing_table_published', runId, version: table.version })
+      );
+    } catch (error) {
+      console.warn(
+        JSON.stringify({
+          event: 'routing_table_publish_skipped',
+          runId,
+          ...formatError(error),
+        })
+      );
+    }
+  }
+
+  console.log(
+    JSON.stringify({
+      event: 'benchmark_run_completed',
+      runId,
+      kind,
+      summaries: allSummaries,
+    })
+  );
+}
+
+export function summarize(rows: CaseResultRow[], kind: BenchmarkKind): BenchmarkModelSummary[] {
+  // Group by "model tier-key" using a plain reduce so this works in all runtimes.
+  // Classifier rows use '*' as the tier (no tiering); decider rows use the actual tier
+  // (falling back to '*' when tier is null).
+  const groups = new Map<string, CaseResultRow[]>();
+  for (const row of rows) {
+    const tierKey = kind === 'classifier' ? '*' : (row.tier ?? '*');
+    const key = `${row.model}\0${tierKey}`;
+    const existing = groups.get(key);
+    if (existing) {
+      existing.push(row);
+    } else {
+      groups.set(key, [row]);
+    }
+  }
+
+  return [...groups.entries()].map(([key, group]) => {
+    const [model, tier] = key.split('\0');
+    const latencies = group.map(r => r.latency_ms).toSorted((a, b) => a - b);
+    const costs = group.filter(r => r.cost_usd !== null);
+    const p95LatencyMs =
+      latencies.length > 0
+        ? (latencies[Math.min(latencies.length - 1, Math.ceil(0.95 * latencies.length) - 1)] ??
+          null)
+        : null;
+    return {
+      model,
+      tier: tier as BenchmarkModelSummary['tier'],
+      accuracy: Number((group.reduce((a, r) => a + r.score, 0) / group.length).toFixed(4)),
+      avgCostUsd: costs.length
+        ? Number((costs.reduce((a, r) => a + (r.cost_usd ?? 0), 0) / costs.length).toFixed(8))
+        : null,
+      avgLatencyMs: Math.round(group.reduce((a, r) => a + r.latency_ms, 0) / group.length),
+      p50LatencyMs: latencies[Math.floor(latencies.length / 2)] ?? null,
+      p95LatencyMs,
+      cases: group.length,
+      errors: group.filter(r => r.error !== null).length,
+      timeouts: group.filter(r => r.timed_out).length,
+    };
+  });
+}
diff --git a/services/auto-routing-benchmark/src/winner.ts b/services/auto-routing-benchmark/src/winner.ts
new file mode 100644
index 0000000000..318809c4a3
--- /dev/null
+++ b/services/auto-routing-benchmark/src/winner.ts
@@ -0,0 +1,38 @@
+import type { BenchmarkModelSummary } from '@kilocode/auto-routing-contracts';
+
+// Picks the best classifier candidate from summaries (tier '*') applying:
+//   1. Accuracy gate: must meet minAccuracy.
+//   2. Optional p95 latency gate: when maxP95LatencyMs is non-null, prefer
+//      candidates whose measured p95 latency is within budget.
+// Selection order:
+//   - Candidates meeting BOTH accuracy and latency → cheapest (tie: highest accuracy).
+//   - Candidates meeting accuracy only (latency gate not met) → lowest p95
+//     (tie: cheapest). This ensures the admin always sees a winner, even
+//     when all models are over budget.
+//   - No accuracy threshold met → most accurate (tie: cheapest).
+// Returns null when there are no graded summaries at all.
+export function pickClassifierWinner(
+  summaries: BenchmarkModelSummary[],
+  minAccuracy: number,
+  maxP95LatencyMs: number | null = null
+): BenchmarkModelSummary | null {
+  const graded = summaries.filter(s => s.tier === '*' && s.cases > 0);
+  if (graded.length === 0) return null;
+  const cost = (s: BenchmarkModelSummary) => s.avgCostUsd ?? Number.POSITIVE_INFINITY;
+  const p95 = (s: BenchmarkModelSummary) => s.p95LatencyMs ?? Number.POSITIVE_INFINITY;
+
+  const meetingAccuracy = graded.filter(s => s.accuracy >= minAccuracy);
+  const meetingBoth =
+    maxP95LatencyMs !== null
+      ? meetingAccuracy.filter(s => s.p95LatencyMs !== null && s.p95LatencyMs <= maxP95LatencyMs)
+      : meetingAccuracy;
+
+  if (meetingBoth.length > 0) {
+    return meetingBoth.toSorted((a, b) => cost(a) - cost(b) || b.accuracy - a.accuracy)[0];
+  }
+  if (meetingAccuracy.length > 0) {
+    // Latency gate not met: pick lowest p95 (null p95 sorts last), tie-break cheapest.
+    return meetingAccuracy.toSorted((a, b) => p95(a) - p95(b) || cost(a) - cost(b))[0];
+  }
+  return graded.toSorted((a, b) => b.accuracy - a.accuracy || cost(a) - cost(b))[0];
+}
diff --git a/services/auto-routing-benchmark/test/stubs/cloudflare-containers.ts b/services/auto-routing-benchmark/test/stubs/cloudflare-containers.ts
new file mode 100644
index 0000000000..bc5bed4fdf
--- /dev/null
+++ b/services/auto-routing-benchmark/test/stubs/cloudflare-containers.ts
@@ -0,0 +1,14 @@
+// Node-safe stub for `@cloudflare/containers`, aliased in vitest.config.ts.
+//
+// The real package imports `cloudflare:workers`, which only exists in the
+// workerd runtime. Unit tests run in the node pool and merely need the worker
+// entry (src/index.ts) to import without pulling in that chain — they never
+// instantiate the container DO. This stub provides the minimal `Container`
+// base class so `class BenchRunnerContainer extends Container<Env>` resolves.
+
+export class Container<Env = unknown> {
+  defaultPort?: number;
+  sleepAfter?: string;
+  // eslint-disable-next-line @typescript-eslint/no-unused-vars
+  constructor(_ctx: unknown, _env: Env) {}
+}
diff --git a/services/auto-routing-benchmark/tsconfig.json b/services/auto-routing-benchmark/tsconfig.json
new file mode 100644
index 0000000000..4f765c05f6
--- /dev/null
+++ b/services/auto-routing-benchmark/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "esnext",
+    "lib": ["esnext"],
+    "module": "esnext",
+    "moduleResolution": "bundler",
+    "types": ["@types/node", "@cloudflare/workers-types", "./worker-configuration.d.ts"],
+    "esModuleInterop": true,
+    "resolveJsonModule": true,
+    "forceConsistentCasingInFileNames": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "noEmit": true
+  },
+  "include": ["worker-configuration.d.ts", "src/**/*.ts", "src/**/*.d.ts", "vitest.config.ts"]
+}
diff --git a/services/auto-routing-benchmark/vitest.config.ts b/services/auto-routing-benchmark/vitest.config.ts
new file mode 100644
index 0000000000..6a49fa250d
--- /dev/null
+++ b/services/auto-routing-benchmark/vitest.config.ts
@@ -0,0 +1,19 @@
+import { resolve } from 'node:path';
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  resolve: {
+    alias: {
+      // The real package imports `cloudflare:workers` (workerd-only). Unit
+      // tests run in the node pool, so alias it to a node-safe stub. Tests
+      // never instantiate the container DO; they only need the worker entry to
+      // import cleanly.
+      '@cloudflare/containers': resolve(__dirname, 'test/stubs/cloudflare-containers.ts'),
+    },
+  },
+  test: {
+    globals: true,
+    environment: 'node',
+    include: ['src/**/*.test.ts'],
+  },
+});
diff --git a/services/auto-routing-benchmark/worker-configuration.d.ts b/services/auto-routing-benchmark/worker-configuration.d.ts
new file mode 100644
index 0000000000..b91e340c4e
--- /dev/null
+++ b/services/auto-routing-benchmark/worker-configuration.d.ts
@@ -0,0 +1,26 @@
+/* eslint-disable */
+// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: bb795d62b0d99d5132cd935146748ae9)
+interface __BaseEnv_Env {
+	AUTO_ROUTING_CONFIG: KVNamespace;
+	BENCH_DB: D1Database;
+	BENCH_QUEUE: Queue;
+	INTERNAL_API_SECRET_PROD: SecretsStoreSecret;
+	OPENROUTER_API_KEY: SecretsStoreSecret;
+	KILO_WEB_API_BASE_URL: string;
+	KILO_CLI_API_URL: string;
+	BENCH_RUNNER: DurableObjectNamespace<import("./src/index").BenchRunnerContainer>;
+}
+declare namespace Cloudflare {
+	interface GlobalProps {
+		mainModule: typeof import("./src/index");
+		durableNamespaces: "BenchRunnerContainer";
+	}
+	interface Env extends __BaseEnv_Env {}
+}
+interface Env extends __BaseEnv_Env {}
+type StringifyValues<EnvType extends Record<string, unknown>> = {
+	[Binding in keyof EnvType]: EnvType[Binding] extends string ? EnvType[Binding] : string;
+};
+declare namespace NodeJS {
+	interface ProcessEnv extends StringifyValues<Pick<Cloudflare.Env, "KILO_WEB_API_BASE_URL" | "KILO_CLI_API_URL">> {}
+}
diff --git a/services/auto-routing-benchmark/wrangler.jsonc b/services/auto-routing-benchmark/wrangler.jsonc
new file mode 100644
index 0000000000..fbf655d599
--- /dev/null
+++ b/services/auto-routing-benchmark/wrangler.jsonc
@@ -0,0 +1,74 @@
+{
+  "$schema": "node_modules/wrangler/config-schema.json",
+  "account_id": "e115e769bcdd4c3d66af59d3332cb394",
+  "name": "auto-routing-benchmark",
+  "main": "src/index.ts",
+  "compatibility_date": "2026-05-15",
+  "compatibility_flags": ["nodejs_compat"],
+  "workers_dev": false,
+  "preview_urls": false,
+  "logpush": true,
+  "routes": [{ "pattern": "auto-routing-benchmark.kiloapps.io", "custom_domain": true }],
+  "dev": { "port": 8814, "local_protocol": "http", "ip": "0.0.0.0" },
+  "observability": { "enabled": true },
+  "vars": {
+    // Base URL for reaching apps/web's /api/internal/* routes. Other workers
+    // that call apps/web internal endpoints use app.kilo.ai.
+    "KILO_WEB_API_BASE_URL": "https://app.kilo.ai",
+    // Gateway base URL injected into the benchmark container as the kilo
+    // CLI's KILO_API_URL. Local dev overrides both vars via .dev.vars (see
+    // .dev.vars.example) so the decider benchmark runs fully locally.
+    "KILO_CLI_API_URL": "https://api.kilo.ai",
+  },
+  "containers": [
+    {
+      "name": "auto-routing-benchmark-runner",
+      "class_name": "BenchRunnerContainer",
+      "image": "./container/Dockerfile",
+      "instance_type": "standard-2",
+      "max_instances": 40,
+    },
+  ],
+  "durable_objects": {
+    "bindings": [{ "name": "BENCH_RUNNER", "class_name": "BenchRunnerContainer" }],
+  },
+  "migrations": [{ "tag": "v1", "new_sqlite_classes": ["BenchRunnerContainer"] }],
+  "d1_databases": [
+    {
+      "binding": "BENCH_DB",
+      "database_name": "auto-routing-benchmark",
+      "database_id": "92f2c88a-5ee6-4fd0-b118-75bd141b5cac",
+      "migrations_dir": "migrations",
+    },
+  ],
+  "queues": {
+    "producers": [{ "binding": "BENCH_QUEUE", "queue": "auto-routing-benchmark-jobs" }],
+    "consumers": [
+      {
+        "queue": "auto-routing-benchmark-jobs",
+        "max_batch_size": 1,
+        "max_retries": 2,
+        "max_concurrency": 4,
+        "dead_letter_queue": "auto-routing-benchmark-dlq",
+      },
+    ],
+  },
+  "kv_namespaces": [
+    // Shared with the auto-routing worker, which uses it as a read-through
+    // cache over this worker's D1. On publish we only DELETE the cached keys
+    // (routing table + classifier winner) so the next read repopulates.
+    { "binding": "AUTO_ROUTING_CONFIG", "id": "4316b8db31e347e19cfadad1b6386ad5" },
+  ],
+  "secrets_store_secrets": [
+    {
+      "binding": "INTERNAL_API_SECRET_PROD",
+      "store_id": "342a86d9e3a94da698e82d0c6e2a36f0",
+      "secret_name": "INTERNAL_API_SECRET_PROD",
+    },
+    {
+      "binding": "OPENROUTER_API_KEY",
+      "store_id": "342a86d9e3a94da698e82d0c6e2a36f0",
+      "secret_name": "OPENROUTER_API_KEY",
+    },
+  ],
+}
diff --git a/services/auto-routing/src/admin-classifier-model.ts b/services/auto-routing/src/admin-classifier-model.ts
index 7fc6660e31..472298d1a0 100644
--- a/services/auto-routing/src/admin-classifier-model.ts
+++ b/services/auto-routing/src/admin-classifier-model.ts
@@ -3,20 +3,23 @@ import {
   type AutoRoutingClassifierModelResponse,
 } from '@kilocode/auto-routing-contracts';
 import type { Handler } from 'hono';
-import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
-import { getClassifierModel, setClassifierModel } from './classifier-config';
+import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
+import { getClassifierModelInfo, setClassifierModel } from './classifier-config';
+import type { ClassifierModelInfo } from './classifier-config';
 import type { HonoEnv } from './hono-env';
 
-function classifierModelResponse(model: string): AutoRoutingClassifierModelResponse {
+function classifierModelResponse(info: ClassifierModelInfo): AutoRoutingClassifierModelResponse {
   return {
-    model,
+    model: info.model,
+    override: info.override,
+    benchmarkWinner: info.benchmarkWinner,
     defaultModel: DEFAULT_CLASSIFIER_MODEL,
   };
 }
 
 export const getClassifierModelHandler: Handler<HonoEnv> = async c => {
-  const model = await getClassifierModel(c.env);
-  return c.json(classifierModelResponse(model));
+  const info = await getClassifierModelInfo(c.env);
+  return c.json(classifierModelResponse(info));
 };
 
 export const putClassifierModelHandler: Handler<HonoEnv> = async c => {
@@ -32,10 +35,10 @@ export const putClassifierModelHandler: Handler<HonoEnv> = async c => {
     return c.json({ error: 'Invalid classifier model' }, 400);
   }
 
-  const model = await setClassifierModel(c.env, parsed.data.model);
-  if (!model) {
+  const info = await setClassifierModel(c.env, parsed.data.model);
+  if (!info) {
     return c.json({ error: 'Invalid classifier model' }, 400);
   }
 
-  return c.json(classifierModelResponse(model));
+  return c.json(classifierModelResponse(info));
 };
diff --git a/services/auto-routing/src/benchmark-origin.ts b/services/auto-routing/src/benchmark-origin.ts
new file mode 100644
index 0000000000..ecbadc2fd6
--- /dev/null
+++ b/services/auto-routing/src/benchmark-origin.ts
@@ -0,0 +1,44 @@
+import {
+  BenchmarkRoutingTableResponseSchema,
+  ClassifierWinnerResponseSchema,
+  type ClassifierWinner,
+  type RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+
+type BenchmarkEnv = Pick<Env, 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'>;
+
+async function fetchBenchmark(env: BenchmarkEnv, path: string): Promise<unknown> {
+  const secret = await env.INTERNAL_API_SECRET_PROD.get();
+  const res = await env.BENCHMARK_SERVICE.fetch(`https://auto-routing-benchmark${path}`, {
+    headers: { authorization: `Bearer ${secret}` },
+  });
+  if (!res.ok) {
+    const detail = (await res.text().catch(() => '')).slice(0, 200);
+    throw new Error(`benchmark origin ${path} responded ${res.status} ${detail}`);
+  }
+  return res.json();
+}
+
+export async function fetchRoutingTableFromOrigin(env: BenchmarkEnv): Promise<RoutingTable | null> {
+  const body = await fetchBenchmark(env, '/admin/routing-table');
+  const parsed = BenchmarkRoutingTableResponseSchema.safeParse(body);
+  if (!parsed.success) {
+    throw new Error(
+      `benchmark routing-table response invalid: ${parsed.error.issues[0]?.message ?? 'unknown'}`
+    );
+  }
+  return parsed.data.table;
+}
+
+export async function fetchClassifierWinnerFromOrigin(
+  env: BenchmarkEnv
+): Promise<ClassifierWinner | null> {
+  const body = await fetchBenchmark(env, '/admin/classifier-winner');
+  const parsed = ClassifierWinnerResponseSchema.safeParse(body);
+  if (!parsed.success) {
+    throw new Error(
+      `benchmark classifier-winner response invalid: ${parsed.error.issues[0]?.message ?? 'unknown'}`
+    );
+  }
+  return parsed.data.winner;
+}
diff --git a/services/auto-routing/src/classifier-analytics.test.ts b/services/auto-routing/src/classifier-analytics.test.ts
index e3ebc38e0c..11a8d5f12e 100644
--- a/services/auto-routing/src/classifier-analytics.test.ts
+++ b/services/auto-routing/src/classifier-analytics.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it, vi } from 'vitest';
 import { writeClassifierMetricsDataPoint } from './classifier-analytics';
-import type { ClassifierOutput } from './classifier-output';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
 
 const classification = {
   taskType: 'debugging',
diff --git a/services/auto-routing/src/classifier-analytics.ts b/services/auto-routing/src/classifier-analytics.ts
index b0ceb9a4c4..08c5c0deb3 100644
--- a/services/auto-routing/src/classifier-analytics.ts
+++ b/services/auto-routing/src/classifier-analytics.ts
@@ -1,4 +1,4 @@
-import type { ClassifierOutput } from './classifier-output';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
 
 export type ClassifierAnalyticsStatus =
   | 'classified'
diff --git a/services/auto-routing/src/classifier-config.test.ts b/services/auto-routing/src/classifier-config.test.ts
index fbd3a0e8c4..1ec3d438b4 100644
--- a/services/auto-routing/src/classifier-config.test.ts
+++ b/services/auto-routing/src/classifier-config.test.ts
@@ -1,17 +1,77 @@
 import { beforeEach, describe, expect, it, vi } from 'vitest';
-import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
+import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
 import {
   CLASSIFIER_MODEL_CONFIG_KEY,
   clearClassifierConfigCache,
   getClassifierModel,
+  getClassifierModelInfo,
 } from './classifier-config';
+import { CLASSIFIER_WINNER_KV_KEY } from '@kilocode/auto-routing-contracts';
 
-function createKv(value: string | null) {
-  const get = vi.fn(async () => value);
-  return {
-    kv: { get } as unknown as KVNamespace,
-    get,
-  };
+type ClassifierEnvStub = Pick<
+  Env,
+  'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'
+>;
+
+const EXAMPLE_WINNER = {
+  model: 'google/gemini-2.5-flash-lite',
+  runId: 'run-abc',
+  accuracy: 0.95,
+  generatedAt: '2026-06-11T00:00:00.000Z',
+};
+
+type EnvSetup = {
+  env: ClassifierEnvStub;
+  configGet: ReturnType<typeof vi.fn>;
+  configPut: ReturnType<typeof vi.fn>;
+  benchmarkFetch: ReturnType<typeof vi.fn>;
+};
+
+function makeEnv(opts: {
+  overrideModel?: string | null;
+  winnerKvValue?: string | null;
+  originWinner?: typeof EXAMPLE_WINNER | null;
+  originStatus?: number;
+  originThrow?: boolean;
+  onPut?: (key: string, value: string, options: unknown) => void;
+}): EnvSetup {
+  const configGet = vi.fn(async (key: string) => {
+    if (key === CLASSIFIER_MODEL_CONFIG_KEY) {
+      return opts.overrideModel === undefined ? null : opts.overrideModel;
+    }
+    if (key === CLASSIFIER_WINNER_KV_KEY) {
+      return opts.winnerKvValue === undefined ? null : opts.winnerKvValue;
+    }
+    return null;
+  });
+  const configPut = vi.fn(async (key: string, value: string, options: unknown) => {
+    opts.onPut?.(key, value, options);
+  });
+  const benchmarkFetch = vi.fn(async () => {
+    if (opts.originThrow) throw new Error('benchmark unavailable');
+    return {
+      ok: opts.originStatus === undefined ? true : opts.originStatus < 400,
+      status: opts.originStatus ?? 200,
+      json: async () => ({
+        winner: opts.originWinner !== undefined ? opts.originWinner : null,
+      }),
+    };
+  });
+
+  const env: ClassifierEnvStub = {
+    AUTO_ROUTING_CONFIG: {
+      get: configGet,
+      put: configPut,
+    },
+    BENCHMARK_SERVICE: {
+      fetch: benchmarkFetch,
+    },
+    INTERNAL_API_SECRET_PROD: {
+      get: vi.fn(async () => 'test-secret'),
+    },
+  } as unknown as ClassifierEnvStub;
+
+  return { env, configGet, configPut, benchmarkFetch };
 }
 
 describe('classifier config', () => {
@@ -20,42 +80,103 @@ describe('classifier config', () => {
   });
 
   it('falls back to the default classifier model when KV has no value', async () => {
-    const { get, kv } = createKv(null);
+    const { env, configGet } = makeEnv({});
 
-    await expect(getClassifierModel({ AUTO_ROUTING_CONFIG: kv })).resolves.toBe(
-      DEFAULT_CLASSIFIER_MODEL
-    );
-    expect(get).toHaveBeenCalledWith(CLASSIFIER_MODEL_CONFIG_KEY);
+    await expect(getClassifierModel(env)).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
+    expect(configGet).toHaveBeenCalledWith(CLASSIFIER_MODEL_CONFIG_KEY);
   });
 
-  it('uses the trimmed classifier model from KV', async () => {
-    await expect(
-      getClassifierModel({
-        AUTO_ROUTING_CONFIG: createKv('  google/gemini-2.5-flash-lite  ').kv,
-      })
-    ).resolves.toBe('google/gemini-2.5-flash-lite');
+  it('uses the trimmed classifier model from KV override', async () => {
+    const { env } = makeEnv({ overrideModel: '  google/gemini-2.5-flash-lite  ' });
+    await expect(getClassifierModel(env)).resolves.toBe('google/gemini-2.5-flash-lite');
   });
 
   it('falls back to the default classifier model when KV has a blank value', async () => {
-    await expect(
-      getClassifierModel({
-        AUTO_ROUTING_CONFIG: createKv('   ').kv,
-      })
-    ).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
+    const { env } = makeEnv({ overrideModel: '   ' });
+    await expect(getClassifierModel(env)).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
   });
 
   it('fails closed to the default classifier model when the KV read rejects', async () => {
     const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
-    const kv = {
-      get: vi.fn(async () => {
-        throw new Error('KV unavailable');
-      }),
-    } as unknown as KVNamespace;
+    const configGet = vi.fn(async () => {
+      throw new Error('KV unavailable');
+    });
+    const env: ClassifierEnvStub = {
+      AUTO_ROUTING_CONFIG: {
+        get: configGet,
+        put: vi.fn(async () => {}),
+      } as unknown as KVNamespace,
+      BENCHMARK_SERVICE: { fetch: vi.fn() } as unknown as Fetcher,
+      INTERNAL_API_SECRET_PROD: {
+        get: vi.fn(async () => 'secret'),
+      } as unknown as SecretsStoreSecret,
+    };
 
-    await expect(getClassifierModel({ AUTO_ROUTING_CONFIG: kv })).resolves.toBe(
-      DEFAULT_CLASSIFIER_MODEL
-    );
+    await expect(getClassifierModel(env)).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
     expect(warn).toHaveBeenCalled();
     warn.mockRestore();
   });
+
+  it('serves the benchmark winner from KV without calling origin', async () => {
+    const { env, benchmarkFetch } = makeEnv({ winnerKvValue: JSON.stringify(EXAMPLE_WINNER) });
+    const info = await getClassifierModelInfo(env);
+    expect(info.benchmarkWinner).toBe(EXAMPLE_WINNER.model);
+    expect(info.model).toBe(EXAMPLE_WINNER.model);
+    expect(benchmarkFetch).not.toHaveBeenCalled();
+  });
+
+  it('fetches from origin on winner KV miss, writes to KV with expirationTtl, and returns winner', async () => {
+    const puts: Array<{ key: string; value: string; options: unknown }> = [];
+    const { env } = makeEnv({
+      winnerKvValue: null,
+      originWinner: EXAMPLE_WINNER,
+      onPut: (key, value, options) => puts.push({ key, value, options }),
+    });
+
+    const info = await getClassifierModelInfo(env);
+    expect(info.benchmarkWinner).toBe(EXAMPLE_WINNER.model);
+    expect(
+      puts.some(
+        p =>
+          p.key === CLASSIFIER_WINNER_KV_KEY &&
+          (p.options as { expirationTtl: number }).expirationTtl === 3600
+      )
+    ).toBe(true);
+  });
+
+  it('falls back to default model when origin returns null winner', async () => {
+    const { env } = makeEnv({ winnerKvValue: null, originWinner: null });
+    const info = await getClassifierModelInfo(env);
+    expect(info.benchmarkWinner).toBeNull();
+    expect(info.model).toBe(DEFAULT_CLASSIFIER_MODEL);
+  });
+
+  it('falls back to default model when origin fails for the winner', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const { env } = makeEnv({ winnerKvValue: null, originThrow: true });
+    await expect(getClassifierModel(env)).resolves.toBe(DEFAULT_CLASSIFIER_MODEL);
+    warn.mockRestore();
+  });
+
+  it('keeps a healthy admin override when the winner origin fails', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const { env } = makeEnv({ overrideModel: 'override/model', originThrow: true });
+    expect(await getClassifierModelInfo(env)).toEqual({
+      model: 'override/model',
+      override: 'override/model',
+      benchmarkWinner: null,
+    });
+    warn.mockRestore();
+  });
+
+  it('override takes precedence over benchmark winner from origin', async () => {
+    const { env } = makeEnv({
+      overrideModel: 'openai/gpt-4o',
+      winnerKvValue: null,
+      originWinner: EXAMPLE_WINNER,
+    });
+    const info = await getClassifierModelInfo(env);
+    expect(info.override).toBe('openai/gpt-4o');
+    expect(info.model).toBe('openai/gpt-4o');
+  });
 });
diff --git a/services/auto-routing/src/classifier-config.ts b/services/auto-routing/src/classifier-config.ts
index 6b0687a539..5e4de3c938 100644
--- a/services/auto-routing/src/classifier-config.ts
+++ b/services/auto-routing/src/classifier-config.ts
@@ -1,6 +1,12 @@
-import { formatError } from '@kilocode/worker-utils';
-import { DEFAULT_CLASSIFIER_MODEL } from './classifier-prompt';
-import { ttlCached } from './ttl-cache';
+import { formatError, ttlCached } from '@kilocode/worker-utils';
+import {
+  CLASSIFIER_WINNER_KV_KEY,
+  ClassifierWinnerSchema,
+  type ClassifierWinner,
+} from '@kilocode/auto-routing-contracts';
+import { DEFAULT_CLASSIFIER_MODEL } from '@kilocode/auto-routing-contracts/classifier';
+import { kvReadThrough } from './kv-read-through';
+import { fetchClassifierWinnerFromOrigin } from './benchmark-origin';
 
 export const CLASSIFIER_MODEL_CONFIG_KEY = 'classifier_model';
 export const DECISION_LOG_SAMPLE_RATE_CONFIG_KEY = 'decision_log_sample_rate';
@@ -16,12 +22,56 @@ const DEFAULT_DECISION_LOG_SAMPLE_RATE = 0.01;
 // read from every classification.
 const CONFIG_CACHE_TTL_MS = 60_000;
 
-type ClassifierConfigEnv = Pick<Env, 'AUTO_ROUTING_CONFIG'>;
+type ClassifierConfigEnv = Pick<
+  Env,
+  'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'
+>;
+
+export type ClassifierModelInfo = {
+  // Effective model used by /decide: override ?? benchmark winner ?? default.
+  model: string;
+  override: string | null;
+  benchmarkWinner: string | null;
+};
+
+function parseClassifierWinner(raw: string): ClassifierWinner | null {
+  try {
+    const parsed = ClassifierWinnerSchema.safeParse(JSON.parse(raw));
+    return parsed.success ? parsed.data : null;
+  } catch {
+    return null;
+  }
+}
 
 const classifierModelCache = ttlCached(CONFIG_CACHE_TTL_MS, async (env: ClassifierConfigEnv) => {
-  const configuredModel = await env.AUTO_ROUTING_CONFIG.get(CLASSIFIER_MODEL_CONFIG_KEY);
-  const trimmedModel = configuredModel?.trim();
-  return trimmedModel && trimmedModel.length > 0 ? trimmedModel : DEFAULT_CLASSIFIER_MODEL;
+  const [configuredModel, winner] = await Promise.all([
+    env.AUTO_ROUTING_CONFIG.get(CLASSIFIER_MODEL_CONFIG_KEY),
+    kvReadThrough<ClassifierWinner>({
+      kv: env.AUTO_ROUTING_CONFIG,
+      key: CLASSIFIER_WINNER_KV_KEY,
+      ttlSeconds: 3600,
+      fetchOrigin: () => fetchClassifierWinnerFromOrigin(env),
+      parse: parseClassifierWinner,
+    }).catch((error: unknown) => {
+      // A benchmark-origin failure must not reject the whole load: that would
+      // discard a healthy admin override and fail closed to the default.
+      console.warn(
+        JSON.stringify({
+          event: 'auto_routing_config_read_failed',
+          key: CLASSIFIER_WINNER_KV_KEY,
+          ...formatError(error),
+        })
+      );
+      return null;
+    }),
+  ]);
+  const override = configuredModel?.trim() || null;
+  const benchmarkWinner = winner?.model ?? null;
+  return {
+    model: override ?? benchmarkWinner ?? DEFAULT_CLASSIFIER_MODEL,
+    override,
+    benchmarkWinner,
+  } satisfies ClassifierModelInfo;
 });
 
 const decisionLogSampleRateCache = ttlCached(
@@ -57,10 +107,20 @@ function failClosed<T>(key: string, fallback: T): (error: unknown) => T {
   };
 }
 
-export function getClassifierModel(env: ClassifierConfigEnv): Promise<string> {
+const DEFAULT_CLASSIFIER_MODEL_INFO: ClassifierModelInfo = {
+  model: DEFAULT_CLASSIFIER_MODEL,
+  override: null,
+  benchmarkWinner: null,
+};
+
+export function getClassifierModelInfo(env: ClassifierConfigEnv): Promise<ClassifierModelInfo> {
   return classifierModelCache
     .get(env)
-    .catch(failClosed(CLASSIFIER_MODEL_CONFIG_KEY, DEFAULT_CLASSIFIER_MODEL));
+    .catch(failClosed(CLASSIFIER_MODEL_CONFIG_KEY, DEFAULT_CLASSIFIER_MODEL_INFO));
+}
+
+export async function getClassifierModel(env: ClassifierConfigEnv): Promise<string> {
+  return (await getClassifierModelInfo(env)).model;
 }
 
 export function getDecisionLogSampleRate(env: ClassifierConfigEnv): Promise<number> {
@@ -69,16 +129,22 @@ export function getDecisionLogSampleRate(env: ClassifierConfigEnv): Promise<numb
     .catch(failClosed(DECISION_LOG_SAMPLE_RATE_CONFIG_KEY, DEFAULT_DECISION_LOG_SAMPLE_RATE));
 }
 
+// model: null clears the admin override so the benchmark winner (or the
+// built-in default) takes effect.
 export async function setClassifierModel(
   env: ClassifierConfigEnv,
-  model: string
-): Promise<string | null> {
+  model: string | null
+): Promise<ClassifierModelInfo | null> {
+  if (model === null) {
+    await env.AUTO_ROUTING_CONFIG.delete(CLASSIFIER_MODEL_CONFIG_KEY);
+    classifierModelCache.clear();
+    return getClassifierModelInfo(env);
+  }
   const trimmedModel = model.trim();
   if (trimmedModel.length === 0) {
     return null;
   }
-
   await env.AUTO_ROUTING_CONFIG.put(CLASSIFIER_MODEL_CONFIG_KEY, trimmedModel);
   classifierModelCache.clear();
-  return trimmedModel;
+  return getClassifierModelInfo(env);
 }
diff --git a/services/auto-routing/src/decide.ts b/services/auto-routing/src/decide.ts
index 3cc94edc56..fd476a5668 100644
--- a/services/auto-routing/src/decide.ts
+++ b/services/auto-routing/src/decide.ts
@@ -1,5 +1,6 @@
 import { MirrorPayloadSchema } from '@kilocode/auto-routing-contracts';
 import type {
+  AutoRoutingDecision,
   AutoRoutingDecisionResponse,
   MirrorPayload,
   NormalizedClassifierInput,
@@ -9,7 +10,7 @@ import type { Handler } from 'hono';
 import { writeClassifierMetricsDataPoint } from './classifier-analytics';
 import type { ClassifierAnalyticsStatus } from './classifier-analytics';
 import { getClassifierModel, getDecisionLogSampleRate } from './classifier-config';
-import type { ClassifierOutput } from './classifier-output';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
 import {
   computeContentHashes,
   deriveConversationKey,
@@ -17,9 +18,16 @@ import {
   hashIdentifierForTelemetry,
 } from './conversation-identity';
 import type { ContentHashes } from './conversation-identity';
-import { getCachedClassification, putCachedClassification } from './decision-cache';
+import {
+  getCachedClassification,
+  getStickyDecision,
+  putCachedClassification,
+  putStickyDecision,
+} from './decision-cache';
+import { computeDecision } from './decision-engine';
 import { ClassifierRunError, classifyNormalizedInput } from './model-classifier';
 import type { ClassifierRunResult } from './model-classifier';
+import { getRoutingTable } from './routing-table';
 import type { HonoEnv } from './hono-env';
 
 // Isolate-scoped request counter, used to correlate latency with isolate
@@ -29,11 +37,12 @@ let isolateRequestSeq = 0;
 function decisionResponse(
   cost: number,
   classification: ClassifierOutput,
-  normalized: NormalizedClassifierInput
+  normalized: NormalizedClassifierInput,
+  decision: AutoRoutingDecision | null
 ): AutoRoutingDecisionResponse {
   return {
     cost,
-    decision: null,
+    decision,
     classifierResult: { classification, normalized },
   };
 }
@@ -194,7 +203,8 @@ function recordDecision(
   env: Env,
   ctx: DecisionContext,
   durationMs: number,
-  outcome: DecisionOutcome
+  outcome: DecisionOutcome,
+  decision: AutoRoutingDecision | null = null
 ): void {
   const summary = summarizeOutcome(outcome);
 
@@ -243,6 +253,10 @@ function recordDecision(
       hasMachineId: ctx.payload.machineId !== null,
       mode: ctx.payload.mode,
       uaPrefix: ctx.payload.userAgent?.slice(0, 40) ?? null,
+      decidedModel: decision?.model ?? null,
+      decidedTier: decision?.tier ?? null,
+      decisionSource: decision?.source ?? null,
+      sticky: decision?.sticky ?? null,
       ...summary.details,
     })
   );
@@ -265,11 +279,12 @@ export const decideHandler: Handler<HonoEnv> = async c => {
 
   const payload = parsed.data;
   const startedAt = performance.now();
-  const [hashes, userIdHash, classifierModel, successSampleRate] = await Promise.all([
+  const [hashes, userIdHash, classifierModel, successSampleRate, routingTable] = await Promise.all([
     computeContentHashes(payload.input),
     hashIdentifierForTelemetry(payload.userId),
     getClassifierModel(c.env),
     getDecisionLogSampleRate(c.env),
+    getRoutingTable(c.env),
   ]);
   const ctx: DecisionContext = {
     payload,
@@ -281,19 +296,24 @@ export const decideHandler: Handler<HonoEnv> = async c => {
     successSampleRate,
   };
 
-  const cached = await getCachedClassification(
-    c.env,
-    ctx.conversationKey,
-    hashes.exact,
-    classifierModel
-  );
+  // Both live in the conversation's Durable Object; fetch them together.
+  const [cached, stickyModel] = await Promise.all([
+    getCachedClassification(c.env, ctx.conversationKey, hashes.exact, classifierModel),
+    getStickyDecision(c.env, ctx.conversationKey),
+  ]);
   if (cached) {
-    recordDecision(c.env, ctx, performance.now() - startedAt, {
-      kind: 'cache_hit',
-      classifierModel,
-      classification: cached,
-    });
-    return c.json(decisionResponse(0, cached, payload.input));
+    const decision = computeDecision(cached, routingTable, stickyModel);
+    if (decision) {
+      c.executionCtx.waitUntil(putStickyDecision(c.env, ctx.conversationKey, decision.model));
+    }
+    recordDecision(
+      c.env,
+      ctx,
+      performance.now() - startedAt,
+      { kind: 'cache_hit', classifierModel, classification: cached },
+      decision
+    );
+    return c.json(decisionResponse(0, cached, payload.input, decision));
   }
 
   try {
@@ -311,10 +331,22 @@ export const decideHandler: Handler<HonoEnv> = async c => {
         )
       );
     }
-    recordDecision(c.env, ctx, performance.now() - startedAt, { kind: 'model', classifier });
-    // When routing decisions are implemented, include the prior decision for
-    // this session as an input alongside classifier output.
-    return c.json(decisionResponse(classifier.cost ?? 0, classifier.classification, payload.input));
+    const decision = computeDecision(classifier.classification, routingTable, stickyModel);
+    // Like the classification cache, sticky state only trusts real classifier
+    // output: a heuristic fallback must not re-anchor the session's model.
+    if (decision && !classifier.fallback) {
+      c.executionCtx.waitUntil(putStickyDecision(c.env, ctx.conversationKey, decision.model));
+    }
+    recordDecision(
+      c.env,
+      ctx,
+      performance.now() - startedAt,
+      { kind: 'model', classifier },
+      decision
+    );
+    return c.json(
+      decisionResponse(classifier.cost ?? 0, classifier.classification, payload.input, decision)
+    );
   } catch (error) {
     recordDecision(c.env, ctx, performance.now() - startedAt, { kind: 'error', error });
     // A failed run can still have billed the first attempt (e.g. a valid-but-
diff --git a/services/auto-routing/src/decision-cache.test.ts b/services/auto-routing/src/decision-cache.test.ts
index 1e3245835d..8f97f5c3ca 100644
--- a/services/auto-routing/src/decision-cache.test.ts
+++ b/services/auto-routing/src/decision-cache.test.ts
@@ -1,6 +1,6 @@
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
-import type { ClassifierOutput } from './classifier-output';
-import { AutoRoutingDecisionCacheDO } from './decision-cache';
+import type { ClassifierOutput } from '@kilocode/auto-routing-contracts/classifier';
+import { AutoRoutingDecisionCacheDO, getStickyDecision, putStickyDecision } from './decision-cache';
 
 const classification = {
   taskType: 'implementation',
@@ -104,3 +104,48 @@ describe('AutoRoutingDecisionCacheDO', () => {
     await expect(storage.getAlarm()).resolves.toBeNull();
   });
 });
+
+describe('sticky decision storage', () => {
+  beforeEach(() => {
+    vi.useFakeTimers();
+    vi.setSystemTime(new Date('2026-06-11T12:00:00Z'));
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+  });
+
+  function createStickyEnv() {
+    const { cacheDO, storage } = createCacheDO();
+    const env = {
+      AUTO_ROUTING_DECISION_CACHE: {
+        idFromName: (name: string) => name,
+        get: () => cacheDO,
+      },
+    } as unknown as Pick<Env, 'AUTO_ROUTING_DECISION_CACHE'>;
+    return { env, cacheDO, storage };
+  }
+
+  it('round-trips the sticky model for a conversation', async () => {
+    const { env } = createStickyEnv();
+    await expect(getStickyDecision(env, 'conversation-1')).resolves.toBeNull();
+
+    await putStickyDecision(env, 'conversation-1', 'mid/chat');
+    await expect(getStickyDecision(env, 'conversation-1')).resolves.toBe('mid/chat');
+  });
+
+  it('expires sticky entries after the TTL', async () => {
+    const { env } = createStickyEnv();
+    await putStickyDecision(env, 'conversation-1', 'mid/chat');
+
+    vi.advanceTimersByTime(31 * 60 * 1000);
+    await expect(getStickyDecision(env, 'conversation-1')).resolves.toBeNull();
+  });
+
+  it('returns null for invalid stored shapes', async () => {
+    const { env, cacheDO } = createStickyEnv();
+    await cacheDO.putEntry('sticky', { nope: true } as unknown as ClassifierOutput);
+
+    await expect(getStickyDecision(env, 'conversation-1')).resolves.toBeNull();
+  });
+});
diff --git a/services/auto-routing/src/decision-cache.ts b/services/auto-routing/src/decision-cache.ts
index a4bd929bf8..ae98778688 100644
--- a/services/auto-routing/src/decision-cache.ts
+++ b/services/auto-routing/src/decision-cache.ts
@@ -1,5 +1,6 @@
 import { ClassifierOutputSchema, type ClassifierOutput } from '@kilocode/auto-routing-contracts';
 import { DurableObject } from 'cloudflare:workers';
+import * as z from 'zod';
 
 // Mirrored agent sessions classify the same prompt prefixes on every API
 // call, so identical classifier inputs repeat heavily within a short
@@ -13,13 +14,19 @@ const ENTRY_TTL_MS = 30 * 60 * 1000;
 // Cloudflare caps storage.delete() at 128 keys per call.
 const DELETE_BATCH_SIZE = 128;
 
+// The DO treats stored values as opaque — callers validate on read, since
+// entries may have been written by an older worker version. A concrete union
+// rather than unknown because the workers RPC stub maps non-serializable
+// method types to never.
+type StoredValue = ClassifierOutput | StickyDecision;
+
 type StoredEntry = {
-  value: ClassifierOutput;
+  value: StoredValue;
   storedAt: number;
 };
 
 export class AutoRoutingDecisionCacheDO extends DurableObject<Env> {
-  async getEntry(key: string): Promise<ClassifierOutput | null> {
+  async getEntry(key: string): Promise<StoredValue | null> {
     const entry = await this.ctx.storage.get<StoredEntry>(key);
     if (!entry) return null;
     if (Date.now() - entry.storedAt > ENTRY_TTL_MS) {
@@ -29,7 +36,7 @@ export class AutoRoutingDecisionCacheDO extends DurableObject<Env> {
     return entry.value;
   }
 
-  async putEntry(key: string, value: ClassifierOutput): Promise<void> {
+  async putEntry(key: string, value: StoredValue): Promise<void> {
     await this.ctx.storage.put(key, { value, storedAt: Date.now() } satisfies StoredEntry);
     // A fixed-period sweep (rather than an idle alarm pushed out on every
     // write) so storage stays bounded even when distinct conversations
@@ -73,6 +80,15 @@ function entryKey(contentHash: string, classifierModel: string): string {
   return `${classifierModel}:${contentHash}`;
 }
 
+// Single per-conversation slot remembering the last model the decision
+// engine served, so the session can stay on it (keeping the provider's
+// prompt cache warm) instead of ping-ponging when its tier oscillates.
+// Cannot collide with classification keys, which always contain a ':'.
+const STICKY_DECISION_KEY = 'sticky';
+
+const StickyDecisionSchema = z.object({ model: z.string().min(1) });
+type StickyDecision = z.infer<typeof StickyDecisionSchema>;
+
 export async function getCachedClassification(
   env: DecisionCacheEnv,
   conversationKey: string,
@@ -93,6 +109,36 @@ export async function getCachedClassification(
   }
 }
 
+export async function getStickyDecision(
+  env: DecisionCacheEnv,
+  conversationKey: string
+): Promise<string | null> {
+  try {
+    const value = await cacheStub(env, conversationKey).getEntry(STICKY_DECISION_KEY);
+    if (!value) return null;
+    // Entries may have been written by an older worker version; validate
+    // before serving.
+    const parsed = StickyDecisionSchema.safeParse(value);
+    return parsed.success ? parsed.data.model : null;
+  } catch {
+    return null;
+  }
+}
+
+export async function putStickyDecision(
+  env: DecisionCacheEnv,
+  conversationKey: string,
+  model: string
+): Promise<void> {
+  try {
+    await cacheStub(env, conversationKey).putEntry(STICKY_DECISION_KEY, {
+      model,
+    } satisfies StickyDecision);
+  } catch {
+    // Sticky writes are best effort and must not fail the decision.
+  }
+}
+
 export async function putCachedClassification(
   env: DecisionCacheEnv,
   conversationKey: string,
diff --git a/services/auto-routing/src/decision-engine.test.ts b/services/auto-routing/src/decision-engine.test.ts
new file mode 100644
index 0000000000..b10fcc2e47
--- /dev/null
+++ b/services/auto-routing/src/decision-engine.test.ts
@@ -0,0 +1,164 @@
+import { describe, expect, it } from 'vitest';
+import type { ClassifierOutput, RoutingTable } from '@kilocode/auto-routing-contracts';
+import { computeDecision } from './decision-engine';
+
+const classification: ClassifierOutput = {
+  taskType: 'implementation',
+  subtaskType: 'code_generation',
+  contextComplexity: 'small',
+  reasoningComplexity: 'low',
+  riskLevel: 'low',
+  executionMode: 'answer_only',
+  requiresTools: false,
+  confidence: 0.9,
+};
+
+const table: RoutingTable = {
+  version: 'run-1',
+  generatedAt: '2026-06-11T00:00:00.000Z',
+  minAccuracy: 0.7,
+  switchCostFactor: 3,
+  source: 'benchmark',
+  tiers: {
+    low: [
+      {
+        model: 'cheap/chat',
+        accuracy: 0.85,
+        avgCostUsd: 0.002,
+        meetsThreshold: true,
+      },
+      {
+        model: 'mid/chat',
+        accuracy: 0.8,
+        avgCostUsd: 0.005,
+        meetsThreshold: true,
+        reasoningEffort: 'medium',
+      },
+      {
+        model: 'pricey/chat',
+        accuracy: 0.9,
+        avgCostUsd: 0.02,
+        meetsThreshold: true,
+      },
+      {
+        model: 'weak/chat',
+        accuracy: 0.5,
+        avgCostUsd: 0.003,
+        meetsThreshold: false,
+      },
+    ],
+    medium: [
+      {
+        model: 'mid/chat',
+        accuracy: 0.8,
+        avgCostUsd: 0.01,
+        meetsThreshold: true,
+      },
+    ],
+    high: [
+      {
+        model: 'big/chat',
+        accuracy: 0.9,
+        avgCostUsd: 0.1,
+        meetsThreshold: true,
+      },
+    ],
+  },
+};
+
+describe('computeDecision', () => {
+  it('picks the first candidate of the tier', () => {
+    const decision = computeDecision(classification, table, null);
+    expect(decision).toEqual({
+      model: 'cheap/chat',
+      tier: 'low',
+      source: 'benchmark',
+      tableVersion: 'run-1',
+      reasoningEffort: null,
+      sticky: false,
+    });
+  });
+  it('uses the tier derived from the classification', () => {
+    const hard: ClassifierOutput = {
+      ...classification,
+      reasoningComplexity: 'high',
+      contextComplexity: 'large',
+      executionMode: 'multi_step_project',
+    };
+    expect(computeDecision(hard, table, null)?.model).toBe('big/chat');
+  });
+  it('returns a decision for every tier of a valid table', () => {
+    const byTier: Array<[ClassifierOutput, string]> = [
+      [classification, 'cheap/chat'],
+      [
+        { ...classification, reasoningComplexity: 'medium', contextComplexity: 'medium' },
+        'mid/chat',
+      ],
+      [
+        {
+          ...classification,
+          reasoningComplexity: 'high',
+          contextComplexity: 'large',
+          executionMode: 'multi_step_project',
+        },
+        'big/chat',
+      ],
+    ];
+    for (const [input, expected] of byTier) {
+      expect(computeDecision(input, table, null)?.model).toBe(expected);
+    }
+  });
+  it('returns null when there is no routing table', () => {
+    expect(computeDecision(classification, null, null)).toBeNull();
+  });
+
+  describe('session stickiness', () => {
+    it('keeps the incumbent on tier de-escalation when it is within the switch-cost factor', () => {
+      // Fresh pick cheap/chat at 0.002; mid/chat at 0.005 is not cheaper by
+      // more than 3x (0.002 * 3 = 0.006 >= 0.005), so the session stays put.
+      const decision = computeDecision(classification, table, 'mid/chat');
+      expect(decision).toEqual({
+        model: 'mid/chat',
+        tier: 'low',
+        source: 'benchmark',
+        tableVersion: 'run-1',
+        // The incumbent's benchmarked effort, not the fresh pick's.
+        reasoningEffort: 'medium',
+        sticky: true,
+      });
+    });
+    it('keeps the incumbent at the exact switch-cost boundary', () => {
+      // Strict comparison: switch only when fresh * factor < incumbent.
+      // Integer costs avoid float noise on the equality case (1 * 3 === 3).
+      const boundaryTable: RoutingTable = {
+        ...table,
+        tiers: {
+          ...table.tiers,
+          low: [
+            { ...table.tiers.low[0]!, model: 'fresh/chat', avgCostUsd: 1 },
+            { ...table.tiers.low[1]!, model: 'incumbent/chat', avgCostUsd: 3 },
+          ],
+        },
+      };
+      const decision = computeDecision(classification, boundaryTable, 'incumbent/chat');
+      expect(decision).toMatchObject({ model: 'incumbent/chat', sticky: true });
+    });
+    it('switches when the fresh pick is cheaper by more than the factor', () => {
+      // pricey/chat at 0.02 vs fresh 0.002 * 3 = 0.006: switch pays off.
+      const decision = computeDecision(classification, table, 'pricey/chat');
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
+    it('switches when the incumbent no longer meets the tier threshold', () => {
+      const decision = computeDecision(classification, table, 'weak/chat');
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
+    it('serves the fresh pick when the incumbent is not in the tier', () => {
+      const decision = computeDecision(classification, table, 'gone/model');
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
+    it('is not sticky when the incumbent is the fresh pick', () => {
+      const decision = computeDecision(classification, table, 'cheap/chat');
+      expect(decision).toMatchObject({ model: 'cheap/chat', sticky: false });
+    });
+  });
+});
diff --git a/services/auto-routing/src/decision-engine.ts b/services/auto-routing/src/decision-engine.ts
new file mode 100644
index 0000000000..0d641e069d
--- /dev/null
+++ b/services/auto-routing/src/decision-engine.ts
@@ -0,0 +1,52 @@
+import {
+  deriveDifficultyTier,
+  type AutoRoutingDecision,
+  type ClassifierOutput,
+  type RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+
+export function computeDecision(
+  classification: ClassifierOutput,
+  table: RoutingTable | null,
+  incumbentModel: string | null
+): AutoRoutingDecision | null {
+  if (!table) return null;
+  const tier = deriveDifficultyTier(classification);
+  const candidates = table.tiers[tier];
+  // A parsed table guarantees a non-empty tier (schema .min(1)), so with a
+  // table and a classification a decision always exists.
+  const freshPick = candidates[0];
+
+  // Keep the session on its incumbent model when it is still good enough for
+  // the current tier. A model switch discards the provider's prompt cache,
+  // and rebuilding it costs full-price input tokens (4-10x cache-read rates)
+  // on a context that dominates agent-session spend — so a switch is only
+  // worth it when the fresh pick's recurring per-turn savings clearly exceed
+  // that one-time penalty, i.e. it is cheaper by more than switchCostFactor.
+  const incumbent =
+    incumbentModel === null ? undefined : candidates.find(c => c.model === incumbentModel);
+  if (
+    incumbent &&
+    incumbent.meetsThreshold &&
+    incumbent.model !== freshPick.model &&
+    !(freshPick.avgCostUsd * table.switchCostFactor < incumbent.avgCostUsd)
+  ) {
+    return {
+      model: incumbent.model,
+      tier,
+      source: table.source,
+      tableVersion: table.version,
+      reasoningEffort: incumbent.reasoningEffort ?? null,
+      sticky: true,
+    };
+  }
+
+  return {
+    model: freshPick.model,
+    tier,
+    source: table.source,
+    tableVersion: table.version,
+    reasoningEffort: freshPick.reasoningEffort ?? null,
+    sticky: false,
+  };
+}
diff --git a/services/auto-routing/src/index.test.ts b/services/auto-routing/src/index.test.ts
index 89b9ba675c..4519c7c310 100644
--- a/services/auto-routing/src/index.test.ts
+++ b/services/auto-routing/src/index.test.ts
@@ -1,5 +1,6 @@
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 import { clearClassifierConfigCache } from './classifier-config';
+import { clearRoutingTableCache } from './routing-table';
 import { app } from './index';
 import { ClassifierRunError } from './model-classifier';
 import type * as ModelClassifierModule from './model-classifier';
@@ -13,11 +14,13 @@ vi.mock('./model-classifier', async importOriginal => {
 
 const writeDataPoint = vi.fn();
 const configGet = vi.fn();
+const configDelete = vi.fn();
 const configPut = vi.fn();
 const analyticsTokenGet = vi.fn();
 const cacheGetEntry = vi.fn();
 const cachePutEntry = vi.fn();
 const cacheIdFromName = vi.fn(() => 'cache-do-id');
+const benchmarkFetch = vi.fn();
 const originalFetch = globalThis.fetch;
 const mockedFetch = vi.fn<typeof globalThis.fetch>();
 
@@ -27,8 +30,12 @@ const env = {
   },
   AUTO_ROUTING_CONFIG: {
     get: configGet,
+    delete: configDelete,
     put: configPut,
   },
+  BENCHMARK_SERVICE: {
+    fetch: benchmarkFetch,
+  },
   AUTO_ROUTING_CLASSIFIER_METRICS_V2: {
     writeDataPoint,
   },
@@ -74,6 +81,53 @@ const normalizedInput = {
   },
 };
 
+const benchmarkRoutingTable = {
+  version: 'bench-run-1',
+  generatedAt: '2026-06-12T00:00:00.000Z',
+  minAccuracy: 0.7,
+  switchCostFactor: 3,
+  source: 'benchmark',
+  tiers: {
+    low: [
+      {
+        model: 'google/gemini-2.5-flash-lite',
+        accuracy: 0.9,
+        avgCostUsd: 0.001,
+        meetsThreshold: true,
+        reasoningEffort: null,
+      },
+    ],
+    medium: [
+      {
+        model: 'google/gemini-2.5-flash',
+        accuracy: 0.85,
+        avgCostUsd: 0.002,
+        meetsThreshold: true,
+        reasoningEffort: null,
+      },
+      // The high-tier model also qualifies for medium, within the 3x
+      // switch-cost factor of the fresh pick (0.002 * 3 >= 0.005): a session
+      // de-escalating from high stays on it.
+      {
+        model: 'anthropic/claude-sonnet-4.6',
+        accuracy: 0.8,
+        avgCostUsd: 0.005,
+        meetsThreshold: true,
+        reasoningEffort: null,
+      },
+    ],
+    high: [
+      {
+        model: 'anthropic/claude-sonnet-4.6',
+        accuracy: 0.8,
+        avgCostUsd: 0.01,
+        meetsThreshold: true,
+        reasoningEffort: null,
+      },
+    ],
+  },
+};
+
 function mirrorPayload(overrides: Record<string, unknown> = {}) {
   return {
     input: normalizedInput,
@@ -117,11 +171,32 @@ function decideRequest(payload: unknown) {
 describe('auto routing worker', () => {
   beforeEach(() => {
     clearClassifierConfigCache();
+    clearRoutingTableCache();
     classifyNormalizedInput.mockReset();
     classifyNormalizedInput.mockResolvedValue(mockClassifierResult);
     writeDataPoint.mockReset();
     configGet.mockReset();
+    // Real KV returns null for missing keys; an undefined here would send the
+    // routing-table loader down the JSON.parse-throw path instead.
+    configGet.mockResolvedValue(null);
+    configDelete.mockReset();
+    configDelete.mockResolvedValue(undefined);
     configPut.mockReset();
+    configPut.mockResolvedValue(undefined);
+    benchmarkFetch.mockReset();
+    benchmarkFetch.mockImplementation(async (url: string) => {
+      if (String(url).includes('/admin/classifier-winner')) {
+        return { ok: true, status: 200, json: async () => ({ winner: null }) };
+      }
+      return {
+        ok: true,
+        status: 200,
+        json: async () => ({
+          table: benchmarkRoutingTable,
+          publishedAt: benchmarkRoutingTable.generatedAt,
+        }),
+      };
+    });
     analyticsTokenGet.mockReset();
     analyticsTokenGet.mockResolvedValue('analytics-token');
     cacheGetEntry.mockReset();
@@ -158,7 +233,14 @@ describe('auto routing worker', () => {
     expect(response.status).toBe(200);
     await expect(response.json()).resolves.toEqual({
       cost: 0.00000123,
-      decision: null,
+      decision: {
+        model: expect.any(String),
+        tier: expect.stringMatching(/^(low|medium|high)$/),
+        source: 'benchmark',
+        tableVersion: 'bench-run-1',
+        reasoningEffort: null,
+        sticky: false,
+      },
       classifierResult: {
         classification: mockClassification,
         normalized: normalizedInput,
@@ -201,6 +283,7 @@ describe('auto routing worker', () => {
       mode: 'code',
       uaPrefix: 'Kilo-Code/4.106.0',
       bodyBytes: 2048,
+      sticky: false,
     });
     // The raw user id (which embeds the client IP for anonymous users) must
     // never reach persisted logs.
@@ -215,12 +298,22 @@ describe('auto routing worker', () => {
     expect(response.status).toBe(200);
     await expect(response.json()).resolves.toMatchObject({
       cost: 0,
-      decision: null,
+      decision: {
+        model: expect.any(String),
+        tier: expect.stringMatching(/^(low|medium|high)$/),
+        source: 'benchmark',
+        tableVersion: 'bench-run-1',
+        reasoningEffort: null,
+        sticky: false,
+      },
       classifierResult: { classification: mockClassification },
     });
     expect(cacheIdFromName).toHaveBeenCalledWith('user:user-1:task:task-123');
     expect(classifyNormalizedInput).not.toHaveBeenCalled();
-    expect(cachePutEntry).not.toHaveBeenCalled();
+    // The classification is not re-cached; only the served model is
+    // remembered for session stickiness.
+    expect(cachePutEntry).toHaveBeenCalledTimes(1);
+    expect(cachePutEntry).toHaveBeenCalledWith('sticky', { model: expect.any(String) });
     expect(writeDataPoint).toHaveBeenCalledWith(
       expect.objectContaining({
         doubles: [0, 0, mockClassification.confidence, 1],
@@ -238,6 +331,44 @@ describe('auto routing worker', () => {
     );
   });
 
+  it('keeps the session on the incumbent model when the tier de-escalates', async () => {
+    // Back the mocked DO stub with real storage so the sticky model written
+    // by the first request is visible to the second.
+    const store = new Map<string, unknown>();
+    cacheGetEntry.mockImplementation(async (key: string) => store.get(key) ?? null);
+    cachePutEntry.mockImplementation(async (key: string, value: unknown) => {
+      store.set(key, value);
+    });
+
+    classifyNormalizedInput.mockResolvedValueOnce({
+      ...mockClassifierResult,
+      classification: {
+        ...mockClassification,
+        reasoningComplexity: 'high',
+        contextComplexity: 'large',
+        executionMode: 'multi_step_project',
+      },
+    });
+    const first = await decideRequest(mirrorPayload());
+    expect(first.status).toBe(200);
+    await expect(first.json()).resolves.toMatchObject({
+      decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'high', sticky: false },
+    });
+
+    // The second turn (different prompt, same session) classifies as medium.
+    // The fresh medium pick is cheaper, but not by more than the switch-cost
+    // factor, so the session keeps its incumbent.
+    const second = await decideRequest(
+      mirrorPayload({
+        input: { ...normalizedInput, userPromptPrefix: 'Now a much easier follow-up.' },
+      })
+    );
+    expect(second.status).toBe(200);
+    await expect(second.json()).resolves.toMatchObject({
+      decision: { model: 'anthropic/claude-sonnet-4.6', tier: 'medium', sticky: true },
+    });
+  });
+
   it('falls back to a machine-scoped conversation key without a session id', async () => {
     const response = await decideRequest(mirrorPayload({ sessionId: null }));
 
@@ -326,6 +457,29 @@ describe('auto routing worker', () => {
       ],
       doubles: [expect.any(Number), 0.00000123, 0, 0],
     });
+    // A heuristic fallback classification is served but must not re-anchor
+    // the session's sticky model (same rule as the classification cache).
+    expect(cachePutEntry).not.toHaveBeenCalledWith('sticky', expect.anything());
+  });
+
+  it('makes no decision when no routing table is published', async () => {
+    benchmarkFetch.mockImplementation(async (url: string) => {
+      if (String(url).includes('/admin/classifier-winner')) {
+        return { ok: true, status: 200, json: async () => ({ winner: null }) };
+      }
+      return { ok: true, status: 200, json: async () => ({ table: null, publishedAt: null }) };
+    });
+
+    const response = await decideRequest(mirrorPayload());
+
+    expect(response.status).toBe(200);
+    await expect(response.json()).resolves.toMatchObject({
+      cost: 0.00000123,
+      decision: null,
+      classifierResult: { classification: mockClassification },
+    });
+    // A null decision must not overwrite the session's sticky model.
+    expect(cachePutEntry).not.toHaveBeenCalledWith('sticky', expect.anything());
   });
 
   it('returns a null classifier result when the classifier request fails', async () => {
@@ -448,8 +602,10 @@ describe('auto routing worker', () => {
     expect(classifyNormalizedInput).not.toHaveBeenCalled();
   });
 
-  it('returns the configured classifier model', async () => {
-    configGet.mockResolvedValueOnce('google/gemini-2.5-flash-lite');
+  it('returns the override as the effective classifier model', async () => {
+    configGet.mockImplementation(key =>
+      Promise.resolve(key === 'classifier_model' ? 'google/gemini-2.5-flash-lite' : null)
+    );
 
     const response = await request('/admin/classifier-model', {
       headers: { authorization: 'Bearer classifier-token' },
@@ -458,12 +614,48 @@ describe('auto routing worker', () => {
     expect(response.status).toBe(200);
     await expect(response.json()).resolves.toEqual({
       model: 'google/gemini-2.5-flash-lite',
+      override: 'google/gemini-2.5-flash-lite',
+      benchmarkWinner: null,
       defaultModel: 'google/gemini-2.5-flash-lite',
     });
     expect(configGet).toHaveBeenCalledWith('classifier_model');
   });
 
-  it('updates the configured classifier model', async () => {
+  it('falls back to the benchmark winner when no override is set', async () => {
+    configGet.mockImplementation(key =>
+      Promise.resolve(
+        key === 'classifier_benchmark_winner'
+          ? JSON.stringify({
+              model: 'qwen/qwen3.7-plus',
+              runId: 'classifier-run-1',
+              accuracy: 0.93,
+              generatedAt: '2026-06-12T00:00:00.000Z',
+            })
+          : null
+      )
+    );
+
+    const response = await request('/admin/classifier-model', {
+      headers: { authorization: 'Bearer classifier-token' },
+    });
+
+    expect(response.status).toBe(200);
+    await expect(response.json()).resolves.toEqual({
+      model: 'qwen/qwen3.7-plus',
+      override: null,
+      benchmarkWinner: 'qwen/qwen3.7-plus',
+      defaultModel: 'google/gemini-2.5-flash-lite',
+    });
+  });
+
+  it('updates the classifier model override', async () => {
+    const stored = new Map<string, string>();
+    configGet.mockImplementation(key => Promise.resolve(stored.get(key) ?? null));
+    configPut.mockImplementation((key, value) => {
+      stored.set(key, value);
+      return Promise.resolve();
+    });
+
     const response = await request('/admin/classifier-model', {
       method: 'PUT',
       headers: {
@@ -476,11 +668,33 @@ describe('auto routing worker', () => {
     expect(response.status).toBe(200);
     await expect(response.json()).resolves.toEqual({
       model: 'google/gemini-2.5-flash-lite:free',
+      override: 'google/gemini-2.5-flash-lite:free',
+      benchmarkWinner: null,
       defaultModel: 'google/gemini-2.5-flash-lite',
     });
     expect(configPut).toHaveBeenCalledWith('classifier_model', 'google/gemini-2.5-flash-lite:free');
   });
 
+  it('clears the override when model is null', async () => {
+    const response = await request('/admin/classifier-model', {
+      method: 'PUT',
+      headers: {
+        authorization: 'Bearer classifier-token',
+        'content-type': 'application/json',
+      },
+      body: JSON.stringify({ model: null }),
+    });
+
+    expect(response.status).toBe(200);
+    await expect(response.json()).resolves.toEqual({
+      model: 'google/gemini-2.5-flash-lite',
+      override: null,
+      benchmarkWinner: null,
+      defaultModel: 'google/gemini-2.5-flash-lite',
+    });
+    expect(configDelete).toHaveBeenCalledWith('classifier_model');
+  });
+
   it('rejects blank classifier model updates', async () => {
     const response = await request('/admin/classifier-model', {
       method: 'PUT',
diff --git a/services/auto-routing/src/kv-read-through.test.ts b/services/auto-routing/src/kv-read-through.test.ts
new file mode 100644
index 0000000000..2b154d4959
--- /dev/null
+++ b/services/auto-routing/src/kv-read-through.test.ts
@@ -0,0 +1,115 @@
+import { describe, expect, it, vi } from 'vitest';
+import { kvReadThrough } from './kv-read-through';
+
+function makeKv(value: string | null): { kv: KVNamespace; put: ReturnType<typeof vi.fn> } {
+  const put = vi.fn(async () => {});
+  const kv = {
+    get: vi.fn(async () => value),
+    put,
+  } as unknown as KVNamespace;
+  return { kv, put };
+}
+
+describe('kvReadThrough', () => {
+  it('returns cached value on KV hit without calling origin', async () => {
+    const value = { model: 'test/model', accuracy: 0.9 };
+    const { kv, put } = makeKv(JSON.stringify(value));
+    const fetchOrigin = vi.fn(async () => value);
+
+    const result = await kvReadThrough({
+      kv,
+      key: 'test-key',
+      ttlSeconds: 300,
+      fetchOrigin,
+      parse: raw => JSON.parse(raw) as typeof value,
+    });
+
+    expect(result).toEqual(value);
+    expect(fetchOrigin).not.toHaveBeenCalled();
+    expect(put).not.toHaveBeenCalled();
+  });
+
+  it('treats a corrupt KV value as a miss, fetches from origin, and writes back with expirationTtl', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const { kv, put } = makeKv('not valid json {{{');
+    const origin = { model: 'origin/model', accuracy: 0.8 };
+    const fetchOrigin = vi.fn(async () => origin);
+
+    const result = await kvReadThrough({
+      kv,
+      key: 'corrupt-key',
+      ttlSeconds: 3600,
+      fetchOrigin,
+      parse: raw => {
+        try {
+          return JSON.parse(raw) as typeof origin;
+        } catch {
+          return null;
+        }
+      },
+    });
+
+    expect(result).toEqual(origin);
+    expect(fetchOrigin).toHaveBeenCalledOnce();
+    expect(put).toHaveBeenCalledWith('corrupt-key', JSON.stringify(origin), {
+      expirationTtl: 3600,
+    });
+    expect(warn).toHaveBeenCalled();
+    warn.mockRestore();
+  });
+
+  it('fetches from origin on KV miss and writes back with expirationTtl', async () => {
+    const { kv, put } = makeKv(null);
+    const origin = { model: 'from/origin', accuracy: 0.95 };
+    const fetchOrigin = vi.fn(async () => origin);
+
+    const result = await kvReadThrough({
+      kv,
+      key: 'missing-key',
+      ttlSeconds: 3600,
+      fetchOrigin,
+      parse: raw => JSON.parse(raw) as typeof origin,
+    });
+
+    expect(result).toEqual(origin);
+    expect(fetchOrigin).toHaveBeenCalledOnce();
+    expect(put).toHaveBeenCalledWith('missing-key', JSON.stringify(origin), {
+      expirationTtl: 3600,
+    });
+  });
+
+  it('returns null and does NOT write to KV when origin returns null', async () => {
+    const { kv, put } = makeKv(null);
+    const fetchOrigin = vi.fn(async () => null);
+
+    const result = await kvReadThrough({
+      kv,
+      key: 'empty-key',
+      ttlSeconds: 3600,
+      fetchOrigin,
+      parse: raw => JSON.parse(raw) as Record<string, unknown>,
+    });
+
+    expect(result).toBeNull();
+    expect(put).not.toHaveBeenCalled();
+  });
+
+  it('propagates origin errors without writing to KV', async () => {
+    const { kv, put } = makeKv(null);
+    const fetchOrigin = vi.fn(async () => {
+      throw new Error('origin unavailable');
+    });
+
+    await expect(
+      kvReadThrough({
+        kv,
+        key: 'throw-key',
+        ttlSeconds: 3600,
+        fetchOrigin,
+        parse: raw => JSON.parse(raw) as Record<string, unknown>,
+      })
+    ).rejects.toThrow('origin unavailable');
+
+    expect(put).not.toHaveBeenCalled();
+  });
+});
diff --git a/services/auto-routing/src/kv-read-through.ts b/services/auto-routing/src/kv-read-through.ts
new file mode 100644
index 0000000000..96300c91e2
--- /dev/null
+++ b/services/auto-routing/src/kv-read-through.ts
@@ -0,0 +1,41 @@
+// Generic read-through cache on top of a KV namespace.
+// On KV hit: parse+validate; corrupt values are treated as misses.
+// On miss: fetch from origin; write to KV with expirationTtl on success.
+// Origin null → no KV write; origin throw → propagates to caller.
+export async function kvReadThrough<T>(options: {
+  kv: KVNamespace;
+  key: string;
+  ttlSeconds: number;
+  fetchOrigin: () => Promise<T | null>;
+  parse: (raw: string) => T | null;
+  serialize?: (value: T) => string;
+}): Promise<T | null> {
+  const { kv, key, ttlSeconds, fetchOrigin, parse, serialize = JSON.stringify } = options;
+
+  const raw = await kv.get(key);
+  if (raw !== null) {
+    const parsed = parse(raw);
+    if (parsed !== null) {
+      return parsed;
+    }
+    console.warn(JSON.stringify({ event: 'kv_read_through_corrupt', key }));
+  }
+
+  // Miss (or corrupt value treated as miss): fetch from origin.
+  const value = await fetchOrigin();
+  if (value === null) {
+    return null;
+  }
+
+  // Awaited: an unawaited promise without waitUntil may be cancelled when the
+  // request ends, silently dropping the cache write. A put failure must not
+  // discard the value we already fetched, so it only warns.
+  await kv
+    .put(key, serialize(value), { expirationTtl: ttlSeconds })
+    .catch((error: unknown) =>
+      console.warn(
+        JSON.stringify({ event: 'kv_read_through_put_failed', key, error: String(error) })
+      )
+    );
+  return value;
+}
diff --git a/services/auto-routing/src/model-classifier.ts b/services/auto-routing/src/model-classifier.ts
index 94d7f672cf..e9a9898f13 100644
--- a/services/auto-routing/src/model-classifier.ts
+++ b/services/auto-routing/src/model-classifier.ts
@@ -1,81 +1,22 @@
-import type { OpenRouter } from '@openrouter/sdk';
-import type { ChatResult } from '@openrouter/sdk/models';
-import { buildClassifierMessages, CLASSIFIER_MAX_TOKENS } from './classifier-prompt';
+import { classifyWithOpenRouter } from '@kilocode/auto-routing-contracts/classifier';
+import type {
+  ClassifierCallOptions,
+  ClassifierRunResult,
+} from '@kilocode/auto-routing-contracts/classifier';
 import type { NormalizedClassifierInput } from '@kilocode/auto-routing-contracts';
-import {
-  ClassifierOutputParseError,
-  parseClassifierOutput,
-  type ClassifierOutput,
-} from './classifier-output';
-import { fallbackClassifierOutput } from './classifier-output/fallback';
 import { createOpenRouterClient } from './openrouter';
 
-export type ClassifierRunResult = {
-  cost: number | null;
-  classifierModel: string;
-  classification: ClassifierOutput;
-  fallback?: ClassifierRunFallbackMetadata;
-  modelCallMeta?: ClassifierModelCallMeta;
-  retried?: boolean;
-  // Why the first attempt was retried; present only when retried is true.
-  firstAttemptFailure?: {
-    reason: string;
-    failureStage: string | null;
-    finishReason: string | null;
-  };
-};
-
-export type ClassifierModelCallMeta = {
-  finishReason: string | null;
-  completionTokens: number | null;
-  reasoningTokens: number | null;
-  // Length only — the raw output is derived from untrusted, mirrored user
-  // prompts and must not reach persistent logs. Combined with finishReason
-  // and token counts this still distinguishes truncation from prompt echo.
-  textLength: number | null;
-};
-
-export type ClassifierRunFailureMetadata = {
-  cost: number | null;
-  classifierModel: string;
-  failureStage?: string;
-  schemaIssueSummary?: string[];
-  topLevelKeys?: string[];
-};
-
-export type ClassifierRunFallbackMetadata = {
-  reason: 'no_text' | 'invalid_output';
-  failureStage?: string;
-  schemaIssueSummary?: string[];
-  topLevelKeys?: string[];
-};
-
-export class ClassifierRunError extends Error {
-  readonly cost: number | null;
-  readonly classifierModel: string;
-  readonly failureStage?: string;
-  readonly schemaIssueSummary: string[];
-  readonly topLevelKeys: string[];
-
-  constructor(message: string, metadata: ClassifierRunFailureMetadata) {
-    super(message);
-    this.name = 'ClassifierRunError';
-    this.cost = metadata.cost;
-    this.classifierModel = metadata.classifierModel;
-    this.failureStage = metadata.failureStage;
-    this.schemaIssueSummary = metadata.schemaIssueSummary ?? [];
-    this.topLevelKeys = metadata.topLevelKeys ?? [];
-  }
-}
+export {
+  ClassifierRunError,
+  classifyWithOpenRouter,
+} from '@kilocode/auto-routing-contracts/classifier';
+export type {
+  ClassifierCallOptions,
+  ClassifierRunResult,
+} from '@kilocode/auto-routing-contracts/classifier';
 
 type ClassifierEnv = Pick<Env, 'OPENROUTER_API_KEY'>;
 
-export type ClassifierCallOptions = {
-  // Sticky routing key passed to OpenRouter so requests from the same
-  // session land on the same provider and reuse its prompt cache.
-  openrouterSessionId?: string;
-};
-
 export async function classifyNormalizedInput(
   env: ClassifierEnv,
   input: NormalizedClassifierInput,
@@ -85,133 +26,3 @@ export async function classifyNormalizedInput(
   const client = await createOpenRouterClient(env);
   return classifyWithOpenRouter(client, input, classifierModel, options);
 }
-
-export async function classifyWithOpenRouter(
-  client: OpenRouter,
-  input: NormalizedClassifierInput,
-  classifierModel: string,
-  options: ClassifierCallOptions = {}
-): Promise<ClassifierRunResult> {
-  // Invalid output is usually a transient provider glitch (responses cut
-  // off after a handful of tokens with a "stop" finish reason), so one
-  // retry recovers most of those classifications.
-  const firstAttempt = await runClassifierAttempt(client, input, classifierModel, options);
-  if (!firstAttempt.fallback) {
-    return firstAttempt;
-  }
-
-  let retryAttempt: ClassifierRunResult;
-  try {
-    retryAttempt = await runClassifierAttempt(client, input, classifierModel, options);
-  } catch (error) {
-    // The retry threw (e.g. a transport error) after the first attempt had
-    // already billed and produced diagnostics. Surface those rather than
-    // letting the raw error escape and underreport spend.
-    throw new ClassifierRunError(
-      error instanceof Error ? error.message : 'classifier retry failed',
-      {
-        cost: firstAttempt.cost,
-        classifierModel,
-        failureStage: firstAttempt.fallback.failureStage ?? firstAttempt.fallback.reason,
-        schemaIssueSummary: firstAttempt.fallback.schemaIssueSummary,
-        topLevelKeys: firstAttempt.fallback.topLevelKeys,
-      }
-    );
-  }
-  return {
-    ...retryAttempt,
-    cost: sumCosts(firstAttempt.cost, retryAttempt.cost),
-    retried: true,
-    firstAttemptFailure: {
-      reason: firstAttempt.fallback.reason,
-      failureStage: firstAttempt.fallback.failureStage ?? null,
-      finishReason: firstAttempt.modelCallMeta?.finishReason ?? null,
-    },
-  };
-}
-
-function sumCosts(first: number | null, second: number | null): number | null {
-  if (first === null && second === null) return null;
-  return (first ?? 0) + (second ?? 0);
-}
-
-async function runClassifierAttempt(
-  client: OpenRouter,
-  input: NormalizedClassifierInput,
-  classifierModel: string,
-  options: ClassifierCallOptions
-): Promise<ClassifierRunResult> {
-  const result = await client.chat.send({
-    chatRequest: {
-      model: classifierModel,
-      messages: buildClassifierMessages(input),
-      responseFormat: { type: 'json_object' },
-      stream: false,
-      temperature: 0,
-      maxTokens: CLASSIFIER_MAX_TOKENS,
-      ...(options.openrouterSessionId ? { sessionId: options.openrouterSessionId } : {}),
-    },
-  });
-
-  const cost = result.usage?.cost ?? null;
-  const text = extractClassifierText(result);
-  const modelCallMeta = extractModelCallMeta(result, text);
-  if (!text) {
-    return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, {
-      reason: 'no_text',
-    });
-  }
-
-  try {
-    return {
-      cost,
-      classifierModel,
-      classification: parseClassifierOutput(text),
-      modelCallMeta,
-    };
-  } catch (error) {
-    return fallbackClassifierResult(input, classifierModel, cost, modelCallMeta, {
-      reason: 'invalid_output',
-      ...(error instanceof ClassifierOutputParseError
-        ? {
-            failureStage: error.failureStage,
-            schemaIssueSummary: error.schemaIssueSummary,
-            topLevelKeys: error.topLevelKeys,
-          }
-        : {}),
-    });
-  }
-}
-
-function extractModelCallMeta(result: ChatResult, text: string | null): ClassifierModelCallMeta {
-  return {
-    finishReason: result.choices[0]?.finishReason ?? null,
-    completionTokens: result.usage?.completionTokens ?? null,
-    reasoningTokens: result.usage?.completionTokensDetails?.reasoningTokens ?? null,
-    textLength: text?.length ?? null,
-  };
-}
-
-function fallbackClassifierResult(
-  input: NormalizedClassifierInput,
-  classifierModel: string,
-  cost: number | null,
-  modelCallMeta: ClassifierModelCallMeta,
-  fallback: ClassifierRunFallbackMetadata
-): ClassifierRunResult {
-  return {
-    cost,
-    classifierModel,
-    classification: fallbackClassifierOutput(input),
-    fallback,
-    modelCallMeta,
-  };
-}
-
-function extractClassifierText(result: ChatResult) {
-  const content: unknown = result.choices[0]?.message.content;
-  if (typeof content === 'string' && content.trim().length > 0) {
-    return content;
-  }
-  return null;
-}
diff --git a/services/auto-routing/src/openrouter.ts b/services/auto-routing/src/openrouter.ts
index 4d8608d6f5..8d48367720 100644
--- a/services/auto-routing/src/openrouter.ts
+++ b/services/auto-routing/src/openrouter.ts
@@ -1,5 +1,5 @@
 import { OpenRouter } from '@openrouter/sdk';
-import { ttlCached } from './ttl-cache';
+import { ttlCached } from '@kilocode/worker-utils';
 
 type OpenRouterEnv = Pick<Env, 'OPENROUTER_API_KEY'>;
 
diff --git a/services/auto-routing/src/routing-table.test.ts b/services/auto-routing/src/routing-table.test.ts
new file mode 100644
index 0000000000..be60e909ab
--- /dev/null
+++ b/services/auto-routing/src/routing-table.test.ts
@@ -0,0 +1,156 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+import type { RoutingTable } from '@kilocode/auto-routing-contracts';
+import { clearRoutingTableCache, getRoutingTable } from './routing-table';
+
+const SAMPLE_TABLE: RoutingTable = {
+  version: 'bench-run-1',
+  generatedAt: '2026-06-12T00:00:00.000Z',
+  minAccuracy: 0.7,
+  switchCostFactor: 3,
+  source: 'benchmark',
+  tiers: {
+    low: [
+      {
+        model: 'google/gemini-2.5-flash-lite',
+        accuracy: 0.9,
+        avgCostUsd: 0.001,
+        meetsThreshold: true,
+        reasoningEffort: null,
+      },
+    ],
+    medium: [
+      {
+        model: 'google/gemini-2.5-flash',
+        accuracy: 0.85,
+        avgCostUsd: 0.002,
+        meetsThreshold: true,
+        reasoningEffort: null,
+      },
+    ],
+    high: [
+      {
+        model: 'anthropic/claude-sonnet-4.6',
+        accuracy: 0.8,
+        avgCostUsd: 0.01,
+        meetsThreshold: true,
+        reasoningEffort: null,
+      },
+    ],
+  },
+};
+
+type KvStub = Pick<Env, 'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'>;
+
+function makeEnv(
+  kvValue: string | null,
+  opts: {
+    onGet?: () => void;
+    onPut?: (key: string, value: string, options: unknown) => void;
+    originTable?: unknown;
+    originStatus?: number;
+    originThrow?: boolean;
+  } = {}
+): KvStub {
+  return {
+    AUTO_ROUTING_CONFIG: {
+      get: async () => {
+        opts.onGet?.();
+        return kvValue;
+      },
+      put: async (key: string, value: string, options: unknown) => {
+        opts.onPut?.(key, value, options);
+      },
+    },
+    BENCHMARK_SERVICE: {
+      fetch: async () => {
+        if (opts.originThrow) throw new Error('benchmark unavailable');
+        return {
+          ok: opts.originStatus === undefined ? true : opts.originStatus < 400,
+          status: opts.originStatus ?? 200,
+          json: async () =>
+            opts.originTable !== undefined
+              ? { table: opts.originTable, publishedAt: '2026-06-11T00:00:00.000Z' }
+              : { table: null, publishedAt: null },
+        };
+      },
+    },
+    INTERNAL_API_SECRET_PROD: {
+      get: async () => 'test-secret',
+    },
+  } as unknown as KvStub;
+}
+
+afterEach(() => clearRoutingTableCache());
+
+describe('getRoutingTable', () => {
+  it('returns null when the key is missing and origin has no table', async () => {
+    expect(await getRoutingTable(makeEnv(null))).toBeNull();
+  });
+
+  it('returns null when the stored JSON is invalid and origin has no table', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    expect(await getRoutingTable(makeEnv('{"nope":true}'))).toBeNull();
+    clearRoutingTableCache();
+    expect(await getRoutingTable(makeEnv('not json at all'))).toBeNull();
+    warn.mockRestore();
+  });
+
+  it('parses and caches a valid stored table without calling origin', async () => {
+    let reads = 0;
+    const fetchSpy = vi.fn(async () => ({
+      ok: true,
+      status: 200,
+      json: async () => ({ table: null, publishedAt: null }),
+    }));
+    const env: KvStub = {
+      AUTO_ROUTING_CONFIG: {
+        get: async () => {
+          reads++;
+          return JSON.stringify(SAMPLE_TABLE);
+        },
+        put: async () => {},
+      },
+      BENCHMARK_SERVICE: { fetch: fetchSpy },
+      INTERNAL_API_SECRET_PROD: { get: async () => 'secret' },
+    } as unknown as KvStub;
+
+    const first = await getRoutingTable(env);
+    await getRoutingTable(env);
+    expect(first?.version).toBe(SAMPLE_TABLE.version);
+    expect(reads).toBe(1);
+    expect(fetchSpy).not.toHaveBeenCalled();
+  });
+
+  it('fetches from origin on KV miss, writes to KV with expirationTtl, and returns the table', async () => {
+    const puts: Array<{ key: string; value: string; options: unknown }> = [];
+    const env = makeEnv(null, {
+      originTable: SAMPLE_TABLE,
+      onPut: (key, value, options) => puts.push({ key, value, options }),
+    });
+
+    const result = await getRoutingTable(env);
+    expect(result).toEqual(SAMPLE_TABLE);
+    expect(puts).toHaveLength(1);
+    expect(puts[0].key).toBe('routing_table_v1');
+    expect(puts[0].options).toEqual({ expirationTtl: 3600 });
+  });
+
+  it('returns null when origin responds non-OK', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const env = makeEnv(null, { originStatus: 500 });
+    expect(await getRoutingTable(env)).toBeNull();
+    warn.mockRestore();
+  });
+
+  it('returns null when origin throws', async () => {
+    const warn = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    const env = makeEnv(null, { originThrow: true });
+    expect(await getRoutingTable(env)).toBeNull();
+    warn.mockRestore();
+  });
+
+  it('returns null when origin returns a null table', async () => {
+    const env = makeEnv(null, { originTable: undefined });
+    expect(await getRoutingTable(env)).toBeNull();
+  });
+});
diff --git a/services/auto-routing/src/routing-table.ts b/services/auto-routing/src/routing-table.ts
new file mode 100644
index 0000000000..5c9ad85b0d
--- /dev/null
+++ b/services/auto-routing/src/routing-table.ts
@@ -0,0 +1,61 @@
+import { formatError, ttlCached } from '@kilocode/worker-utils';
+import {
+  ROUTING_TABLE_KV_KEY,
+  RoutingTableSchema,
+  type RoutingTable,
+} from '@kilocode/auto-routing-contracts';
+import { kvReadThrough } from './kv-read-through';
+import { fetchRoutingTableFromOrigin } from './benchmark-origin';
+
+const ROUTING_TABLE_CACHE_TTL_MS = 60_000;
+
+type RoutingTableEnv = Pick<
+  Env,
+  'AUTO_ROUTING_CONFIG' | 'BENCHMARK_SERVICE' | 'INTERNAL_API_SECRET_PROD'
+>;
+
+const routingTableCache = ttlCached(ROUTING_TABLE_CACHE_TTL_MS, async (env: RoutingTableEnv) => {
+  const table = await kvReadThrough({
+    kv: env.AUTO_ROUTING_CONFIG,
+    key: ROUTING_TABLE_KV_KEY,
+    ttlSeconds: 3600,
+    fetchOrigin: () => fetchRoutingTableFromOrigin(env),
+    parse: raw => {
+      try {
+        const parsed = RoutingTableSchema.safeParse(JSON.parse(raw));
+        if (!parsed.success) {
+          console.warn(
+            JSON.stringify({
+              event: 'auto_routing_table_invalid',
+              issues: parsed.error.issues.slice(0, 5).map(i => `${i.path.join('.')}: ${i.code}`),
+            })
+          );
+          return null;
+        }
+        return parsed.data;
+      } catch (error) {
+        console.warn(
+          JSON.stringify({ event: 'auto_routing_table_invalid', ...formatError(error) })
+        );
+        return null;
+      }
+    },
+  });
+  return table;
+});
+
+export function clearRoutingTableCache(): void {
+  routingTableCache.clear();
+}
+
+// Null when no benchmark-published table exists (or it cannot be read):
+// /decide then makes no decision and the gateway falls back to its static
+// balanced defaults.
+export function getRoutingTable(env: RoutingTableEnv): Promise<RoutingTable | null> {
+  return routingTableCache.get(env).catch((error: unknown) => {
+    console.warn(
+      JSON.stringify({ event: 'auto_routing_table_read_failed', ...formatError(error) })
+    );
+    return null;
+  });
+}
diff --git a/services/auto-routing/worker-configuration.d.ts b/services/auto-routing/worker-configuration.d.ts
index 6b69a65d5b..e91b95e923 100644
--- a/services/auto-routing/worker-configuration.d.ts
+++ b/services/auto-routing/worker-configuration.d.ts
@@ -1,5 +1,5 @@
 /* eslint-disable */
-// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: 0d84c4429525cf1b432d2ffe636e1ca8)
+// Generated by Wrangler by running `wrangler types --include-runtime=false` (hash: 7e1033a1604c8e567cadb72d6145fa58)
 interface __BaseEnv_Env {
 	AUTO_ROUTING_CONFIG: KVNamespace;
 	AUTO_ROUTING_CLASSIFIER_METRICS_V2: AnalyticsEngineDataset;
@@ -8,6 +8,7 @@ interface __BaseEnv_Env {
 	O11Y_CF_AE_API_TOKEN: SecretsStoreSecret;
 	O11Y_CF_ACCOUNT_ID: "e115e769bcdd4c3d66af59d3332cb394";
 	AUTO_ROUTING_DECISION_CACHE: DurableObjectNamespace<import("./src/index").AutoRoutingDecisionCacheDO>;
+	BENCHMARK_SERVICE: Fetcher /* auto-routing-benchmark */;
 }
 declare namespace Cloudflare {
 	interface GlobalProps {
diff --git a/services/auto-routing/wrangler.jsonc b/services/auto-routing/wrangler.jsonc
index ddcf6d9baa..297c4557e8 100644
--- a/services/auto-routing/wrangler.jsonc
+++ b/services/auto-routing/wrangler.jsonc
@@ -45,6 +45,8 @@
       "dataset": "auto_routing_classifier_metrics_v2",
     },
   ],
+  "services": [{ "binding": "BENCHMARK_SERVICE", "service": "auto-routing-benchmark" }],
+
   "kv_namespaces": [
     {
       "binding": "AUTO_ROUTING_CONFIG",