diff --git a/.github/workflows/deploy-workers.yml b/.github/workflows/deploy-workers.yml index 1a247f091d..1e18146a03 100644 --- a/.github/workflows/deploy-workers.yml +++ b/.github/workflows/deploy-workers.yml @@ -49,6 +49,10 @@ jobs: with: apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} workingDirectory: ${{ inputs.worker }} + # Workers that define a `predeploy` script (e.g. D1 migrations) run it + # right before deploy; all other workers are unaffected. + preCommands: | + if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi command: deploy detect-changes: @@ -150,4 +154,8 @@ jobs: with: apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }} workingDirectory: ${{ matrix.worker }} + # Workers that define a `predeploy` script (e.g. D1 migrations) run it + # right before deploy; all other workers are unaffected. + preCommands: | + if [ "$(jq -r '.scripts.predeploy // empty' package.json)" != "" ]; then pnpm run predeploy; fi command: deploy diff --git a/apps/web/.env.development.local.example b/apps/web/.env.development.local.example index c17511bfe9..ae816bb2c2 100644 --- a/apps/web/.env.development.local.example +++ b/apps/web/.env.development.local.example @@ -19,6 +19,9 @@ AUTO_TRIAGE_URL=http://localhost:8791 # @url auto-routing AUTO_ROUTING_WORKER_URL=http://localhost:8810 +# @url auto-routing-benchmark +AUTO_ROUTING_BENCHMARK_WORKER_URL=http://localhost:8814 + # @url cloudflare-security-sync SECURITY_SYNC_WORKER_URL=http://localhost:8812 diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts new file mode 100644 index 0000000000..e572c4e2a5 --- /dev/null +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.test.ts @@ -0,0 +1,158 @@ +import { NextRequest } from 'next/server'; +import type { User } from '@kilocode/db'; +import { + getBenchmarkConfig, + updateBenchmarkConfig, +} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client'; +import { getUserFromAuth } from '@/lib/user/server'; +import { findExperimentReservedModelIds } from '@/lib/ai-gateway/experiments/reserved-ids'; +import type { KiloExclusiveModel } from '@/lib/ai-gateway/providers/kilo-exclusive-model'; +import type * as ModelsModule from '@/lib/ai-gateway/models'; + +jest.mock('@/lib/user/server', () => ({ + getUserFromAuth: jest.fn(), +})); + +jest.mock('@/lib/ai-gateway/auto-routing-benchmark-admin-client', () => ({ + getBenchmarkConfig: jest.fn(), + updateBenchmarkConfig: jest.fn(), +})); + +jest.mock('@/lib/ai-gateway/experiments/reserved-ids', () => ({ + findExperimentReservedModelIds: jest.fn(), +})); + +// Stub the catalog so tests don't depend on any specific provider file. +// 'test-exclusive/alibaba-only' maps to the alibaba gateway (chat_completions only). +jest.mock('@/lib/ai-gateway/models', () => { + const actual = jest.requireActual('@/lib/ai-gateway/models'); + const stubModel: KiloExclusiveModel = { + public_id: 'test-exclusive/alibaba-only', + display_name: 'Test Alibaba-only', + description: 'stub for unit tests', + context_length: 8192, + max_completion_tokens: 4096, + status: 'public', + flags: [], + gateway: 'alibaba', + internal_id: 'stub-internal', + pricing: null, + exclusive_to: [], + inference_provider_restriction: [], + }; + return { + ...actual, + findKiloExclusiveModel: (id: string) => + id === 'test-exclusive/alibaba-only' ? stubModel : actual.findKiloExclusiveModel(id), + }; +}); + +import { PUT } from './route'; + +const mockGetUserFromAuth = jest.mocked(getUserFromAuth); +const mockGetBenchmarkConfig = jest.mocked(getBenchmarkConfig); +const mockUpdateBenchmarkConfig = jest.mocked(updateBenchmarkConfig); +const mockFindExperimentReservedModelIds = jest.mocked(findExperimentReservedModelIds); + +// Test-fixture boundary: only the fields the route actually reads. +function adminUserFixture(): User { + return { id: 'admin_123', google_user_email: 'admin@kilocode.ai' } as Partial as User; +} + +function putRequest(body: unknown) { + return new NextRequest('http://localhost:3000/admin/api/auto-routing/benchmark-config', { + method: 'PUT', + body: JSON.stringify(body), + headers: { 'content-type': 'application/json' }, + }); +} + +const validConfig = { + classifierModels: ['google/gemini-2.5-flash-lite'], + deciderModels: [{ id: 'openai/gpt-5-mini', reasoningEffort: null }], + minAccuracy: 0.7, + switchCostFactor: 3, + maxConcurrency: 4, + benchmarkUserId: null, + classifierRepetitions: 1, + deciderRepetitions: 1, + classifierMaxP95LatencyMs: 1000, + updatedAt: null, + updatedBy: null, +}; + +describe('PUT /admin/api/auto-routing/benchmark-config', () => { + beforeEach(() => { + jest.clearAllMocks(); + mockGetUserFromAuth.mockResolvedValue({ + user: adminUserFixture(), + authFailedResponse: null, + }); + mockUpdateBenchmarkConfig.mockResolvedValue({ + status: 200, + body: { config: validConfig }, + }); + mockGetBenchmarkConfig.mockResolvedValue({ status: 200, body: { config: null } }); + mockFindExperimentReservedModelIds.mockResolvedValue([]); + }); + + it('forwards a config whose decider models all serve every gateway chat API', async () => { + const response = await PUT(putRequest(validConfig)); + expect(response.status).toBe(200); + expect(mockUpdateBenchmarkConfig).toHaveBeenCalledWith(validConfig, 'admin@kilocode.ai'); + }); + + it('rejects with 400 listing decider models not servable on all gateway chat APIs', async () => { + const response = await PUT( + putRequest({ + ...validConfig, + deciderModels: [ + { id: 'openai/gpt-5-mini', reasoningEffort: null }, + { id: 'test-exclusive/alibaba-only', reasoningEffort: null }, + ], + }) + ); + + expect(response.status).toBe(400); + const body = (await response.json()) as { error: string }; + expect(body.error).toContain('test-exclusive/alibaba-only'); + expect(body.error).toContain('chat_completions'); + expect(body.error).not.toContain('openai/gpt-5-mini ('); + expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled(); + }); + + it('rejects decider models reserved by a model experiment (any status)', async () => { + // Ownership is status-independent per .specs/model-experiments.md: a public + // id with a draft/active/paused/completed experiment is reserved for + // explicit user selection and must not enter kilo-auto candidate sets. + mockFindExperimentReservedModelIds.mockResolvedValue(['preview/experimental-model']); + + const response = await PUT( + putRequest({ + ...validConfig, + deciderModels: [ + { id: 'openai/gpt-5-mini', reasoningEffort: null }, + { id: 'preview/experimental-model', reasoningEffort: null }, + ], + }) + ); + + expect(response.status).toBe(400); + const body = (await response.json()) as { error: string }; + expect(body.error).toContain('preview/experimental-model'); + expect(body.error).toContain('model-experiment'); + expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled(); + // The check runs against the decider model ids. + expect(mockFindExperimentReservedModelIds).toHaveBeenCalledWith([ + 'openai/gpt-5-mini', + 'preview/experimental-model', + ]); + }); + + it('rejects a schema-invalid config with 400', async () => { + const response = await PUT(putRequest({ classifierModels: 'oops' })); + expect(response.status).toBe(400); + await expect(response.json()).resolves.toEqual({ error: 'Invalid benchmark config' }); + expect(mockUpdateBenchmarkConfig).not.toHaveBeenCalled(); + }); +}); diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts new file mode 100644 index 0000000000..d85f617353 --- /dev/null +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-config/route.ts @@ -0,0 +1,74 @@ +import { BenchmarkConfigSchema } from '@kilocode/auto-routing-contracts'; +import type { NextRequest } from 'next/server'; +import { NextResponse } from 'next/server'; +import { + getBenchmarkConfig, + updateBenchmarkConfig, +} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client'; +import { + gatewayChatApisForModel, + modelServesAllGatewayChatApis, +} from '@/lib/ai-gateway/model-api-kinds'; +import { findExperimentReservedModelIds } from '@/lib/ai-gateway/experiments/reserved-ids'; +import { getUserFromAuth } from '@/lib/user/server'; + +export async function GET() { + const { authFailedResponse } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + const result = await getBenchmarkConfig(); + return NextResponse.json(result.body, { status: result.status }); +} + +export async function PUT(request: NextRequest) { + const { authFailedResponse, user } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + let rawBody: unknown; + try { + rawBody = await request.json(); + } catch { + return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 }); + } + + const parsed = BenchmarkConfigSchema.safeParse(rawBody); + if (!parsed.success) { + return NextResponse.json({ error: 'Invalid benchmark config' }, { status: 400 }); + } + + // Model-experiment public ids are dedicated preview ids that users must + // explicitly select; per .specs/model-experiments.md they must never enter + // kilo-auto candidate sets, so they can't be saved as decider candidates + // (the routing table feeds kilo-auto/efficient automatic selection). Checked + // across all experiment statuses — ownership, not just routing membership. + const deciderModelIds = parsed.data.deciderModels.map(m => m.id); + const reservedExperimentIds = await findExperimentReservedModelIds(deciderModelIds); + if (reservedExperimentIds.length > 0) { + return NextResponse.json( + { + error: `Decider models must not be model-experiment public ids (reserved for explicit user selection): ${reservedExperimentIds.join(', ')}`, + }, + { status: 400 } + ); + } + + // Routing-table candidates carry no per-protocol metadata, so every decider + // model must be servable on ALL gateway chat API kinds by the provider the + // gateway would route it to. + const unsupported = parsed.data.deciderModels + .map(m => m.id) + .filter(id => !modelServesAllGatewayChatApis(id)) + .map(id => `${id} (supports: ${gatewayChatApisForModel(id).join(', ') || 'none'})`); + if (unsupported.length > 0) { + return NextResponse.json( + { + error: `Decider models must support all gateway chat APIs (chat_completions, responses, messages): ${unsupported.join('; ')}`, + }, + { status: 400 } + ); + } + + const email = user?.google_user_email ?? ''; + const result = await updateBenchmarkConfig(parsed.data, email); + return NextResponse.json(result.body, { status: result.status }); +} diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts new file mode 100644 index 0000000000..26fdc8eef1 --- /dev/null +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-routing-table/route.ts @@ -0,0 +1,11 @@ +import { NextResponse } from 'next/server'; +import { getBenchmarkRoutingTable } from '@/lib/ai-gateway/auto-routing-benchmark-admin-client'; +import { getUserFromAuth } from '@/lib/user/server'; + +export async function GET() { + const { authFailedResponse } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + const result = await getBenchmarkRoutingTable(); + return NextResponse.json(result.body, { status: result.status }); +} diff --git a/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts new file mode 100644 index 0000000000..efbfebdde3 --- /dev/null +++ b/apps/web/src/app/admin/api/auto-routing/benchmark-runs/route.ts @@ -0,0 +1,36 @@ +import { StartBenchmarkRunRequestSchema } from '@kilocode/auto-routing-contracts'; +import type { NextRequest } from 'next/server'; +import { NextResponse } from 'next/server'; +import { + listBenchmarkRuns, + startBenchmarkRun, +} from '@/lib/ai-gateway/auto-routing-benchmark-admin-client'; +import { getUserFromAuth } from '@/lib/user/server'; + +export async function GET() { + const { authFailedResponse } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + const result = await listBenchmarkRuns(); + return NextResponse.json(result.body, { status: result.status }); +} + +export async function POST(request: NextRequest) { + const { authFailedResponse } = await getUserFromAuth({ adminOnly: true }); + if (authFailedResponse) return authFailedResponse; + + let rawBody: unknown; + try { + rawBody = await request.json(); + } catch { + return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 }); + } + + const parsed = StartBenchmarkRunRequestSchema.safeParse(rawBody); + if (!parsed.success) { + return NextResponse.json({ error: 'Invalid start benchmark run request' }, { status: 400 }); + } + + const result = await startBenchmarkRun(parsed.data.kind, parsed.data.force); + return NextResponse.json(result.body, { status: result.status }); +} diff --git a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx index d893f27382..f6e262d43d 100644 --- a/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx +++ b/apps/web/src/app/admin/auto-routing/AutoRoutingAdminContent.tsx @@ -11,7 +11,6 @@ import React, { useEffect, useMemo, useState, type ReactNode } from 'react'; import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; import { toast } from 'sonner'; import { BarChart3, Clock3, DollarSign, HelpCircle, RefreshCw, Route, Save } from 'lucide-react'; -import * as z from 'zod'; import { ModelCombobox, type ModelOption } from '@/components/shared/ModelCombobox'; import { Badge } from '@/components/ui/badge'; import { Button } from '@/components/ui/button'; @@ -31,6 +30,8 @@ import { type OpenRouterModelsResponse, } from '@/lib/organizations/organization-types'; import { cn } from '@/lib/utils'; +import { BenchmarksSection } from './BenchmarksSection'; +import { parseAdminResponse } from './admin-fetch'; const periods: Array<{ value: AutoRoutingAnalyticsPeriod; label: string }> = [ { value: '1h', label: '1h' }, @@ -39,24 +40,6 @@ const periods: Array<{ value: AutoRoutingAnalyticsPeriod; label: string }> = [ { value: '30d', label: '30d' }, ]; -const AdminApiErrorSchema = z.object({ error: z.string().optional() }); - -async function parseAdminResponse( - response: Response, - schema: z.ZodType -): Promise { - const body: unknown = await response.json(); - if (!response.ok) { - const parsedError = AdminApiErrorSchema.safeParse(body); - throw new Error( - parsedError.success && parsedError.data.error - ? parsedError.data.error - : `Request failed: ${response.status}` - ); - } - return schema.parse(body); -} - async function fetchClassifierModel() { const response = await fetch('/admin/api/auto-routing/classifier-model'); return parseAdminResponse( @@ -65,7 +48,7 @@ async function fetchClassifierModel() { ); } -async function saveClassifierModel(model: string) { +async function saveClassifierModel(model: string | null) { const response = await fetch('/admin/api/auto-routing/classifier-model', { method: 'PUT', headers: { 'content-type': 'application/json' }, @@ -397,10 +380,12 @@ export function AutoRoutingAdminContent() { }); useEffect(() => { - if (classifierModelQuery.data?.model) { - setSelectedModel(classifierModelQuery.data.model); + const override = classifierModelQuery.data?.override; + const model = classifierModelQuery.data?.model; + if (model !== undefined) { + setSelectedModel(override ?? model); } - }, [classifierModelQuery.data?.model]); + }, [classifierModelQuery.data?.override, classifierModelQuery.data?.model]); const modelOptions = useMemo(() => { return ( @@ -414,10 +399,14 @@ export function AutoRoutingAdminContent() { const saveMutation = useMutation({ mutationFn: saveClassifierModel, - onSuccess: data => { + onSuccess: (data, model) => { queryClient.setQueryData(['auto-routing', 'classifier-model'], data); - setSelectedModel(data.model); - toast.success('Classifier model updated'); + setSelectedModel(data.override ?? data.model); + if (model === null) { + toast.success('Override cleared — benchmark winner in effect'); + } else { + toast.success('Classifier model override saved'); + } }, onError: error => { toast.error(error instanceof Error ? error.message : 'Failed to update classifier model'); @@ -432,10 +421,12 @@ export function AutoRoutingAdminContent() { classifierModelQuery.error instanceof Error ? classifierModelQuery.error.message : undefined; const openRouterModelsError = openRouterModelsQuery.error instanceof Error ? openRouterModelsQuery.error.message : undefined; - const currentModel = classifierModelQuery.data?.model ?? ''; - const hasClassifierModelLoaded = classifierModelQuery.isSuccess && currentModel.length > 0; + const currentOverride = classifierModelQuery.data?.override ?? null; + const hasClassifierModelLoaded = classifierModelQuery.isSuccess; const hasModelChange = - hasClassifierModelLoaded && selectedModel.trim().length > 0 && selectedModel !== currentModel; + hasClassifierModelLoaded && + selectedModel.trim().length > 0 && + selectedModel !== (currentOverride ?? ''); const summary = analyticsQuery.data?.summary; const totalRequests = summary?.totalRequests ?? 0; const { classifiedRate, cacheHitRate, fallbackRate } = summaryRates(summary); @@ -472,32 +463,67 @@ export function AutoRoutingAdminContent() { - Classifier Model + Classifier model override - - - + +
+
Effective model
+
+ {classifierModelQuery.data?.model ?? } +
+
Override
+
+ {classifierModelQuery.isLoading ? ( + + ) : ( + (classifierModelQuery.data?.override ?? 'none') + )} +
+
Benchmark winner
+
+ {classifierModelQuery.isLoading ? ( + + ) : ( + (classifierModelQuery.data?.benchmarkWinner ?? 'not yet published') + )} +
+
+
+ + + {currentOverride !== null ? ( + + ) : null} +
@@ -600,6 +626,8 @@ export function AutoRoutingAdminContent() { /> )} + + ); } diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts new file mode 100644 index 0000000000..11a8a6a0e3 --- /dev/null +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.test.ts @@ -0,0 +1,102 @@ +import { describe, expect, it } from '@jest/globals'; +import { + configToFormState, + formatAccuracy, + formatUsd, + formStateToConfig, +} from './BenchmarksSection'; + +describe('formatAccuracy', () => { + it('formats 0.8542 as 85.4%', () => { + expect(formatAccuracy(0.8542)).toBe('85.4%'); + }); + + it('formats 1.0 as 100.0%', () => { + expect(formatAccuracy(1.0)).toBe('100.0%'); + }); + + it('formats 0 as 0.0%', () => { + expect(formatAccuracy(0)).toBe('0.0%'); + }); + + it('formats 0.5 as 50.0%', () => { + expect(formatAccuracy(0.5)).toBe('50.0%'); + }); + + it('rounds to one decimal place', () => { + expect(formatAccuracy(0.9999)).toBe('100.0%'); + expect(formatAccuracy(0.9994)).toBe('99.9%'); + }); +}); + +describe('formatUsd', () => { + it('returns em dash for null', () => { + expect(formatUsd(null)).toBe('—'); + }); + + it('formats a small cost with 6 decimal places', () => { + expect(formatUsd(0.000123)).toBe('$0.000123'); + }); + + it('trims trailing zeros', () => { + expect(formatUsd(0.1)).toBe('$0.1'); + }); + + it('formats zero as $0.0', () => { + expect(formatUsd(0)).toBe('$0.0'); + }); + + it('formats a typical cost', () => { + expect(formatUsd(0.001234)).toBe('$0.001234'); + }); + + it('formats a cost that fits exactly at 6dp', () => { + expect(formatUsd(0.000001)).toBe('$0.000001'); + }); +}); + +describe('configToFormState', () => { + it('yields defaults including classifierMaxP95LatencyMs "1000" when config is null', () => { + const state = configToFormState(null); + expect(state.classifierRepetitions).toBe(1); + expect(state.deciderRepetitions).toBe(1); + expect(state.classifierMaxP95LatencyMs).toBe('1000'); + expect(state.classifierModels).toBe(''); + expect(state.deciderModels).toEqual([]); + }); +}); + +describe('formStateToConfig round-trip', () => { + const baseConfig = { + classifierModels: ['model-a', 'model-b'], + deciderModels: [{ id: 'model-c', reasoningEffort: null }], + minAccuracy: 0.8, + switchCostFactor: 3, + maxConcurrency: 4, + benchmarkUserId: 'user-123', + classifierRepetitions: 3, + deciderRepetitions: 2, + classifierMaxP95LatencyMs: 500, + updatedAt: null, + updatedBy: null, + }; + + it('preserves classifierRepetitions, deciderRepetitions, and classifierMaxP95LatencyMs', () => { + const state = configToFormState(baseConfig); + expect(state.classifierRepetitions).toBe(3); + expect(state.deciderRepetitions).toBe(2); + expect(state.classifierMaxP95LatencyMs).toBe('500'); + + const result = formStateToConfig(state, baseConfig); + expect(result.classifierRepetitions).toBe(3); + expect(result.deciderRepetitions).toBe(2); + expect(result.classifierMaxP95LatencyMs).toBe(500); + }); + + it('converts empty-string classifierMaxP95LatencyMs form value to null in config', () => { + const state = configToFormState(baseConfig); + const stateWithEmpty = { ...state, classifierMaxP95LatencyMs: '' }; + const result = formStateToConfig(stateWithEmpty, baseConfig); + expect(result.classifierMaxP95LatencyMs).toBeNull(); + }); +}); diff --git a/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx new file mode 100644 index 0000000000..9bdfac18ba --- /dev/null +++ b/apps/web/src/app/admin/auto-routing/BenchmarksSection.tsx @@ -0,0 +1,984 @@ +'use client'; + +import { + BenchmarkConfigResponseSchema, + BenchmarkRoutingTableResponseSchema, + BenchmarkRunsResponseSchema, + StartBenchmarkRunResponseSchema, + type BenchmarkConfig, + type BenchmarkKind, + type BenchmarkRoutingTableResponse, + type BenchmarkRun, + type BenchmarkModelSummary, + type ReasoningEffort, +} from '@kilocode/auto-routing-contracts'; +import React, { useCallback, useEffect, useRef, useState } from 'react'; +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; +import { toast } from 'sonner'; +import { ChevronDown, ChevronRight, Play, Plus, Save, Trash2 } from 'lucide-react'; +import { Badge } from '@/components/ui/badge'; +import { Button } from '@/components/ui/button'; +import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; +import { Checkbox } from '@/components/ui/checkbox'; +import { Input } from '@/components/ui/input'; +import { Label } from '@/components/ui/label'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { Skeleton } from '@/components/ui/skeleton'; +import { + Table, + TableBody, + TableCell, + TableHead, + TableHeader, + TableRow, +} from '@/components/ui/table'; +import { Textarea } from '@/components/ui/textarea'; +import { parseAdminResponse } from './admin-fetch'; + +// --------------------------------------------------------------------------- +// Pure helpers (exported for unit tests) +// --------------------------------------------------------------------------- + +export function formatAccuracy(n: number): string { + return `${(n * 100).toFixed(1)}%`; +} + +export function formatUsd(n: number | null): string { + if (n === null) return '—'; + // 6 dp, remove trailing zeros, but keep at least $0.000001 precision + const fixed = n.toFixed(6); + // Trim trailing zeros after decimal, but leave at least one digit after dot + const trimmed = fixed.replace(/(\.\d*?)0+$/, '$1').replace(/\.$/, '.0'); + return `$${trimmed}`; +} + +// --------------------------------------------------------------------------- +// Fetch helpers +// --------------------------------------------------------------------------- + +async function fetchBenchmarkConfig() { + const response = await fetch('/admin/api/auto-routing/benchmark-config'); + return parseAdminResponse(response, BenchmarkConfigResponseSchema); +} + +async function saveBenchmarkConfig(config: BenchmarkConfig) { + const response = await fetch('/admin/api/auto-routing/benchmark-config', { + method: 'PUT', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(config), + }); + return parseAdminResponse(response, BenchmarkConfigResponseSchema); +} + +async function fetchBenchmarkRuns() { + const response = await fetch('/admin/api/auto-routing/benchmark-runs'); + return parseAdminResponse(response, BenchmarkRunsResponseSchema); +} + +async function startBenchmarkRun({ kind, force }: { kind: BenchmarkKind; force: boolean }) { + const response = await fetch('/admin/api/auto-routing/benchmark-runs', { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ kind, force }), + }); + return parseAdminResponse(response, StartBenchmarkRunResponseSchema); +} + +async function fetchBenchmarkRoutingTable() { + const response = await fetch('/admin/api/auto-routing/benchmark-routing-table'); + return parseAdminResponse( + response, + BenchmarkRoutingTableResponseSchema + ); +} + +// --------------------------------------------------------------------------- +// Local form state type for decider model rows +// --------------------------------------------------------------------------- + +type DeciderModelRow = { + id: string; + reasoningEffort: ReasoningEffort | null; +}; + +export function configToFormState(config: BenchmarkConfig | null): { + classifierModels: string; + deciderModels: DeciderModelRow[]; + minAccuracy: number; + switchCostFactor: number; + maxConcurrency: number; + benchmarkUserId: string; + classifierRepetitions: number; + deciderRepetitions: number; + classifierMaxP95LatencyMs: string; +} { + if (config === null) { + // No config saved yet: the worker fabricates nothing, so the form starts + // empty and the admin must enter and save a config before running. + return { + classifierModels: '', + deciderModels: [], + minAccuracy: 0.7, + switchCostFactor: 3, + maxConcurrency: 4, + benchmarkUserId: '', + classifierRepetitions: 1, + deciderRepetitions: 1, + classifierMaxP95LatencyMs: '1000', + }; + } + return { + classifierModels: config.classifierModels.join('\n'), + deciderModels: config.deciderModels.map(m => ({ + id: m.id, + reasoningEffort: m.reasoningEffort ?? null, + })), + minAccuracy: config.minAccuracy, + switchCostFactor: config.switchCostFactor, + maxConcurrency: config.maxConcurrency, + benchmarkUserId: config.benchmarkUserId ?? '', + classifierRepetitions: config.classifierRepetitions, + deciderRepetitions: config.deciderRepetitions, + classifierMaxP95LatencyMs: + config.classifierMaxP95LatencyMs !== null ? String(config.classifierMaxP95LatencyMs) : '', + }; +} + +export function formStateToConfig( + state: ReturnType, + base: BenchmarkConfig | null +): BenchmarkConfig { + const classifierModels = state.classifierModels + .split('\n') + .map(s => s.trim()) + .filter(s => s.length > 0); + const deciderModels = state.deciderModels + .filter(row => row.id.trim().length > 0) + .map(row => ({ + id: row.id.trim(), + reasoningEffort: row.reasoningEffort ?? null, + })); + const benchmarkUserId = state.benchmarkUserId.trim(); + const rawLatency = state.classifierMaxP95LatencyMs.trim(); + const classifierMaxP95LatencyMs = rawLatency.length > 0 ? parseInt(rawLatency, 10) || null : null; + return { + classifierModels, + deciderModels, + minAccuracy: state.minAccuracy, + switchCostFactor: state.switchCostFactor, + maxConcurrency: state.maxConcurrency, + benchmarkUserId: benchmarkUserId.length > 0 ? benchmarkUserId : null, + classifierRepetitions: state.classifierRepetitions, + deciderRepetitions: state.deciderRepetitions, + classifierMaxP95LatencyMs, + updatedAt: base?.updatedAt ?? null, + updatedBy: base?.updatedBy ?? null, + }; +} + +// --------------------------------------------------------------------------- +// Config editor sub-component +// --------------------------------------------------------------------------- + +function BenchmarkConfigEditor({ + config, + onSaved, +}: { + config: BenchmarkConfig | null; + onSaved: (next: { config: BenchmarkConfig | null }) => void; +}) { + const [form, setForm] = useState(() => configToFormState(config)); + // Tracks unsaved local edits. A background config refetch (the runs list + // polls; the query also refetches on focus) must not silently overwrite + // in-progress edits, so the sync effect only resets the form while pristine. + const [dirty, setDirty] = useState(false); + + // Any user edit goes through this so it marks the form dirty. + const updateForm = useCallback( + ( + updater: (prev: ReturnType) => ReturnType + ) => { + setForm(updater); + setDirty(true); + }, + [] + ); + + // Sync from server config only on initial load / after a save — never while + // the admin has unsaved edits (that would discard their work). + useEffect(() => { + if (!dirty) setForm(configToFormState(config)); + }, [config, dirty]); + + // Discard local edits and reload the latest server config (explicit conflict + // recovery when a remote update arrived while editing). + const handleReload = useCallback(() => { + setForm(configToFormState(config)); + setDirty(false); + }, [config]); + + const saveMutation = useMutation({ + mutationFn: saveBenchmarkConfig, + onSuccess: data => { + // The save is now the source of truth: clear dirty and re-sync so the + // next background refetch is free to update the form again. + setForm(configToFormState(data.config)); + setDirty(false); + onSaved(data); + toast.success('Benchmark config saved'); + }, + onError: (error: unknown) => { + toast.error(error instanceof Error ? error.message : 'Failed to save benchmark config'); + }, + }); + + const handleAddDeciderRow = useCallback(() => { + updateForm(prev => ({ + ...prev, + deciderModels: [...prev.deciderModels, { id: '', reasoningEffort: null }], + })); + }, [updateForm]); + + const handleRemoveDeciderRow = useCallback( + (index: number) => { + updateForm(prev => ({ + ...prev, + deciderModels: prev.deciderModels.filter((_, i) => i !== index), + })); + }, + [updateForm] + ); + + const handleDeciderRowChange = useCallback( + (index: number, patch: Partial) => { + updateForm(prev => ({ + ...prev, + deciderModels: prev.deciderModels.map((row, i) => + i === index ? { ...row, ...patch } : row + ), + })); + }, + [updateForm] + ); + + const handleSave = useCallback(() => { + saveMutation.mutate(formStateToConfig(form, config)); + }, [form, config, saveMutation]); + + return ( + + + Benchmark Config + + + {/* Classifier models */} +
+ +