diff --git a/src/apps/competitions/tasks.py b/src/apps/competitions/tasks.py index 07acb270a..a55a05357 100644 --- a/src/apps/competitions/tasks.py +++ b/src/apps/competitions/tasks.py @@ -795,11 +795,26 @@ def update_phase_statuses(): @app.task(queue='site-worker') def submission_status_cleanup(): - submissions = Submission.objects.filter(status=Submission.RUNNING, has_children=False).select_related('phase', 'parent') + # Recover submissions stuck in any non-terminal state + non_terminal_statuses = [ + Submission.SUBMITTED, + Submission.PREPARING, + Submission.RUNNING, + Submission.SCORING, + ] + submissions = Submission.objects.filter( + status__in=non_terminal_statuses, + has_children=False, + ).select_related('phase', 'parent') for sub in submissions: - # Check if the submission has been running for 24 hours longer than execution_time_limit - if sub.started_when < now() - timedelta(milliseconds=(3600000 * 24) + sub.phase.execution_time_limit): + # Use started_when for Running submissions, created_when as fallback for others + reference_time = sub.started_when if sub.started_when else sub.created_when + deadline = reference_time + timedelta( + milliseconds=(3600000 * 24) + sub.phase.execution_time_limit + ) + + if now() > deadline: if sub.parent is not None: sub.parent.cancel(status=Submission.FAILED) else: diff --git a/src/apps/competitions/tests/test_submissions.py b/src/apps/competitions/tests/test_submissions.py index ee5cdc850..2429a05c2 100644 --- a/src/apps/competitions/tests/test_submissions.py +++ b/src/apps/competitions/tests/test_submissions.py @@ -427,6 +427,51 @@ def test_submissions_are_cancelled_if_running_24_hours_past_execution_time_limit assert self.submission_pass.status == Submission.RUNNING assert self.submission_fail.status == Submission.FAILED + def test_cleanup_recovers_stuck_submitted_submissions(self): + """Submissions stuck in Submitted should be recovered by cleanup.""" + sub = self.make_submission() + sub.status = Submission.SUBMITTED + sub.created_when = timezone.now() - timedelta(hours=48) + sub.save(ignore_submission_limit=True) + + submission_status_cleanup() + sub.refresh_from_db() + assert sub.status == Submission.FAILED + + def test_cleanup_recovers_stuck_preparing_submissions(self): + """Submissions stuck in Preparing should be recovered by cleanup.""" + sub = self.make_submission() + sub.status = Submission.PREPARING + sub.created_when = timezone.now() - timedelta(hours=48) + sub.save(ignore_submission_limit=True) + + submission_status_cleanup() + sub.refresh_from_db() + assert sub.status == Submission.FAILED + + def test_cleanup_recovers_stuck_scoring_submissions(self): + """Submissions stuck in Scoring should be recovered by cleanup.""" + sub = self.make_submission() + sub.status = Submission.SCORING + sub.created_when = timezone.now() - timedelta(hours=48) + sub.save(ignore_submission_limit=True) + + submission_status_cleanup() + sub.refresh_from_db() + assert sub.status == Submission.FAILED + + def test_cleanup_does_not_touch_recent_non_terminal_submissions(self): + """Recent submissions in non-terminal states should NOT be cleaned up.""" + for status in [Submission.SUBMITTED, Submission.PREPARING, Submission.SCORING]: + sub = self.make_submission() + sub.status = status + sub.created_when = timezone.now() + sub.save(ignore_submission_limit=True) + + submission_status_cleanup() + sub.refresh_from_db() + assert sub.status == status, f"Recent {status} submission should not be cleaned up" + def test_cancelling_parent_submission_cancels_all_children(self): self.parent_submission = self.make_submission() self.parent_submission.has_children = True diff --git a/tests/k6/README_cleanup_tests.md b/tests/k6/README_cleanup_tests.md new file mode 100644 index 000000000..088bcb81d --- /dev/null +++ b/tests/k6/README_cleanup_tests.md @@ -0,0 +1,140 @@ +# Stuck Submission Cleanup Integration Tests + +## Overview + +These tests validate that the `submission_status_cleanup()` task correctly recovers submissions stuck in **any** non-terminal state, not just `Running`. + +### Problem + +The original cleanup task only recovered submissions stuck in `Running` state. Submissions that never reached `Running` (stuck in `Submitted`, `Preparing`, or `Scoring`) would hang forever. + +### Solution + +The fix extends cleanup to cover all non-terminal states: +- `Submitted` +- `Preparing` +- `Running` +- `Scoring` + +Uses `created_when` as fallback when `started_when` is null (for submissions that never reached Running). + +## Test Suite + +### Files + +- **`run_cleanup_test.sh`** — End-to-end orchestrator + - Submits N submissions + - Stops compute_worker to simulate stuck state + - Ages submissions (sets created_when to 48h ago) + - Runs cleanup task + - Verifies all submissions recovered + +- **`test_stuck_submissions.js`** — K6 script for recovery verification + - Creates submissions and polls until cleanup runs + - Verifies all reach terminal state + +- **`test_cleanup_conservation.js`** — K6 conservation harness + - Submits N, asserts N terminal + - Conservation rate must be 100% + +## Running the Tests + +### Prerequisites + +- Docker Compose environment running +- K6 installed (`brew install k6`) +- `codabench` user created with password `codabench` +- Competition with at least one phase exists + +### Run Full Test Suite + +```bash +cd tests/k6 +./run_cleanup_test.sh [PHASE_ID] +``` + +If `PHASE_ID` is not provided, the script auto-detects the first available phase. + +### Environment Variables + +- `BASE_URL` (default: `http://localhost`) +- `USERNAME` (default: `codabench`) +- `PASSWORD` (default: `codabench`) +- `SUBMISSION_COUNT` (default: `3`) + +### Example + +```bash +BASE_URL=http://localhost:8000 SUBMISSION_COUNT=5 ./run_cleanup_test.sh +``` + +## Expected Output + +### Success + +``` +═══════════════════════════════════════════════════════ + Stuck Submission Cleanup — Integration Test +═══════════════════════════════════════════════════════ + +▸ Step 1: Checking prerequisites... + ✓ Docker Compose is running + ✓ K6 is installed + +▸ Step 2: Auto-detecting phase... + ✓ Found phase ID: 123 + +▸ Step 6: Running cleanup and verifying recovery + ══ Results ══ + Created: 3 + Recovered: 3 + Still stuck: 0 + Recovery rate: 100.0% + +▸ Step 8: Running K6 conservation harness + Conservation rate: 100% + Lost submissions: 0 + +✓ PASS — All stuck submissions recovered +``` + +### Failure + +``` + ❌ FAIL — 2 submissions still stuck + + This means submission_status_cleanup() is not covering + all non-terminal states (bug not fixed). +``` + +## What This Tests + +### Directly Tested + +- ✅ Cleanup recovers submissions stuck in `Submitted` +- ✅ Cleanup recovers submissions stuck in `Preparing` +- ✅ Cleanup recovers submissions stuck in `Running` +- ✅ Cleanup recovers submissions stuck in `Scoring` +- ✅ Recent submissions (< 24h) are NOT cleaned up +- ✅ Conservation: all submissions reach terminal state + +### Integration with CI/CD + +```yaml +# .github/workflows/integration-tests.yml +- name: Run cleanup integration test + run: | + docker compose up -d + docker compose exec django python manage.py generate_data + tests/k6/run_cleanup_test.sh + env: + SUBMISSION_COUNT: 5 +``` + +## Related Tests + +Unit tests in `src/apps/competitions/tests/test_submissions.py`: +- `test_cleanup_recovers_stuck_submitted_submissions` +- `test_cleanup_recovers_stuck_preparing_submissions` +- `test_cleanup_recovers_stuck_scoring_submissions` +- `test_cleanup_does_not_touch_recent_non_terminal_submissions` diff --git a/tests/k6/run_cleanup_test.sh b/tests/k6/run_cleanup_test.sh new file mode 100755 index 000000000..bc6fc4e9c --- /dev/null +++ b/tests/k6/run_cleanup_test.sh @@ -0,0 +1,284 @@ +#!/usr/bin/env bash +# +# run_cleanup_test.sh — End-to-end stuck submission cleanup test +# +# Orchestrates: +# 1. Ensures the environment is running +# 2. Submits N submissions via K6 +# 3. Stops the compute_worker so submissions get stuck +# 4. Ages the stuck submissions (sets created_when to 48 hours ago) +# 5. Runs submission_status_cleanup() +# 6. K6 verifies all submissions reached a terminal state +# +# Prerequisites: +# - docker compose up -d (environment running) +# - A competition with at least one phase exists +# - k6 installed (brew install k6) +# +# Usage: +# ./tests/k6/run_cleanup_test.sh [PHASE_ID] +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +cd "$PROJECT_DIR" + +# ─── Configuration ────────────────────────────────────────────────────── +BASE_URL="${BASE_URL:-http://localhost}" +API_URL="${API_URL:-${BASE_URL}/api}" +USERNAME="${CODABENCH_USER:-codabench}" +PASSWORD="${CODABENCH_PASS:-codabench}" +PHASE_ID="${1:-}" +SUBMISSION_COUNT="${SUBMISSION_COUNT:-3}" + +echo "═══════════════════════════════════════════════════════" +echo " Stuck Submission Cleanup — Integration Test" +echo "═══════════════════════════════════════════════════════" +echo "" +echo "Config:" +echo " BASE_URL: $BASE_URL" +echo " USERNAME: $USERNAME" +echo " PHASE_ID: ${PHASE_ID:-auto-detect}" +echo " SUBMISSION_COUNT: $SUBMISSION_COUNT" +echo "" + +# ─── Step 0: Preflight checks ────────────────────────────────────────── +echo "▸ Step 0: Preflight checks" + +if ! command -v k6 &>/dev/null; then + echo " ✗ k6 not found. Install with: brew install k6" + exit 1 +fi + +if ! docker compose ps --status running django | grep -q django; then + echo " ✗ Django container not running. Run: docker compose up -d" + exit 1 +fi + +echo " ✓ All checks passed" +echo "" + +# ─── Step 1: Make sure compute_worker is running for submissions ─────── +echo "▸ Step 1: Ensure compute_worker is running" +docker compose start compute_worker 2>/dev/null || true +sleep 3 +echo " ✓ compute_worker started" +echo "" + +# ─── Step 2: Submit submissions via K6 (just the creation part) ──────── +echo "▸ Step 2: Create $SUBMISSION_COUNT submissions via API" + +# Get auth token +TOKEN=$(curl -s -X POST "${API_URL}/api-token-auth/" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${USERNAME}\",\"password\":\"${PASSWORD}\"}" | \ + python3 -c "import sys,json; print(json.load(sys.stdin).get('token',''))" 2>/dev/null) + +if [ -z "$TOKEN" ]; then + echo " ✗ Failed to authenticate. Check credentials." + exit 1 +fi +echo " ✓ Authenticated" + +# Auto-detect phase ID if not provided +# The /api/phases/ returns [] on direct call, and /api/competitions/ list doesn't include phases. +# So we list competitions, then fetch the first one's detail to get phases. +if [ -z "$PHASE_ID" ]; then + COMP_ID=$(curl -s "${API_URL}/competitions/" \ + -H "Authorization: Token $TOKEN" | \ + python3 -c " +import sys, json +data = json.load(sys.stdin) +results = data.get('results', data) if isinstance(data, dict) else data +if results: + print(results[0]['id']) +" 2>/dev/null) + + if [ -z "$COMP_ID" ]; then + echo " ✗ No competitions found. Run: docker compose exec django python manage.py generate_data" + exit 1 + fi + + PHASE_ID=$(curl -s "${API_URL}/competitions/${COMP_ID}/" \ + -H "Authorization: Token $TOKEN" | \ + python3 -c " +import sys, json +data = json.load(sys.stdin) +phases = data.get('phases', []) +if phases: + print(phases[0]['id']) +" 2>/dev/null) + + if [ -z "$PHASE_ID" ]; then + echo " ✗ No phases found. Upload a competition first." + echo " Use the competition.yaml from tests/test_files/competitions/" + exit 1 + fi + echo " ✓ Auto-detected phase ID: $PHASE_ID" +fi + +# Get a data key (UUID) for the submission — required to avoid 500 on serialization +DATA_KEY=$(docker compose exec -T django python manage.py shell --command=" +from datasets.models import Data +d = Data.objects.first() +if d: print(d.key) +" 2>/dev/null | grep -E '^[0-9a-f-]{36}$' | tr -d '\\r\\n') + +if [ -z "$DATA_KEY" ]; then + echo " ✗ No Data objects found in DB. Run generate_data first." + exit 1 +fi +echo " Using data key: $DATA_KEY" + +# Create submissions +SUBMISSION_IDS=() +for i in $(seq 1 "$SUBMISSION_COUNT"); do + RESULT=$(curl -s -X POST "${API_URL}/submissions/" \ + -H "Authorization: Token $TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"phase\":${PHASE_ID},\"data\":\"${DATA_KEY}\"}") + + SUB_ID=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null) + SUB_STATUS=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null) + + if [ -n "$SUB_ID" ]; then + SUBMISSION_IDS+=("$SUB_ID") + echo " ✓ Created submission $SUB_ID (status: $SUB_STATUS)" + else + echo " ✗ Failed to create submission $i: $RESULT" + fi + sleep 1 +done + +if [ ${#SUBMISSION_IDS[@]} -eq 0 ]; then + echo " ✗ No submissions created. Aborting." + exit 1 +fi + +echo " Created ${#SUBMISSION_IDS[@]} submissions" +echo "" + +# ─── Step 3: Stop compute_worker to strand submissions ───────────────── +echo "▸ Step 3: Stop compute_worker (submissions will get stuck)" +docker compose stop compute_worker +sleep 5 +echo " ✓ compute_worker stopped" +echo "" + +# ─── Step 4: Check current status of submissions ────────────────────── +echo "▸ Step 4: Current submission statuses" +STUCK_COUNT=0 +for SUB_ID in "${SUBMISSION_IDS[@]}"; do + STATUS=$(curl -s "${API_URL}/submissions/${SUB_ID}/" \ + -H "Authorization: Token $TOKEN" | \ + python3 -c "import sys,json; print(json.load(sys.stdin).get('status','unknown'))" 2>/dev/null) + echo " Submission $SUB_ID: $STATUS" + + if [[ "$STATUS" != "Finished" && "$STATUS" != "Failed" && "$STATUS" != "Cancelled" ]]; then + STUCK_COUNT=$((STUCK_COUNT + 1)) + fi +done +echo " → $STUCK_COUNT/${#SUBMISSION_IDS[@]} stuck in non-terminal state" +echo "" + +# ─── Step 5: Age submissions + trigger cleanup ──────────────────────── +echo "▸ Step 5: Age stuck submissions and trigger cleanup" + +# Build Python command to age submissions and run cleanup +IDS_LIST=$(printf "%s," "${SUBMISSION_IDS[@]}" | sed 's/,$//') + +docker compose exec -T django python manage.py shell </dev/null) + + if [[ "$STATUS" == "Finished" || "$STATUS" == "Failed" || "$STATUS" == "Cancelled" ]]; then + echo " ✓ Submission $SUB_ID → $STATUS (recovered)" + RECOVERED=$((RECOVERED + 1)) + else + echo " ✗ Submission $SUB_ID → $STATUS (STILL STUCK)" + STILL_STUCK=$((STILL_STUCK + 1)) + fi +done + +echo "" +echo "═══════════════════════════════════════════════════════" +echo " Test Results" +echo "═══════════════════════════════════════════════════════" +echo " Total submissions: ${#SUBMISSION_IDS[@]}" +echo " Recovered: $RECOVERED" +echo " Still stuck: $STILL_STUCK" +echo "" + +if [ "$STILL_STUCK" -eq 0 ]; then + echo " ✅ PASS — All stuck submissions recovered by cleanup" +else + echo " ❌ FAIL — $STILL_STUCK submissions still stuck" + echo "" + echo " This means submission_status_cleanup() is not covering" + echo " all non-terminal states (bug not fixed)." +fi +echo "═══════════════════════════════════════════════════════" +echo "" + +# ─── Step 7: Restart compute_worker ──────────────────────────────────── +echo "▸ Step 7: Restarting compute_worker" +docker compose start compute_worker +echo " ✓ compute_worker restarted" +echo "" + +# ─── Step 8: Run the K6 conservation harness ────────────────────────── +echo "▸ Step 8: Running K6 conservation harness" +k6 run "$SCRIPT_DIR/test_cleanup_conservation.js" \ + --env BASE_URL="$BASE_URL" \ + --env API_URL="$API_URL" \ + --env USERNAME="$USERNAME" \ + --env PASSWORD="$PASSWORD" \ + --env PHASE_ID="$PHASE_ID" \ + --env DATA_KEY="$DATA_KEY" \ + --env SUBMISSION_COUNT="$SUBMISSION_COUNT" \ + || true + +echo "" +echo "Done." + +exit "$STILL_STUCK" diff --git a/tests/k6/test_cleanup_conservation.js b/tests/k6/test_cleanup_conservation.js new file mode 100755 index 000000000..56b3bbd9b --- /dev/null +++ b/tests/k6/test_cleanup_conservation.js @@ -0,0 +1,225 @@ +/** + * K6 Test: Cleanup Conservation Harness + * + * Submit N submissions with a running compute_worker and assert that + * ALL reach a terminal state (Finished, Failed, or Cancelled) within SLA. + * + * This is the "conservation check" from report.md: submit N, assert N scored. + * Unlike the stuck-submission test, this runs with the full pipeline active + * and checks that nothing is silently lost. + * + * Usage: + * k6 run tests/k6/test_cleanup_conservation.js \ + * --env BASE_URL=http://localhost \ + * --env USERNAME=codabench \ + * --env PASSWORD=codabench \ + * --env PHASE_ID=1 \ + * --env SUBMISSION_COUNT=5 \ + * --env SLA_TIMEOUT_SEC=300 + */ + +import http from "k6/http"; +import { check, sleep, group } from "k6"; +import { Rate, Counter, Trend } from "k6/metrics"; + +// ─── Custom metrics ───────────────────────────────────────────────────────── +const conservationRate = new Rate("conservation_rate"); +const submissionsAccepted = new Counter("submissions_accepted"); +const submissionsTerminal = new Counter("submissions_terminal"); +const submissionsLost = new Counter("submissions_lost"); +const submissionLatency = new Trend("submission_completion_ms"); + +// ─── Configuration ────────────────────────────────────────────────────────── +const BASE_URL = __ENV.BASE_URL || "http://localhost"; +const API_URL = __ENV.API_URL || `${BASE_URL}/api`; +const USERNAME = __ENV.USERNAME || "codabench"; +const PASSWORD = __ENV.PASSWORD || "codabench"; +const PHASE_ID = __ENV.PHASE_ID || ""; +const SUBMISSION_COUNT = parseInt(__ENV.SUBMISSION_COUNT || "3"); +const SLA_TIMEOUT_SEC = parseInt(__ENV.SLA_TIMEOUT_SEC || "300"); +const POLL_INTERVAL_SEC = parseInt(__ENV.POLL_INTERVAL_SEC || "10"); + +export const options = { + vus: 1, + iterations: 1, + thresholds: { + // Conservation: every accepted submission must reach terminal state + conservation_rate: ["rate>=1.0"], + submissions_lost: ["count==0"], + }, +}; + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +function getAuthToken() { + const res = http.post(`${API_URL}/api-token-auth/`, { + username: USERNAME, + password: PASSWORD, + }); + check(res, { "auth: status 200": (r) => r.status === 200 }); + return JSON.parse(res.body).token; +} + +function authHeaders(token) { + return { + headers: { + Authorization: `Token ${token}`, + "Content-Type": "application/json", + }, + }; +} + +function getPhaseId(token) { + if (PHASE_ID) return parseInt(PHASE_ID); + // List competitions, then fetch detail for phases + const listRes = http.get(`${API_URL}/competitions/`, authHeaders(token)); + const listData = JSON.parse(listRes.body); + const competitions = listData.results || listData; + if (!competitions || competitions.length === 0) return null; + + const detailRes = http.get(`${API_URL}/competitions/${competitions[0].id}/`, authHeaders(token)); + const detail = JSON.parse(detailRes.body); + if (detail.phases && detail.phases.length > 0) return detail.phases[0].id; + return null; +} + +const TERMINAL_STATUSES = ["Finished", "Failed", "Cancelled"]; + +// ─── Main test ────────────────────────────────────────────────────────────── + +export default function () { + let token; + let phaseId; + + // Track each submission's create time and status + const submissions = []; // { id, createdAt, finalStatus, completedAt } + + group("1. Setup", function () { + token = getAuthToken(); + phaseId = getPhaseId(token); + if (!phaseId) { + console.error("ABORT: No phase found."); + return; + } + console.log(`Phase: ${phaseId}, SLA: ${SLA_TIMEOUT_SEC}s`); + }); + + if (!phaseId) return; + + const dataKey = __ENV.DATA_KEY || ""; + + // ── Submit N ────────────────────────────────────────────────────────── + group("2. Submit submissions", function () { + for (let i = 0; i < SUBMISSION_COUNT; i++) { + const body = { phase: phaseId }; + if (dataKey) body.data = dataKey; + const payload = JSON.stringify(body); + const res = http.post( + `${API_URL}/submissions/`, + payload, + authHeaders(token) + ); + + if (res.status === 201) { + const sub = JSON.parse(res.body); + submissions.push({ + id: sub.id, + createdAt: Date.now(), + finalStatus: null, + completedAt: null, + }); + submissionsAccepted.add(1); + console.log(`Submitted ${sub.id}`); + } else { + console.error(`Submit failed: ${res.status} ${res.body}`); + } + sleep(1); + } + }); + + if (submissions.length === 0) { + console.error("ABORT: No submissions created."); + return; + } + + // ── Poll until all terminal or SLA expires ──────────────────────────── + group("3. Poll for completion (conservation check)", function () { + const deadline = Date.now() + SLA_TIMEOUT_SEC * 1000; + let pending = submissions.length; + + while (pending > 0 && Date.now() < deadline) { + sleep(POLL_INTERVAL_SEC); + + for (const sub of submissions) { + if (sub.finalStatus) continue; // already done + + const res = http.get( + `${API_URL}/submissions/${sub.id}/`, + authHeaders(token) + ); + if (res.status !== 200) continue; + + const data = JSON.parse(res.body); + if (TERMINAL_STATUSES.includes(data.status)) { + sub.finalStatus = data.status; + sub.completedAt = Date.now(); + pending--; + const elapsed = sub.completedAt - sub.createdAt; + submissionLatency.add(elapsed); + console.log( + `Submission ${sub.id} → ${data.status} (${(elapsed / 1000).toFixed(1)}s)` + ); + } + } + + const elapsed = ((Date.now() - submissions[0].createdAt) / 1000).toFixed( + 0 + ); + console.log( + `[${elapsed}s] ${submissions.length - pending}/${submissions.length} terminal` + ); + } + }); + + // ── Report ──────────────────────────────────────────────────────────── + group("4. Conservation report", function () { + let terminal = 0; + let lost = 0; + + console.log("\n══ Conservation Report ══"); + for (const sub of submissions) { + if (sub.finalStatus) { + terminal++; + submissionsTerminal.add(1); + conservationRate.add(1); + console.log(` ✓ ${sub.id}: ${sub.finalStatus}`); + } else { + lost++; + submissionsLost.add(1); + conservationRate.add(0); + + // Fetch current status for diagnostics + const res = http.get( + `${API_URL}/submissions/${sub.id}/`, + authHeaders(token) + ); + const currentStatus = + res.status === 200 ? JSON.parse(res.body).status : "unknown"; + console.error( + ` ✗ ${sub.id}: LOST — stuck in '${currentStatus}' after SLA` + ); + } + } + + console.log(`\nAccepted: ${submissions.length}`); + console.log(`Terminal: ${terminal}`); + console.log(`Lost: ${lost}`); + console.log( + `Conservation: ${((terminal / submissions.length) * 100).toFixed(1)}%` + ); + + check(null, { + "conservation: zero lost submissions": () => lost === 0, + }); + }); +} diff --git a/tests/k6/test_stuck_submissions.js b/tests/k6/test_stuck_submissions.js new file mode 100644 index 000000000..19663e699 --- /dev/null +++ b/tests/k6/test_stuck_submissions.js @@ -0,0 +1,310 @@ +/** + * K6 Test: Stuck Submission Cleanup + * + * Tests that submission_status_cleanup recovers submissions stuck in ANY + * non-terminal state (Submitted, Preparing, Running, Scoring), not just Running. + * + * Workflow: + * 1. Authenticate and get API token + * 2. Submit N submissions via API (using a pre-existing competition) + * 3. Stop compute_worker so submissions get stuck + * 4. Poll and confirm submissions are stuck in non-terminal states + * 5. Trigger the cleanup task + * 6. Verify ALL stuck submissions reach a terminal state (Failed) + * + * Usage: + * # Run via the wrapper script (recommended): + * ./tests/k6/run_cleanup_test.sh + * + * # Or run just the K6 part (requires manual setup): + * k6 run tests/k6/test_stuck_submissions.js \ + * --env BASE_URL=http://localhost \ + * --env API_URL=http://localhost/api \ + * --env USERNAME=codabench \ + * --env PASSWORD=codabench \ + * --env PHASE_ID= + */ + +import http from "k6/http"; +import { check, sleep, group } from "k6"; +import { Rate, Counter, Trend } from "k6/metrics"; + +// ─── Custom metrics ───────────────────────────────────────────────────────── +const submissionsCreated = new Counter("submissions_created"); +const submissionsStuck = new Counter("submissions_stuck"); +const submissionsRecovered = new Counter("submissions_recovered"); +const submissionsStillStuck = new Counter("submissions_still_stuck"); +const recoveryRate = new Rate("recovery_rate"); +const timeToRecovery = new Trend("time_to_recovery_ms"); + +// ─── Configuration ────────────────────────────────────────────────────────── +const BASE_URL = __ENV.BASE_URL || "http://localhost"; +const API_URL = __ENV.API_URL || `${BASE_URL}/api`; +const USERNAME = __ENV.USERNAME || "codabench"; +const PASSWORD = __ENV.PASSWORD || "codabench"; +const PHASE_ID = __ENV.PHASE_ID || ""; +const SUBMISSION_COUNT = parseInt(__ENV.SUBMISSION_COUNT || "3"); +const POLL_INTERVAL_SEC = parseInt(__ENV.POLL_INTERVAL_SEC || "5"); +const MAX_POLL_ATTEMPTS = parseInt(__ENV.MAX_POLL_ATTEMPTS || "10"); + +// We run a single iteration — this is not a load test, it's a functional test +export const options = { + vus: 1, + iterations: 1, + thresholds: { + recovery_rate: ["rate>=1.0"], // ALL stuck submissions must be recovered + submissions_still_stuck: ["count==0"], + }, +}; + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +function getAuthToken() { + const res = http.post(`${API_URL}/api-token-auth/`, { + username: USERNAME, + password: PASSWORD, + }); + + check(res, { + "auth: status 200": (r) => r.status === 200, + "auth: token received": (r) => JSON.parse(r.body).token !== undefined, + }); + + return JSON.parse(res.body).token; +} + +function authHeaders(token) { + return { + headers: { + Authorization: `Token ${token}`, + "Content-Type": "application/json", + }, + }; +} + +function getPhaseId(token) { + if (PHASE_ID) return parseInt(PHASE_ID); + + // /api/phases/ returns [] on direct call, /api/competitions/ list has no phases. + // List competitions, then fetch the first one's detail to get phases. + const listRes = http.get(`${API_URL}/competitions/`, authHeaders(token)); + check(listRes, { "competitions list: status 200": (r) => r.status === 200 }); + + const listData = JSON.parse(listRes.body); + const competitions = listData.results || listData; + if (!competitions || competitions.length === 0) { + console.error("No competitions found. Create one first."); + return null; + } + + const detailRes = http.get(`${API_URL}/competitions/${competitions[0].id}/`, authHeaders(token)); + const detail = JSON.parse(detailRes.body); + if (detail.phases && detail.phases.length > 0) { + console.log(`Auto-discovered phase ID: ${detail.phases[0].id} (competition: ${detail.title})`); + return detail.phases[0].id; + } + console.error("No phases found in first competition."); + return null; +} + +function getSubmissionStatus(token, submissionId) { + const res = http.get( + `${API_URL}/submissions/${submissionId}/`, + authHeaders(token) + ); + if (res.status !== 200) return null; + return JSON.parse(res.body); +} + +const TERMINAL_STATUSES = ["Finished", "Failed", "Cancelled"]; +const NON_TERMINAL_STATUSES = [ + "Submitting", + "Submitted", + "Preparing", + "Running", + "Scoring", +]; + +function isTerminal(status) { + return TERMINAL_STATUSES.includes(status); +} + +// ─── Main test ────────────────────────────────────────────────────────────── + +export default function () { + let token; + let phaseId; + const submissionIds = []; + + // ── Step 1: Authenticate ───────────────────────────────────────────── + group("1. Authentication", function () { + token = getAuthToken(); + console.log("Authenticated successfully"); + }); + + // ── Step 2: Discover phase ─────────────────────────────────────────── + group("2. Discover phase", function () { + phaseId = getPhaseId(token); + if (!phaseId) { + console.error("ABORT: No phase available. Create a competition first."); + return; + } + console.log(`Using phase ID: ${phaseId}`); + }); + + if (!phaseId) return; + + // ── Step 2b: Get a data key for submissions ───────────────────────── + let dataKey = __ENV.DATA_KEY || ""; + if (!dataKey) { + group("2b. Get data key", function () { + // Find an existing Data object key via submissions API + const res = http.get( + `${API_URL}/submissions/?phase__competition=${phaseId}`, + authHeaders(token) + ); + // Fallback: we rely on DATA_KEY env var or the wrapper script + console.log("DATA_KEY env var not set — submissions may fail if data is required"); + }); + } + + // ── Step 3: Submit N submissions ───────────────────────────────────── + group("3. Submit submissions", function () { + for (let i = 0; i < SUBMISSION_COUNT; i++) { + const body = { phase: phaseId }; + if (dataKey) body.data = dataKey; + const payload = JSON.stringify(body); + + const res = http.post( + `${API_URL}/submissions/`, + payload, + authHeaders(token) + ); + + const ok = check(res, { + [`submission ${i + 1}: created (201)`]: (r) => r.status === 201, + }); + + if (ok) { + const sub = JSON.parse(res.body); + submissionIds.push(sub.id); + submissionsCreated.add(1); + console.log( + `Created submission ${sub.id} — status: ${sub.status}` + ); + } else { + console.error( + `Failed to create submission ${i + 1}: ${res.status} ${res.body}` + ); + } + + sleep(1); // small gap between submissions + } + }); + + if (submissionIds.length === 0) { + console.error("ABORT: No submissions created."); + return; + } + + // ── Step 4: Wait and check for stuck submissions ───────────────────── + // At this point the wrapper script has already stopped the compute_worker, + // so submissions should stay in non-terminal states. + group("4. Verify submissions are stuck", function () { + sleep(10); // give the system time to process + + let stuckCount = 0; + for (const id of submissionIds) { + const sub = getSubmissionStatus(token, id); + if (sub && !isTerminal(sub.status)) { + stuckCount++; + submissionsStuck.add(1); + console.log(`Submission ${id} stuck in: ${sub.status}`); + } else if (sub) { + console.log(`Submission ${id} already terminal: ${sub.status}`); + } + } + + console.log( + `${stuckCount}/${submissionIds.length} submissions stuck in non-terminal state` + ); + }); + + // ── Step 5: Wait for cleanup to run ────────────────────────────────── + // The wrapper script triggers cleanup between this group and the next. + // We signal readiness by waiting, then polling. + group("5. Wait for cleanup + verify recovery", function () { + console.log( + "Waiting for cleanup task to run (triggered by wrapper script)..." + ); + + // The wrapper script will: + // 1. Force-age the submission timestamps via Django shell + // 2. Trigger submission_status_cleanup() + // We poll until we see the submissions change to terminal. + + const startTime = Date.now(); + let allRecovered = false; + + for (let attempt = 0; attempt < MAX_POLL_ATTEMPTS; attempt++) { + sleep(POLL_INTERVAL_SEC); + + let terminalCount = 0; + for (const id of submissionIds) { + const sub = getSubmissionStatus(token, id); + if (sub && isTerminal(sub.status)) { + terminalCount++; + } + } + + console.log( + `Poll ${attempt + 1}/${MAX_POLL_ATTEMPTS}: ${terminalCount}/${submissionIds.length} terminal` + ); + + if (terminalCount === submissionIds.length) { + allRecovered = true; + break; + } + } + + const elapsed = Date.now() - startTime; + timeToRecovery.add(elapsed); + + // Final status check + let recovered = 0; + let stillStuck = 0; + + for (const id of submissionIds) { + const sub = getSubmissionStatus(token, id); + if (!sub) { + stillStuck++; + continue; + } + + if (isTerminal(sub.status)) { + recovered++; + submissionsRecovered.add(1); + recoveryRate.add(1); + console.log(`✓ Submission ${id} recovered → ${sub.status}`); + } else { + stillStuck++; + submissionsStillStuck.add(1); + recoveryRate.add(0); + console.error( + `✗ Submission ${id} STILL STUCK in: ${sub.status}` + ); + } + } + + console.log(`\n══ Results ══`); + console.log(`Created: ${submissionIds.length}`); + console.log(`Recovered: ${recovered}`); + console.log(`Still stuck: ${stillStuck}`); + console.log(`Recovery rate: ${((recovered / submissionIds.length) * 100).toFixed(1)}%`); + console.log(`Time: ${elapsed}ms`); + + check(null, { + "all submissions recovered": () => stillStuck === 0, + }); + }); +}