livekit · chenghao-mou · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 28, 2026
diff --git a/.changeset/busy-aliens-wink.md b/.changeset/busy-aliens-wink.md
@@ -0,0 +1,6 @@
+---
+"@livekit/agents": minor
+"livekit-agents-examples": patch
+---
+
+feat(eot): add audio eot model support
diff --git a/.changeset/multimodal-eou.md b/.changeset/multimodal-eou.md
@@ -0,0 +1,17 @@
+---
+"@livekit/agents": minor
+"@livekit/agents-plugin-silero": minor
+"@livekit/agents-plugins-livekit": minor
+---
+
+feat(core): multimodal end-of-turn detection with cloud → local fallback (AGT-2520)
+
+- New `inference.AudioTurnDetector`: WebSocket cloud EOT transport (`model: 'turn-detector'`) with automatic fallback to the local native model (`model: 'turn-detector-mini'`) via `@livekit/local-inference`. Auto-selects `'turn-detector'` when `LIVEKIT_REMOTE_EOT_URL` is set, `'turn-detector-mini'` otherwise.
+- The local EOT model runs in the shared inference process (the same `InferenceProcExecutor` the text turn detector uses), loaded once per worker host (~138 MB) instead of in every job worker. The runner is registered by default when the native binding is available, so the inference process spawns on worker startup; on platforms where the binding can't load, local EOT degrades to a positive-default prediction and the worker still starts. (This is a JS-specific divergence from Python, which keeps EOT in-process and relies on forkserver COW sharing.)
+- No prewarm helpers: EOT auto-warms in the inference process; the in-process silero VAD lazy-loads on first stream. (The `inference.prewarm*` helpers added during development were removed before release.)
+- New `inference.VAD` (local-only streaming VAD via `@livekit/local-inference`).
+- `AgentSession` now auto-provisions a bundled silero VAD when `vad` is omitted (`isDefault=true`). Pass `vad: null` to opt out.
+- `livekit-plugins-silero` is deprecated; pass `vad: null` to opt out of the bundled default, or use `inference.VAD({ model: 'silero', ... })` to customise.
+- `livekit-plugins-livekit` turn detector is deprecated in favor of `inference.AudioTurnDetector`.
+- New `EOTInferenceMetrics` and `EOTModelUsage`; new telemetry span attributes (`lk.eou.source`, `lk.eou.from_cache`, `lk.eou.detection_delay`); new `eot_prediction` event forwarded over remote sessions.
+- Requires `@livekit/protocol` >= 1.46.2 (exposes the `AgentInference` message namespace used by the cloud transport).
diff --git a/agents/package.json b/agents/package.json
@@ -52,8 +52,9 @@
   "dependencies": {
     "@bufbuild/protobuf": "^1.10.0",
     "@ffmpeg-installer/ffmpeg": "^1.1.0",
+    "@livekit/local-inference": "^0.2.5",
     "@livekit/mutex": "^1.1.1",
-    "@livekit/protocol": "^1.45.7",
+    "@livekit/protocol": "^1.46.2",
     "@livekit/typed-emitter": "^3.0.0",
     "@livekit/throws-transformer": "0.1.8",
     "@opentelemetry/api": "^1.9.0",

diff --git a/agents/src/inference/_warmup.ts b/agents/src/inference/_warmup.ts
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Loader for the bundled `@livekit/local-inference` native binding.
+ *
+ * Memory model (measured ~138 MB for the EOT model, ~2 MB for VAD): Node has
+ * no forkserver/COW, so anything loaded in a job worker is private to that
+ * worker. To avoid paying ~138 MB per worker, the EOT model is NOT loaded in
+ * job workers — it runs in the shared `InferenceProcExecutor` (see
+ * `inference/eot/runner.ts`), loaded once per host. The VAD stays in-process
+ * (it's small and runs continuously) and is reached via this loader.
+ *
+ * There are intentionally no public `prewarm*` helpers: EOT auto-warms via
+ * the inference runner's `initialize()` at proc startup, and the VAD lazy-
+ * loads on first stream.
+ */
+import { createRequire } from 'node:module';
+import { log } from '../log.js';
+
+const cjsRequire = createRequire(import.meta.url);
+
+let nativeMod: typeof import('@livekit/local-inference') | undefined;
+let triedLoad = false;
+
+function getNative(): typeof import('@livekit/local-inference') | undefined {
+  if (triedLoad) return nativeMod;
+  triedLoad = true;
+  try {
+    nativeMod = cjsRequire('@livekit/local-inference') as typeof import('@livekit/local-inference');
+    return nativeMod;
+  } catch (err) {
+    log().warn(
+      { err: err instanceof Error ? err.message : String(err) },
+      '@livekit/local-inference native binding not loadable; local VAD/EOT paths disabled',
+    );
+    return undefined;
+  }
+}
+
+/** @internal Returns the loaded native module, or `undefined` if unavailable. */
+export function _getLocalInferenceModule(): typeof import('@livekit/local-inference') | undefined {
+  return getNative();
+}