ProjectTech4DevAI · Prajna1999 · May 27, 2026 · May 28, 2026 · May 31, 2026 · Jun 1, 2026
diff --git a/backend/app/alembic/versions/064_add_anthropic_google_vertex_to_provider_enum.py b/backend/app/alembic/versions/064_add_anthropic_google_vertex_to_provider_enum.py
@@ -0,0 +1,115 @@
+"""add anthropic + google-vertex to provider_enum and seed test model_config rows
+
+Revision ID: 064
+Revises: 063
+Create Date: 2026-05-28 00:00:00.000000
+
+"""
+
+from alembic import op
+
+
+revision = "064"
+down_revision = "063"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ALTER TYPE ... ADD VALUE cannot run inside a transaction block; use
+    # autocommit per existing pattern (see migration 056). The added values
+    # are visible to subsequent statements once the autocommit_block exits.
+    with op.get_context().autocommit_block():
+        op.execute(
+            "ALTER TYPE global.provider_enum ADD VALUE IF NOT EXISTS 'anthropic'"
+        )
+        op.execute(
+            "ALTER TYPE global.provider_enum ADD VALUE IF NOT EXISTS 'google-vertex'"
+        )
+
+    # Pass-through seed rows for testing. Pricing values are placeholders;
+    # revise once real cost data is available.
+    op.execute(
+        """
+        INSERT INTO global.model_config
+            (provider, model_name, completion_type, config, input_modalities, output_modalities, pricing, is_active, inserted_at, updated_at)
+        VALUES
+            -- Anthropic text models
+            ('anthropic', 'claude-opus-4-7', 'text',
+                '{"temperature": {"type": "float", "default": 1.0, "min": 0.0, "max": 1.0, "description": "Sampling temperature."}}',
+                '{TEXT,IMAGE,PDF}', '{TEXT}',
+                '{"response": {"input_token_cost": 15.0, "output_token_cost": 75.0}, "batch": {"input_token_cost": 7.5, "output_token_cost": 37.5}}',
+                true, NOW(), NOW()),
+            ('anthropic', 'claude-sonnet-4-6', 'text',
+                '{"temperature": {"type": "float", "default": 1.0, "min": 0.0, "max": 1.0, "description": "Sampling temperature."}}',
+                '{TEXT,IMAGE,PDF}', '{TEXT}',
+                '{"response": {"input_token_cost": 3.0, "output_token_cost": 15.0}, "batch": {"input_token_cost": 1.5, "output_token_cost": 7.5}}',
+                true, NOW(), NOW()),
+            ('anthropic', 'claude-haiku-4-5-20251001', 'text',
+                '{"temperature": {"type": "float", "default": 1.0, "min": 0.0, "max": 1.0, "description": "Sampling temperature."}}',
+                '{TEXT,IMAGE,PDF}', '{TEXT}',
+                '{"response": {"input_token_cost": 1.0, "output_token_cost": 5.0}, "batch": {"input_token_cost": 0.5, "output_token_cost": 2.5}}',
+                true, NOW(), NOW()),
+            -- Google Vertex STT models (Gemini 3.x family — GA per
+            -- https://docs.cloud.google.com/gemini-enterprise-agent-platform/models/google-models)
+            ('google-vertex', 'gemini-3.1-pro-preview', 'stt',
+                '{"thinking_level": {"type": "enum", "default": "high", "options": ["low", "medium", "high"], "description": "Max reasoning depth before output. high = best quality, low = faster/cheaper."}}',
+                '{AUDIO}', '{TEXT}',
+                '{"response": {"input_token_cost": 2.0, "output_token_cost": 12.0}, "audio": {"input_token_cost": 3.5, "output_token_cost": 12.0}}',
+                true, NOW(), NOW()),
+            ('google-vertex', 'gemini-3-pro', 'stt',
+                '{"thinking_level": {"type": "enum", "default": "high", "options": ["low", "medium", "high"], "description": "Max reasoning depth before output."}}',
+                '{AUDIO}', '{TEXT}',
+                '{"response": {"input_token_cost": 1.5, "output_token_cost": 10.0}, "audio": {"input_token_cost": 3.0, "output_token_cost": 10.0}}',
+                true, NOW(), NOW()),
+            ('google-vertex', 'gemini-3.5-flash', 'stt',
+                '{"thinking_level": {"type": "enum", "default": "high", "options": ["minimal", "low", "medium", "high"], "description": "Max reasoning depth before output."}}',
+                '{AUDIO}', '{TEXT}',
+                '{"response": {"input_token_cost": 0.6, "output_token_cost": 3.5}, "audio": {"input_token_cost": 1.2, "output_token_cost": 3.5}}',
+                true, NOW(), NOW()),
+            ('google-vertex', 'gemini-3-flash-preview', 'stt',
+                '{"thinking_level": {"type": "enum", "default": "high", "options": ["minimal", "low", "medium", "high"], "description": "Max reasoning depth before output."}}',
+                '{AUDIO}', '{TEXT}',
+                '{"response": {"input_token_cost": 0.5, "output_token_cost": 3.0}, "audio": {"input_token_cost": 1.0, "output_token_cost": 3.0}}',
+                true, NOW(), NOW()),
+            ('google-vertex', 'gemini-3.1-flash-lite', 'stt',
+                '{"temperature": {"type": "float", "default": 0.0, "min": 0.0, "max": 2.0, "description": "Controls randomness. Lower = more deterministic."}}',
+                '{AUDIO}', '{TEXT}',
+                '{"response": {"input_token_cost": 0.1, "output_token_cost": 0.4}, "audio": {"input_token_cost": 0.3, "output_token_cost": 0.4}}',
+                true, NOW(), NOW()),
+            ('google-vertex', 'gemini-2.5-flash', 'stt',
+                '{"temperature": {"type": "float", "default": 0.0, "min": 0.0, "max": 2.0, "description": "Controls randomness. Lower = more deterministic."}}',
+                '{AUDIO}', '{TEXT}',
+                '{"response": {"input_token_cost": 0.3, "output_token_cost": 2.5}, "audio": {"input_token_cost": 1.0, "output_token_cost": 2.5}}',
+                true, NOW(), NOW()),
+            ('google-vertex', 'gemini-2.5-pro', 'stt',
+                '{"temperature": {"type": "float", "default": 0.0, "min": 0.0, "max": 2.0, "description": "Controls randomness. Lower = more deterministic."}}',
+                '{AUDIO}', '{TEXT}',
+                '{"response": {"input_token_cost": 1.25, "output_token_cost": 10.0}, "audio": {"input_token_cost": 3.5, "output_token_cost": 10.0}}',
+                true, NOW(), NOW()),
+            -- Google Vertex TTS models
+            ('google-vertex', 'gemini-2.5-flash-preview-tts', 'tts',
+                '{"voice": {"type": "enum", "default": "Kore", "options": ["Aoede", "Charon", "Fenrir", "Kore", "Puck"], "description": "TTS voice."}}',
+                '{TEXT}', '{AUDIO}',
+                '{"response": {"input_token_cost": 0.5, "output_token_cost": 10.0}, "audio": {"input_token_cost": 0.5, "output_token_cost": 10.0}}',
+                true, NOW(), NOW()),
+            ('google-vertex', 'gemini-2.5-pro-preview-tts', 'tts',
+                '{"voice": {"type": "enum", "default": "Kore", "options": ["Aoede", "Charon", "Fenrir", "Kore", "Puck"], "description": "TTS voice."}}',
+                '{TEXT}', '{AUDIO}',
+                '{"response": {"input_token_cost": 1.0, "output_token_cost": 20.0}, "audio": {"input_token_cost": 1.0, "output_token_cost": 20.0}}',
+                true, NOW(), NOW())
+        ON CONFLICT (provider, model_name) DO NOTHING
+        """
+    )
+
+
+def downgrade():
+    op.execute(
+        """
+        DELETE FROM global.model_config
+        WHERE provider IN ('anthropic', 'google-vertex')
+        """
+    )
+    # Enum value removal requires rebuilding the type and re-pointing every
+    # referencing column. Skipped — see migrations 035 / 056 for the same
+    # convention.
diff --git a/backend/app/core/audio_utils.py b/backend/app/core/audio_utils.py
@@ -1,18 +1,56 @@
-"""
-Audio processing utilities for format conversion.
+"""Audio processing utilities: format conversion + STT input carrier."""
 
-This module provides utilities for converting audio between different formats,
-particularly for TTS output post-processing.
-"""
 import io
 import logging
+import os
+import tempfile
 import wave
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Iterator
 
 from pydub import AudioSegment
 
-
 logger = logging.getLogger(__name__)
 
+_MIME_TO_EXT = {
+    "audio/wav": ".wav",
+    "audio/mpeg": ".mp3",
+    "audio/mp3": ".mp3",
+    "audio/ogg": ".ogg",
+    "audio/flac": ".flac",
+    "audio/webm": ".webm",
+    "audio/mp4": ".mp4",
+    "audio/m4a": ".m4a",
+    "audio/aac": ".aac",
+    "audio/aiff": ".aiff",
+}
+
+
+@dataclass(frozen=True)
+class AudioRef:
+    """In-memory STT input. Providers consume ``bytes_`` directly or call
+    ``to_path()`` when an SDK needs a filesystem path. Temp files are owned
+    by the provider's ``with`` scope — no framework-level cleanup needed.
+    """
+
+    bytes_: bytes
+    mime_type: str = "audio/wav"
+
+    @contextmanager
+    def to_path(self) -> Iterator[str]:
+        ext = _MIME_TO_EXT.get(self.mime_type, ".audio")
+        tmp = tempfile.NamedTemporaryFile(suffix=ext, delete=False, prefix="audio_")
+        try:
+            tmp.write(self.bytes_)
+            tmp.close()
+            yield tmp.name
+        finally:
+            try:
+                os.unlink(tmp.name)
+            except OSError:
+                pass
+
 
 def convert_pcm_to_mp3(
     pcm_bytes: bytes, sample_rate: int = 24000

diff --git a/backend/app/core/cloud/__init__.py b/backend/app/core/cloud/__init__.py
@@ -4,4 +4,5 @@
     CloudStorage,
     CloudStorageError,
     get_cloud_storage,
+    upload_audio_to_gcs,
 )