disability-assistant/modal_app.py at main · githubbermoon/disability-assistant · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
import modal
import io
import os
from typing import Dict
from PIL import Image

app = modal.App("accessibility-companion", secrets=[modal.Secret.from_dotenv()])

# Define image with dependencies
# Use a CUDA-enabled base image to ensure system-level libraries (cuDNN) are present
image = (
    modal.Image.from_registry("nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04", add_python="3.12")
    .pip_install(
        "huggingface_hub",
        "transformers",
        "torch",
        "numpy",
        "pillow",
        "scipy",
        "accelerate",
        "sentencepiece",
        "protobuf",
        "torchvision",
        "bitsandbytes",
        "paddleocr",
        "paddlepaddle-gpu==2.6.1",
        "faster-whisper",
        "google-generativeai",
        "optimum[onnxruntime-gpu]",
        "pyannote.audio"
    )
)

@app.cls(image=image, gpu="any")
class ModelInference:
    @modal.enter()
    def enter(self):
        from transformers import (
            OwlViTProcessor,
            OwlViTForObjectDetection,
            BartTokenizer,
            BartForConditionalGeneration,
        )
        import torch

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Loading other models on {self.device}...")

        # Object Detection Model
        self.od_processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
        # Keep detection on CPU to free GPU memory
        self.od_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to("cpu")

        # Text Simplification Model
        self.sim_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
        # Keep summarizer on CPU to free GPU memory
        self.sim_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to("cpu")

        print("Models loaded.")

    @modal.method()
    def process_image_ocr(self, image_bytes: bytes) -> str:
        # Deprecated: OCR is now handled locally in app.py via Gemini API
        return "OCR should be handled locally."

    @modal.method()
    def detect_objects(self, image_bytes: bytes) -> dict:
        from PIL import Image
        import torch

        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        # OwlViT requires text queries. We'll use a default list of common objects.
        text_queries = ["person", "car", "dog", "cat", "door", "chair", "table", "phone", "laptop", "book"]
        inputs = self.od_processor(text=text_queries, images=image, return_tensors="pt")

        with torch.no_grad():
            outputs = self.od_model(**inputs)

        # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        target_sizes = torch.tensor([image.size[::-1]])
        # Use post_process_grounded_object_detection as per warning
        results = self.od_processor.post_process_grounded_object_detection(
            outputs,
            threshold=0.1,
            target_sizes=target_sizes,
            text_labels=[text_queries] # Note: requires list of lists for batch
        )[0]

        detected_objects = []
        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
            detected_objects.append({
                "label": text_queries[label.item()],
                "score": round(score.item(), 3),
                "box": [round(i, 2) for i in box.tolist()]
            })

        return {"objects": detected_objects}

    @modal.method()
    def simplify_text(self, text: str) -> str:
        inputs = self.sim_tokenizer([text], max_length=1024, return_tensors="pt")
        summary_ids = self.sim_model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=100)
        return self.sim_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

@app.cls(image=image, gpu="T4")
class HearingModel:
    @modal.enter()
    def enter(self):
        from faster_whisper import WhisperModel
        import torch

        device = "cuda" if torch.cuda.is_available() else "cpu"
        compute_type = "float16" if device == "cuda" else "int8"

        print(f"Loading Distil-Whisper on {device} ({compute_type})...")
        # Load distil-large-v2 as requested
        self.model = WhisperModel("distil-large-v2", device=device, compute_type=compute_type)
        print("✅ Hearing Model loaded.")

    @modal.method()
    def transcribe_audio(self, audio_bytes: bytes) -> str:
        import io
        import numpy as np

        print(f"🎤 [HearingModel] Received {len(audio_bytes)} bytes of audio")

        # faster-whisper accepts a file-like object or path
        # It handles VAD internally if we don't specify vad_filter=False
        try:
            segments, info = self.model.transcribe(
                io.BytesIO(audio_bytes),
                beam_size=1,
                vad_filter=True,
                vad_parameters=dict(min_silence_duration_ms=500)
            )

            text = " ".join([segment.text for segment in segments])
            print(f"✅ [HearingModel] Transcription result: {text.strip()}")
            return text.strip()
        except Exception as e:
            print(f"❌ [HearingModel] Transcription error: {e}")
            import traceback
            traceback.print_exc()
            return ""

@app.cls(image=image, gpu="T4")
class EmotionModel:
    @modal.enter()
    def enter(self):
        from optimum.onnxruntime import ORTModelForAudioClassification
        from transformers import AutoFeatureExtractor
        import os
        import shutil

        model_id = "BilalHasan/distilhubert-finetuned-ravdess"
        print("Loading Emotion Model (ONNX)...")

        # Use /tmp for writable export path
        save_dir = "/tmp/emotion_onnx"

        # Clean up previous run if needed to ensure fresh export or check existence
        if not os.path.exists(save_dir):
            print(f"Exporting {model_id} to ONNX...")
            # Try CUDA first, fallback to CPU
            try:
                self.model = ORTModelForAudioClassification.from_pretrained(
                    model_id,
                    export=True,
                    provider="CUDAExecutionProvider"
                )
            except ValueError:
                print("CUDA not available, using CPU for ONNX")
                self.model = ORTModelForAudioClassification.from_pretrained(
                    model_id,
                    export=True,
                    provider="CPUExecutionProvider"
                )
            self.model.save_pretrained(save_dir)
            self.feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
            self.feature_extractor.save_pretrained(save_dir)
        else:
             print("Loading cached ONNX model...")
             try:
                 self.model = ORTModelForAudioClassification.from_pretrained(
                    save_dir,
                    provider="CUDAExecutionProvider"
                 )
             except ValueError:
                 print("CUDA not available, using CPU for ONNX")
                 self.model = ORTModelForAudioClassification.from_pretrained(
                    save_dir,
                    provider="CPUExecutionProvider"
                 )
             self.feature_extractor = AutoFeatureExtractor.from_pretrained(save_dir)

        print("✅ Emotion Model loaded.")

    @modal.method()
    def predict_emotion(self, audio_bytes: bytes) -> str:
        import torch
        import numpy as np
        import io
        import scipy.io.wavfile as wav

        print(f"😊 [EmotionModel] Received {len(audio_bytes)} bytes of audio")

        try:
            # Load audio
            # DistilHuBERT expects 16kHz
            # We assume input is wav bytes, need to resample if needed.
            # For simplicity, we rely on feature extractor to handle some,
            # but it expects raw array.

            # Decode bytes to numpy
            # Note: This assumes the bytes are a valid WAV file
            rate, data = wav.read(io.BytesIO(audio_bytes))

            # Convert to float32
            if data.dtype == np.int16:
                data = data.astype(np.float32) / 32768.0

            # Resample to 16000 if needed (simple decimation for now or just assume 16k)
            # Ideally use librosa or scipy.signal.resample
            if rate != 16000:
                from scipy import signal
                number_of_samples = round(len(data) * float(16000) / rate)
                data = signal.resample(data, number_of_samples)

            inputs = self.feature_extractor(data, sampling_rate=16000, return_tensors="pt")

            # Run inference
            # ORTModel returns generic output
            outputs = self.model(**inputs)

            # Get label
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred_id = torch.argmax(probs, dim=-1).item()
            label = self.model.config.id2label[pred_id]
            score = probs[0][pred_id].item()

            result = f"{label} ({score:.2f})"
            print(f"✅ [EmotionModel] Emotion result: {result}")
            return result
        except Exception as e:
            print(f"❌ [EmotionModel] Emotion Error: {e}")
            import traceback
            traceback.print_exc()
            return "Error"

@app.cls(image=image, gpu="T4", secrets=[modal.Secret.from_name("huggingface-secret")])
class DiarizationModel:
    @modal.enter()
    def enter(self):
        from pyannote.audio import Pipeline
        import os

        hf_token = os.environ.get("HF_TOKEN")
        if not hf_token:
            raise ValueError("HF_TOKEN not found in environment. Please set it in Modal secrets.")

        print("Loading Speaker Diarization Pipeline...")
        # Load the pyannote speaker-diarization-3.1 pipeline
        self.pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            token=hf_token
        )

        # Move to GPU if available
        import torch
        if torch.cuda.is_available():
            self.pipeline.to(torch.device("cuda"))
            print("✅ Diarization Pipeline loaded on GPU.")
        else:
            print("✅ Diarization Pipeline loaded on CPU.")

    @modal.method()
    def diarize_chunk(self, audio_bytes: bytes) -> str:
        import io
        import scipy.io.wavfile as wav
        import numpy as np
        import torch

        print(f"👥 [DiarizationModel] Received {len(audio_bytes)} bytes of audio")

        try:
            # Decode WAV bytes to numpy array
            rate, data = wav.read(io.BytesIO(audio_bytes))
            # Ensure writable array to avoid torch warning
            if not data.flags.writeable:
                data = np.copy(data)

            # Convert to float32 tensor
            if data.dtype == 'int16':
                data = data.astype(np.float32) / 32768.0
            else:
                data = data.astype(np.float32)
            waveform = torch.from_numpy(data).float()

            # Ensure shape is (channels, samples) - add channel dim if mono
            if waveform.dim() == 1:
                waveform = waveform.unsqueeze(0)

            # Create the required dictionary format for pyannote
            audio_dict = {
                "waveform": waveform,
                "sample_rate": rate
            }

            print(f"👥 [DiarizationModel] Running diarization on in-memory audio...")

            # Run diarization with in-memory audio
            result_obj = self.pipeline(audio_dict)

            print(f"👥 [DiarizationModel] Result type: {type(result_obj).__name__}")

            # Handle DiarizeOutput (new pyannote 3.1 format)
            annotation = None
            if hasattr(result_obj, "speaker_diarization"):
                print("👥 [DiarizationModel] Using speaker_diarization from DiarizeOutput")
                annotation = result_obj.speaker_diarization
            elif hasattr(result_obj, "annotation"):
                print("👥 [DiarizationModel] Using annotation field from result")
                annotation = result_obj.annotation
            elif hasattr(result_obj, "itertracks"):
                print("👥 [DiarizationModel] Using Annotation object directly")
                annotation = result_obj

            speakers = []
            if annotation is not None and hasattr(annotation, "itertracks"):
                for segment, track, label in annotation.itertracks(yield_label=True):
                    speakers.append((segment.end, label))

            if speakers:
                speakers.sort(key=lambda x: x[0])
                speaker = speakers[-1][1]
                print(f"✅ [DiarizationModel] Speaker result: {speaker} ({len(speakers)} segments)")
                return speaker
            else:
                print("⚠️ [DiarizationModel] No speaker detected")
                return "No speaker detected"

        except Exception as e:
            print(f"❌ [DiarizationModel] Diarization Error: {e}")
            import traceback
            traceback.print_exc()
            return "Error"