-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodal_app.py
More file actions
346 lines (292 loc) · 13.2 KB
/
Copy pathmodal_app.py
File metadata and controls
346 lines (292 loc) · 13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
import modal
import io
import os
from typing import Dict
from PIL import Image
app = modal.App("accessibility-companion", secrets=[modal.Secret.from_dotenv()])
# Define image with dependencies
# Use a CUDA-enabled base image to ensure system-level libraries (cuDNN) are present
image = (
modal.Image.from_registry("nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04", add_python="3.12")
.pip_install(
"huggingface_hub",
"transformers",
"torch",
"numpy",
"pillow",
"scipy",
"accelerate",
"sentencepiece",
"protobuf",
"torchvision",
"bitsandbytes",
"paddleocr",
"paddlepaddle-gpu==2.6.1",
"faster-whisper",
"google-generativeai",
"optimum[onnxruntime-gpu]",
"pyannote.audio"
)
)
@app.cls(image=image, gpu="any")
class ModelInference:
@modal.enter()
def enter(self):
from transformers import (
OwlViTProcessor,
OwlViTForObjectDetection,
BartTokenizer,
BartForConditionalGeneration,
)
import torch
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading other models on {self.device}...")
# Object Detection Model
self.od_processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
# Keep detection on CPU to free GPU memory
self.od_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to("cpu")
# Text Simplification Model
self.sim_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
# Keep summarizer on CPU to free GPU memory
self.sim_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to("cpu")
print("Models loaded.")
@modal.method()
def process_image_ocr(self, image_bytes: bytes) -> str:
# Deprecated: OCR is now handled locally in app.py via Gemini API
return "OCR should be handled locally."
@modal.method()
def detect_objects(self, image_bytes: bytes) -> dict:
from PIL import Image
import torch
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
# OwlViT requires text queries. We'll use a default list of common objects.
text_queries = ["person", "car", "dog", "cat", "door", "chair", "table", "phone", "laptop", "book"]
inputs = self.od_processor(text=text_queries, images=image, return_tensors="pt")
with torch.no_grad():
outputs = self.od_model(**inputs)
# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.tensor([image.size[::-1]])
# Use post_process_grounded_object_detection as per warning
results = self.od_processor.post_process_grounded_object_detection(
outputs,
threshold=0.1,
target_sizes=target_sizes,
text_labels=[text_queries] # Note: requires list of lists for batch
)[0]
detected_objects = []
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
detected_objects.append({
"label": text_queries[label.item()],
"score": round(score.item(), 3),
"box": [round(i, 2) for i in box.tolist()]
})
return {"objects": detected_objects}
@modal.method()
def simplify_text(self, text: str) -> str:
inputs = self.sim_tokenizer([text], max_length=1024, return_tensors="pt")
summary_ids = self.sim_model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=100)
return self.sim_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
@app.cls(image=image, gpu="T4")
class HearingModel:
@modal.enter()
def enter(self):
from faster_whisper import WhisperModel
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
print(f"Loading Distil-Whisper on {device} ({compute_type})...")
# Load distil-large-v2 as requested
self.model = WhisperModel("distil-large-v2", device=device, compute_type=compute_type)
print("✅ Hearing Model loaded.")
@modal.method()
def transcribe_audio(self, audio_bytes: bytes) -> str:
import io
import numpy as np
print(f"🎤 [HearingModel] Received {len(audio_bytes)} bytes of audio")
# faster-whisper accepts a file-like object or path
# It handles VAD internally if we don't specify vad_filter=False
try:
segments, info = self.model.transcribe(
io.BytesIO(audio_bytes),
beam_size=1,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500)
)
text = " ".join([segment.text for segment in segments])
print(f"✅ [HearingModel] Transcription result: {text.strip()}")
return text.strip()
except Exception as e:
print(f"❌ [HearingModel] Transcription error: {e}")
import traceback
traceback.print_exc()
return ""
@app.cls(image=image, gpu="T4")
class EmotionModel:
@modal.enter()
def enter(self):
from optimum.onnxruntime import ORTModelForAudioClassification
from transformers import AutoFeatureExtractor
import os
import shutil
model_id = "BilalHasan/distilhubert-finetuned-ravdess"
print("Loading Emotion Model (ONNX)...")
# Use /tmp for writable export path
save_dir = "/tmp/emotion_onnx"
# Clean up previous run if needed to ensure fresh export or check existence
if not os.path.exists(save_dir):
print(f"Exporting {model_id} to ONNX...")
# Try CUDA first, fallback to CPU
try:
self.model = ORTModelForAudioClassification.from_pretrained(
model_id,
export=True,
provider="CUDAExecutionProvider"
)
except ValueError:
print("CUDA not available, using CPU for ONNX")
self.model = ORTModelForAudioClassification.from_pretrained(
model_id,
export=True,
provider="CPUExecutionProvider"
)
self.model.save_pretrained(save_dir)
self.feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
self.feature_extractor.save_pretrained(save_dir)
else:
print("Loading cached ONNX model...")
try:
self.model = ORTModelForAudioClassification.from_pretrained(
save_dir,
provider="CUDAExecutionProvider"
)
except ValueError:
print("CUDA not available, using CPU for ONNX")
self.model = ORTModelForAudioClassification.from_pretrained(
save_dir,
provider="CPUExecutionProvider"
)
self.feature_extractor = AutoFeatureExtractor.from_pretrained(save_dir)
print("✅ Emotion Model loaded.")
@modal.method()
def predict_emotion(self, audio_bytes: bytes) -> str:
import torch
import numpy as np
import io
import scipy.io.wavfile as wav
print(f"😊 [EmotionModel] Received {len(audio_bytes)} bytes of audio")
try:
# Load audio
# DistilHuBERT expects 16kHz
# We assume input is wav bytes, need to resample if needed.
# For simplicity, we rely on feature extractor to handle some,
# but it expects raw array.
# Decode bytes to numpy
# Note: This assumes the bytes are a valid WAV file
rate, data = wav.read(io.BytesIO(audio_bytes))
# Convert to float32
if data.dtype == np.int16:
data = data.astype(np.float32) / 32768.0
# Resample to 16000 if needed (simple decimation for now or just assume 16k)
# Ideally use librosa or scipy.signal.resample
if rate != 16000:
from scipy import signal
number_of_samples = round(len(data) * float(16000) / rate)
data = signal.resample(data, number_of_samples)
inputs = self.feature_extractor(data, sampling_rate=16000, return_tensors="pt")
# Run inference
# ORTModel returns generic output
outputs = self.model(**inputs)
# Get label
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
pred_id = torch.argmax(probs, dim=-1).item()
label = self.model.config.id2label[pred_id]
score = probs[0][pred_id].item()
result = f"{label} ({score:.2f})"
print(f"✅ [EmotionModel] Emotion result: {result}")
return result
except Exception as e:
print(f"❌ [EmotionModel] Emotion Error: {e}")
import traceback
traceback.print_exc()
return "Error"
@app.cls(image=image, gpu="T4", secrets=[modal.Secret.from_name("huggingface-secret")])
class DiarizationModel:
@modal.enter()
def enter(self):
from pyannote.audio import Pipeline
import os
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
raise ValueError("HF_TOKEN not found in environment. Please set it in Modal secrets.")
print("Loading Speaker Diarization Pipeline...")
# Load the pyannote speaker-diarization-3.1 pipeline
self.pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
token=hf_token
)
# Move to GPU if available
import torch
if torch.cuda.is_available():
self.pipeline.to(torch.device("cuda"))
print("✅ Diarization Pipeline loaded on GPU.")
else:
print("✅ Diarization Pipeline loaded on CPU.")
@modal.method()
def diarize_chunk(self, audio_bytes: bytes) -> str:
import io
import scipy.io.wavfile as wav
import numpy as np
import torch
print(f"👥 [DiarizationModel] Received {len(audio_bytes)} bytes of audio")
try:
# Decode WAV bytes to numpy array
rate, data = wav.read(io.BytesIO(audio_bytes))
# Ensure writable array to avoid torch warning
if not data.flags.writeable:
data = np.copy(data)
# Convert to float32 tensor
if data.dtype == 'int16':
data = data.astype(np.float32) / 32768.0
else:
data = data.astype(np.float32)
waveform = torch.from_numpy(data).float()
# Ensure shape is (channels, samples) - add channel dim if mono
if waveform.dim() == 1:
waveform = waveform.unsqueeze(0)
# Create the required dictionary format for pyannote
audio_dict = {
"waveform": waveform,
"sample_rate": rate
}
print(f"👥 [DiarizationModel] Running diarization on in-memory audio...")
# Run diarization with in-memory audio
result_obj = self.pipeline(audio_dict)
print(f"👥 [DiarizationModel] Result type: {type(result_obj).__name__}")
# Handle DiarizeOutput (new pyannote 3.1 format)
annotation = None
if hasattr(result_obj, "speaker_diarization"):
print("👥 [DiarizationModel] Using speaker_diarization from DiarizeOutput")
annotation = result_obj.speaker_diarization
elif hasattr(result_obj, "annotation"):
print("👥 [DiarizationModel] Using annotation field from result")
annotation = result_obj.annotation
elif hasattr(result_obj, "itertracks"):
print("👥 [DiarizationModel] Using Annotation object directly")
annotation = result_obj
speakers = []
if annotation is not None and hasattr(annotation, "itertracks"):
for segment, track, label in annotation.itertracks(yield_label=True):
speakers.append((segment.end, label))
if speakers:
speakers.sort(key=lambda x: x[0])
speaker = speakers[-1][1]
print(f"✅ [DiarizationModel] Speaker result: {speaker} ({len(speakers)} segments)")
return speaker
else:
print("⚠️ [DiarizationModel] No speaker detected")
return "No speaker detected"
except Exception as e:
print(f"❌ [DiarizationModel] Diarization Error: {e}")
import traceback
traceback.print_exc()
return "Error"