Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| import whisper | |
| from gtts import gTTS | |
| from typing import Optional, Tuple, Dict, Any | |
| from datetime import datetime | |
| from pydub.utils import mediainfo | |
| import json | |
| # === Load Whisper model once === | |
| try: | |
| model = whisper.load_model("base") # Choose: "tiny", "base", "small", "medium", "large" | |
| except Exception as e: | |
| raise RuntimeError(f"❌ Failed to load Whisper model: {e}") | |
| # === Optional: Enable this to log all transcriptions === | |
| AUDIO_LOG_FILE = os.path.join(tempfile.gettempdir(), "audio_transcription_log.jsonl") | |
| def transcribe_audio(file_path: str, language: Optional[str] = None) -> Tuple[str, Optional[str], float]: | |
| """ | |
| Transcribes audio. Returns (text, detected_language, duration_sec) | |
| """ | |
| try: | |
| info = mediainfo(file_path) | |
| duration = float(info.get("duration", 0)) | |
| result = model.transcribe(file_path, language=language) | |
| text = result.get("text", "").strip() | |
| detected_lang = result.get("language", language) | |
| return text, detected_lang, duration | |
| except Exception as e: | |
| return f"Error during transcription: {e}", None, 0.0 | |
| def text_to_speech(text: str, lang: str = "en") -> str: | |
| """ | |
| Converts text to MP3 using gTTS and returns the file path. | |
| """ | |
| try: | |
| timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") | |
| filename = f"tts_{timestamp}.mp3" | |
| output_path = os.path.join(tempfile.gettempdir(), filename) | |
| tts = gTTS(text=text, lang=lang) | |
| tts.save(output_path) | |
| return output_path | |
| except Exception as e: | |
| raise RuntimeError(f"Text-to-Speech conversion failed: {e}") | |
| def log_audio_interaction(log: Dict[str, Any]): | |
| """ | |
| Logs audio interaction to a JSONL file in temp dir. | |
| """ | |
| try: | |
| with open(AUDIO_LOG_FILE, "a", encoding="utf-8") as f: | |
| f.write(json.dumps(log) + "\n") | |
| except Exception: | |
| pass | |
| def process_audio_to_speech(file_path: str, lang: Optional[str] = None) -> Dict[str, Any]: | |
| """ | |
| Full pipeline: transcribe -> respond -> convert to TTS. | |
| Returns dict with text, detected language, tts_audio_path, duration. | |
| """ | |
| text, detected_lang, duration = transcribe_audio(file_path, language=lang) | |
| if text.startswith("Error"): | |
| return { | |
| "error": text, | |
| "language": detected_lang, | |
| "duration_sec": duration | |
| } | |
| # Customize AI response | |
| response_text = f"You said: {text}" | |
| tts_path = text_to_speech(response_text, lang=detected_lang or "en") | |
| log_audio_interaction({ | |
| "timestamp": datetime.utcnow().isoformat(), | |
| "original_text": text, | |
| "response_text": response_text, | |
| "detected_language": detected_lang, | |
| "duration_sec": duration, | |
| "tts_path": tts_path | |
| }) | |
| return { | |
| "transcription": text, | |
| "response": response_text, | |
| "language": detected_lang, | |
| "duration_sec": duration, | |
| "tts_audio_path": tts_path | |
| } | |
| def cleanup_audio_files(*file_paths: str): | |
| """ | |
| Deletes temp files if they exist. | |
| """ | |
| for path in file_paths: | |
| try: | |
| if os.path.exists(path): | |
| os.remove(path) | |
| except Exception: | |
| pass | |