Spaces:

deniskiplimo816
/

llama-models

Sleeping

App Files Files Community

llama-models / app /audio_tool.py

deniskiplimo816

Upload 27 files

293ab16 verified 6 months ago

raw

history blame contribute delete

3.26 kB

	import os
	import tempfile
	import whisper
	from gtts import gTTS
	from typing import Optional, Tuple, Dict, Any
	from datetime import datetime
	from pydub.utils import mediainfo
	import json

	# === Load Whisper model once ===
	try:
	model = whisper.load_model("base") # Choose: "tiny", "base", "small", "medium", "large"
	except Exception as e:
	raise RuntimeError(f"❌ Failed to load Whisper model: {e}")

	# === Optional: Enable this to log all transcriptions ===
	AUDIO_LOG_FILE = os.path.join(tempfile.gettempdir(), "audio_transcription_log.jsonl")

	def transcribe_audio(file_path: str, language: Optional[str] = None) -> Tuple[str, Optional[str], float]:
	"""
	Transcribes audio. Returns (text, detected_language, duration_sec)
	"""
	try:
	info = mediainfo(file_path)
	duration = float(info.get("duration", 0))

	result = model.transcribe(file_path, language=language)
	text = result.get("text", "").strip()
	detected_lang = result.get("language", language)

	return text, detected_lang, duration
	except Exception as e:
	return f"Error during transcription: {e}", None, 0.0

	def text_to_speech(text: str, lang: str = "en") -> str:
	"""
	Converts text to MP3 using gTTS and returns the file path.
	"""
	try:
	timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
	filename = f"tts_{timestamp}.mp3"
	output_path = os.path.join(tempfile.gettempdir(), filename)

	tts = gTTS(text=text, lang=lang)
	tts.save(output_path)

	return output_path
	except Exception as e:
	raise RuntimeError(f"Text-to-Speech conversion failed: {e}")

	def log_audio_interaction(log: Dict[str, Any]):
	"""
	Logs audio interaction to a JSONL file in temp dir.
	"""
	try:
	with open(AUDIO_LOG_FILE, "a", encoding="utf-8") as f:
	f.write(json.dumps(log) + "\n")
	except Exception:
	pass

	def process_audio_to_speech(file_path: str, lang: Optional[str] = None) -> Dict[str, Any]:
	"""
	Full pipeline: transcribe -> respond -> convert to TTS.
	Returns dict with text, detected language, tts_audio_path, duration.
	"""
	text, detected_lang, duration = transcribe_audio(file_path, language=lang)

	if text.startswith("Error"):
	return {
	"error": text,
	"language": detected_lang,
	"duration_sec": duration
	}

	# Customize AI response
	response_text = f"You said: {text}"
	tts_path = text_to_speech(response_text, lang=detected_lang or "en")

	log_audio_interaction({
	"timestamp": datetime.utcnow().isoformat(),
	"original_text": text,
	"response_text": response_text,
	"detected_language": detected_lang,
	"duration_sec": duration,
	"tts_path": tts_path
	})

	return {
	"transcription": text,
	"response": response_text,
	"language": detected_lang,
	"duration_sec": duration,
	"tts_audio_path": tts_path
	}

	def cleanup_audio_files(*file_paths: str):
	"""
	Deletes temp files if they exist.
	"""
	for path in file_paths:
	try:
	if os.path.exists(path):
	os.remove(path)
	except Exception:
	pass