Update app.py
Browse files
app.py
CHANGED
|
@@ -7,18 +7,13 @@ import os
|
|
| 7 |
import datetime
|
| 8 |
|
| 9 |
# --- Configuration ---
|
| 10 |
-
# Assurez-vous d'avoir ajouté votre token Hugging Face dans les secrets de votre Space (nommé HF_TOKEN)
|
| 11 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 12 |
-
# MODEL_NAME = "openai/whisper-large-v3"
|
| 13 |
MODEL_NAME = "openai/whisper-medium"
|
| 14 |
|
| 15 |
-
# Détermine si un GPU est disponible et configure le device en conséquence
|
| 16 |
device = 0 if torch.cuda.is_available() else "cpu"
|
| 17 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 18 |
|
| 19 |
# --- Initialisation des modèles ---
|
| 20 |
-
|
| 21 |
-
# Pipeline de transcription (Whisper)
|
| 22 |
pipe = pipeline(
|
| 23 |
"automatic-speech-recognition",
|
| 24 |
model=MODEL_NAME,
|
|
@@ -26,8 +21,6 @@ pipe = pipeline(
|
|
| 26 |
device=device,
|
| 27 |
)
|
| 28 |
|
| 29 |
-
# Pipeline de diarisation (Pyannote)
|
| 30 |
-
# S'assure que le token est disponible avant de charger le modèle
|
| 31 |
if HF_TOKEN:
|
| 32 |
pyannote_pipeline = Pipeline.from_pretrained(
|
| 33 |
"pyannote/speaker-diarization-3.1",
|
|
@@ -38,14 +31,12 @@ else:
|
|
| 38 |
pyannote_pipeline = None
|
| 39 |
print("Avertissement : Le token Hugging Face n'est pas défini. La diarisation sera désactivée.")
|
| 40 |
|
| 41 |
-
|
| 42 |
# --- Fonctions de traitement audio ---
|
| 43 |
|
| 44 |
def convert_to_wav(audio_path):
|
| 45 |
"""Convertit n'importe quel fichier audio en format WAV mono."""
|
| 46 |
try:
|
| 47 |
audio = AudioSegment.from_file(audio_path)
|
| 48 |
-
# Convertit en mono pour la compatibilité avec les modèles
|
| 49 |
audio = audio.set_channels(1)
|
| 50 |
wav_path = os.path.splitext(audio_path)[0] + ".wav"
|
| 51 |
audio.export(wav_path, format="wav")
|
|
@@ -54,80 +45,78 @@ def convert_to_wav(audio_path):
|
|
| 54 |
print(f"Erreur lors de la conversion en WAV : {e}")
|
| 55 |
return None
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def transcribe_audio(filepath, diarize, language_choice):
|
| 58 |
-
"""Fonction principale qui transcrit et optionnellement diarise l'audio."""
|
| 59 |
if filepath is None:
|
| 60 |
-
return "Aucun fichier audio fourni.", "Veuillez télécharger un fichier audio.", ""
|
| 61 |
|
| 62 |
-
# Convertit le fichier en WAV, format requis par les modèles
|
| 63 |
wav_filepath = convert_to_wav(filepath)
|
| 64 |
if not wav_filepath:
|
| 65 |
-
return "Erreur : Le fichier audio n'a pas pu être converti.", "Conversion échouée.", ""
|
| 66 |
|
| 67 |
-
# Prépare les paramètres pour Whisper
|
| 68 |
whisper_params = {
|
| 69 |
"chunk_length_s": 30,
|
| 70 |
"batch_size": 24,
|
| 71 |
"return_timestamps": True
|
| 72 |
}
|
| 73 |
-
|
| 74 |
-
# Ajoute la langue si elle est spécifiée
|
| 75 |
if language_choice != "auto":
|
| 76 |
whisper_params["generate_kwargs"] = {"language": language_choice}
|
| 77 |
|
| 78 |
-
# Transcription avec Whisper
|
| 79 |
outputs = pipe(wav_filepath, **whisper_params)
|
| 80 |
transcription = outputs["text"].strip()
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
detected_language = "Non disponible"
|
| 84 |
-
if "chunks" in outputs and len(outputs["chunks"]) > 0:
|
| 85 |
-
# Essaie de récupérer la langue du premier chunk
|
| 86 |
-
first_chunk = outputs["chunks"][0]
|
| 87 |
-
if "language" in first_chunk:
|
| 88 |
-
detected_language = first_chunk["language"]
|
| 89 |
-
|
| 90 |
-
# Message d'information sur la langue
|
| 91 |
language_info = f"Langue détectée: {detected_language}"
|
| 92 |
if language_choice != "auto":
|
| 93 |
language_info += f" (Langue forcée: {language_choice})"
|
| 94 |
|
| 95 |
-
|
| 96 |
if diarize and pyannote_pipeline:
|
| 97 |
try:
|
| 98 |
diarization = pyannote_pipeline(wav_filepath)
|
| 99 |
-
diarized_transcription = ""
|
| 100 |
-
# Parcourt les segments de parole identifiés par la diarisation
|
| 101 |
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 102 |
segment_start = turn.start
|
| 103 |
segment_end = turn.end
|
| 104 |
-
|
| 105 |
-
# Associe le texte transcrit au segment de parole actuel
|
| 106 |
segment_text = ""
|
| 107 |
for chunk in outputs["chunks"]:
|
| 108 |
chunk_start = chunk['timestamp'][0]
|
| 109 |
chunk_end = chunk['timestamp'][1]
|
| 110 |
if chunk_start is not None and chunk_end is not None:
|
| 111 |
-
# Vérifie si le chunk de texte se superpose avec le segment de parole
|
| 112 |
if max(segment_start, chunk_start) < min(segment_end, chunk_end):
|
| 113 |
segment_text += chunk['text']
|
| 114 |
-
|
| 115 |
-
# Formate la sortie
|
| 116 |
start_time = str(datetime.timedelta(seconds=int(segment_start)))
|
| 117 |
diarized_transcription += f"[{start_time}] {speaker}:{segment_text.strip()}\n"
|
| 118 |
-
|
| 119 |
-
return transcription, diarized_transcription, language_info
|
| 120 |
except Exception as e:
|
| 121 |
-
|
| 122 |
elif diarize:
|
| 123 |
-
|
| 124 |
else:
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
# --- Interface Gradio ---
|
| 128 |
|
| 129 |
with gr.Blocks() as demo:
|
| 130 |
-
gr.
|
| 131 |
# gr.Markdown("## Objectif")
|
| 132 |
gr.Markdown("Transcrivez et diarisez automatiquement vos fichiers audio (WhatsApp, réunions, interviews, etc.) grâce à Whisper et pyannote, directement dans ce Space.")
|
| 133 |
|
|
@@ -155,39 +144,30 @@ with gr.Blocks() as demo:
|
|
| 155 |
reset_btn = gr.Button("Reset", variant="secondary")
|
| 156 |
with gr.Column():
|
| 157 |
language_info_output = gr.Textbox(label="Information sur la langue", lines=1)
|
|
|
|
| 158 |
transcription_output = gr.Textbox(label="Transcription Complète", lines=10)
|
|
|
|
| 159 |
diarization_output = gr.Textbox(label="Transcription avec Diarisation (par locuteur)", lines=15)
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
submit_btn.click(
|
| 164 |
fn=transcribe_audio,
|
| 165 |
inputs=[audio_input, diarize_checkbox, language_dropdown],
|
| 166 |
-
outputs=[transcription_output, diarization_output, language_info_output]
|
| 167 |
)
|
| 168 |
|
| 169 |
def reset_fields():
|
| 170 |
-
return "", "", "", None, "auto", True
|
| 171 |
|
| 172 |
reset_btn.click(
|
| 173 |
fn=reset_fields,
|
| 174 |
inputs=[],
|
| 175 |
-
outputs=[transcription_output, diarization_output, language_info_output,
|
| 176 |
-
)
|
| 177 |
-
|
| 178 |
-
def save_transcription_to_txt(transcription):
|
| 179 |
-
if not transcription:
|
| 180 |
-
return None
|
| 181 |
-
filename = "transcription.txt"
|
| 182 |
-
with open(filename, "w", encoding="utf-8") as f:
|
| 183 |
-
f.write(transcription)
|
| 184 |
-
return filename
|
| 185 |
-
|
| 186 |
-
download_btn.click(
|
| 187 |
-
fn=save_transcription_to_txt,
|
| 188 |
-
inputs=transcription_output,
|
| 189 |
-
outputs=download_file
|
| 190 |
)
|
| 191 |
|
| 192 |
-
# --- Lancement de l'application ---
|
| 193 |
demo.launch(share=True)
|
|
|
|
| 7 |
import datetime
|
| 8 |
|
| 9 |
# --- Configuration ---
|
|
|
|
| 10 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 11 |
MODEL_NAME = "openai/whisper-medium"
|
| 12 |
|
|
|
|
| 13 |
device = 0 if torch.cuda.is_available() else "cpu"
|
| 14 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 15 |
|
| 16 |
# --- Initialisation des modèles ---
|
|
|
|
|
|
|
| 17 |
pipe = pipeline(
|
| 18 |
"automatic-speech-recognition",
|
| 19 |
model=MODEL_NAME,
|
|
|
|
| 21 |
device=device,
|
| 22 |
)
|
| 23 |
|
|
|
|
|
|
|
| 24 |
if HF_TOKEN:
|
| 25 |
pyannote_pipeline = Pipeline.from_pretrained(
|
| 26 |
"pyannote/speaker-diarization-3.1",
|
|
|
|
| 31 |
pyannote_pipeline = None
|
| 32 |
print("Avertissement : Le token Hugging Face n'est pas défini. La diarisation sera désactivée.")
|
| 33 |
|
|
|
|
| 34 |
# --- Fonctions de traitement audio ---
|
| 35 |
|
| 36 |
def convert_to_wav(audio_path):
|
| 37 |
"""Convertit n'importe quel fichier audio en format WAV mono."""
|
| 38 |
try:
|
| 39 |
audio = AudioSegment.from_file(audio_path)
|
|
|
|
| 40 |
audio = audio.set_channels(1)
|
| 41 |
wav_path = os.path.splitext(audio_path)[0] + ".wav"
|
| 42 |
audio.export(wav_path, format="wav")
|
|
|
|
| 45 |
print(f"Erreur lors de la conversion en WAV : {e}")
|
| 46 |
return None
|
| 47 |
|
| 48 |
+
def detect_language_on_upload(filepath):
|
| 49 |
+
if filepath is None:
|
| 50 |
+
return "auto"
|
| 51 |
+
wav_filepath = convert_to_wav(filepath)
|
| 52 |
+
if not wav_filepath:
|
| 53 |
+
return "auto"
|
| 54 |
+
outputs = pipe(wav_filepath, chunk_length_s=30, batch_size=24, return_timestamps=False)
|
| 55 |
+
detected_lang = outputs.get("language", "auto")
|
| 56 |
+
return detected_lang
|
| 57 |
+
|
| 58 |
+
def save_txt(content, filename):
|
| 59 |
+
if not content or content.strip() == "":
|
| 60 |
+
return None
|
| 61 |
+
with open(filename, "w", encoding="utf-8") as f:
|
| 62 |
+
f.write(content)
|
| 63 |
+
return filename
|
| 64 |
+
|
| 65 |
def transcribe_audio(filepath, diarize, language_choice):
|
|
|
|
| 66 |
if filepath is None:
|
| 67 |
+
return "Aucun fichier audio fourni.", "Veuillez télécharger un fichier audio.", "", None, None
|
| 68 |
|
|
|
|
| 69 |
wav_filepath = convert_to_wav(filepath)
|
| 70 |
if not wav_filepath:
|
| 71 |
+
return "Erreur : Le fichier audio n'a pas pu être converti.", "Conversion échouée.", "", None, None
|
| 72 |
|
|
|
|
| 73 |
whisper_params = {
|
| 74 |
"chunk_length_s": 30,
|
| 75 |
"batch_size": 24,
|
| 76 |
"return_timestamps": True
|
| 77 |
}
|
|
|
|
|
|
|
| 78 |
if language_choice != "auto":
|
| 79 |
whisper_params["generate_kwargs"] = {"language": language_choice}
|
| 80 |
|
|
|
|
| 81 |
outputs = pipe(wav_filepath, **whisper_params)
|
| 82 |
transcription = outputs["text"].strip()
|
| 83 |
+
|
| 84 |
+
detected_language = outputs.get("language", "Non disponible")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
language_info = f"Langue détectée: {detected_language}"
|
| 86 |
if language_choice != "auto":
|
| 87 |
language_info += f" (Langue forcée: {language_choice})"
|
| 88 |
|
| 89 |
+
diarized_transcription = ""
|
| 90 |
if diarize and pyannote_pipeline:
|
| 91 |
try:
|
| 92 |
diarization = pyannote_pipeline(wav_filepath)
|
|
|
|
|
|
|
| 93 |
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 94 |
segment_start = turn.start
|
| 95 |
segment_end = turn.end
|
|
|
|
|
|
|
| 96 |
segment_text = ""
|
| 97 |
for chunk in outputs["chunks"]:
|
| 98 |
chunk_start = chunk['timestamp'][0]
|
| 99 |
chunk_end = chunk['timestamp'][1]
|
| 100 |
if chunk_start is not None and chunk_end is not None:
|
|
|
|
| 101 |
if max(segment_start, chunk_start) < min(segment_end, chunk_end):
|
| 102 |
segment_text += chunk['text']
|
|
|
|
|
|
|
| 103 |
start_time = str(datetime.timedelta(seconds=int(segment_start)))
|
| 104 |
diarized_transcription += f"[{start_time}] {speaker}:{segment_text.strip()}\n"
|
|
|
|
|
|
|
| 105 |
except Exception as e:
|
| 106 |
+
diarized_transcription = f"Erreur pendant la diarisation : {e}"
|
| 107 |
elif diarize:
|
| 108 |
+
diarized_transcription = "Diarisation activée mais le modèle n'a pas pu être chargé (token manquant ?)."
|
| 109 |
else:
|
| 110 |
+
diarized_transcription = "Diarisation non activée."
|
| 111 |
+
|
| 112 |
+
transcription_file = save_txt(transcription, "transcription.txt")
|
| 113 |
+
diarization_file = save_txt(diarized_transcription, "transcription_diarized.txt")
|
| 114 |
+
return transcription, diarized_transcription, language_info, transcription_file, diarization_file
|
| 115 |
|
| 116 |
# --- Interface Gradio ---
|
| 117 |
|
| 118 |
with gr.Blocks() as demo:
|
| 119 |
+
gr.HTML("<div style='text-align:center;'><h1>Application de Transcription et Diarisation Audio</h1></div>")
|
| 120 |
# gr.Markdown("## Objectif")
|
| 121 |
gr.Markdown("Transcrivez et diarisez automatiquement vos fichiers audio (WhatsApp, réunions, interviews, etc.) grâce à Whisper et pyannote, directement dans ce Space.")
|
| 122 |
|
|
|
|
| 144 |
reset_btn = gr.Button("Reset", variant="secondary")
|
| 145 |
with gr.Column():
|
| 146 |
language_info_output = gr.Textbox(label="Information sur la langue", lines=1)
|
| 147 |
+
transcription_file = gr.File(label="Télécharger la transcription (.txt)")
|
| 148 |
transcription_output = gr.Textbox(label="Transcription Complète", lines=10)
|
| 149 |
+
diarization_file = gr.File(label="Télécharger la transcription diarizée (.txt)")
|
| 150 |
diarization_output = gr.Textbox(label="Transcription avec Diarisation (par locuteur)", lines=15)
|
| 151 |
+
|
| 152 |
+
audio_input.change(
|
| 153 |
+
fn=detect_language_on_upload,
|
| 154 |
+
inputs=audio_input,
|
| 155 |
+
outputs=language_dropdown
|
| 156 |
+
)
|
| 157 |
|
| 158 |
submit_btn.click(
|
| 159 |
fn=transcribe_audio,
|
| 160 |
inputs=[audio_input, diarize_checkbox, language_dropdown],
|
| 161 |
+
outputs=[transcription_output, diarization_output, language_info_output, transcription_file, diarization_file]
|
| 162 |
)
|
| 163 |
|
| 164 |
def reset_fields():
|
| 165 |
+
return "", "", "", None, None, "auto", True
|
| 166 |
|
| 167 |
reset_btn.click(
|
| 168 |
fn=reset_fields,
|
| 169 |
inputs=[],
|
| 170 |
+
outputs=[transcription_output, diarization_output, language_info_output, transcription_file, diarization_file, language_dropdown, diarize_checkbox]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
)
|
| 172 |
|
|
|
|
| 173 |
demo.launch(share=True)
|