Spaces:
Build error
Build error
Commit
·
d68b1ee
1
Parent(s):
a0314cc
Add ASR Text
Browse files
app.py
CHANGED
|
@@ -1,23 +1,96 @@
|
|
| 1 |
-
import nemo
|
| 2 |
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
import torch
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 8 |
|
| 9 |
model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def run_diarization(path1):
|
|
|
|
| 12 |
annotation = model(path1, num_workers=0, batch_size=16)
|
| 13 |
rttm=annotation.to_rttm()
|
| 14 |
-
df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker'])
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
split = line.split()
|
| 17 |
-
start_time, duration,
|
| 18 |
end_time = float(start_time) + float(duration)
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
article = (
|
| 23 |
"<p style='text-align: center'>"
|
|
@@ -38,7 +111,7 @@ microphone_interface = gr.Interface(
|
|
| 38 |
title="Offline Speaker Diarization with NeMo",
|
| 39 |
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
|
| 40 |
article=article,
|
| 41 |
-
layout="
|
| 42 |
theme="huggingface",
|
| 43 |
allow_flagging=False,
|
| 44 |
live=False,
|
|
@@ -52,7 +125,7 @@ upload_interface = gr.Interface(
|
|
| 52 |
title="Offline Speaker Diarization with NeMo",
|
| 53 |
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
|
| 54 |
article=article,
|
| 55 |
-
layout="
|
| 56 |
theme="huggingface",
|
| 57 |
allow_flagging=False,
|
| 58 |
live=False,
|
|
@@ -61,4 +134,4 @@ upload_interface = gr.Interface(
|
|
| 61 |
|
| 62 |
demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
|
| 63 |
|
| 64 |
-
demo.launch(enable_queue=True)
|
|
|
|
|
|
|
| 1 |
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
|
| 2 |
+
from nemo.collections.asr.models import EncDecRNNTBPEModel
|
| 3 |
+
from nemo.collections.asr.models import EncDecSpeakerLabelModel
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
import torch
|
| 7 |
+
import json
|
| 8 |
+
from omegaconf import OmegaConf
|
| 9 |
+
import uuid
|
| 10 |
|
| 11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
|
| 13 |
model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
|
| 14 |
+
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
|
| 15 |
+
model.eval()
|
| 16 |
|
| 17 |
def run_diarization(path1):
|
| 18 |
+
print(path1)
|
| 19 |
annotation = model(path1, num_workers=0, batch_size=16)
|
| 20 |
rttm=annotation.to_rttm()
|
| 21 |
+
df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
|
| 22 |
+
lines = rttm.splitlines()
|
| 23 |
+
if len(lines) == 0:
|
| 24 |
+
df.loc[0] = 0, 0, 'No speaker found'
|
| 25 |
+
return df
|
| 26 |
+
start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
|
| 27 |
+
end_time = float(start_time) + float(duration)
|
| 28 |
+
df.loc[0] = start_time, end_time, prev_speaker, ''
|
| 29 |
+
|
| 30 |
+
for line in lines[1:]:
|
| 31 |
split = line.split()
|
| 32 |
+
start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
|
| 33 |
end_time = float(start_time) + float(duration)
|
| 34 |
+
if cur_speaker == prev_speaker:
|
| 35 |
+
df.loc[df.index[-1], 'end_time'] = end_time
|
| 36 |
+
else:
|
| 37 |
+
df.loc[len(df)] = start_time, end_time, cur_speaker, ''
|
| 38 |
+
prev_speaker = cur_speaker
|
| 39 |
+
|
| 40 |
+
hyp = get_transcripts(df, path1)
|
| 41 |
+
|
| 42 |
+
assert len(hyp) == len(df)
|
| 43 |
+
|
| 44 |
+
for i in range(len(df)):
|
| 45 |
+
df.loc[i, 'text'] = hyp[i]
|
| 46 |
+
|
| 47 |
+
return df
|
| 48 |
+
|
| 49 |
+
def create_manifest(df,audio_path):
|
| 50 |
+
|
| 51 |
+
filename = '/tmp/' + str(uuid.uuid4()) + '.json'
|
| 52 |
+
with open(filename, 'w') as f:
|
| 53 |
+
for i in range(len(df)):
|
| 54 |
+
start_time = df.iloc[i]['start_time']
|
| 55 |
+
end_time = df.iloc[i]['end_time']
|
| 56 |
+
speaker = df.iloc[i]['speaker']
|
| 57 |
+
dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
|
| 58 |
+
json.dump(dic, f)
|
| 59 |
+
f.write('\n')
|
| 60 |
+
|
| 61 |
+
return filename
|
| 62 |
+
|
| 63 |
+
def get_transcripts(df, audio_path):
|
| 64 |
+
|
| 65 |
+
filename = create_manifest(df,audio_path)
|
| 66 |
+
model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
|
| 67 |
+
model.eval()
|
| 68 |
+
config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 4})
|
| 69 |
+
dataloader = model._setup_transcribe_dataloader(config)
|
| 70 |
+
|
| 71 |
+
hypotheses = []
|
| 72 |
+
all_hypotheses = []
|
| 73 |
+
|
| 74 |
+
for test_batch in (dataloader):
|
| 75 |
+
encoded, encoded_len = model.forward(
|
| 76 |
+
input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
|
| 77 |
+
)
|
| 78 |
+
best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor(
|
| 79 |
+
encoded,
|
| 80 |
+
encoded_len,
|
| 81 |
+
return_hypotheses=False,
|
| 82 |
+
partial_hypotheses=None,)
|
| 83 |
+
|
| 84 |
+
hypotheses += best_hyp
|
| 85 |
+
if all_hyp is not None:
|
| 86 |
+
all_hypotheses += all_hyp
|
| 87 |
+
else:
|
| 88 |
+
all_hypotheses += best_hyp
|
| 89 |
+
|
| 90 |
+
del encoded
|
| 91 |
+
del test_batch
|
| 92 |
+
|
| 93 |
+
return hypotheses
|
| 94 |
|
| 95 |
article = (
|
| 96 |
"<p style='text-align: center'>"
|
|
|
|
| 111 |
title="Offline Speaker Diarization with NeMo",
|
| 112 |
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
|
| 113 |
article=article,
|
| 114 |
+
layout="vertical",
|
| 115 |
theme="huggingface",
|
| 116 |
allow_flagging=False,
|
| 117 |
live=False,
|
|
|
|
| 125 |
title="Offline Speaker Diarization with NeMo",
|
| 126 |
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
|
| 127 |
article=article,
|
| 128 |
+
layout="vertical",
|
| 129 |
theme="huggingface",
|
| 130 |
allow_flagging=False,
|
| 131 |
live=False,
|
|
|
|
| 134 |
|
| 135 |
demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
|
| 136 |
|
| 137 |
+
demo.launch(enable_queue=True)
|