Spaces:

nithinraok
/

NeMo-Offline-Speaker-Diarization

Build error

App Files Files Community

nithinraok commited on Jun 8, 2023

Commit

d68b1ee

1 Parent(s): a0314cc

Add ASR Text

Browse files

Files changed (1) hide show

app.py +82 -9

app.py CHANGED Viewed

@@ -1,23 +1,96 @@
-import nemo
 from nemo.collections.asr.models.msdd_models import NeuralDiarizer
 import gradio as gr
 import pandas as pd
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
 def run_diarization(path1):
     annotation = model(path1, num_workers=0, batch_size=16)
     rttm=annotation.to_rttm()
-    df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker'])
-    for idx,line in enumerate(rttm.splitlines()):
         split = line.split()
-        start_time, duration, speaker = split[3], split[4], split[7]
         end_time = float(start_time) + float(duration)
-        df.loc[idx] = start_time, end_time, speaker
-    return df
 article = (
     "<p style='text-align: center'>"
@@ -38,7 +111,7 @@ microphone_interface = gr.Interface(
     title="Offline Speaker Diarization with NeMo",
     description="This demonstration will perform offline speaker diarization on an audio file using nemo",
     article=article,
-    layout="horizontal",
     theme="huggingface",
     allow_flagging=False,
     live=False,
@@ -52,7 +125,7 @@ upload_interface = gr.Interface(
     title="Offline Speaker Diarization with NeMo",
     description="This demonstration will perform offline speaker diarization on an audio file using nemo",
     article=article,
-    layout="horizontal",
     theme="huggingface",
     allow_flagging=False,
     live=False,
@@ -61,4 +134,4 @@ upload_interface = gr.Interface(
 demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
-demo.launch(enable_queue=True)

 from nemo.collections.asr.models.msdd_models import NeuralDiarizer
+from nemo.collections.asr.models import EncDecRNNTBPEModel
+from nemo.collections.asr.models import EncDecSpeakerLabelModel
 import gradio as gr
 import pandas as pd
 import torch
+import json
+from omegaconf import OmegaConf
+import uuid
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
+speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
+model.eval()
 def run_diarization(path1):
+    print(path1)
     annotation = model(path1, num_workers=0, batch_size=16)
     rttm=annotation.to_rttm()
+    df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
+    lines = rttm.splitlines()
+    if len(lines) == 0:
+        df.loc[0] = 0, 0, 'No speaker found'
+        return df
+    start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
+    end_time = float(start_time) + float(duration)
+    df.loc[0] = start_time, end_time, prev_speaker, ''
+    for line in lines[1:]:
         split = line.split()
+        start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
         end_time = float(start_time) + float(duration)
+        if cur_speaker == prev_speaker:
+            df.loc[df.index[-1], 'end_time'] = end_time
+        else:
+            df.loc[len(df)] = start_time, end_time, cur_speaker, ''
+        prev_speaker = cur_speaker
+    hyp = get_transcripts(df, path1)
+    assert len(hyp) == len(df)
+    for i in range(len(df)):
+        df.loc[i, 'text'] = hyp[i]
+    return df
+def create_manifest(df,audio_path):
+    filename = '/tmp/' + str(uuid.uuid4()) + '.json'
+    with open(filename, 'w') as f:
+        for i in range(len(df)):
+            start_time = df.iloc[i]['start_time']
+            end_time = df.iloc[i]['end_time']
+            speaker = df.iloc[i]['speaker']
+            dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
+            json.dump(dic, f)
+            f.write('\n')
+    return filename
+def get_transcripts(df, audio_path):
+    filename = create_manifest(df,audio_path)
+    model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
+    model.eval()
+    config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 4})
+    dataloader = model._setup_transcribe_dataloader(config)
+    hypotheses = []
+    all_hypotheses = []
+    for test_batch in (dataloader):
+        encoded, encoded_len = model.forward(
+            input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
+            )
+        best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor(
+            encoded,
+            encoded_len,
+            return_hypotheses=False,
+            partial_hypotheses=None,)
+        hypotheses += best_hyp
+        if all_hyp is not None:
+            all_hypotheses += all_hyp
+        else:
+            all_hypotheses += best_hyp
+        del encoded
+        del test_batch
+    return hypotheses
 article = (
     "<p style='text-align: center'>"
     title="Offline Speaker Diarization with NeMo",
     description="This demonstration will perform offline speaker diarization on an audio file using nemo",
     article=article,
+    layout="vertical",
     theme="huggingface",
     allow_flagging=False,
     live=False,
     title="Offline Speaker Diarization with NeMo",
     description="This demonstration will perform offline speaker diarization on an audio file using nemo",
     article=article,
+    layout="vertical",
     theme="huggingface",
     allow_flagging=False,
     live=False,
 demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
+demo.launch(enable_queue=True)