Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,18 +12,12 @@ asr_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large
|
|
| 12 |
translator = pipeline("text2text-generation", model="dammyogt/damilola-finetuned-NLP-opus-mt-en-ha")
|
| 13 |
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")
|
| 14 |
|
| 15 |
-
def translate_speech(
|
| 16 |
-
#
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
# Resample the audio data to 16000 Hz
|
| 20 |
-
audio_data_resampled = librosa.resample(audio_data, sample_rate, 16000)
|
| 21 |
-
|
| 22 |
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
|
| 23 |
-
sf.write(temp_audio_file.name, audio_data_resampled, 16000)
|
| 24 |
|
| 25 |
# Prepare the input dictionary
|
| 26 |
-
input_dict = asr_processor(
|
| 27 |
|
| 28 |
# Use the ASR model to get the logits
|
| 29 |
logits = asr_model(input_dict.input_values.to("cpu")).logits
|
|
@@ -66,11 +60,10 @@ def translate_speech(audio_data_tuple):
|
|
| 66 |
|
| 67 |
return 16000, synthesised_speech
|
| 68 |
|
| 69 |
-
|
| 70 |
# Define the Gradio interface
|
| 71 |
iface = gr.Interface(
|
| 72 |
fn=translate_speech,
|
| 73 |
-
inputs=gr.inputs.Audio(
|
| 74 |
outputs=gr.outputs.Audio(type="numpy"),
|
| 75 |
title="English to Hausa Translation",
|
| 76 |
description="Realtime demo for English to Hausa translation using speech recognition and text-to-speech synthesis."
|
|
|
|
| 12 |
translator = pipeline("text2text-generation", model="dammyogt/damilola-finetuned-NLP-opus-mt-en-ha")
|
| 13 |
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")
|
| 14 |
|
| 15 |
+
def translate_speech(audio_file_path):
|
| 16 |
+
# Load the audio file as a floating point time series
|
| 17 |
+
audio_data, sample_rate = librosa.load(audio_file_path, sr=16000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Prepare the input dictionary
|
| 20 |
+
input_dict = asr_processor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True) # Pass the resampled audio_data here
|
| 21 |
|
| 22 |
# Use the ASR model to get the logits
|
| 23 |
logits = asr_model(input_dict.input_values.to("cpu")).logits
|
|
|
|
| 60 |
|
| 61 |
return 16000, synthesised_speech
|
| 62 |
|
|
|
|
| 63 |
# Define the Gradio interface
|
| 64 |
iface = gr.Interface(
|
| 65 |
fn=translate_speech,
|
| 66 |
+
inputs=gr.inputs.Audio(type="file"), # Change this line
|
| 67 |
outputs=gr.outputs.Audio(type="numpy"),
|
| 68 |
title="English to Hausa Translation",
|
| 69 |
description="Realtime demo for English to Hausa translation using speech recognition and text-to-speech synthesis."
|