eng-to-hau

Sleeping

App Files Files Community

eng-to-hau / app.py

Baghdad99

Update app.py

4804944 almost 2 years ago

raw

history blame contribute delete

3.11 kB

	import torch
	import librosa
	import gradio as gr
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor, SeamlessM4Tv2Model, pipeline, AutoTokenizer
	import numpy as np
	import soundfile as sf
	import tempfile


	# Load the models and processors
	asr_model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
	asr_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")

	# Load the SeamlessM4T model and processor
	translator_model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
	translator_processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")

	tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")

	def translate_speech(audio_file_path):
	# Load the audio file as a floating point time series
	audio_data, sample_rate = librosa.load(audio_file_path, sr=16000)

	# Prepare the input dictionary
	input_dict = asr_processor(audio_data, sampling_rate=16000, return_tensors="pt", padding=True) # Pass the resampled audio_data here

	# Use the ASR model to get the logits
	logits = asr_model(input_dict.input_values.to("cpu")).logits

	# Get the predicted IDs
	pred_ids = torch.argmax(logits, dim=-1)[0]

	# Decode the predicted IDs to get the transcription
	transcription = asr_processor.decode(pred_ids)
	print(f"Transcription: {transcription}") # Print the transcription

	# Prepare the input dictionary for the translator
	text_inputs = translator_processor(text=transcription, src_lang="eng", return_tensors="pt")

	# Use the translator model to translate the transcription
	translated_text = translator_model.generate(**text_inputs, tgt_lang="hau") # Change the target language to Hausa

	# Decode the translated text
	translated_text_str = translator_processor.decode(translated_text[0])

	# Remove special tokens
	translated_text_str = translated_text_str.replace("<pad>", "").replace("</s>", "").strip()

	print(f"Translated text string: {translated_text_str}") # Print the translated text string

	# Use the text-to-speech pipeline to synthesize the translated text
	synthesised_speech = tts(translated_text_str)

	# Check if the synthesised speech contains 'audio'
	if 'audio' in synthesised_speech:
	synthesised_speech_data = synthesised_speech['audio']
	else:
	print("The synthesised speech does not contain 'audio'")
	return

	# Flatten the audio data
	synthesised_speech_data = synthesised_speech_data.flatten()

	# Scale the audio data to the range of int16 format
	synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

	return 16000, synthesised_speech

	# Define the Gradio interface
	iface = gr.Interface(
	fn=translate_speech,
	inputs=gr.inputs.Audio(type="filepath"), # Change this line
	outputs=gr.outputs.Audio(type="numpy"),
	title="English to Hausa Translation",
	description="Realtime demo for English to Hausa translation using speech recognition and text-to-speech synthesis."
	)

	iface.launch()