Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import soundfile as sf | |
| import spaces | |
| import os | |
| import numpy as np | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from speechbrain.pretrained import EncoderClassifier | |
| from datasets import load_dataset | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| def load_models_and_data(): | |
| model_name = "microsoft/speecht5_tts" | |
| processor = SpeechT5Processor.from_pretrained(model_name) | |
| model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device) | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) | |
| spk_model_name = "speechbrain/spkrec-xvect-voxceleb" | |
| speaker_model = EncoderClassifier.from_hparams( | |
| source=spk_model_name, | |
| run_opts={"device": device}, | |
| savedir=os.path.join("/tmp", spk_model_name), | |
| ) | |
| # Load a sample from a dataset for default embedding | |
| dataset = load_dataset("erenfazlioglu/turkishvoicedataset", split="train") | |
| example = dataset[304] | |
| return model, processor, vocoder, speaker_model, example | |
| model, processor, vocoder, speaker_model, default_example = load_models_and_data() | |
| def create_speaker_embedding(waveform): | |
| with torch.no_grad(): | |
| speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device)) | |
| speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) | |
| speaker_embeddings = speaker_embeddings.squeeze() | |
| return speaker_embeddings | |
| def prepare_default_embedding(example): | |
| audio = example["audio"] | |
| return create_speaker_embedding(audio["array"]) | |
| default_embedding = prepare_default_embedding(default_example) | |
| def text_to_speech(text, audio_file=None): | |
| inputs = processor(text=text, return_tensors="pt").to(device) | |
| speaker_embeddings = default_embedding | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder) | |
| sf.write("output.wav", speech.cpu().numpy(), samplerate=16000) | |
| return "output.wav" | |
| iface = gr.Interface( | |
| fn=text_to_speech, | |
| inputs=[ | |
| gr.Textbox(label="Enter Turkish text to convert to speech") | |
| ], | |
| outputs=gr.Audio(label="Generated Speech"), | |
| title="Turkish SpeechT5 Text-to-Speech Demo with Optional Custom Voice", | |
| description="Enter Turkish text, optionally upload a short audio sample of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model." | |
| ) | |
| iface.launch(share=True) |