Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| from pathlib import Path | |
| from transformers import AutoProcessor, BarkModel | |
| import scipy | |
| from pytube import YouTube | |
| from pydub import AudioSegment | |
| from TTS.api import TTS | |
| #import ffmpeg | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device) | |
| # model.enable_cpu_offload() | |
| device = "cpu" | |
| processor = AutoProcessor.from_pretrained("suno/bark-small") | |
| model = BarkModel.from_pretrained("suno/bark-small").to(device) | |
| num_list = ["1","2","3","4","5","6","7","8","9","10"] | |
| lang_list = ["en","de"] | |
| #SAMPLE_RATE = 24_000 | |
| def run_bark(text, n, lang): | |
| #history_prompt = [] | |
| semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}" | |
| #text=["Hello, my name is Suno. And, uh β and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."], | |
| inputs = processor(text=text, | |
| voice_preset = semantic_prompt, | |
| return_tensors="pt", | |
| ) | |
| print("generating") | |
| speech_values = model.generate( | |
| **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True | |
| ) | |
| #speech_values = model.generate(**inputs, do_sample=True) | |
| sampling_rate = model.generation_config.sample_rate | |
| #sampling_rate = 24_000 | |
| print("writing") | |
| scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze()) | |
| return ("bark_out.wav") | |
| def custom_bark(inp): | |
| speaker_wav=Path("Mid.mp3") | |
| tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device) | |
| tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path="output.wav") | |
| return ("output.wav") | |
| def load_video_yt(vid): | |
| yt = YouTube(vid) | |
| vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4") | |
| vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp4") | |
| print (yt.length) | |
| return vid, vid_aud, "tmp_aud.mp4" | |
| def trim_clip(clip, start_t, end_t): | |
| clip = Path("tmp_aud.mp4") | |
| #clip = "tmp_aud.mp3" | |
| # Open an mp3 file | |
| song = AudioSegment.from_file("tmp_aud.mp4", | |
| format="mp4") | |
| # start and end time | |
| #start_min = 0 | |
| #start_sec = 10 | |
| #end_min = 0 | |
| #end_sec = 55 | |
| start_min = int(start_t.split(":",1)[0]) | |
| start_sec = int(start_t.split(":",1)[1]) | |
| end_min = int(end_t.split(":",1)[0]) | |
| end_sec = int(end_t.split(":",1)[1]) | |
| # pydub does things in milliseconds, so convert time | |
| start = ((start_min*60)+start_sec)*1000 | |
| end = ((end_min*60)+end_sec)*1000 | |
| #start = 0 | |
| #end = 15*1000 | |
| # song clip of 10 seconds from starting | |
| first_10_seconds = song[start: end] | |
| # save file | |
| first_10_seconds.export("Mid.mp3", format="mp3") | |
| print("New Audio file is created and saved") | |
| return "Mid.mp3" | |
| with gr.Blocks() as app: | |
| with gr.Column(): | |
| in_text = gr.Textbox() | |
| with gr.Tab("Default"): | |
| with gr.Row(): | |
| speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1") | |
| speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en") | |
| go_btn = gr.Button() | |
| with gr.Tab("Upload"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| in_aud_mic = gr.Audio(source='microphone') | |
| in_aud_file = gr.Audio(source='upload', interactive = True) | |
| aud_file = gr.File() | |
| with gr.Column(): | |
| in_aud_yt = gr.Textbox(label="YouTube URL") | |
| load_yt_btn = gr.Button("Load URL") | |
| with gr.Column(): | |
| with gr.Row(): | |
| start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23") | |
| end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12") | |
| trim_clip_btn = gr.Button("Trim Clip") | |
| trim_aud = gr.Audio(source='upload', interactive = False) | |
| alt_go_btn = gr.Button() | |
| yt_vid = gr.Video(type = 'filepath') | |
| #speaker_num = gr.Number(value=0) | |
| with gr.Column(): | |
| out_audio = gr.Audio() | |
| go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio) | |
| load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file]) | |
| trim_clip_btn.click(trim_clip,[aud_file, start_time, end_time],trim_aud) | |
| alt_go_btn.click(custom_bark, in_text, out_audio) | |
| app.launch() |