Spaces:

Omnibus
/

Bark-simple

Running

App Files Files Community

Bark-simple / app.py

Omnibus

Update app.py

eb278b4 verified almost 2 years ago

raw

history blame contribute delete

4.67 kB

	import gradio as gr
	import torch
	from pathlib import Path
	from transformers import AutoProcessor, BarkModel
	import scipy
	from pytube import YouTube
	from pydub import AudioSegment
	from TTS.api import TTS
	#import ffmpeg


	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
	# model.enable_cpu_offload()

	device = "cpu"


	processor = AutoProcessor.from_pretrained("suno/bark-small")
	model = BarkModel.from_pretrained("suno/bark-small").to(device)
	num_list = ["1","2","3","4","5","6","7","8","9","10"]
	lang_list = ["en","de"]
	#SAMPLE_RATE = 24_000
	def run_bark(text, n, lang):
	#history_prompt = []
	semantic_prompt=f"v2/{lang}_speaker_{int(n)-1}"

	#text=["Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."],
	inputs = processor(text=text,
	voice_preset = semantic_prompt,
	return_tensors="pt",
	)
	print("generating")
	speech_values = model.generate(
	**inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
	)
	#speech_values = model.generate(**inputs, do_sample=True)
	sampling_rate = model.generation_config.sample_rate

	#sampling_rate = 24_000
	print("writing")
	scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
	return ("bark_out.wav")

	def custom_bark(inp):
	speaker_wav=Path("Mid.mp3")
	tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
	tts.tts_to_file(inp, speaker_wav=speaker_wav, language="en", file_path="output.wav")
	return ("output.wav")

	def load_video_yt(vid):
	yt = YouTube(vid)
	vid = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download(filename="tmp.mp4")
	vid_aud = yt.streams.filter(only_audio=True)[0].download(filename="tmp_aud.mp4")
	print (yt.length)
	return vid, vid_aud, "tmp_aud.mp4"

	def trim_clip(clip, start_t, end_t):
	clip = Path("tmp_aud.mp4")
	#clip = "tmp_aud.mp3"
	# Open an mp3 file
	song = AudioSegment.from_file("tmp_aud.mp4",
	format="mp4")

	# start and end time
	#start_min = 0
	#start_sec = 10
	#end_min = 0
	#end_sec = 55
	start_min = int(start_t.split(":",1)[0])
	start_sec = int(start_t.split(":",1)[1])
	end_min = int(end_t.split(":",1)[0])
	end_sec = int(end_t.split(":",1)[1])
	# pydub does things in milliseconds, so convert time
	start = ((start_min60)+start_sec)1000
	end = ((end_min60)+end_sec)1000
	#start = 0
	#end = 15*1000
	# song clip of 10 seconds from starting
	first_10_seconds = song[start: end]

	# save file
	first_10_seconds.export("Mid.mp3", format="mp3")
	print("New Audio file is created and saved")

	return "Mid.mp3"

	with gr.Blocks() as app:
	with gr.Column():
	in_text = gr.Textbox()
	with gr.Tab("Default"):
	with gr.Row():
	speaker_num = gr.Dropdown(label="Speaker Voice", choices=num_list,value="1")
	speaker_lang = gr.Dropdown(label="Speaker Language", choices=lang_list,value="en")
	go_btn = gr.Button()
	with gr.Tab("Upload"):
	with gr.Row():
	with gr.Column():
	in_aud_mic = gr.Audio(source='microphone')
	in_aud_file = gr.Audio(source='upload', interactive = True)
	aud_file = gr.File()
	with gr.Column():
	in_aud_yt = gr.Textbox(label="YouTube URL")
	load_yt_btn = gr.Button("Load URL")
	with gr.Column():
	with gr.Row():
	start_time = gr.Textbox(label = "Start", value = "0:00", placeholder = "0:23")
	end_time = gr.Textbox(label = "End", value = "0:01", placeholder = "1:12")

	trim_clip_btn = gr.Button("Trim Clip")
	trim_aud = gr.Audio(source='upload', interactive = False)
	alt_go_btn = gr.Button()
	yt_vid = gr.Video(type = 'filepath')
	#speaker_num = gr.Number(value=0)

	with gr.Column():
	out_audio = gr.Audio()

	go_btn.click(run_bark,[in_text, speaker_num, speaker_lang],out_audio)
	load_yt_btn.click(load_video_yt, in_aud_yt, [yt_vid,in_aud_file,aud_file])
	trim_clip_btn.click(trim_clip,[aud_file, start_time, end_time],trim_aud)
	alt_go_btn.click(custom_bark, in_text, out_audio)

	app.launch()