Aesthetic_RVC_Inference_HF

Build error

App Files Files Community

Aesthetic_RVC_Inference_HF / tabs /tts.py

r3gm

Upload 340 files

3b7b011 about 2 years ago

raw

history blame contribute delete

16 kB

	import sys

	sys.path.append("..")
	import os

	now_dir = os.getcwd()

	from dotenv import load_dotenv
	from lib.infer.modules.vc.modules import VC
	from assets.configs.config import Config

	load_dotenv()
	config = Config()
	vc = VC(config)

	import shutil
	import numpy as np
	import torch

	import soundfile as sf
	from gtts import gTTS
	import edge_tts
	import asyncio
	import scipy.io.wavfile as wavfile
	import nltk

	nltk.download("punkt", quiet=True)
	from nltk.tokenize import sent_tokenize
	from bark import SAMPLE_RATE

	import json
	import ssl
	from typing import Any, Dict, List, Optional
	import asyncio
	import aiohttp
	import certifi

	VOICE_LIST = (
	"https://speech.platform.bing.com/consumer/speech/synthesize/"
	+ "readaloud/voices/list?trustedclienttoken="
	+ "6A5AA1D4EAFF4E9FB37E23D68491D6F4"
	)
	def get_bark_voice():
	mensaje = """
	v2/en_speaker_0 English Male
	v2/en_speaker_1 English Male
	v2/en_speaker_2 English Male
	v2/en_speaker_3 English Male
	v2/en_speaker_4 English Male
	v2/en_speaker_5 English Male
	v2/en_speaker_6 English Male
	v2/en_speaker_7 English Male
	v2/en_speaker_8 English Male
	v2/en_speaker_9 English Female
	v2/zh_speaker_0 Chinese (Simplified) Male
	v2/zh_speaker_1 Chinese (Simplified) Male
	v2/zh_speaker_2 Chinese (Simplified) Male
	v2/zh_speaker_3 Chinese (Simplified) Male
	v2/zh_speaker_4 Chinese (Simplified) Female
	v2/zh_speaker_5 Chinese (Simplified) Male
	v2/zh_speaker_6 Chinese (Simplified) Female
	v2/zh_speaker_7 Chinese (Simplified) Female
	v2/zh_speaker_8 Chinese (Simplified) Male
	v2/zh_speaker_9 Chinese (Simplified) Female
	v2/fr_speaker_0 French Male
	v2/fr_speaker_1 French Female
	v2/fr_speaker_2 French Female
	v2/fr_speaker_3 French Male
	v2/fr_speaker_4 French Male
	v2/fr_speaker_5 French Female
	v2/fr_speaker_6 French Male
	v2/fr_speaker_7 French Male
	v2/fr_speaker_8 French Male
	v2/fr_speaker_9 French Male
	v2/de_speaker_0 German Male
	v2/de_speaker_1 German Male
	v2/de_speaker_2 German Male
	v2/de_speaker_3 German Female
	v2/de_speaker_4 German Male
	v2/de_speaker_5 German Male
	v2/de_speaker_6 German Male
	v2/de_speaker_7 German Male
	v2/de_speaker_8 German Female
	v2/de_speaker_9 German Male
	v2/hi_speaker_0 Hindi Female
	v2/hi_speaker_1 Hindi Female
	v2/hi_speaker_2 Hindi Male
	v2/hi_speaker_3 Hindi Female
	v2/hi_speaker_4 Hindi Female
	v2/hi_speaker_5 Hindi Male
	v2/hi_speaker_6 Hindi Male
	v2/hi_speaker_7 Hindi Male
	v2/hi_speaker_8 Hindi Male
	v2/hi_speaker_9 Hindi Female
	v2/it_speaker_0 Italian Male
	v2/it_speaker_1 Italian Male
	v2/it_speaker_2 Italian Female
	v2/it_speaker_3 Italian Male
	v2/it_speaker_4 Italian Male
	v2/it_speaker_5 Italian Male
	v2/it_speaker_6 Italian Male
	v2/it_speaker_7 Italian Female
	v2/it_speaker_8 Italian Male
	v2/it_speaker_9 Italian Female
	v2/ja_speaker_0 Japanese Female
	v2/ja_speaker_1 Japanese Female
	v2/ja_speaker_2 Japanese Male
	v2/ja_speaker_3 Japanese Female
	v2/ja_speaker_4 Japanese Female
	v2/ja_speaker_5 Japanese Female
	v2/ja_speaker_6 Japanese Male
	v2/ja_speaker_7 Japanese Female
	v2/ja_speaker_8 Japanese Female
	v2/ja_speaker_9 Japanese Female
	v2/ko_speaker_0 Korean Female
	v2/ko_speaker_1 Korean Male
	v2/ko_speaker_2 Korean Male
	v2/ko_speaker_3 Korean Male
	v2/ko_speaker_4 Korean Male
	v2/ko_speaker_5 Korean Male
	v2/ko_speaker_6 Korean Male
	v2/ko_speaker_7 Korean Male
	v2/ko_speaker_8 Korean Male
	v2/ko_speaker_9 Korean Male
	v2/pl_speaker_0 Polish Male
	v2/pl_speaker_1 Polish Male
	v2/pl_speaker_2 Polish Male
	v2/pl_speaker_3 Polish Male
	v2/pl_speaker_4 Polish Female
	v2/pl_speaker_5 Polish Male
	v2/pl_speaker_6 Polish Female
	v2/pl_speaker_7 Polish Male
	v2/pl_speaker_8 Polish Male
	v2/pl_speaker_9 Polish Female
	v2/pt_speaker_0 Portuguese Male
	v2/pt_speaker_1 Portuguese Male
	v2/pt_speaker_2 Portuguese Male
	v2/pt_speaker_3 Portuguese Male
	v2/pt_speaker_4 Portuguese Male
	v2/pt_speaker_5 Portuguese Male
	v2/pt_speaker_6 Portuguese Male
	v2/pt_speaker_7 Portuguese Male
	v2/pt_speaker_8 Portuguese Male
	v2/pt_speaker_9 Portuguese Male
	v2/ru_speaker_0 Russian Male
	v2/ru_speaker_1 Russian Male
	v2/ru_speaker_2 Russian Male
	v2/ru_speaker_3 Russian Male
	v2/ru_speaker_4 Russian Male
	v2/ru_speaker_5 Russian Female
	v2/ru_speaker_6 Russian Female
	v2/ru_speaker_7 Russian Male
	v2/ru_speaker_8 Russian Male
	v2/ru_speaker_9 Russian Female
	v2/es_speaker_0 Spanish Male
	v2/es_speaker_1 Spanish Male
	v2/es_speaker_2 Spanish Male
	v2/es_speaker_3 Spanish Male
	v2/es_speaker_4 Spanish Male
	v2/es_speaker_5 Spanish Male
	v2/es_speaker_6 Spanish Male
	v2/es_speaker_7 Spanish Male
	v2/es_speaker_8 Spanish Female
	v2/es_speaker_9 Spanish Female
	v2/tr_speaker_0 Turkish Male
	v2/tr_speaker_1 Turkish Male
	v2/tr_speaker_2 Turkish Male
	v2/tr_speaker_3 Turkish Male
	v2/tr_speaker_4 Turkish Female
	v2/tr_speaker_5 Turkish Female
	v2/tr_speaker_6 Turkish Male
	v2/tr_speaker_7 Turkish Male
	v2/tr_speaker_8 Turkish Male
	v2/tr_speaker_9 Turkish Male
	"""
	# Dividir el mensaje en líneas
	lineas = mensaje.split("\n")
	datos_deseados = []
	for linea in lineas:
	partes = linea.split("\t")
	if len(partes) == 3:
	clave, _, genero = partes
	datos_deseados.append(f"{clave}-{genero}")

	return datos_deseados

	# \|\|-----------------------------------------------------------------------------------\|\|
	# \|\| Obtained from dependency edge_tts \|\|
	# \|\|-----------------------------------------------------------------------------------\|\|

	async def list_voices(*, proxy: Optional[str] = None) -> Any:
	"""
	List all available voices and their attributes.

	This pulls data from the URL used by Microsoft Edge to return a list of
	all available voices.

	Returns:
	dict: A dictionary of voice attributes.
	"""
	ssl_ctx = ssl.create_default_context(cafile=certifi.where())
	async with aiohttp.ClientSession(trust_env=True) as session:
	async with session.get(
	VOICE_LIST,
	headers={
	"Authority": "speech.platform.bing.com",
	"Sec-CH-UA": '" Not;A Brand";v="99", "Microsoft Edge";v="91", "Chromium";v="91"',
	"Sec-CH-UA-Mobile": "?0",
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
	"Accept": "/",
	"Sec-Fetch-Site": "none",
	"Sec-Fetch-Mode": "cors",
	"Sec-Fetch-Dest": "empty",
	"Accept-Encoding": "gzip, deflate, br",
	"Accept-Language": "en-US,en;q=0.9",
	},
	proxy=proxy,
	ssl=ssl_ctx,
	) as url:
	data = json.loads(await url.text())
	return data
	async def create(custom_voices: Optional[List[Dict[str, Any]]] = None) -> List[Dict[str, Any]]:
	"""
	Creates a list of voices with all available voices and their attributes.
	"""
	voices = await list_voices() if custom_voices is None else custom_voices
	voices = [
	{voice, {"Language": voice["Locale"].split("-")[0]}}
	for voice in voices
	]
	simplified_voices = [
	{'ShortName': voice['ShortName'], 'Gender': voice['Gender']}
	for voice in voices
	]
	return simplified_voices

	async def loop_main():
	voices = await create()
	voices_json = json.dumps(voices)
	return voices_json

	def get_edge_voice():
	loop = asyncio.get_event_loop()
	voices_json = loop.run_until_complete(loop_main())
	voices = json.loads(voices_json)
	tts_voice = []
	for voice in voices:
	short_name = voice['ShortName']
	gender = voice['Gender']
	formatted_entry = f"{short_name}-{gender}"
	tts_voice.append(formatted_entry)
	# print(f"{short_name}-{gender}")
	return tts_voice

	set_bark_voice = get_bark_voice()
	set_edge_voice = get_edge_voice()

	def update_tts_methods_voice(select_value):
	# ["Edge-tts", "RVG-tts", "Bark-tts"]
	if select_value == "Edge-tts":
	return {"choices": set_edge_voice, "value": "", "__type__": "update"}
	elif select_value == "Bark-tts":
	return {"choices": set_bark_voice, "value": "", "__type__": "update"}


	def custom_voice(
	_values, # filter indices
	audio_files, # all audio files
	model_voice_path="",
	transpose=0,
	f0method="pm",
	index_rate_=float(0.66),
	crepe_hop_length_=float(64),
	f0_autotune=False,
	file_index="",
	file_index2="",
	):
	vc.get_vc(model_voice_path)

	for _value_item in _values:
	filename = (
	"assets/audios/audio_outputs" + audio_files[_value_item]
	if _value_item != "converted_tts"
	else audio_files[0]
	)
	# filename = "audio2/"+audio_files[_value_item]
	try:
	print(audio_files[_value_item], model_voice_path)
	except:
	pass
	info_, (sample_, audio_output_) = vc.vc_single_dont_save(
	sid=0,
	input_audio_path1=filename, # f"audio2/{filename}",
	f0_up_key=transpose, # transpose for m to f and reverse 0 12
	f0_file=None,
	f0_method=f0method,
	file_index=file_index, # dir pwd?
	file_index2=file_index2,
	# file_big_npy1,
	index_rate=index_rate_,
	filter_radius=int(3),
	resample_sr=int(0),
	rms_mix_rate=float(0.25),
	protect=float(0.33),
	crepe_hop_length=crepe_hop_length_,
	f0_autotune=f0_autotune,
	f0_min=50,
	note_min=50,
	f0_max=1100,
	note_max=1100,
	)

	sf.write(
	file=filename, # f"audio2/{filename}",
	samplerate=sample_,
	data=audio_output_,
	)


	def cast_to_device(tensor, device):
	try:
	return tensor.to(device)
	except Exception as e:
	print(e)
	return tensor


	def __bark__(text, voice_preset):
	os.makedirs(os.path.join(now_dir, "tts"), exist_ok=True)
	from transformers import AutoProcessor, BarkModel

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	dtype = torch.float32 if "cpu" in device else torch.float16
	bark_processor = AutoProcessor.from_pretrained(
	"suno/bark",
	cache_dir=os.path.join(now_dir, "tts", "suno/bark"),
	torch_dtype=dtype,
	)
	bark_model = BarkModel.from_pretrained(
	"suno/bark",
	cache_dir=os.path.join(now_dir, "tts", "suno/bark"),
	torch_dtype=dtype,
	).to(device)
	# bark_model.enable_cpu_offload()
	inputs = bark_processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
	tensor_dict = {
	k: cast_to_device(v, device) if hasattr(v, "to") else v
	for k, v in inputs.items()
	}
	speech_values = bark_model.generate(**tensor_dict, do_sample=True)
	sampling_rate = bark_model.generation_config.sample_rate
	speech = speech_values.cpu().numpy().squeeze()
	return speech, sampling_rate


	def use_tts(
	tts_text,
	tts_voice,
	model_path,
	index_path,
	transpose,
	f0_method,
	index_rate,
	crepe_hop_length,
	f0_autotune,
	tts_method,
	):
	if tts_voice == None:
	return

	output_folder = "assets/audios/audio-outputs"
	os.makedirs(output_folder, exist_ok=True)
	output_count = 1 # Contador para nombres de archivo únicos

	while True:
	converted_tts_filename = os.path.join(output_folder, f"tts_out_{output_count}.wav")
	bark_out_filename = os.path.join(output_folder, f"bark_out_{output_count}.wav")

	if not os.path.exists(converted_tts_filename) and not os.path.exists(bark_out_filename):
	break
	output_count += 1


	if "SET_LIMIT" == os.getenv("DEMO"):
	if len(tts_text) > 60:
	tts_text = tts_text[:60]
	print("DEMO; limit to 60 characters")

	language = tts_voice[:2]
	if tts_method == "Edge-tts":
	try:
	# nest_asyncio.apply() # gradio;not
	asyncio.run(
	edge_tts.Communicate(
	tts_text, "-".join(tts_voice.split("-")[:-1])
	).save(converted_tts_filename)
	)
	except:
	try:
	tts = gTTS(tts_text, lang=language)
	tts.save(converted_tts_filename)
	tts.save
	print(
	f"No audio was received. Please change the tts voice for {tts_voice}. USING gTTS."
	)
	except:
	tts = gTTS("a", lang=language)
	tts.save(converted_tts_filename)
	print("Error: Audio will be replaced.")

	try:
	vc.get_vc(model_path)
	info_, (sample_, audio_output_) = vc.vc_single_dont_save(
	sid=0,
	input_audio_path1=converted_tts_filename,
	f0_up_key=transpose,
	f0_file=None,
	f0_method=f0_method,
	file_index="",
	file_index2=index_path,
	index_rate=index_rate,
	filter_radius=int(3),
	resample_sr=int(0),
	rms_mix_rate=float(0.25),
	protect=float(0.33),
	crepe_hop_length=crepe_hop_length,
	f0_autotune=f0_autotune,
	f0_min=50,
	note_min=50,
	f0_max=1100,
	note_max=1100,
	)

	# Genera un nombre de archivo único para el archivo procesado por vc.vc_single_dont_save
	vc_output_filename = os.path.join(output_folder, f"converted_tts_{output_count}.wav")

	# Guarda el archivo de audio procesado por vc.vc_single_dont_save
	wavfile.write(
	vc_output_filename,
	rate=sample_,
	data=audio_output_,
	)

	return vc_output_filename,converted_tts_filename
	except Exception as e:
	print(f"{e}")
	return None, None

	elif tts_method == "Bark-tts":
	try:
	script = tts_text.replace("\n", " ").strip()
	sentences = sent_tokenize(script)
	print(sentences)
	silence = np.zeros(int(0.25 * SAMPLE_RATE))
	pieces = []
	for sentence in sentences:
	audio_array, _ = __bark__(sentence, tts_voice.split("-")[0])
	pieces += [audio_array, silence.copy()]

	sf.write(
	file=bark_out_filename, samplerate=SAMPLE_RATE, data=np.concatenate(pieces)
	)
	vc.get_vc(model_path)
	info_, (sample_, audio_output_) = vc.vc_single_dont_save(
	sid=0,
	input_audio_path1=os.path.join(
	now_dir, "assets", "audios", "audio-outputs", "bark_out.wav"
	), # f"audio2/{filename}",
	f0_up_key=transpose, # transpose for m to f and reverse 0 12
	f0_file=None,
	f0_method=f0_method,
	file_index="", # dir pwd?
	file_index2=index_path,
	# file_big_npy1,
	index_rate=index_rate,
	filter_radius=int(3),
	resample_sr=int(0),
	rms_mix_rate=float(0.25),
	protect=float(0.33),
	crepe_hop_length=crepe_hop_length,
	f0_autotune=f0_autotune,
	f0_min=50,
	note_min=50,
	f0_max=1100,
	note_max=1100,
	)

	vc_output_filename = os.path.join(output_folder, f"converted_bark_{output_count}.wav")

	# Guarda el archivo de audio procesado por vc.vc_single_dont_save
	wavfile.write(
	vc_output_filename,
	rate=sample_,
	data=audio_output_,
	)

	return vc_output_filename, bark_out_filename

	except Exception as e:
	print(f"{e}")
	return None, None