Spaces:

TIMBOVILL
/

Applio-V3-HF

Running

App Files Files Community

Applio-V3-HF / rvc /train /extract /extract_f0_print.py

PlayerBPlaytime

Upload 125 files

16de183 verified almost 2 years ago

raw

history blame contribute delete

7.4 kB

	import os
	import sys
	import numpy as np
	import pyworld
	import torchcrepe
	import torch
	import parselmouth
	import tqdm
	from multiprocessing import Process, cpu_count

	current_directory = os.getcwd()
	sys.path.append(current_directory)


	from rvc.lib.utils import load_audio


	exp_dir = sys.argv[1]
	f0_method = sys.argv[2]
	num_processes = cpu_count()

	try:
	hop_length = int(sys.argv[3])
	except ValueError:
	hop_length = 128

	DoFormant = False
	Quefrency = 1.0
	Timbre = 1.0


	class FeatureInput:
	def __init__(self, sample_rate=16000, hop_size=160):
	self.fs = sample_rate
	self.hop = hop_size

	self.f0_method_dict = self.get_f0_method_dict()

	self.f0_bin = 256
	self.f0_max = 1100.0
	self.f0_min = 50.0
	self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
	self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)

	def mncrepe(self, method, x, p_len, hop_length):
	f0 = None
	torch_device_index = 0
	torch_device = (
	torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
	if torch.cuda.is_available()
	else torch.device("mps")
	if torch.backends.mps.is_available()
	else torch.device("cpu")
	)

	audio = torch.from_numpy(x.astype(np.float32)).to(torch_device, copy=True)
	audio /= torch.quantile(torch.abs(audio), 0.999)
	audio = torch.unsqueeze(audio, dim=0)
	if audio.ndim == 2 and audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0, keepdim=True).detach()
	audio = audio.detach()

	if method == "crepe":
	pitch = torchcrepe.predict(
	audio,
	self.fs,
	hop_length,
	self.f0_min,
	self.f0_max,
	"full",
	batch_size=hop_length * 2,
	device=torch_device,
	pad=True,
	)
	p_len = p_len or x.shape[0] // hop_length
	source = np.array(pitch.squeeze(0).cpu().float().numpy())
	source[source < 0.001] = np.nan
	target = np.interp(
	np.arange(0, len(source) * p_len, len(source)) / p_len,
	np.arange(0, len(source)),
	source,
	)
	f0 = np.nan_to_num(target)

	return f0

	def get_pm(self, x, p_len):
	f0 = (
	parselmouth.Sound(x, self.fs)
	.to_pitch_ac(
	time_step=160 / 16000,
	voicing_threshold=0.6,
	pitch_floor=self.f0_min,
	pitch_ceiling=self.f0_max,
	)
	.selected_array["frequency"]
	)

	return np.pad(
	f0,
	[
	[
	max(0, (p_len - len(f0) + 1) // 2),
	max(0, p_len - len(f0) - (p_len - len(f0) + 1) // 2),
	]
	],
	mode="constant",
	)

	def get_harvest(self, x):
	f0_spectral = pyworld.harvest(
	x.astype(np.double),
	fs=self.fs,
	f0_ceil=self.f0_max,
	f0_floor=self.f0_min,
	frame_period=1000 * self.hop / self.fs,
	)
	return pyworld.stonemask(x.astype(np.double), *f0_spectral, self.fs)

	def get_dio(self, x):
	f0_spectral = pyworld.dio(
	x.astype(np.double),
	fs=self.fs,
	f0_ceil=self.f0_max,
	f0_floor=self.f0_min,
	frame_period=1000 * self.hop / self.fs,
	)
	return pyworld.stonemask(x.astype(np.double), *f0_spectral, self.fs)

	def get_rmvpe(self, x):
	if not hasattr(self, "model_rmvpe"):
	from rvc.lib.rmvpe import RMVPE

	self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device="cpu")
	return self.model_rmvpe.infer_from_audio(x, thred=0.03)

	def get_f0_method_dict(self):
	return {
	"pm": self.get_pm,
	"harvest": self.get_harvest,
	"dio": self.get_dio,
	"rmvpe": self.get_rmvpe,
	}

	def compute_f0(self, path, f0_method, hop_length):
	x = load_audio(path, self.fs)
	p_len = x.shape[0] // self.hop

	if f0_method in self.f0_method_dict:
	f0 = (
	self.f0_method_dict[f0_method](x, p_len)
	if f0_method == "pm"
	else self.f0_method_dict[f0_method](x)
	)
	elif f0_method == "crepe":
	f0 = self.mncrepe(f0_method, x, p_len, hop_length)
	return f0

	def coarse_f0(self, f0):
	f0_mel = 1127 * np.log(1 + f0 / 700)
	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
	self.f0_bin - 2
	) / (self.f0_mel_max - self.f0_mel_min) + 1

	# use 0 or 1
	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
	f0_coarse = np.rint(f0_mel).astype(int)
	assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
	f0_coarse.max(),
	f0_coarse.min(),
	)
	return f0_coarse

	def process_paths(self, paths, f0_method, hop_length, thread_n):
	if len(paths) == 0:
	print("There are no paths to process.")
	return
	with tqdm.tqdm(total=len(paths), leave=True, position=thread_n) as pbar:
	description = f"Thread {thread_n} \| Hop-Length {hop_length}"
	pbar.set_description(description)

	for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
	try:
	if os.path.exists(opt_path1 + ".npy") and os.path.exists(
	opt_path2 + ".npy"
	):
	pbar.update(1)
	continue

	feature_pit = self.compute_f0(inp_path, f0_method, hop_length)
	np.save(
	opt_path2,
	feature_pit,
	allow_pickle=False,
	) # nsf
	coarse_pit = self.coarse_f0(feature_pit)
	np.save(
	opt_path1,
	coarse_pit,
	allow_pickle=False,
	) # ori
	pbar.update(1)
	except Exception as error:
	print(f"f0fail-{idx}-{inp_path}-{error}")


	if __name__ == "__main__":
	feature_input = FeatureInput()
	paths = []
	input_root = f"{exp_dir}/1_16k_wavs"
	output_root1 = f"{exp_dir}/2a_f0"
	output_root2 = f"{exp_dir}/2b-f0nsf"

	os.makedirs(output_root1, exist_ok=True)
	os.makedirs(output_root2, exist_ok=True)
	for name in sorted(list(os.listdir(input_root))):
	input_path = f"{input_root}/{name}"
	if "spec" in input_path:
	continue
	output_path1 = f"{output_root1}/{name}"
	output_path2 = f"{output_root2}/{name}"
	paths.append([input_path, output_path1, output_path2])

	processes = []
	print("Using f0 method: " + f0_method)
	for i in range(num_processes):
	p = Process(
	target=feature_input.process_paths,
	args=(paths[i::num_processes], f0_method, hop_length, i),
	)
	processes.append(p)
	p.start()
	for i in range(num_processes):
	processes[i].join()