Spaces:

mozgov
/

so-vits-svc-shengshuyan

Runtime error

App Files Files Community

xiaoheicat commited on Aug 26, 2023

Commit

c82bb46

1 Parent(s): 07e9a8a

Upload 18 files

Browse files

Files changed (18) hide show

CppDataProcess/F0Preprocess.cpp +153 -0
CppDataProcess/F0Preprocess.hpp +36 -0
CppDataProcess/Slicer.hpp +82 -0
CppDataProcess/Wav.cpp +151 -0
CppDataProcess/Wav.hpp +99 -0
CppDataProcess/readme.md +8 -0
cluster/__init__.py +29 -0
cluster/__pycache__/__init__.cpython-38.pyc +0 -0
cluster/__pycache__/kmeans.cpython-38.pyc +0 -0
cluster/km_train.py +80 -0
cluster/kmeans.py +204 -0
cluster/train_cluster.py +85 -0
configs/config.json +94 -0
configs/diffusion.yaml +48 -0
configs_template/config_template.json +77 -0
configs_template/config_tiny_template.json +77 -0
configs_template/diffusion_template.yaml +51 -0
dataset_raw/wav_structure.txt +20 -0

CppDataProcess/F0Preprocess.cpp ADDED Viewed

	@@ -0,0 +1,153 @@

+#include "F0Preprocess.hpp"
+void F0PreProcess::compute_f0(const double* audio, int64_t len)
+{
+	DioOption Doption;
+	InitializeDioOption(&Doption);
+	Doption.f0_ceil = 800;
+	Doption.frame_period = 1000.0 * hop / fs;
+	f0Len = GetSamplesForDIO(fs, (int)len, Doption.frame_period);
+	const auto tp = new double[f0Len];
+	const auto tmpf0 = new double[f0Len];
+	rf0 = new double[f0Len];
+	Dio(audio, (int)len, fs, &Doption, tp, tmpf0);
+	StoneMask(audio, (int)len, fs, tp, tmpf0, (int)f0Len, rf0);
+	delete[] tmpf0;
+	delete[] tp;
+}
+std::vector<double> arange(double start,double end,double step = 1.0,double div = 1.0)
+{
+	std::vector<double> output;
+	while(start<end)
+	{
+		output.push_back(start / div);
+		start += step;
+	}
+	return output;
+}
+void F0PreProcess::InterPf0(int64_t len)
+{
+	const auto xi = arange(0.0, (double)f0Len * (double)len, (double)f0Len, (double)len);
+	const auto tmp = new double[xi.size() + 1];
+	interp1(arange(0, (double)f0Len).data(), rf0, static_cast<int>(f0Len), xi.data(), (int)xi.size(), tmp);
+	for (size_t i = 0; i < xi.size(); i++)
+		if (isnan(tmp[i]))
+			tmp[i] = 0.0;
+	delete[] rf0;
+    rf0 = nullptr;
+	rf0 = tmp;
+	f0Len = (int64_t)xi.size();
+}
+long long* F0PreProcess::f0Log()
+{
+	const auto tmp = new long long[f0Len];
+	const auto f0_mel = new double[f0Len];
+	for (long long i = 0; i < f0Len; i++)
+	{
+		f0_mel[i] = 1127 * log(1.0 + rf0[i] / 700.0);
+		if (f0_mel[i] > 0.0)
+			f0_mel[i] = (f0_mel[i] - f0_mel_min) * (f0_bin - 2.0) / (f0_mel_max - f0_mel_min) + 1.0;
+		if (f0_mel[i] < 1.0)
+			f0_mel[i] = 1;
+		if (f0_mel[i] > f0_bin - 1)
+			f0_mel[i] = f0_bin - 1;
+		tmp[i] = (long long)round(f0_mel[i]);
+	}
+	delete[] f0_mel;
+	delete[] rf0;
+    rf0 = nullptr;
+	return tmp;
+}
+std::vector<long long> F0PreProcess::GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran)
+{
+	compute_f0(audio, audioLen);
+	for (int64_t i = 0; i < f0Len; ++i)
+	{
+		rf0[i] = rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0);
+		if (rf0[i] < 0.001)
+			rf0[i] = NAN;
+	}
+	InterPf0(hubLen);
+	const auto O0f = f0Log();
+	std::vector<long long> Of0(O0f, O0f + f0Len);
+    delete[] O0f;
+	return Of0;
+}
+std::vector<long long> getAligments(size_t specLen, size_t hubertLen)
+{
+	std::vector<long long> mel2ph(specLen + 1, 0);
+	size_t startFrame = 0;
+	const double ph_durs = static_cast<double>(specLen) / static_cast<double>(hubertLen);
+	for (size_t iph = 0; iph < hubertLen; ++iph)
+	{
+		const auto endFrame = static_cast<size_t>(round(static_cast<double>(iph) * ph_durs + ph_durs));
+		for (auto j = startFrame; j < endFrame + 1; ++j)
+			mel2ph[j] = static_cast<long long>(iph) + 1;
+		startFrame = endFrame + 1;
+	}
+	return mel2ph;
+}
+std::vector<float> F0PreProcess::GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran)
+{
+	compute_f0(audio, audioLen);
+	for (int64_t i = 0; i < f0Len; ++i)
+	{
+		rf0[i] = log2(rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0));
+		if (rf0[i] < 0.001)
+			rf0[i] = NAN;
+	}
+	const int64_t specLen = audioLen / hop;
+	InterPf0(specLen);
+    std::vector<float> Of0(specLen, 0.0);
+    double last_value = 0.0;
+    for (int64_t i = 0; i < specLen; ++i)
+    {
+        if (rf0[i] <= 0.0)
+        {
+            int64_t j = i + 1;
+            for (; j < specLen; ++j)
+            {
+                if (rf0[j] > 0.0)
+                    break;
+            }
+            if (j < specLen - 1)
+            {
+                if (last_value > 0.0)
+                {
+                    const auto step = (rf0[j] - rf0[i - 1]) / double(j - i);
+                    for (int64_t k = i; k < j; ++k)
+                        Of0[k] = float(rf0[i - 1] + step * double(k - i + 1));
+                }
+                else
+                    for (int64_t k = i; k < j; ++k)
+                        Of0[k] = float(rf0[j]);
+                i = j;
+            }
+            else
+            {
+                for (int64_t k = i; k < specLen; ++k)
+                    Of0[k] = float(last_value);
+                i = specLen;
+            }
+        }
+        else
+        {
+            Of0[i] = float(rf0[i - 1]);
+            last_value = rf0[i];
+        }
+    }
+    delete[] rf0;
+    rf0 = nullptr;
+	return Of0;
+}

CppDataProcess/F0Preprocess.hpp ADDED Viewed

	@@ -0,0 +1,36 @@

+#include "world/dio.h"
+#include "world/stonemask.h"
+#include "world/matlabfunctions.h"
+#include <string>
+#include <vector>
+//Cpp F0 Preprocess
+class F0PreProcess
+{
+public:
+	int fs;
+	short hop;
+	const int f0_bin = 256;
+	const double f0_max = 1100.0;
+	const double f0_min = 50.0;
+	const double f0_mel_min = 1127.0 * log(1.0 + f0_min / 700.0);
+	const double f0_mel_max = 1127.0 * log(1.0 + f0_max / 700.0);
+	F0PreProcess(int sr = 16000, short h = 160) :fs(sr), hop(h) {}
+	~F0PreProcess()
+	{
+		delete[] rf0;
+		rf0 = nullptr;
+	}
+	void compute_f0(const double* audio, int64_t len);
+	void InterPf0(int64_t len);
+	long long* f0Log();
+	int64_t getLen()const { return f0Len; }
+	std::vector<long long> GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran);
+	std::vector<float> GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran);
+private:
+	double* rf0 = nullptr;
+	int64_t f0Len = 0;
+};
+std::vector<long long> getAligments(size_t specLen, size_t hubertLen);

CppDataProcess/Slicer.hpp ADDED Viewed

	@@ -0,0 +1,82 @@

+#include <string>
+#include <vector>
+#include "Wav.hpp"
+struct SliceResult
+{
+	std::vector<unsigned long long>	SliceOffset;
+	std::vector<bool> SliceTag;
+	cutResult(std::vector<unsigned long long>&& O, std::vector<bool>&& T) :SliceOffset(O), SliceTag(T) {}
+};
+double getAvg(const short* start, const short* end)
+{
+	const auto size = end - start + 1;
+	auto avg = (double)(*start);
+	for (auto i = 1; i < size; i++)
+	{
+		avg = avg + (abs((double)start[i]) - avg) / (double)(i + 1ull);
+	}
+	return avg;
+}
+inline SliceResult SliceWav(Wav& input, double threshold, unsigned long minLen, unsigned short frame_len, unsigned short frame_shift)
+{
+	const auto header = input.getHeader();
+	if (header.Subchunk2Size < minLen * header.bytesPerSec)
+		return { {0,header.Subchunk2Size},{true} };
+	auto ptr = input.getData();
+	std::vector<unsigned long long> output;
+	std::vector<bool> tag;
+	auto n = (header.Subchunk2Size / frame_shift) - 2 * (frame_len / frame_shift);
+	unsigned long nn = 0;
+	bool cutTag = true;
+	output.emplace_back(0);
+	while (n--)
+	{
+		//if (nn > minLen * header.bytesPerSec)
+		if (cutTag)
+		{
+			const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
+			if (vol < threshold)
+			{
+				cutTag = false;
+				if (nn > minLen * header.bytesPerSec)
+				{
+					nn = 0;
+					output.emplace_back((ptr - input.getData()) + (frame_len / 2));
+				}
+			}
+			else
+			{
+				cutTag = true;
+			}
+		}
+		else
+		{
+			const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
+			if (vol < threshold)
+			{
+				cutTag = false;
+			}
+			else
+			{
+				cutTag = true;
+				if (nn > minLen * header.bytesPerSec)
+				{
+					nn = 0;
+					output.emplace_back((ptr - input.getData()) + (frame_len / 2));
+				}
+			}
+		}
+		nn += frame_shift;
+		ptr += frame_shift;
+	}
+	output.push_back(header.Subchunk2Size);
+	for (size_t i = 1; i < output.size(); i++)
+	{
+		tag.push_back(abs(getAvg((short*)(input.getData() + output[i - 1]), (short*)(input.getData() + output[i]))) > threshold);
+	}
+	return { std::move(output),std::move(tag) };
+}

CppDataProcess/Wav.cpp ADDED Viewed

	@@ -0,0 +1,151 @@

+#include "Wav.hpp"
+Wav::Wav(const wchar_t* Path) :header(WAV_HEADER()) {
+	char buf[1024];
+	FILE* stream;
+	_wfreopen_s(&stream, Path, L"rb", stderr);
+	if (stream == nullptr) {
+		throw (std::exception("File not exists"));
+	}
+	fread(buf, 1, HEAD_LENGTH, stream);
+	int pos = 0;
+	while (pos < HEAD_LENGTH) {
+		if ((buf[pos] == 'R') && (buf[pos + 1] == 'I') && (buf[pos + 2] == 'F') && (buf[pos + 3] == 'F')) {
+			pos += 4;
+			break;
+		}
+		++pos;
+	}
+	if (pos >= HEAD_LENGTH)
+		throw (std::exception("Don't order fried rice (annoyed)"));
+	header.ChunkSize = *(int*)&buf[pos];
+	pos += 8;
+	while (pos < HEAD_LENGTH) {
+		if ((buf[pos] == 'f') && (buf[pos + 1] == 'm') && (buf[pos + 2] == 't')) {
+			pos += 4;
+			break;
+		}
+		++pos;
+	}
+	if (pos >= HEAD_LENGTH)
+		throw (std::exception("Don't order fried rice (annoyed)"));
+	header.Subchunk1Size = *(int*)&buf[pos];
+	pos += 4;
+	header.AudioFormat = *(short*)&buf[pos];
+	pos += 2;
+	header.NumOfChan = *(short*)&buf[pos];
+	pos += 2;
+	header.SamplesPerSec = *(int*)&buf[pos];
+	pos += 4;
+	header.bytesPerSec = *(int*)&buf[pos];
+	pos += 4;
+	header.blockAlign = *(short*)&buf[pos];
+	pos += 2;
+	header.bitsPerSample = *(short*)&buf[pos];
+	pos += 2;
+	while (pos < HEAD_LENGTH) {
+		if ((buf[pos] == 'd') && (buf[pos + 1] == 'a') && (buf[pos + 2] == 't') && (buf[pos + 3] == 'a')) {
+			pos += 4;
+			break;
+		}
+		++pos;
+	}
+	if (pos >= HEAD_LENGTH)
+		throw (std::exception("Don't order fried rice (annoyed)"));
+	header.Subchunk2Size = *(int*)&buf[pos];
+	pos += 4;
+	StartPos = pos;
+	Data = new char[header.Subchunk2Size + 1];
+	fseek(stream, StartPos, SEEK_SET);
+	fread(Data, 1, header.Subchunk2Size, stream);
+	if (stream != nullptr) {
+		fclose(stream);
+	}
+	SData = reinterpret_cast<int16_t*>(Data);
+	dataSize = header.Subchunk2Size / 2;
+}
+Wav::Wav(const Wav& input) :header(WAV_HEADER()) {
+	Data = new char[(input.header.Subchunk2Size + 1)];
+	if (Data == nullptr) { throw std::exception("OOM"); }
+	memcpy(header.RIFF, input.header.RIFF, 4);
+	memcpy(header.fmt, input.header.fmt, 4);
+	memcpy(header.WAVE, input.header.WAVE, 4);
+	memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
+	header.ChunkSize = input.header.ChunkSize;
+	header.Subchunk1Size = input.header.Subchunk1Size;
+	header.AudioFormat = input.header.AudioFormat;
+	header.NumOfChan = input.header.NumOfChan;
+	header.SamplesPerSec = input.header.SamplesPerSec;
+	header.bytesPerSec = input.header.bytesPerSec;
+	header.blockAlign = input.header.blockAlign;
+	header.bitsPerSample = input.header.bitsPerSample;
+	header.Subchunk2Size = input.header.Subchunk2Size;
+	StartPos = input.StartPos;
+	memcpy(Data, input.Data, input.header.Subchunk2Size);
+	SData = reinterpret_cast<int16_t*>(Data);
+	dataSize = header.Subchunk2Size / 2;
+}
+Wav::Wav(Wav&& input) noexcept
+{
+	Data = input.Data;
+	input.Data = nullptr;
+	memcpy(header.RIFF, input.header.RIFF, 4);
+	memcpy(header.fmt, input.header.fmt, 4);
+	memcpy(header.WAVE, input.header.WAVE, 4);
+	memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
+	header.ChunkSize = input.header.ChunkSize;
+	header.Subchunk1Size = input.header.Subchunk1Size;
+	header.AudioFormat = input.header.AudioFormat;
+	header.NumOfChan = input.header.NumOfChan;
+	header.SamplesPerSec = input.header.SamplesPerSec;
+	header.bytesPerSec = input.header.bytesPerSec;
+	header.blockAlign = input.header.blockAlign;
+	header.bitsPerSample = input.header.bitsPerSample;
+	header.Subchunk2Size = input.header.Subchunk2Size;
+	StartPos = input.StartPos;
+	SData = reinterpret_cast<int16_t*>(Data);
+	dataSize = header.Subchunk2Size / 2;
+}
+Wav& Wav::operator=(Wav&& input) noexcept
+{
+	destory();
+	Data = input.Data;
+	input.Data = nullptr;
+	memcpy(header.RIFF, input.header.RIFF, 4);
+	memcpy(header.fmt, input.header.fmt, 4);
+	memcpy(header.WAVE, input.header.WAVE, 4);
+	memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
+	header.ChunkSize = input.header.ChunkSize;
+	header.Subchunk1Size = input.header.Subchunk1Size;
+	header.AudioFormat = input.header.AudioFormat;
+	header.NumOfChan = input.header.NumOfChan;
+	header.SamplesPerSec = input.header.SamplesPerSec;
+	header.bytesPerSec = input.header.bytesPerSec;
+	header.blockAlign = input.header.blockAlign;
+	header.bitsPerSample = input.header.bitsPerSample;
+	header.Subchunk2Size = input.header.Subchunk2Size;
+	StartPos = input.StartPos;
+	SData = reinterpret_cast<int16_t*>(Data);
+	dataSize = header.Subchunk2Size / 2;
+	return *this;
+}
+Wav& Wav::cat(const Wav& input)
+{
+	if (header.AudioFormat != 1) return *this;
+	if (header.SamplesPerSec != input.header.bitsPerSample || header.NumOfChan != input.header.NumOfChan) return *this;
+	char* buffer = new char[(int64_t)header.Subchunk2Size + (int64_t)input.header.Subchunk2Size + 1];
+	if (buffer == nullptr)return *this;
+	memcpy(buffer, Data, header.Subchunk2Size);
+	memcpy(buffer + header.Subchunk2Size, input.Data, input.header.Subchunk2Size);
+	header.ChunkSize += input.header.Subchunk2Size;
+	header.Subchunk2Size += input.header.Subchunk2Size;
+	delete[] Data;
+	Data = buffer;
+	SData = reinterpret_cast<int16_t*>(Data);
+	dataSize = header.Subchunk2Size / 2;
+	return *this;
+}

CppDataProcess/Wav.hpp ADDED Viewed

	@@ -0,0 +1,99 @@

+class Wav {
+public:
+	struct WAV_HEADER {
+		char             RIFF[4] = { 'R','I','F','F' };              //RIFF��ʶ
+		unsigned long    ChunkSize;                                  //�ļ���С-8
+		char             WAVE[4] = { 'W','A','V','E' };              //WAVE��
+		char             fmt[4] = { 'f','m','t',' ' };               //fmt��
+		unsigned long    Subchunk1Size;                              //fmt���С
+		unsigned short   AudioFormat;                                //�����ʽ
+		unsigned short   NumOfChan;                                  //������
+		unsigned long    SamplesPerSec;                              //������
+		unsigned long    bytesPerSec;                                //ÿ�����ֽ���
+		unsigned short   blockAlign;                                 //�������ֽ�
+		unsigned short   bitsPerSample;                              //������λ��
+		char             Subchunk2ID[4] = { 'd','a','t','a' };       //���ݿ�
+		unsigned long    Subchunk2Size;                              //���ݿ��С
+		WAV_HEADER(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :ChunkSize(cs), Subchunk1Size(sc1s), AudioFormat(af), NumOfChan(nc), SamplesPerSec(sr), bytesPerSec(bps), blockAlign(ba), bitsPerSample(bips), Subchunk2Size(sc2s) {}
+	};
+	using iterator = int16_t*;
+	Wav(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :header({
+			cs,
+			sc1s,
+			af,
+			nc,
+			sr,
+			bps,
+			ba,
+			bips,
+			sc2s
+		}), Data(nullptr), StartPos(44) {
+		dataSize = 0;
+		SData = nullptr;
+	}
+	Wav(unsigned long sr, unsigned long length, const void* data) :header({
+			36,
+			16,
+			1,
+			1,
+			sr,
+			sr * 2,
+			2,
+			16,
+			length
+		}), Data(new char[length + 1]), StartPos(44)
+	{
+		header.ChunkSize = 36 + length;
+		memcpy(Data, data, length);
+		SData = reinterpret_cast<int16_t*>(Data);
+		dataSize = length / 2;
+	}
+	Wav(const wchar_t* Path);
+	Wav(const Wav& input);
+	Wav(Wav&& input) noexcept;
+	Wav& operator=(const Wav& input) = delete;
+	Wav& operator=(Wav&& input) noexcept;
+	~Wav() { destory(); }
+	Wav& cat(const Wav& input);
+	bool isEmpty() const { return this->header.Subchunk2Size == 0; }
+	const char* getData() const { return Data; }
+	char* getData() { return Data; }
+	WAV_HEADER getHeader() const { return header; }
+	WAV_HEADER& Header() { return header; }
+	void destory() const { delete[] Data; }
+	void changeData(const void* indata,long length,int sr)
+	{
+		delete[] Data;
+		Data = new char[length];
+		memcpy(Data, indata, length);
+		header.ChunkSize = 36 + length;
+		header.Subchunk2Size = length;
+		header.SamplesPerSec = sr;
+		header.bytesPerSec = 2 * sr;
+	}
+	int16_t& operator[](const size_t index) const
+	{
+		if (index < dataSize)
+			return *(SData + index);
+		return *(SData + dataSize - 1);
+	}
+	iterator begin() const
+	{
+		return reinterpret_cast<int16_t*>(Data);
+	}
+	iterator end() const
+	{
+		return reinterpret_cast<int16_t*>(Data + header.Subchunk2Size);
+	}
+	int64_t getDataLen()const
+	{
+		return static_cast<int64_t>(dataSize);
+	}
+private:
+	WAV_HEADER header;
+	char* Data;
+	int16_t* SData;
+	size_t dataSize;
+	int StartPos;
+};

CppDataProcess/readme.md ADDED Viewed

	@@ -0,0 +1,8 @@

+## F0Preprocess
+请前往 https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder 下载PyWorld的源代码并编译出静态库并链接到你的项目之中，然后调用此头文件
+## Slicer
+一个简单的切片机
+---
+~~上面的东西是直接从MoeSS的代码里面抽出来的，可以作为预置预处理的替代品（）~~

cluster/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+from sklearn.cluster import KMeans
+def get_cluster_model(ckpt_path):
+    checkpoint = torch.load(ckpt_path)
+    kmeans_dict = {}
+    for spk, ckpt in checkpoint.items():
+        km = KMeans(ckpt["n_features_in_"])
+        km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
+        km.__dict__["_n_threads"] = ckpt["_n_threads"]
+        km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
+        kmeans_dict[spk] = km
+    return kmeans_dict
+def get_cluster_result(model, x, speaker):
+    """
+        x: np.array [t, 256]
+        return cluster class result
+    """
+    return model[speaker].predict(x)
+def get_cluster_center_result(model, x,speaker):
+    """x: np.array [t, 256]"""
+    predict = model[speaker].predict(x)
+    return model[speaker].cluster_centers_[predict]
+def get_center(model, x,speaker):
+    return model[speaker].cluster_centers_[x]

cluster/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (1.09 kB). View file

cluster/__pycache__/kmeans.cpython-38.pyc ADDED Viewed

Binary file (6.95 kB). View file

cluster/km_train.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import time,pdb
+import tqdm
+from time import time as ttime
+import os
+from pathlib import Path
+import logging
+import argparse
+from cluster.kmeans import KMeansGPU
+import torch
+import numpy as np
+from sklearn.cluster import KMeans,MiniBatchKMeans
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+from time import time as ttime
+import pynvml,torch
+def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉，虽然库支持但是也不考虑
+    logger.info(f"Loading features from {in_dir}")
+    features = []
+    nums = 0
+    for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
+    # for name in os.listdir(in_dir):
+    #     path="%s/%s"%(in_dir,name)
+        features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
+        # print(features[-1].shape)
+    features = np.concatenate(features, axis=0)
+    print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
+    features = features.astype(np.float32)
+    logger.info(f"Clustering features of shape: {features.shape}")
+    t = time.time()
+    if(use_gpu==False):
+        if use_minibatch:
+            kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
+        else:
+            kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
+    else:
+            kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
+            features=torch.from_numpy(features)#.to(device)
+            labels = kmeans.fit_predict(features)#
+    print(time.time()-t, "s")
+    x = {
+            "n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
+            "_n_threads": kmeans._n_threads if use_gpu==False else 4,
+            "cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
+    }
+    print("end")
+    return x
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', type=Path, default="./dataset/44k",
+                        help='path of training data directory')
+    parser.add_argument('--output', type=Path, default="logs/44k",
+                        help='path of model output directory')
+    args = parser.parse_args()
+    checkpoint_dir = args.output
+    dataset = args.dataset
+    n_clusters = 1000
+    ckpt = {}
+    for spk in os.listdir(dataset):
+        if os.path.isdir(dataset/spk):
+            print(f"train kmeans for {spk}...")
+            in_dir = dataset/spk
+            x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=True)
+            ckpt[spk] = x
+    checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
+    checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
+    torch.save(
+        ckpt,
+        checkpoint_path,
+    )

cluster/kmeans.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from time import time
+import numpy as np
+import pynvml
+import torch
+from torch.nn.functional import normalize
+# device=torch.device("cuda:0")
+def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
+    """ Picks k points in the data based on the kmeans++ method.
+    Parameters
+    ----------
+    data : torch.Tensor
+        Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
+        data, rank 2 multidimensional data, in which case one
+        row is one observation.
+    k : int
+        Number of samples to generate.
+    sample_size : int
+        sample data to avoid memory overflow during calculation
+    Returns
+    -------
+    init : ndarray
+        A 'k' by 'N' containing the initial centroids.
+    References
+    ----------
+    .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
+       careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
+       on Discrete Algorithms, 2007.
+    .. [2] scipy/cluster/vq.py: _kpp
+    """
+    batch_size=data.shape[0]
+    if batch_size>sample_size:
+        data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
+    dims = data.shape[1] if len(data.shape) > 1 else 1
+    init = torch.zeros((k, dims)).to(data.device)
+    r = torch.distributions.uniform.Uniform(0, 1)
+    for i in range(k):
+        if i == 0:
+            init[i, :] = data[torch.randint(data.shape[0], [1])]
+        else:
+            D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
+            probs = D2 / torch.sum(D2)
+            cumprobs = torch.cumsum(probs, dim=0)
+            init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
+    return init
+class KMeansGPU:
+  '''
+  Kmeans clustering algorithm implemented with PyTorch
+  Parameters:
+    n_clusters: int,
+      Number of clusters
+    max_iter: int, default: 100
+      Maximum number of iterations
+    tol: float, default: 0.0001
+      Tolerance
+    verbose: int, default: 0
+      Verbosity
+    mode: {'euclidean', 'cosine'}, default: 'euclidean'
+      Type of distance measure
+    init_method: {'random', 'point', '++'}
+      Type of initialization
+    minibatch: {None, int}, default: None
+      Batch size of MinibatchKmeans algorithm
+      if None perform full KMeans algorithm
+  Attributes:
+    centroids: torch.Tensor, shape: [n_clusters, n_features]
+      cluster centroids
+  '''
+  def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
+    self.n_clusters = n_clusters
+    self.max_iter = max_iter
+    self.tol = tol
+    self.verbose = verbose
+    self.mode = mode
+    self.device=device
+    pynvml.nvmlInit()
+    gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
+    info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
+    self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
+    print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
+  @staticmethod
+  def cos_sim(a, b):
+    """
+      Compute cosine similarity of 2 sets of vectors
+      Parameters:
+      a: torch.Tensor, shape: [m, n_features]
+      b: torch.Tensor, shape: [n, n_features]
+    """
+    return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
+  @staticmethod
+  def euc_sim(a, b):
+    """
+      Compute euclidean similarity of 2 sets of vectors
+      Parameters:
+      a: torch.Tensor, shape: [m, n_features]
+      b: torch.Tensor, shape: [n, n_features]
+    """
+    return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
+  def max_sim(self, a, b):
+    """
+      Compute maximum similarity (or minimum distance) of each vector
+      in a with all of the vectors in b
+      Parameters:
+      a: torch.Tensor, shape: [m, n_features]
+      b: torch.Tensor, shape: [n, n_features]
+    """
+    if self.mode == 'cosine':
+      sim_func = self.cos_sim
+    elif self.mode == 'euclidean':
+      sim_func = self.euc_sim
+    sim = sim_func(a, b)
+    max_sim_v, max_sim_i = sim.max(dim=-1)
+    return max_sim_v, max_sim_i
+  def fit_predict(self, X):
+    """
+      Combination of fit() and predict() methods.
+      This is faster than calling fit() and predict() seperately.
+      Parameters:
+      X: torch.Tensor, shape: [n_samples, n_features]
+      centroids: {torch.Tensor, None}, default: None
+        if given, centroids will be initialized with given tensor
+        if None, centroids will be randomly chosen from X
+      Return:
+      labels: torch.Tensor, shape: [n_samples]
+            mini_=33kk/k*remain
+            mini=min(mini_,fea_shape)
+            offset=log2(k/1000)*1.5
+            kpp_all=min(mini_*10/offset,fea_shape)
+            kpp_sample=min(mini_/12/offset,fea_shape)
+    """
+    assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
+    assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
+    assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
+    # print("verbose:%s"%self.verbose)
+    offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
+    with torch.no_grad():
+      batch_size= X.shape[0]
+      # print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
+      start_time = time()
+      if (self.minibatch*10//offset< batch_size):
+        x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
+      else:
+        x = X.to(self.device)
+      # print(x.device)
+      self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
+      del x
+      torch.cuda.empty_cache()
+      # self.centroids = self.centroids.to(self.device)
+      num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
+      closest = None#[3098036]#int64
+      if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
+        X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
+      elif(self.minibatch>=batch_size):
+        X=X.to(self.device)
+      for i in range(self.max_iter):
+        iter_time = time()
+        if self.minibatch<batch_size//2:#可用minibatch数太小，每次都得从内存倒腾到显存
+          x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
+        else:#否则直接全部缓存
+          x = X
+        closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
+        matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
+        expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
+        mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
+        c_grad = mask @ x / mask.sum(-1)[..., :, None]
+        c_grad[c_grad!=c_grad] = 0 # remove NaNs
+        error = (c_grad - self.centroids).pow(2).sum()
+        if self.minibatch is not None:
+          lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
+        else:
+          lr = 1
+        matched_clusters=matched_clusters.long()
+        num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
+        self.centroids = self.centroids * (1-lr) + c_grad * lr
+        if self.verbose >= 2:
+          print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
+        if error <= self.tol:
+          break
+      if self.verbose >= 1:
+        print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
+    return closest

cluster/train_cluster.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import argparse
+import logging
+import os
+import time
+from pathlib import Path
+import numpy as np
+import torch
+import tqdm
+from kmeans import KMeansGPU
+from sklearn.cluster import KMeans, MiniBatchKMeans
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉，虽然库支持但是也不考虑
+    if str(in_dir).endswith(".ipynb_checkpoints"):
+        logger.info(f"Ignore {in_dir}")
+    logger.info(f"Loading features from {in_dir}")
+    features = []
+    nums = 0
+    for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
+    # for name in os.listdir(in_dir):
+    #     path="%s/%s"%(in_dir,name)
+        features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
+        # print(features[-1].shape)
+    features = np.concatenate(features, axis=0)
+    print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
+    features = features.astype(np.float32)
+    logger.info(f"Clustering features of shape: {features.shape}")
+    t = time.time()
+    if(use_gpu is False):
+        if use_minibatch:
+            kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
+        else:
+            kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
+    else:
+            kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
+            features=torch.from_numpy(features)#.to(device)
+            kmeans.fit_predict(features)#
+    print(time.time()-t, "s")
+    x = {
+            "n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
+            "_n_threads": kmeans._n_threads if use_gpu is False else 4,
+            "cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
+    }
+    print("end")
+    return x
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset', type=Path, default="./dataset/44k",
+                        help='path of training data directory')
+    parser.add_argument('--output', type=Path, default="logs/44k",
+                        help='path of model output directory')
+    parser.add_argument('--gpu',action='store_true', default=False ,
+                        help='to use GPU')
+    args = parser.parse_args()
+    checkpoint_dir = args.output
+    dataset = args.dataset
+    use_gpu = args.gpu
+    n_clusters = 10000
+    ckpt = {}
+    for spk in os.listdir(dataset):
+        if os.path.isdir(dataset/spk):
+            print(f"train kmeans for {spk}...")
+            in_dir = dataset/spk
+            x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=use_gpu)
+            ckpt[spk] = x
+    checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
+    checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
+    torch.save(
+        ckpt,
+        checkpoint_path,
+    )

configs/config.json ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+    "train": {
+        "log_interval": 20,
+        "eval_interval": 20,
+        "seed": 1234,
+        "epochs": 10000,
+        "learning_rate": 0.0001,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "batch_size": 6,
+        "fp16_run": false,
+        "lr_decay": 0.999875,
+        "segment_size": 10240,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0,
+        "use_sr": true,
+        "max_speclen": 512,
+        "port": "8001",
+        "keep_ckpts": 3
+    },
+    "data": {
+        "training_files": "filelists/train.txt",
+        "validation_files": "filelists/val.txt",
+        "max_wav_value": 32768.0,
+        "sampling_rate": 44100,
+        "filter_length": 2048,
+        "hop_length": 512,
+        "win_length": 2048,
+        "n_mel_channels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": 22050
+    },
+    "model": {
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0.1,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            8,
+            8,
+            2,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4,
+            4
+        ],
+        "n_layers_q": 3,
+        "use_spectral_norm": false,
+        "gin_channels": 256,
+        "ssl_dim": 256,
+        "n_speakers": 200,
+        "speech_encoder": "vec256l9"
+    },
+    "spk": {
+        "Shengshuyan": 0
+    }
+}

configs/diffusion.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+data:
+  sampling_rate: 44100
+  block_size: 512 # Equal to hop_length
+  duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
+  encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
+  cnhubertsoft_gate: 10
+  encoder_sample_rate: 16000
+  encoder_hop_size: 320
+  encoder_out_channels: 768 # 256 if using 'hubertsoft'
+  training_files: "filelists/train.txt"
+  validation_files: "filelists/val.txt"
+  extensions: # List of extension included in the data collection
+    - wav
+model:
+  type: 'Diffusion'
+  n_layers: 20
+  n_chans: 512
+  n_hidden: 256
+  use_pitch_aug: true
+  n_spk: 1 # max number of different speakers
+device: cuda
+vocoder:
+  type: 'nsf-hifigan'
+  ckpt: 'pretrain/nsf_hifigan/model'
+infer:
+  speedup: 10
+  method: 'dpm-solver' # 'pndm' or 'dpm-solver'
+env:
+  expdir: logs/44k/diffusion
+  gpu_id: 0
+train:
+  num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!
+  amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
+  batch_size: 48
+  cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
+  cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
+  cache_fp16: true
+  epochs: 100000
+  interval_log: 10
+  interval_val: 2000
+  interval_force_save: 10000
+  lr: 0.0002
+  decay_step: 100000
+  gamma: 0.5
+  weight_decay: 0
+  save_opt: false
+spk:
+  'nyaru': 0

configs_template/config_template.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 800,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 0.0001,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 6,
+    "fp16_run": false,
+    "half_type": "fp16",
+    "lr_decay": 0.999875,
+    "segment_size": 10240,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 512,
+    "port": "8001",
+    "keep_ckpts": 3,
+    "all_in_mem": false,
+    "vol_aug":false
+  },
+  "data": {
+    "training_files": "filelists/train.txt",
+    "validation_files": "filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": 22050,
+    "unit_interpolate_mode":"nearest"
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [ 8, 8, 2, 2, 2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16, 4, 4, 4],
+    "n_layers_q": 3,
+    "n_flow_layer": 4,
+    "use_spectral_norm": false,
+    "gin_channels": 768,
+    "ssl_dim": 768,
+    "n_speakers": 200,
+    "vocoder_name":"nsf-hifigan",
+    "speech_encoder":"vec768l12",
+    "speaker_embedding":false,
+    "vol_embedding":false,
+    "use_depthwise_conv":false,
+    "flow_share_parameter": false,
+    "use_automatic_f0_prediction": true
+  },
+  "spk": {
+    "nyaru": 0,
+    "huiyu": 1,
+    "nen": 2,
+    "paimon": 3,
+    "yunhao": 4
+  }
+}

configs_template/config_tiny_template.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 800,
+    "seed": 1234,
+    "epochs": 10000,
+    "learning_rate": 0.0001,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 6,
+    "fp16_run": false,
+    "half_type": "fp16",
+    "lr_decay": 0.999875,
+    "segment_size": 10240,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "use_sr": true,
+    "max_speclen": 512,
+    "port": "8001",
+    "keep_ckpts": 3,
+    "all_in_mem": false,
+    "vol_aug":false
+  },
+  "data": {
+    "training_files": "filelists/train.txt",
+    "validation_files": "filelists/val.txt",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": 22050,
+    "unit_interpolate_mode":"nearest"
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 512,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [ 8, 8, 2, 2, 2],
+    "upsample_initial_channel": 400,
+    "upsample_kernel_sizes": [16,16, 4, 4, 4],
+    "n_layers_q": 3,
+    "n_flow_layer": 4,
+    "use_spectral_norm": false,
+    "gin_channels": 768,
+    "ssl_dim": 768,
+    "n_speakers": 200,
+    "vocoder_name":"nsf-hifigan",
+    "speech_encoder":"vec768l12",
+    "speaker_embedding":false,
+    "vol_embedding":false,
+    "use_depthwise_conv":true,
+    "flow_share_parameter": true,
+    "use_automatic_f0_prediction": true
+  },
+  "spk": {
+    "nyaru": 0,
+    "huiyu": 1,
+    "nen": 2,
+    "paimon": 3,
+    "yunhao": 4
+  }
+}

configs_template/diffusion_template.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+data:
+  sampling_rate: 44100
+  block_size: 512 # Equal to hop_length
+  duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
+  encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
+  cnhubertsoft_gate: 10
+  encoder_sample_rate: 16000
+  encoder_hop_size: 320
+  encoder_out_channels: 768 # 256 if using 'hubertsoft'
+  training_files: "filelists/train.txt"
+  validation_files: "filelists/val.txt"
+  extensions: # List of extension included in the data collection
+    - wav
+  unit_interpolate_mode: "nearest"
+model:
+  type: 'Diffusion'
+  n_layers: 20
+  n_chans: 512
+  n_hidden: 256
+  use_pitch_aug: true
+  timesteps : 1000
+  k_step_max: 0 # must <= timesteps, If it is 0, train all
+  n_spk: 1 # max number of different speakers
+device: cuda
+vocoder:
+  type: 'nsf-hifigan'
+  ckpt: 'pretrain/nsf_hifigan/model'
+infer:
+  speedup: 10
+  method: 'dpm-solver++' # 'pndm' or 'dpm-solver' or 'ddim' or 'unipc' or 'dpm-solver++'
+env:
+  expdir: logs/44k/diffusion
+  gpu_id: 0
+train:
+  num_workers: 4 # If your cpu and gpu are both very strong, set to 0 may be faster!
+  amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
+  batch_size: 48
+  cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
+  cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
+  cache_fp16: true
+  epochs: 100000
+  interval_log: 10
+  interval_val: 2000
+  interval_force_save: 5000
+  lr: 0.0001
+  decay_step: 100000
+  gamma: 0.5
+  weight_decay: 0
+  save_opt: false
+spk:
+  'nyaru': 0

dataset_raw/wav_structure.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+数据集准备
+raw
+├───speaker0
+│   ├───xxx1-xxx1.wav
+│   ├───...
+│   └───Lxx-0xx8.wav
+└───speaker1
+    ├───xx2-0xxx2.wav
+    ├───...
+    └───xxx7-xxx007.wav
+此外还需要编辑config.json
+"n_speakers": 10
+"spk":{
+    "speaker0": 0,
+    "speaker1": 1,
+}