Spaces:
Runtime error
Runtime error
Commit
·
c82bb46
1
Parent(s):
07e9a8a
Upload 18 files
Browse files- CppDataProcess/F0Preprocess.cpp +153 -0
- CppDataProcess/F0Preprocess.hpp +36 -0
- CppDataProcess/Slicer.hpp +82 -0
- CppDataProcess/Wav.cpp +151 -0
- CppDataProcess/Wav.hpp +99 -0
- CppDataProcess/readme.md +8 -0
- cluster/__init__.py +29 -0
- cluster/__pycache__/__init__.cpython-38.pyc +0 -0
- cluster/__pycache__/kmeans.cpython-38.pyc +0 -0
- cluster/km_train.py +80 -0
- cluster/kmeans.py +204 -0
- cluster/train_cluster.py +85 -0
- configs/config.json +94 -0
- configs/diffusion.yaml +48 -0
- configs_template/config_template.json +77 -0
- configs_template/config_tiny_template.json +77 -0
- configs_template/diffusion_template.yaml +51 -0
- dataset_raw/wav_structure.txt +20 -0
CppDataProcess/F0Preprocess.cpp
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "F0Preprocess.hpp"
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
void F0PreProcess::compute_f0(const double* audio, int64_t len)
|
| 5 |
+
{
|
| 6 |
+
DioOption Doption;
|
| 7 |
+
InitializeDioOption(&Doption);
|
| 8 |
+
Doption.f0_ceil = 800;
|
| 9 |
+
Doption.frame_period = 1000.0 * hop / fs;
|
| 10 |
+
f0Len = GetSamplesForDIO(fs, (int)len, Doption.frame_period);
|
| 11 |
+
const auto tp = new double[f0Len];
|
| 12 |
+
const auto tmpf0 = new double[f0Len];
|
| 13 |
+
rf0 = new double[f0Len];
|
| 14 |
+
Dio(audio, (int)len, fs, &Doption, tp, tmpf0);
|
| 15 |
+
StoneMask(audio, (int)len, fs, tp, tmpf0, (int)f0Len, rf0);
|
| 16 |
+
delete[] tmpf0;
|
| 17 |
+
delete[] tp;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
std::vector<double> arange(double start,double end,double step = 1.0,double div = 1.0)
|
| 21 |
+
{
|
| 22 |
+
std::vector<double> output;
|
| 23 |
+
while(start<end)
|
| 24 |
+
{
|
| 25 |
+
output.push_back(start / div);
|
| 26 |
+
start += step;
|
| 27 |
+
}
|
| 28 |
+
return output;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
void F0PreProcess::InterPf0(int64_t len)
|
| 32 |
+
{
|
| 33 |
+
const auto xi = arange(0.0, (double)f0Len * (double)len, (double)f0Len, (double)len);
|
| 34 |
+
const auto tmp = new double[xi.size() + 1];
|
| 35 |
+
interp1(arange(0, (double)f0Len).data(), rf0, static_cast<int>(f0Len), xi.data(), (int)xi.size(), tmp);
|
| 36 |
+
for (size_t i = 0; i < xi.size(); i++)
|
| 37 |
+
if (isnan(tmp[i]))
|
| 38 |
+
tmp[i] = 0.0;
|
| 39 |
+
delete[] rf0;
|
| 40 |
+
rf0 = nullptr;
|
| 41 |
+
rf0 = tmp;
|
| 42 |
+
f0Len = (int64_t)xi.size();
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
long long* F0PreProcess::f0Log()
|
| 46 |
+
{
|
| 47 |
+
const auto tmp = new long long[f0Len];
|
| 48 |
+
const auto f0_mel = new double[f0Len];
|
| 49 |
+
for (long long i = 0; i < f0Len; i++)
|
| 50 |
+
{
|
| 51 |
+
f0_mel[i] = 1127 * log(1.0 + rf0[i] / 700.0);
|
| 52 |
+
if (f0_mel[i] > 0.0)
|
| 53 |
+
f0_mel[i] = (f0_mel[i] - f0_mel_min) * (f0_bin - 2.0) / (f0_mel_max - f0_mel_min) + 1.0;
|
| 54 |
+
if (f0_mel[i] < 1.0)
|
| 55 |
+
f0_mel[i] = 1;
|
| 56 |
+
if (f0_mel[i] > f0_bin - 1)
|
| 57 |
+
f0_mel[i] = f0_bin - 1;
|
| 58 |
+
tmp[i] = (long long)round(f0_mel[i]);
|
| 59 |
+
}
|
| 60 |
+
delete[] f0_mel;
|
| 61 |
+
delete[] rf0;
|
| 62 |
+
rf0 = nullptr;
|
| 63 |
+
return tmp;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
std::vector<long long> F0PreProcess::GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran)
|
| 67 |
+
{
|
| 68 |
+
compute_f0(audio, audioLen);
|
| 69 |
+
for (int64_t i = 0; i < f0Len; ++i)
|
| 70 |
+
{
|
| 71 |
+
rf0[i] = rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0);
|
| 72 |
+
if (rf0[i] < 0.001)
|
| 73 |
+
rf0[i] = NAN;
|
| 74 |
+
}
|
| 75 |
+
InterPf0(hubLen);
|
| 76 |
+
const auto O0f = f0Log();
|
| 77 |
+
std::vector<long long> Of0(O0f, O0f + f0Len);
|
| 78 |
+
delete[] O0f;
|
| 79 |
+
return Of0;
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
std::vector<long long> getAligments(size_t specLen, size_t hubertLen)
|
| 83 |
+
{
|
| 84 |
+
std::vector<long long> mel2ph(specLen + 1, 0);
|
| 85 |
+
|
| 86 |
+
size_t startFrame = 0;
|
| 87 |
+
const double ph_durs = static_cast<double>(specLen) / static_cast<double>(hubertLen);
|
| 88 |
+
for (size_t iph = 0; iph < hubertLen; ++iph)
|
| 89 |
+
{
|
| 90 |
+
const auto endFrame = static_cast<size_t>(round(static_cast<double>(iph) * ph_durs + ph_durs));
|
| 91 |
+
for (auto j = startFrame; j < endFrame + 1; ++j)
|
| 92 |
+
mel2ph[j] = static_cast<long long>(iph) + 1;
|
| 93 |
+
startFrame = endFrame + 1;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
return mel2ph;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
std::vector<float> F0PreProcess::GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran)
|
| 100 |
+
{
|
| 101 |
+
compute_f0(audio, audioLen);
|
| 102 |
+
for (int64_t i = 0; i < f0Len; ++i)
|
| 103 |
+
{
|
| 104 |
+
rf0[i] = log2(rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0));
|
| 105 |
+
if (rf0[i] < 0.001)
|
| 106 |
+
rf0[i] = NAN;
|
| 107 |
+
}
|
| 108 |
+
const int64_t specLen = audioLen / hop;
|
| 109 |
+
InterPf0(specLen);
|
| 110 |
+
|
| 111 |
+
std::vector<float> Of0(specLen, 0.0);
|
| 112 |
+
|
| 113 |
+
double last_value = 0.0;
|
| 114 |
+
for (int64_t i = 0; i < specLen; ++i)
|
| 115 |
+
{
|
| 116 |
+
if (rf0[i] <= 0.0)
|
| 117 |
+
{
|
| 118 |
+
int64_t j = i + 1;
|
| 119 |
+
for (; j < specLen; ++j)
|
| 120 |
+
{
|
| 121 |
+
if (rf0[j] > 0.0)
|
| 122 |
+
break;
|
| 123 |
+
}
|
| 124 |
+
if (j < specLen - 1)
|
| 125 |
+
{
|
| 126 |
+
if (last_value > 0.0)
|
| 127 |
+
{
|
| 128 |
+
const auto step = (rf0[j] - rf0[i - 1]) / double(j - i);
|
| 129 |
+
for (int64_t k = i; k < j; ++k)
|
| 130 |
+
Of0[k] = float(rf0[i - 1] + step * double(k - i + 1));
|
| 131 |
+
}
|
| 132 |
+
else
|
| 133 |
+
for (int64_t k = i; k < j; ++k)
|
| 134 |
+
Of0[k] = float(rf0[j]);
|
| 135 |
+
i = j;
|
| 136 |
+
}
|
| 137 |
+
else
|
| 138 |
+
{
|
| 139 |
+
for (int64_t k = i; k < specLen; ++k)
|
| 140 |
+
Of0[k] = float(last_value);
|
| 141 |
+
i = specLen;
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
else
|
| 145 |
+
{
|
| 146 |
+
Of0[i] = float(rf0[i - 1]);
|
| 147 |
+
last_value = rf0[i];
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
delete[] rf0;
|
| 151 |
+
rf0 = nullptr;
|
| 152 |
+
return Of0;
|
| 153 |
+
}
|
CppDataProcess/F0Preprocess.hpp
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "world/dio.h"
|
| 2 |
+
#include "world/stonemask.h"
|
| 3 |
+
#include "world/matlabfunctions.h"
|
| 4 |
+
#include <string>
|
| 5 |
+
#include <vector>
|
| 6 |
+
|
| 7 |
+
//Cpp F0 Preprocess
|
| 8 |
+
|
| 9 |
+
class F0PreProcess
|
| 10 |
+
{
|
| 11 |
+
public:
|
| 12 |
+
int fs;
|
| 13 |
+
short hop;
|
| 14 |
+
const int f0_bin = 256;
|
| 15 |
+
const double f0_max = 1100.0;
|
| 16 |
+
const double f0_min = 50.0;
|
| 17 |
+
const double f0_mel_min = 1127.0 * log(1.0 + f0_min / 700.0);
|
| 18 |
+
const double f0_mel_max = 1127.0 * log(1.0 + f0_max / 700.0);
|
| 19 |
+
F0PreProcess(int sr = 16000, short h = 160) :fs(sr), hop(h) {}
|
| 20 |
+
~F0PreProcess()
|
| 21 |
+
{
|
| 22 |
+
delete[] rf0;
|
| 23 |
+
rf0 = nullptr;
|
| 24 |
+
}
|
| 25 |
+
void compute_f0(const double* audio, int64_t len);
|
| 26 |
+
void InterPf0(int64_t len);
|
| 27 |
+
long long* f0Log();
|
| 28 |
+
int64_t getLen()const { return f0Len; }
|
| 29 |
+
std::vector<long long> GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran);
|
| 30 |
+
std::vector<float> GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran);
|
| 31 |
+
private:
|
| 32 |
+
double* rf0 = nullptr;
|
| 33 |
+
int64_t f0Len = 0;
|
| 34 |
+
};
|
| 35 |
+
|
| 36 |
+
std::vector<long long> getAligments(size_t specLen, size_t hubertLen);
|
CppDataProcess/Slicer.hpp
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <string>
|
| 2 |
+
#include <vector>
|
| 3 |
+
#include "Wav.hpp"
|
| 4 |
+
|
| 5 |
+
struct SliceResult
|
| 6 |
+
{
|
| 7 |
+
std::vector<unsigned long long> SliceOffset;
|
| 8 |
+
std::vector<bool> SliceTag;
|
| 9 |
+
cutResult(std::vector<unsigned long long>&& O, std::vector<bool>&& T) :SliceOffset(O), SliceTag(T) {}
|
| 10 |
+
};
|
| 11 |
+
|
| 12 |
+
double getAvg(const short* start, const short* end)
|
| 13 |
+
{
|
| 14 |
+
const auto size = end - start + 1;
|
| 15 |
+
auto avg = (double)(*start);
|
| 16 |
+
for (auto i = 1; i < size; i++)
|
| 17 |
+
{
|
| 18 |
+
avg = avg + (abs((double)start[i]) - avg) / (double)(i + 1ull);
|
| 19 |
+
}
|
| 20 |
+
return avg;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
inline SliceResult SliceWav(Wav& input, double threshold, unsigned long minLen, unsigned short frame_len, unsigned short frame_shift)
|
| 24 |
+
{
|
| 25 |
+
const auto header = input.getHeader();
|
| 26 |
+
if (header.Subchunk2Size < minLen * header.bytesPerSec)
|
| 27 |
+
return { {0,header.Subchunk2Size},{true} };
|
| 28 |
+
auto ptr = input.getData();
|
| 29 |
+
std::vector<unsigned long long> output;
|
| 30 |
+
std::vector<bool> tag;
|
| 31 |
+
auto n = (header.Subchunk2Size / frame_shift) - 2 * (frame_len / frame_shift);
|
| 32 |
+
unsigned long nn = 0;
|
| 33 |
+
bool cutTag = true;
|
| 34 |
+
output.emplace_back(0);
|
| 35 |
+
while (n--)
|
| 36 |
+
{
|
| 37 |
+
//if (nn > minLen * header.bytesPerSec)
|
| 38 |
+
if (cutTag)
|
| 39 |
+
{
|
| 40 |
+
const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
|
| 41 |
+
if (vol < threshold)
|
| 42 |
+
{
|
| 43 |
+
cutTag = false;
|
| 44 |
+
if (nn > minLen * header.bytesPerSec)
|
| 45 |
+
{
|
| 46 |
+
nn = 0;
|
| 47 |
+
output.emplace_back((ptr - input.getData()) + (frame_len / 2));
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
else
|
| 51 |
+
{
|
| 52 |
+
cutTag = true;
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
else
|
| 56 |
+
{
|
| 57 |
+
const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
|
| 58 |
+
if (vol < threshold)
|
| 59 |
+
{
|
| 60 |
+
cutTag = false;
|
| 61 |
+
}
|
| 62 |
+
else
|
| 63 |
+
{
|
| 64 |
+
cutTag = true;
|
| 65 |
+
if (nn > minLen * header.bytesPerSec)
|
| 66 |
+
{
|
| 67 |
+
nn = 0;
|
| 68 |
+
output.emplace_back((ptr - input.getData()) + (frame_len / 2));
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
}
|
| 72 |
+
nn += frame_shift;
|
| 73 |
+
ptr += frame_shift;
|
| 74 |
+
}
|
| 75 |
+
output.push_back(header.Subchunk2Size);
|
| 76 |
+
for (size_t i = 1; i < output.size(); i++)
|
| 77 |
+
{
|
| 78 |
+
tag.push_back(abs(getAvg((short*)(input.getData() + output[i - 1]), (short*)(input.getData() + output[i]))) > threshold);
|
| 79 |
+
}
|
| 80 |
+
return { std::move(output),std::move(tag) };
|
| 81 |
+
}
|
| 82 |
+
|
CppDataProcess/Wav.cpp
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "Wav.hpp"
|
| 2 |
+
|
| 3 |
+
Wav::Wav(const wchar_t* Path) :header(WAV_HEADER()) {
|
| 4 |
+
char buf[1024];
|
| 5 |
+
FILE* stream;
|
| 6 |
+
_wfreopen_s(&stream, Path, L"rb", stderr);
|
| 7 |
+
if (stream == nullptr) {
|
| 8 |
+
throw (std::exception("File not exists"));
|
| 9 |
+
}
|
| 10 |
+
fread(buf, 1, HEAD_LENGTH, stream);
|
| 11 |
+
int pos = 0;
|
| 12 |
+
while (pos < HEAD_LENGTH) {
|
| 13 |
+
if ((buf[pos] == 'R') && (buf[pos + 1] == 'I') && (buf[pos + 2] == 'F') && (buf[pos + 3] == 'F')) {
|
| 14 |
+
pos += 4;
|
| 15 |
+
break;
|
| 16 |
+
}
|
| 17 |
+
++pos;
|
| 18 |
+
}
|
| 19 |
+
if (pos >= HEAD_LENGTH)
|
| 20 |
+
throw (std::exception("Don't order fried rice (annoyed)"));
|
| 21 |
+
header.ChunkSize = *(int*)&buf[pos];
|
| 22 |
+
pos += 8;
|
| 23 |
+
while (pos < HEAD_LENGTH) {
|
| 24 |
+
if ((buf[pos] == 'f') && (buf[pos + 1] == 'm') && (buf[pos + 2] == 't')) {
|
| 25 |
+
pos += 4;
|
| 26 |
+
break;
|
| 27 |
+
}
|
| 28 |
+
++pos;
|
| 29 |
+
}
|
| 30 |
+
if (pos >= HEAD_LENGTH)
|
| 31 |
+
throw (std::exception("Don't order fried rice (annoyed)"));
|
| 32 |
+
header.Subchunk1Size = *(int*)&buf[pos];
|
| 33 |
+
pos += 4;
|
| 34 |
+
header.AudioFormat = *(short*)&buf[pos];
|
| 35 |
+
pos += 2;
|
| 36 |
+
header.NumOfChan = *(short*)&buf[pos];
|
| 37 |
+
pos += 2;
|
| 38 |
+
header.SamplesPerSec = *(int*)&buf[pos];
|
| 39 |
+
pos += 4;
|
| 40 |
+
header.bytesPerSec = *(int*)&buf[pos];
|
| 41 |
+
pos += 4;
|
| 42 |
+
header.blockAlign = *(short*)&buf[pos];
|
| 43 |
+
pos += 2;
|
| 44 |
+
header.bitsPerSample = *(short*)&buf[pos];
|
| 45 |
+
pos += 2;
|
| 46 |
+
while (pos < HEAD_LENGTH) {
|
| 47 |
+
if ((buf[pos] == 'd') && (buf[pos + 1] == 'a') && (buf[pos + 2] == 't') && (buf[pos + 3] == 'a')) {
|
| 48 |
+
pos += 4;
|
| 49 |
+
break;
|
| 50 |
+
}
|
| 51 |
+
++pos;
|
| 52 |
+
}
|
| 53 |
+
if (pos >= HEAD_LENGTH)
|
| 54 |
+
throw (std::exception("Don't order fried rice (annoyed)"));
|
| 55 |
+
header.Subchunk2Size = *(int*)&buf[pos];
|
| 56 |
+
pos += 4;
|
| 57 |
+
StartPos = pos;
|
| 58 |
+
Data = new char[header.Subchunk2Size + 1];
|
| 59 |
+
fseek(stream, StartPos, SEEK_SET);
|
| 60 |
+
fread(Data, 1, header.Subchunk2Size, stream);
|
| 61 |
+
if (stream != nullptr) {
|
| 62 |
+
fclose(stream);
|
| 63 |
+
}
|
| 64 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
| 65 |
+
dataSize = header.Subchunk2Size / 2;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
Wav::Wav(const Wav& input) :header(WAV_HEADER()) {
|
| 69 |
+
Data = new char[(input.header.Subchunk2Size + 1)];
|
| 70 |
+
if (Data == nullptr) { throw std::exception("OOM"); }
|
| 71 |
+
memcpy(header.RIFF, input.header.RIFF, 4);
|
| 72 |
+
memcpy(header.fmt, input.header.fmt, 4);
|
| 73 |
+
memcpy(header.WAVE, input.header.WAVE, 4);
|
| 74 |
+
memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
|
| 75 |
+
header.ChunkSize = input.header.ChunkSize;
|
| 76 |
+
header.Subchunk1Size = input.header.Subchunk1Size;
|
| 77 |
+
header.AudioFormat = input.header.AudioFormat;
|
| 78 |
+
header.NumOfChan = input.header.NumOfChan;
|
| 79 |
+
header.SamplesPerSec = input.header.SamplesPerSec;
|
| 80 |
+
header.bytesPerSec = input.header.bytesPerSec;
|
| 81 |
+
header.blockAlign = input.header.blockAlign;
|
| 82 |
+
header.bitsPerSample = input.header.bitsPerSample;
|
| 83 |
+
header.Subchunk2Size = input.header.Subchunk2Size;
|
| 84 |
+
StartPos = input.StartPos;
|
| 85 |
+
memcpy(Data, input.Data, input.header.Subchunk2Size);
|
| 86 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
| 87 |
+
dataSize = header.Subchunk2Size / 2;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
Wav::Wav(Wav&& input) noexcept
|
| 91 |
+
{
|
| 92 |
+
Data = input.Data;
|
| 93 |
+
input.Data = nullptr;
|
| 94 |
+
memcpy(header.RIFF, input.header.RIFF, 4);
|
| 95 |
+
memcpy(header.fmt, input.header.fmt, 4);
|
| 96 |
+
memcpy(header.WAVE, input.header.WAVE, 4);
|
| 97 |
+
memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
|
| 98 |
+
header.ChunkSize = input.header.ChunkSize;
|
| 99 |
+
header.Subchunk1Size = input.header.Subchunk1Size;
|
| 100 |
+
header.AudioFormat = input.header.AudioFormat;
|
| 101 |
+
header.NumOfChan = input.header.NumOfChan;
|
| 102 |
+
header.SamplesPerSec = input.header.SamplesPerSec;
|
| 103 |
+
header.bytesPerSec = input.header.bytesPerSec;
|
| 104 |
+
header.blockAlign = input.header.blockAlign;
|
| 105 |
+
header.bitsPerSample = input.header.bitsPerSample;
|
| 106 |
+
header.Subchunk2Size = input.header.Subchunk2Size;
|
| 107 |
+
StartPos = input.StartPos;
|
| 108 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
| 109 |
+
dataSize = header.Subchunk2Size / 2;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
Wav& Wav::operator=(Wav&& input) noexcept
|
| 113 |
+
{
|
| 114 |
+
destory();
|
| 115 |
+
Data = input.Data;
|
| 116 |
+
input.Data = nullptr;
|
| 117 |
+
memcpy(header.RIFF, input.header.RIFF, 4);
|
| 118 |
+
memcpy(header.fmt, input.header.fmt, 4);
|
| 119 |
+
memcpy(header.WAVE, input.header.WAVE, 4);
|
| 120 |
+
memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
|
| 121 |
+
header.ChunkSize = input.header.ChunkSize;
|
| 122 |
+
header.Subchunk1Size = input.header.Subchunk1Size;
|
| 123 |
+
header.AudioFormat = input.header.AudioFormat;
|
| 124 |
+
header.NumOfChan = input.header.NumOfChan;
|
| 125 |
+
header.SamplesPerSec = input.header.SamplesPerSec;
|
| 126 |
+
header.bytesPerSec = input.header.bytesPerSec;
|
| 127 |
+
header.blockAlign = input.header.blockAlign;
|
| 128 |
+
header.bitsPerSample = input.header.bitsPerSample;
|
| 129 |
+
header.Subchunk2Size = input.header.Subchunk2Size;
|
| 130 |
+
StartPos = input.StartPos;
|
| 131 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
| 132 |
+
dataSize = header.Subchunk2Size / 2;
|
| 133 |
+
return *this;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
Wav& Wav::cat(const Wav& input)
|
| 137 |
+
{
|
| 138 |
+
if (header.AudioFormat != 1) return *this;
|
| 139 |
+
if (header.SamplesPerSec != input.header.bitsPerSample || header.NumOfChan != input.header.NumOfChan) return *this;
|
| 140 |
+
char* buffer = new char[(int64_t)header.Subchunk2Size + (int64_t)input.header.Subchunk2Size + 1];
|
| 141 |
+
if (buffer == nullptr)return *this;
|
| 142 |
+
memcpy(buffer, Data, header.Subchunk2Size);
|
| 143 |
+
memcpy(buffer + header.Subchunk2Size, input.Data, input.header.Subchunk2Size);
|
| 144 |
+
header.ChunkSize += input.header.Subchunk2Size;
|
| 145 |
+
header.Subchunk2Size += input.header.Subchunk2Size;
|
| 146 |
+
delete[] Data;
|
| 147 |
+
Data = buffer;
|
| 148 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
| 149 |
+
dataSize = header.Subchunk2Size / 2;
|
| 150 |
+
return *this;
|
| 151 |
+
}
|
CppDataProcess/Wav.hpp
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class Wav {
|
| 2 |
+
public:
|
| 3 |
+
|
| 4 |
+
struct WAV_HEADER {
|
| 5 |
+
char RIFF[4] = { 'R','I','F','F' }; //RIFF��ʶ
|
| 6 |
+
unsigned long ChunkSize; //�ļ���С-8
|
| 7 |
+
char WAVE[4] = { 'W','A','V','E' }; //WAVE��
|
| 8 |
+
char fmt[4] = { 'f','m','t',' ' }; //fmt��
|
| 9 |
+
unsigned long Subchunk1Size; //fmt���С
|
| 10 |
+
unsigned short AudioFormat; //�����ʽ
|
| 11 |
+
unsigned short NumOfChan; //������
|
| 12 |
+
unsigned long SamplesPerSec; //������
|
| 13 |
+
unsigned long bytesPerSec; //ÿ�����ֽ���
|
| 14 |
+
unsigned short blockAlign; //�������ֽ�
|
| 15 |
+
unsigned short bitsPerSample; //�������
|
| 16 |
+
char Subchunk2ID[4] = { 'd','a','t','a' }; //���ݿ�
|
| 17 |
+
unsigned long Subchunk2Size; //���ݿ��С
|
| 18 |
+
WAV_HEADER(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :ChunkSize(cs), Subchunk1Size(sc1s), AudioFormat(af), NumOfChan(nc), SamplesPerSec(sr), bytesPerSec(bps), blockAlign(ba), bitsPerSample(bips), Subchunk2Size(sc2s) {}
|
| 19 |
+
};
|
| 20 |
+
using iterator = int16_t*;
|
| 21 |
+
Wav(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :header({
|
| 22 |
+
cs,
|
| 23 |
+
sc1s,
|
| 24 |
+
af,
|
| 25 |
+
nc,
|
| 26 |
+
sr,
|
| 27 |
+
bps,
|
| 28 |
+
ba,
|
| 29 |
+
bips,
|
| 30 |
+
sc2s
|
| 31 |
+
}), Data(nullptr), StartPos(44) {
|
| 32 |
+
dataSize = 0;
|
| 33 |
+
SData = nullptr;
|
| 34 |
+
}
|
| 35 |
+
Wav(unsigned long sr, unsigned long length, const void* data) :header({
|
| 36 |
+
36,
|
| 37 |
+
16,
|
| 38 |
+
1,
|
| 39 |
+
1,
|
| 40 |
+
sr,
|
| 41 |
+
sr * 2,
|
| 42 |
+
2,
|
| 43 |
+
16,
|
| 44 |
+
length
|
| 45 |
+
}), Data(new char[length + 1]), StartPos(44)
|
| 46 |
+
{
|
| 47 |
+
header.ChunkSize = 36 + length;
|
| 48 |
+
memcpy(Data, data, length);
|
| 49 |
+
SData = reinterpret_cast<int16_t*>(Data);
|
| 50 |
+
dataSize = length / 2;
|
| 51 |
+
}
|
| 52 |
+
Wav(const wchar_t* Path);
|
| 53 |
+
Wav(const Wav& input);
|
| 54 |
+
Wav(Wav&& input) noexcept;
|
| 55 |
+
Wav& operator=(const Wav& input) = delete;
|
| 56 |
+
Wav& operator=(Wav&& input) noexcept;
|
| 57 |
+
~Wav() { destory(); }
|
| 58 |
+
Wav& cat(const Wav& input);
|
| 59 |
+
bool isEmpty() const { return this->header.Subchunk2Size == 0; }
|
| 60 |
+
const char* getData() const { return Data; }
|
| 61 |
+
char* getData() { return Data; }
|
| 62 |
+
WAV_HEADER getHeader() const { return header; }
|
| 63 |
+
WAV_HEADER& Header() { return header; }
|
| 64 |
+
void destory() const { delete[] Data; }
|
| 65 |
+
void changeData(const void* indata,long length,int sr)
|
| 66 |
+
{
|
| 67 |
+
delete[] Data;
|
| 68 |
+
Data = new char[length];
|
| 69 |
+
memcpy(Data, indata, length);
|
| 70 |
+
header.ChunkSize = 36 + length;
|
| 71 |
+
header.Subchunk2Size = length;
|
| 72 |
+
header.SamplesPerSec = sr;
|
| 73 |
+
header.bytesPerSec = 2 * sr;
|
| 74 |
+
}
|
| 75 |
+
int16_t& operator[](const size_t index) const
|
| 76 |
+
{
|
| 77 |
+
if (index < dataSize)
|
| 78 |
+
return *(SData + index);
|
| 79 |
+
return *(SData + dataSize - 1);
|
| 80 |
+
}
|
| 81 |
+
iterator begin() const
|
| 82 |
+
{
|
| 83 |
+
return reinterpret_cast<int16_t*>(Data);
|
| 84 |
+
}
|
| 85 |
+
iterator end() const
|
| 86 |
+
{
|
| 87 |
+
return reinterpret_cast<int16_t*>(Data + header.Subchunk2Size);
|
| 88 |
+
}
|
| 89 |
+
int64_t getDataLen()const
|
| 90 |
+
{
|
| 91 |
+
return static_cast<int64_t>(dataSize);
|
| 92 |
+
}
|
| 93 |
+
private:
|
| 94 |
+
WAV_HEADER header;
|
| 95 |
+
char* Data;
|
| 96 |
+
int16_t* SData;
|
| 97 |
+
size_t dataSize;
|
| 98 |
+
int StartPos;
|
| 99 |
+
};
|
CppDataProcess/readme.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## F0Preprocess
|
| 2 |
+
请前往 https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder 下载PyWorld的源代码并编译出静态库并链接到你的项目之中,然后调用此头文件
|
| 3 |
+
|
| 4 |
+
## Slicer
|
| 5 |
+
一个简单的切片机
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
~~上面的东西是直接从MoeSS的代码里面抽出来的,可以作为预置预处理的替代品()~~
|
cluster/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from sklearn.cluster import KMeans
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def get_cluster_model(ckpt_path):
|
| 6 |
+
checkpoint = torch.load(ckpt_path)
|
| 7 |
+
kmeans_dict = {}
|
| 8 |
+
for spk, ckpt in checkpoint.items():
|
| 9 |
+
km = KMeans(ckpt["n_features_in_"])
|
| 10 |
+
km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
|
| 11 |
+
km.__dict__["_n_threads"] = ckpt["_n_threads"]
|
| 12 |
+
km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
|
| 13 |
+
kmeans_dict[spk] = km
|
| 14 |
+
return kmeans_dict
|
| 15 |
+
|
| 16 |
+
def get_cluster_result(model, x, speaker):
|
| 17 |
+
"""
|
| 18 |
+
x: np.array [t, 256]
|
| 19 |
+
return cluster class result
|
| 20 |
+
"""
|
| 21 |
+
return model[speaker].predict(x)
|
| 22 |
+
|
| 23 |
+
def get_cluster_center_result(model, x,speaker):
|
| 24 |
+
"""x: np.array [t, 256]"""
|
| 25 |
+
predict = model[speaker].predict(x)
|
| 26 |
+
return model[speaker].cluster_centers_[predict]
|
| 27 |
+
|
| 28 |
+
def get_center(model, x,speaker):
|
| 29 |
+
return model[speaker].cluster_centers_[x]
|
cluster/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (1.09 kB). View file
|
|
|
cluster/__pycache__/kmeans.cpython-38.pyc
ADDED
|
Binary file (6.95 kB). View file
|
|
|
cluster/km_train.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time,pdb
|
| 2 |
+
import tqdm
|
| 3 |
+
from time import time as ttime
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import logging
|
| 7 |
+
import argparse
|
| 8 |
+
from cluster.kmeans import KMeansGPU
|
| 9 |
+
import torch
|
| 10 |
+
import numpy as np
|
| 11 |
+
from sklearn.cluster import KMeans,MiniBatchKMeans
|
| 12 |
+
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
from time import time as ttime
|
| 16 |
+
import pynvml,torch
|
| 17 |
+
|
| 18 |
+
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
|
| 19 |
+
logger.info(f"Loading features from {in_dir}")
|
| 20 |
+
features = []
|
| 21 |
+
nums = 0
|
| 22 |
+
for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
|
| 23 |
+
# for name in os.listdir(in_dir):
|
| 24 |
+
# path="%s/%s"%(in_dir,name)
|
| 25 |
+
features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
|
| 26 |
+
# print(features[-1].shape)
|
| 27 |
+
features = np.concatenate(features, axis=0)
|
| 28 |
+
print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
|
| 29 |
+
features = features.astype(np.float32)
|
| 30 |
+
logger.info(f"Clustering features of shape: {features.shape}")
|
| 31 |
+
t = time.time()
|
| 32 |
+
if(use_gpu==False):
|
| 33 |
+
if use_minibatch:
|
| 34 |
+
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
|
| 35 |
+
else:
|
| 36 |
+
kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
|
| 37 |
+
else:
|
| 38 |
+
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
|
| 39 |
+
features=torch.from_numpy(features)#.to(device)
|
| 40 |
+
labels = kmeans.fit_predict(features)#
|
| 41 |
+
|
| 42 |
+
print(time.time()-t, "s")
|
| 43 |
+
|
| 44 |
+
x = {
|
| 45 |
+
"n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
|
| 46 |
+
"_n_threads": kmeans._n_threads if use_gpu==False else 4,
|
| 47 |
+
"cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
|
| 48 |
+
}
|
| 49 |
+
print("end")
|
| 50 |
+
|
| 51 |
+
return x
|
| 52 |
+
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
parser = argparse.ArgumentParser()
|
| 55 |
+
parser.add_argument('--dataset', type=Path, default="./dataset/44k",
|
| 56 |
+
help='path of training data directory')
|
| 57 |
+
parser.add_argument('--output', type=Path, default="logs/44k",
|
| 58 |
+
help='path of model output directory')
|
| 59 |
+
|
| 60 |
+
args = parser.parse_args()
|
| 61 |
+
|
| 62 |
+
checkpoint_dir = args.output
|
| 63 |
+
dataset = args.dataset
|
| 64 |
+
n_clusters = 1000
|
| 65 |
+
|
| 66 |
+
ckpt = {}
|
| 67 |
+
for spk in os.listdir(dataset):
|
| 68 |
+
if os.path.isdir(dataset/spk):
|
| 69 |
+
print(f"train kmeans for {spk}...")
|
| 70 |
+
in_dir = dataset/spk
|
| 71 |
+
x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=True)
|
| 72 |
+
ckpt[spk] = x
|
| 73 |
+
|
| 74 |
+
checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
|
| 75 |
+
checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
|
| 76 |
+
torch.save(
|
| 77 |
+
ckpt,
|
| 78 |
+
checkpoint_path,
|
| 79 |
+
)
|
| 80 |
+
|
cluster/kmeans.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from time import time
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pynvml
|
| 5 |
+
import torch
|
| 6 |
+
from torch.nn.functional import normalize
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# device=torch.device("cuda:0")
|
| 10 |
+
def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
|
| 11 |
+
""" Picks k points in the data based on the kmeans++ method.
|
| 12 |
+
|
| 13 |
+
Parameters
|
| 14 |
+
----------
|
| 15 |
+
data : torch.Tensor
|
| 16 |
+
Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
|
| 17 |
+
data, rank 2 multidimensional data, in which case one
|
| 18 |
+
row is one observation.
|
| 19 |
+
k : int
|
| 20 |
+
Number of samples to generate.
|
| 21 |
+
sample_size : int
|
| 22 |
+
sample data to avoid memory overflow during calculation
|
| 23 |
+
|
| 24 |
+
Returns
|
| 25 |
+
-------
|
| 26 |
+
init : ndarray
|
| 27 |
+
A 'k' by 'N' containing the initial centroids.
|
| 28 |
+
|
| 29 |
+
References
|
| 30 |
+
----------
|
| 31 |
+
.. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
|
| 32 |
+
careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
|
| 33 |
+
on Discrete Algorithms, 2007.
|
| 34 |
+
.. [2] scipy/cluster/vq.py: _kpp
|
| 35 |
+
"""
|
| 36 |
+
batch_size=data.shape[0]
|
| 37 |
+
if batch_size>sample_size:
|
| 38 |
+
data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
|
| 39 |
+
dims = data.shape[1] if len(data.shape) > 1 else 1
|
| 40 |
+
init = torch.zeros((k, dims)).to(data.device)
|
| 41 |
+
r = torch.distributions.uniform.Uniform(0, 1)
|
| 42 |
+
for i in range(k):
|
| 43 |
+
if i == 0:
|
| 44 |
+
init[i, :] = data[torch.randint(data.shape[0], [1])]
|
| 45 |
+
else:
|
| 46 |
+
D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
|
| 47 |
+
probs = D2 / torch.sum(D2)
|
| 48 |
+
cumprobs = torch.cumsum(probs, dim=0)
|
| 49 |
+
init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
|
| 50 |
+
return init
|
| 51 |
+
class KMeansGPU:
|
| 52 |
+
'''
|
| 53 |
+
Kmeans clustering algorithm implemented with PyTorch
|
| 54 |
+
|
| 55 |
+
Parameters:
|
| 56 |
+
n_clusters: int,
|
| 57 |
+
Number of clusters
|
| 58 |
+
|
| 59 |
+
max_iter: int, default: 100
|
| 60 |
+
Maximum number of iterations
|
| 61 |
+
|
| 62 |
+
tol: float, default: 0.0001
|
| 63 |
+
Tolerance
|
| 64 |
+
|
| 65 |
+
verbose: int, default: 0
|
| 66 |
+
Verbosity
|
| 67 |
+
|
| 68 |
+
mode: {'euclidean', 'cosine'}, default: 'euclidean'
|
| 69 |
+
Type of distance measure
|
| 70 |
+
|
| 71 |
+
init_method: {'random', 'point', '++'}
|
| 72 |
+
Type of initialization
|
| 73 |
+
|
| 74 |
+
minibatch: {None, int}, default: None
|
| 75 |
+
Batch size of MinibatchKmeans algorithm
|
| 76 |
+
if None perform full KMeans algorithm
|
| 77 |
+
|
| 78 |
+
Attributes:
|
| 79 |
+
centroids: torch.Tensor, shape: [n_clusters, n_features]
|
| 80 |
+
cluster centroids
|
| 81 |
+
'''
|
| 82 |
+
def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
|
| 83 |
+
self.n_clusters = n_clusters
|
| 84 |
+
self.max_iter = max_iter
|
| 85 |
+
self.tol = tol
|
| 86 |
+
self.verbose = verbose
|
| 87 |
+
self.mode = mode
|
| 88 |
+
self.device=device
|
| 89 |
+
pynvml.nvmlInit()
|
| 90 |
+
gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
|
| 91 |
+
info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
|
| 92 |
+
self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
|
| 93 |
+
print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
|
| 94 |
+
|
| 95 |
+
@staticmethod
|
| 96 |
+
def cos_sim(a, b):
|
| 97 |
+
"""
|
| 98 |
+
Compute cosine similarity of 2 sets of vectors
|
| 99 |
+
|
| 100 |
+
Parameters:
|
| 101 |
+
a: torch.Tensor, shape: [m, n_features]
|
| 102 |
+
|
| 103 |
+
b: torch.Tensor, shape: [n, n_features]
|
| 104 |
+
"""
|
| 105 |
+
return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
|
| 106 |
+
|
| 107 |
+
@staticmethod
|
| 108 |
+
def euc_sim(a, b):
|
| 109 |
+
"""
|
| 110 |
+
Compute euclidean similarity of 2 sets of vectors
|
| 111 |
+
Parameters:
|
| 112 |
+
a: torch.Tensor, shape: [m, n_features]
|
| 113 |
+
b: torch.Tensor, shape: [n, n_features]
|
| 114 |
+
"""
|
| 115 |
+
return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
|
| 116 |
+
|
| 117 |
+
def max_sim(self, a, b):
|
| 118 |
+
"""
|
| 119 |
+
Compute maximum similarity (or minimum distance) of each vector
|
| 120 |
+
in a with all of the vectors in b
|
| 121 |
+
Parameters:
|
| 122 |
+
a: torch.Tensor, shape: [m, n_features]
|
| 123 |
+
b: torch.Tensor, shape: [n, n_features]
|
| 124 |
+
"""
|
| 125 |
+
if self.mode == 'cosine':
|
| 126 |
+
sim_func = self.cos_sim
|
| 127 |
+
elif self.mode == 'euclidean':
|
| 128 |
+
sim_func = self.euc_sim
|
| 129 |
+
sim = sim_func(a, b)
|
| 130 |
+
max_sim_v, max_sim_i = sim.max(dim=-1)
|
| 131 |
+
return max_sim_v, max_sim_i
|
| 132 |
+
|
| 133 |
+
def fit_predict(self, X):
|
| 134 |
+
"""
|
| 135 |
+
Combination of fit() and predict() methods.
|
| 136 |
+
This is faster than calling fit() and predict() seperately.
|
| 137 |
+
Parameters:
|
| 138 |
+
X: torch.Tensor, shape: [n_samples, n_features]
|
| 139 |
+
centroids: {torch.Tensor, None}, default: None
|
| 140 |
+
if given, centroids will be initialized with given tensor
|
| 141 |
+
if None, centroids will be randomly chosen from X
|
| 142 |
+
Return:
|
| 143 |
+
labels: torch.Tensor, shape: [n_samples]
|
| 144 |
+
|
| 145 |
+
mini_=33kk/k*remain
|
| 146 |
+
mini=min(mini_,fea_shape)
|
| 147 |
+
offset=log2(k/1000)*1.5
|
| 148 |
+
kpp_all=min(mini_*10/offset,fea_shape)
|
| 149 |
+
kpp_sample=min(mini_/12/offset,fea_shape)
|
| 150 |
+
"""
|
| 151 |
+
assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
|
| 152 |
+
assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
|
| 153 |
+
assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
|
| 154 |
+
# print("verbose:%s"%self.verbose)
|
| 155 |
+
|
| 156 |
+
offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
|
| 157 |
+
with torch.no_grad():
|
| 158 |
+
batch_size= X.shape[0]
|
| 159 |
+
# print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
|
| 160 |
+
start_time = time()
|
| 161 |
+
if (self.minibatch*10//offset< batch_size):
|
| 162 |
+
x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
|
| 163 |
+
else:
|
| 164 |
+
x = X.to(self.device)
|
| 165 |
+
# print(x.device)
|
| 166 |
+
self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
|
| 167 |
+
del x
|
| 168 |
+
torch.cuda.empty_cache()
|
| 169 |
+
# self.centroids = self.centroids.to(self.device)
|
| 170 |
+
num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
|
| 171 |
+
closest = None#[3098036]#int64
|
| 172 |
+
if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
|
| 173 |
+
X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
|
| 174 |
+
elif(self.minibatch>=batch_size):
|
| 175 |
+
X=X.to(self.device)
|
| 176 |
+
for i in range(self.max_iter):
|
| 177 |
+
iter_time = time()
|
| 178 |
+
if self.minibatch<batch_size//2:#可用minibatch数太小,每次都得从内存倒腾到显存
|
| 179 |
+
x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
|
| 180 |
+
else:#否则直接全部缓存
|
| 181 |
+
x = X
|
| 182 |
+
|
| 183 |
+
closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
|
| 184 |
+
matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
|
| 185 |
+
expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
|
| 186 |
+
mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
|
| 187 |
+
c_grad = mask @ x / mask.sum(-1)[..., :, None]
|
| 188 |
+
c_grad[c_grad!=c_grad] = 0 # remove NaNs
|
| 189 |
+
error = (c_grad - self.centroids).pow(2).sum()
|
| 190 |
+
if self.minibatch is not None:
|
| 191 |
+
lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
|
| 192 |
+
else:
|
| 193 |
+
lr = 1
|
| 194 |
+
matched_clusters=matched_clusters.long()
|
| 195 |
+
num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
|
| 196 |
+
self.centroids = self.centroids * (1-lr) + c_grad * lr
|
| 197 |
+
if self.verbose >= 2:
|
| 198 |
+
print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
|
| 199 |
+
if error <= self.tol:
|
| 200 |
+
break
|
| 201 |
+
|
| 202 |
+
if self.verbose >= 1:
|
| 203 |
+
print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
|
| 204 |
+
return closest
|
cluster/train_cluster.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
import tqdm
|
| 10 |
+
from kmeans import KMeansGPU
|
| 11 |
+
from sklearn.cluster import KMeans, MiniBatchKMeans
|
| 12 |
+
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
|
| 17 |
+
if str(in_dir).endswith(".ipynb_checkpoints"):
|
| 18 |
+
logger.info(f"Ignore {in_dir}")
|
| 19 |
+
|
| 20 |
+
logger.info(f"Loading features from {in_dir}")
|
| 21 |
+
features = []
|
| 22 |
+
nums = 0
|
| 23 |
+
for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
|
| 24 |
+
# for name in os.listdir(in_dir):
|
| 25 |
+
# path="%s/%s"%(in_dir,name)
|
| 26 |
+
features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
|
| 27 |
+
# print(features[-1].shape)
|
| 28 |
+
features = np.concatenate(features, axis=0)
|
| 29 |
+
print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
|
| 30 |
+
features = features.astype(np.float32)
|
| 31 |
+
logger.info(f"Clustering features of shape: {features.shape}")
|
| 32 |
+
t = time.time()
|
| 33 |
+
if(use_gpu is False):
|
| 34 |
+
if use_minibatch:
|
| 35 |
+
kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
|
| 36 |
+
else:
|
| 37 |
+
kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
|
| 38 |
+
else:
|
| 39 |
+
kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
|
| 40 |
+
features=torch.from_numpy(features)#.to(device)
|
| 41 |
+
kmeans.fit_predict(features)#
|
| 42 |
+
|
| 43 |
+
print(time.time()-t, "s")
|
| 44 |
+
|
| 45 |
+
x = {
|
| 46 |
+
"n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
|
| 47 |
+
"_n_threads": kmeans._n_threads if use_gpu is False else 4,
|
| 48 |
+
"cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
|
| 49 |
+
}
|
| 50 |
+
print("end")
|
| 51 |
+
|
| 52 |
+
return x
|
| 53 |
+
|
| 54 |
+
if __name__ == "__main__":
|
| 55 |
+
parser = argparse.ArgumentParser()
|
| 56 |
+
parser.add_argument('--dataset', type=Path, default="./dataset/44k",
|
| 57 |
+
help='path of training data directory')
|
| 58 |
+
parser.add_argument('--output', type=Path, default="logs/44k",
|
| 59 |
+
help='path of model output directory')
|
| 60 |
+
parser.add_argument('--gpu',action='store_true', default=False ,
|
| 61 |
+
help='to use GPU')
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
args = parser.parse_args()
|
| 65 |
+
|
| 66 |
+
checkpoint_dir = args.output
|
| 67 |
+
dataset = args.dataset
|
| 68 |
+
use_gpu = args.gpu
|
| 69 |
+
n_clusters = 10000
|
| 70 |
+
|
| 71 |
+
ckpt = {}
|
| 72 |
+
for spk in os.listdir(dataset):
|
| 73 |
+
if os.path.isdir(dataset/spk):
|
| 74 |
+
print(f"train kmeans for {spk}...")
|
| 75 |
+
in_dir = dataset/spk
|
| 76 |
+
x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=use_gpu)
|
| 77 |
+
ckpt[spk] = x
|
| 78 |
+
|
| 79 |
+
checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
|
| 80 |
+
checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
|
| 81 |
+
torch.save(
|
| 82 |
+
ckpt,
|
| 83 |
+
checkpoint_path,
|
| 84 |
+
)
|
| 85 |
+
|
configs/config.json
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 20,
|
| 4 |
+
"eval_interval": 20,
|
| 5 |
+
"seed": 1234,
|
| 6 |
+
"epochs": 10000,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 6,
|
| 14 |
+
"fp16_run": false,
|
| 15 |
+
"lr_decay": 0.999875,
|
| 16 |
+
"segment_size": 10240,
|
| 17 |
+
"init_lr_ratio": 1,
|
| 18 |
+
"warmup_epochs": 0,
|
| 19 |
+
"c_mel": 45,
|
| 20 |
+
"c_kl": 1.0,
|
| 21 |
+
"use_sr": true,
|
| 22 |
+
"max_speclen": 512,
|
| 23 |
+
"port": "8001",
|
| 24 |
+
"keep_ckpts": 3
|
| 25 |
+
},
|
| 26 |
+
"data": {
|
| 27 |
+
"training_files": "filelists/train.txt",
|
| 28 |
+
"validation_files": "filelists/val.txt",
|
| 29 |
+
"max_wav_value": 32768.0,
|
| 30 |
+
"sampling_rate": 44100,
|
| 31 |
+
"filter_length": 2048,
|
| 32 |
+
"hop_length": 512,
|
| 33 |
+
"win_length": 2048,
|
| 34 |
+
"n_mel_channels": 80,
|
| 35 |
+
"mel_fmin": 0.0,
|
| 36 |
+
"mel_fmax": 22050
|
| 37 |
+
},
|
| 38 |
+
"model": {
|
| 39 |
+
"inter_channels": 192,
|
| 40 |
+
"hidden_channels": 192,
|
| 41 |
+
"filter_channels": 768,
|
| 42 |
+
"n_heads": 2,
|
| 43 |
+
"n_layers": 6,
|
| 44 |
+
"kernel_size": 3,
|
| 45 |
+
"p_dropout": 0.1,
|
| 46 |
+
"resblock": "1",
|
| 47 |
+
"resblock_kernel_sizes": [
|
| 48 |
+
3,
|
| 49 |
+
7,
|
| 50 |
+
11
|
| 51 |
+
],
|
| 52 |
+
"resblock_dilation_sizes": [
|
| 53 |
+
[
|
| 54 |
+
1,
|
| 55 |
+
3,
|
| 56 |
+
5
|
| 57 |
+
],
|
| 58 |
+
[
|
| 59 |
+
1,
|
| 60 |
+
3,
|
| 61 |
+
5
|
| 62 |
+
],
|
| 63 |
+
[
|
| 64 |
+
1,
|
| 65 |
+
3,
|
| 66 |
+
5
|
| 67 |
+
]
|
| 68 |
+
],
|
| 69 |
+
"upsample_rates": [
|
| 70 |
+
8,
|
| 71 |
+
8,
|
| 72 |
+
2,
|
| 73 |
+
2,
|
| 74 |
+
2
|
| 75 |
+
],
|
| 76 |
+
"upsample_initial_channel": 512,
|
| 77 |
+
"upsample_kernel_sizes": [
|
| 78 |
+
16,
|
| 79 |
+
16,
|
| 80 |
+
4,
|
| 81 |
+
4,
|
| 82 |
+
4
|
| 83 |
+
],
|
| 84 |
+
"n_layers_q": 3,
|
| 85 |
+
"use_spectral_norm": false,
|
| 86 |
+
"gin_channels": 256,
|
| 87 |
+
"ssl_dim": 256,
|
| 88 |
+
"n_speakers": 200,
|
| 89 |
+
"speech_encoder": "vec256l9"
|
| 90 |
+
},
|
| 91 |
+
"spk": {
|
| 92 |
+
"Shengshuyan": 0
|
| 93 |
+
}
|
| 94 |
+
}
|
configs/diffusion.yaml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data:
|
| 2 |
+
sampling_rate: 44100
|
| 3 |
+
block_size: 512 # Equal to hop_length
|
| 4 |
+
duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
|
| 5 |
+
encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
|
| 6 |
+
cnhubertsoft_gate: 10
|
| 7 |
+
encoder_sample_rate: 16000
|
| 8 |
+
encoder_hop_size: 320
|
| 9 |
+
encoder_out_channels: 768 # 256 if using 'hubertsoft'
|
| 10 |
+
training_files: "filelists/train.txt"
|
| 11 |
+
validation_files: "filelists/val.txt"
|
| 12 |
+
extensions: # List of extension included in the data collection
|
| 13 |
+
- wav
|
| 14 |
+
model:
|
| 15 |
+
type: 'Diffusion'
|
| 16 |
+
n_layers: 20
|
| 17 |
+
n_chans: 512
|
| 18 |
+
n_hidden: 256
|
| 19 |
+
use_pitch_aug: true
|
| 20 |
+
n_spk: 1 # max number of different speakers
|
| 21 |
+
device: cuda
|
| 22 |
+
vocoder:
|
| 23 |
+
type: 'nsf-hifigan'
|
| 24 |
+
ckpt: 'pretrain/nsf_hifigan/model'
|
| 25 |
+
infer:
|
| 26 |
+
speedup: 10
|
| 27 |
+
method: 'dpm-solver' # 'pndm' or 'dpm-solver'
|
| 28 |
+
env:
|
| 29 |
+
expdir: logs/44k/diffusion
|
| 30 |
+
gpu_id: 0
|
| 31 |
+
train:
|
| 32 |
+
num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!
|
| 33 |
+
amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
|
| 34 |
+
batch_size: 48
|
| 35 |
+
cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
|
| 36 |
+
cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
|
| 37 |
+
cache_fp16: true
|
| 38 |
+
epochs: 100000
|
| 39 |
+
interval_log: 10
|
| 40 |
+
interval_val: 2000
|
| 41 |
+
interval_force_save: 10000
|
| 42 |
+
lr: 0.0002
|
| 43 |
+
decay_step: 100000
|
| 44 |
+
gamma: 0.5
|
| 45 |
+
weight_decay: 0
|
| 46 |
+
save_opt: false
|
| 47 |
+
spk:
|
| 48 |
+
'nyaru': 0
|
configs_template/config_template.json
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 800,
|
| 5 |
+
"seed": 1234,
|
| 6 |
+
"epochs": 10000,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 6,
|
| 14 |
+
"fp16_run": false,
|
| 15 |
+
"half_type": "fp16",
|
| 16 |
+
"lr_decay": 0.999875,
|
| 17 |
+
"segment_size": 10240,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"use_sr": true,
|
| 23 |
+
"max_speclen": 512,
|
| 24 |
+
"port": "8001",
|
| 25 |
+
"keep_ckpts": 3,
|
| 26 |
+
"all_in_mem": false,
|
| 27 |
+
"vol_aug":false
|
| 28 |
+
},
|
| 29 |
+
"data": {
|
| 30 |
+
"training_files": "filelists/train.txt",
|
| 31 |
+
"validation_files": "filelists/val.txt",
|
| 32 |
+
"max_wav_value": 32768.0,
|
| 33 |
+
"sampling_rate": 44100,
|
| 34 |
+
"filter_length": 2048,
|
| 35 |
+
"hop_length": 512,
|
| 36 |
+
"win_length": 2048,
|
| 37 |
+
"n_mel_channels": 80,
|
| 38 |
+
"mel_fmin": 0.0,
|
| 39 |
+
"mel_fmax": 22050,
|
| 40 |
+
"unit_interpolate_mode":"nearest"
|
| 41 |
+
},
|
| 42 |
+
"model": {
|
| 43 |
+
"inter_channels": 192,
|
| 44 |
+
"hidden_channels": 192,
|
| 45 |
+
"filter_channels": 768,
|
| 46 |
+
"n_heads": 2,
|
| 47 |
+
"n_layers": 6,
|
| 48 |
+
"kernel_size": 3,
|
| 49 |
+
"p_dropout": 0.1,
|
| 50 |
+
"resblock": "1",
|
| 51 |
+
"resblock_kernel_sizes": [3,7,11],
|
| 52 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
| 53 |
+
"upsample_rates": [ 8, 8, 2, 2, 2],
|
| 54 |
+
"upsample_initial_channel": 512,
|
| 55 |
+
"upsample_kernel_sizes": [16,16, 4, 4, 4],
|
| 56 |
+
"n_layers_q": 3,
|
| 57 |
+
"n_flow_layer": 4,
|
| 58 |
+
"use_spectral_norm": false,
|
| 59 |
+
"gin_channels": 768,
|
| 60 |
+
"ssl_dim": 768,
|
| 61 |
+
"n_speakers": 200,
|
| 62 |
+
"vocoder_name":"nsf-hifigan",
|
| 63 |
+
"speech_encoder":"vec768l12",
|
| 64 |
+
"speaker_embedding":false,
|
| 65 |
+
"vol_embedding":false,
|
| 66 |
+
"use_depthwise_conv":false,
|
| 67 |
+
"flow_share_parameter": false,
|
| 68 |
+
"use_automatic_f0_prediction": true
|
| 69 |
+
},
|
| 70 |
+
"spk": {
|
| 71 |
+
"nyaru": 0,
|
| 72 |
+
"huiyu": 1,
|
| 73 |
+
"nen": 2,
|
| 74 |
+
"paimon": 3,
|
| 75 |
+
"yunhao": 4
|
| 76 |
+
}
|
| 77 |
+
}
|
configs_template/config_tiny_template.json
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 800,
|
| 5 |
+
"seed": 1234,
|
| 6 |
+
"epochs": 10000,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 6,
|
| 14 |
+
"fp16_run": false,
|
| 15 |
+
"half_type": "fp16",
|
| 16 |
+
"lr_decay": 0.999875,
|
| 17 |
+
"segment_size": 10240,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"use_sr": true,
|
| 23 |
+
"max_speclen": 512,
|
| 24 |
+
"port": "8001",
|
| 25 |
+
"keep_ckpts": 3,
|
| 26 |
+
"all_in_mem": false,
|
| 27 |
+
"vol_aug":false
|
| 28 |
+
},
|
| 29 |
+
"data": {
|
| 30 |
+
"training_files": "filelists/train.txt",
|
| 31 |
+
"validation_files": "filelists/val.txt",
|
| 32 |
+
"max_wav_value": 32768.0,
|
| 33 |
+
"sampling_rate": 44100,
|
| 34 |
+
"filter_length": 2048,
|
| 35 |
+
"hop_length": 512,
|
| 36 |
+
"win_length": 2048,
|
| 37 |
+
"n_mel_channels": 80,
|
| 38 |
+
"mel_fmin": 0.0,
|
| 39 |
+
"mel_fmax": 22050,
|
| 40 |
+
"unit_interpolate_mode":"nearest"
|
| 41 |
+
},
|
| 42 |
+
"model": {
|
| 43 |
+
"inter_channels": 192,
|
| 44 |
+
"hidden_channels": 192,
|
| 45 |
+
"filter_channels": 512,
|
| 46 |
+
"n_heads": 2,
|
| 47 |
+
"n_layers": 6,
|
| 48 |
+
"kernel_size": 3,
|
| 49 |
+
"p_dropout": 0.1,
|
| 50 |
+
"resblock": "1",
|
| 51 |
+
"resblock_kernel_sizes": [3,7,11],
|
| 52 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
| 53 |
+
"upsample_rates": [ 8, 8, 2, 2, 2],
|
| 54 |
+
"upsample_initial_channel": 400,
|
| 55 |
+
"upsample_kernel_sizes": [16,16, 4, 4, 4],
|
| 56 |
+
"n_layers_q": 3,
|
| 57 |
+
"n_flow_layer": 4,
|
| 58 |
+
"use_spectral_norm": false,
|
| 59 |
+
"gin_channels": 768,
|
| 60 |
+
"ssl_dim": 768,
|
| 61 |
+
"n_speakers": 200,
|
| 62 |
+
"vocoder_name":"nsf-hifigan",
|
| 63 |
+
"speech_encoder":"vec768l12",
|
| 64 |
+
"speaker_embedding":false,
|
| 65 |
+
"vol_embedding":false,
|
| 66 |
+
"use_depthwise_conv":true,
|
| 67 |
+
"flow_share_parameter": true,
|
| 68 |
+
"use_automatic_f0_prediction": true
|
| 69 |
+
},
|
| 70 |
+
"spk": {
|
| 71 |
+
"nyaru": 0,
|
| 72 |
+
"huiyu": 1,
|
| 73 |
+
"nen": 2,
|
| 74 |
+
"paimon": 3,
|
| 75 |
+
"yunhao": 4
|
| 76 |
+
}
|
| 77 |
+
}
|
configs_template/diffusion_template.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data:
|
| 2 |
+
sampling_rate: 44100
|
| 3 |
+
block_size: 512 # Equal to hop_length
|
| 4 |
+
duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
|
| 5 |
+
encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
|
| 6 |
+
cnhubertsoft_gate: 10
|
| 7 |
+
encoder_sample_rate: 16000
|
| 8 |
+
encoder_hop_size: 320
|
| 9 |
+
encoder_out_channels: 768 # 256 if using 'hubertsoft'
|
| 10 |
+
training_files: "filelists/train.txt"
|
| 11 |
+
validation_files: "filelists/val.txt"
|
| 12 |
+
extensions: # List of extension included in the data collection
|
| 13 |
+
- wav
|
| 14 |
+
unit_interpolate_mode: "nearest"
|
| 15 |
+
model:
|
| 16 |
+
type: 'Diffusion'
|
| 17 |
+
n_layers: 20
|
| 18 |
+
n_chans: 512
|
| 19 |
+
n_hidden: 256
|
| 20 |
+
use_pitch_aug: true
|
| 21 |
+
timesteps : 1000
|
| 22 |
+
k_step_max: 0 # must <= timesteps, If it is 0, train all
|
| 23 |
+
n_spk: 1 # max number of different speakers
|
| 24 |
+
device: cuda
|
| 25 |
+
vocoder:
|
| 26 |
+
type: 'nsf-hifigan'
|
| 27 |
+
ckpt: 'pretrain/nsf_hifigan/model'
|
| 28 |
+
infer:
|
| 29 |
+
speedup: 10
|
| 30 |
+
method: 'dpm-solver++' # 'pndm' or 'dpm-solver' or 'ddim' or 'unipc' or 'dpm-solver++'
|
| 31 |
+
env:
|
| 32 |
+
expdir: logs/44k/diffusion
|
| 33 |
+
gpu_id: 0
|
| 34 |
+
train:
|
| 35 |
+
num_workers: 4 # If your cpu and gpu are both very strong, set to 0 may be faster!
|
| 36 |
+
amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
|
| 37 |
+
batch_size: 48
|
| 38 |
+
cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
|
| 39 |
+
cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
|
| 40 |
+
cache_fp16: true
|
| 41 |
+
epochs: 100000
|
| 42 |
+
interval_log: 10
|
| 43 |
+
interval_val: 2000
|
| 44 |
+
interval_force_save: 5000
|
| 45 |
+
lr: 0.0001
|
| 46 |
+
decay_step: 100000
|
| 47 |
+
gamma: 0.5
|
| 48 |
+
weight_decay: 0
|
| 49 |
+
save_opt: false
|
| 50 |
+
spk:
|
| 51 |
+
'nyaru': 0
|
dataset_raw/wav_structure.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
数据集准备
|
| 2 |
+
|
| 3 |
+
raw
|
| 4 |
+
├───speaker0
|
| 5 |
+
│ ├───xxx1-xxx1.wav
|
| 6 |
+
│ ├───...
|
| 7 |
+
│ └───Lxx-0xx8.wav
|
| 8 |
+
└───speaker1
|
| 9 |
+
├───xx2-0xxx2.wav
|
| 10 |
+
├───...
|
| 11 |
+
└───xxx7-xxx007.wav
|
| 12 |
+
|
| 13 |
+
此外还需要编辑config.json
|
| 14 |
+
|
| 15 |
+
"n_speakers": 10
|
| 16 |
+
|
| 17 |
+
"spk":{
|
| 18 |
+
"speaker0": 0,
|
| 19 |
+
"speaker1": 1,
|
| 20 |
+
}
|