Harshil748 commited on
Commit
9e5ede4
·
0 Parent(s):

Add all TTS models for 11 Indian languages

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. README.md +50 -0
  3. bho_female/.gitattributes +35 -0
  4. bho_female/README.md +3 -0
  5. bho_female/checkpoint_340000.pth +3 -0
  6. bho_female/config.json +257 -0
  7. bho_male/.gitattributes +35 -0
  8. bho_male/README.md +3 -0
  9. bho_male/checkpoint_200000.pth +3 -0
  10. bho_male/config.json +257 -0
  11. bn_female/bn_female_vits_30hrs.pt +3 -0
  12. bn_female/chars.txt +1 -0
  13. bn_female/jit_infer.py +32 -0
  14. bn_male/bn_male_vits_30hrs.pt +3 -0
  15. bn_male/chars.txt +1 -0
  16. bn_male/extra.py +787 -0
  17. bn_male/jit_infer.py +32 -0
  18. en_female/.gitattributes +35 -0
  19. en_female/README.md +3 -0
  20. en_female/chars.txt +1 -0
  21. en_female/en_female_vits_30hrs.pt +3 -0
  22. en_female/extra.py +787 -0
  23. en_female/jit_infer.py +33 -0
  24. en_male/.gitattributes +35 -0
  25. en_male/README.md +3 -0
  26. en_male/chars.txt +1 -0
  27. en_male/en_male_vits_30hrs.pt +3 -0
  28. en_male/extra.py +787 -0
  29. en_male/jit_infer.py +32 -0
  30. gu_mms/config.json +82 -0
  31. gu_mms/special_tokens_map.json +4 -0
  32. gu_mms/tokenizer_config.json +12 -0
  33. gu_mms/vocab.json +62 -0
  34. hi_female/chars.txt +1 -0
  35. hi_female/extra.py +787 -0
  36. hi_female/hi_female_vits_30hrs.pt +3 -0
  37. hi_female/jit_infer.py +32 -0
  38. hi_male/chars.txt +1 -0
  39. hi_male/extra.py +787 -0
  40. hi_male/hi_male_vits_30hrs.pt +3 -0
  41. hi_male/jit_infer.py +32 -0
  42. hne_female/.gitattributes +35 -0
  43. hne_female/README.md +3 -0
  44. hne_female/ch_female_vits_30hrs.pt +3 -0
  45. hne_female/chars.txt +1 -0
  46. hne_female/extra.py +787 -0
  47. hne_female/jit_infer.py +31 -0
  48. hne_male/.gitattributes +35 -0
  49. hne_male/README.md +3 -0
  50. hne_male/ch_male_vits_30hrs.pt +3 -0
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.pt filter=lfs diff=lfs merge=lfs -text
2
+ *.pth filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - tts
5
+ - text-to-speech
6
+ - indian-languages
7
+ - vits
8
+ language:
9
+ - hi
10
+ - bn
11
+ - mr
12
+ - te
13
+ - kn
14
+ - en
15
+ - bho
16
+ - mai
17
+ - mag
18
+ - hne
19
+ - gu
20
+ ---
21
+
22
+ # VoiceAPI Models
23
+
24
+ TTS models for 11 Indian languages, 21 voices total.
25
+
26
+ ## Languages & Voices
27
+
28
+ | Language | Code | Female | Male |
29
+ |----------|------|--------|------|
30
+ | Hindi | hi | ✅ | ✅ |
31
+ | Bengali | bn | ✅ | ✅ |
32
+ | Marathi | mr | ✅ | ✅ |
33
+ | Telugu | te | ✅ | ✅ |
34
+ | Kannada | kn | ✅ | ✅ |
35
+ | English | en | ✅ | ✅ |
36
+ | Bhojpuri | bho | ✅ | ✅ |
37
+ | Maithili | mai | ✅ | ✅ |
38
+ | Magahi | mag | ✅ | ✅ |
39
+ | Chhattisgarhi | hne | ✅ | ✅ |
40
+ | Gujarati | gu | MMS | - |
41
+
42
+ ## Model Types
43
+
44
+ - **JIT Models** (.pt): SYSPIN VITS models (most languages)
45
+ - **Coqui Models** (.pth): Bhojpuri male/female
46
+ - **MMS**: Facebook MMS for Gujarati
47
+
48
+ ## Usage
49
+
50
+ These models are used by the [VoiceAPI](https://huggingface.co/spaces/Harshil748/VoiceAPI) TTS service.
bho_female/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
bho_female/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---
bho_female/checkpoint_340000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2182258024b05f739bf79002cb52cfa863605d54ee2eee5b4a5cd1fbaac797ab
3
+ size 997764677
bho_female/config.json ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": ".",
3
+ "logger_uri": null,
4
+ "run_name": "vits_Bhojpuri_Female_30hrs",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": null,
13
+ "save_step": 20000,
14
+ "save_n_checkpoints": 1000,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": true,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": true,
26
+ "epochs": 1000,
27
+ "batch_size": 40,
28
+ "eval_batch_size": 16,
29
+ "grad_clip": [
30
+ 1000,
31
+ 1000
32
+ ],
33
+ "scheduler_after_epoch": true,
34
+ "lr": 0.001,
35
+ "optimizer": "AdamW",
36
+ "optimizer_params": {
37
+ "betas": [
38
+ 0.8,
39
+ 0.99
40
+ ],
41
+ "eps": 1e-09,
42
+ "weight_decay": 0.01
43
+ },
44
+ "lr_scheduler": null,
45
+ "lr_scheduler_params": {},
46
+ "use_grad_scaler": false,
47
+ "cudnn_enable": true,
48
+ "cudnn_deterministic": false,
49
+ "cudnn_benchmark": false,
50
+ "training_seed": 54321,
51
+ "model": "vits",
52
+ "num_loader_workers": 8,
53
+ "num_eval_loader_workers": 4,
54
+ "use_noise_augment": false,
55
+ "audio": {
56
+ "fft_size": 1024,
57
+ "sample_rate": 22050,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "num_mels": 80,
61
+ "mel_fmin": 0,
62
+ "mel_fmax": null
63
+ },
64
+ "use_phonemes": false,
65
+ "phonemizer": null,
66
+ "phoneme_language": "en-us",
67
+ "compute_input_seq_cache": true,
68
+ "text_cleaner": "multilingual_cleaners",
69
+ "enable_eos_bos_chars": false,
70
+ "test_sentences_file": "",
71
+ "phoneme_cache_path": "./phoneme_cache",
72
+ "characters": {
73
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
74
+ "vocab_dict": null,
75
+ "pad": "<PAD>",
76
+ "eos": "<EOS>",
77
+ "bos": "<BOS>",
78
+ "blank": "<BLNK>",
79
+ "characters": "\u091a.\u0947\u0910\u0925\u092e\u0959\u091d\u0906\u0949?\u092d\u092a \u0939\u0928\u093d\u091f\u0940\u0938\u0935\u091b\u0923\u0921\u091e\u0926\u094b\u0915\u0924\u0948\u0943\u095b\u0941\u095e\u092c\u0908\u094c\u0927\u090b\u093e\u0922\u0907\u093c\u0902\u0937\u0920\u0905\u095c\u0913\u092f,\u093f\u0930\u0914\u0901\u092b\u0909\u0916\u0911\u094d\u0932\u091c\u090f\u090a\u0917\u0936\u095d\u0919\u0918\u0942",
80
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
81
+ "phonemes": null,
82
+ "is_unique": true,
83
+ "is_sorted": true
84
+ },
85
+ "add_blank": true,
86
+ "batch_group_size": 5,
87
+ "loss_masking": null,
88
+ "min_audio_len": 1,
89
+ "max_audio_len": Infinity,
90
+ "min_text_len": 1,
91
+ "max_text_len": Infinity,
92
+ "compute_f0": false,
93
+ "compute_energy": false,
94
+ "compute_linear_spec": true,
95
+ "precompute_num_workers": 0,
96
+ "start_by_longest": false,
97
+ "shuffle": false,
98
+ "drop_last": false,
99
+ "datasets": [
100
+ {
101
+ "formatter": "syspin",
102
+ "dataset_name": "",
103
+ "path": ".",
104
+ "meta_file_train": "../manifests/Bhojpuri_Female/30hrs.tsv",
105
+ "ignored_speakers": null,
106
+ "language": "",
107
+ "phonemizer": "",
108
+ "meta_file_val": "",
109
+ "meta_file_attn_mask": ""
110
+ }
111
+ ],
112
+ "test_sentences": [
113
+ [
114
+ "\u090f\u0928\u094d\u091f\u094d\u0930\u093e\u092a\u0940 \u0915\u0902\u092a\u094d\u092f\u0942\u091f\u093f\u0902\u0917 \u092e\u0947\u0902 \u090f\u0928\u094d\u091f\u094d\u0930\u094b\u092a\u0940 \u090a \u0911\u092a\u0930\u0947\u091f\u093f\u0902\u0917 \u0938\u093f\u0938\u094d\u091f\u092e \u0939 \u091c\u0947 \u092a\u0947 \u0938\u0930\u093e \u0915\u094d\u0930\u093f\u092a\u094d\u091f\u094b\u0917\u094d\u0930\u093e\u092b\u093f\u0915 \u092b\u0902\u0915\u094d\u0936\u0928 \u0938\u092c \u0915\u093e\u092e \u0915\u0930\u0947 \u0932\u0947\u0902",
115
+ "Bhojpuri_Female",
116
+ null,
117
+ "bh"
118
+ ]
119
+ ],
120
+ "eval_split_max_size": null,
121
+ "eval_split_size": 0.01,
122
+ "use_speaker_weighted_sampler": false,
123
+ "speaker_weighted_sampler_alpha": 1.0,
124
+ "use_language_weighted_sampler": false,
125
+ "language_weighted_sampler_alpha": 1.0,
126
+ "use_length_weighted_sampler": false,
127
+ "length_weighted_sampler_alpha": 1.0,
128
+ "model_args": {
129
+ "num_chars": 85,
130
+ "out_channels": 513,
131
+ "spec_segment_size": 32,
132
+ "hidden_channels": 192,
133
+ "hidden_channels_ffn_text_encoder": 768,
134
+ "num_heads_text_encoder": 2,
135
+ "num_layers_text_encoder": 6,
136
+ "kernel_size_text_encoder": 3,
137
+ "dropout_p_text_encoder": 0.1,
138
+ "dropout_p_duration_predictor": 0.5,
139
+ "kernel_size_posterior_encoder": 5,
140
+ "dilation_rate_posterior_encoder": 1,
141
+ "num_layers_posterior_encoder": 16,
142
+ "kernel_size_flow": 5,
143
+ "dilation_rate_flow": 1,
144
+ "num_layers_flow": 4,
145
+ "resblock_type_decoder": "1",
146
+ "resblock_kernel_sizes_decoder": [
147
+ 3,
148
+ 7,
149
+ 11
150
+ ],
151
+ "resblock_dilation_sizes_decoder": [
152
+ [
153
+ 1,
154
+ 3,
155
+ 5
156
+ ],
157
+ [
158
+ 1,
159
+ 3,
160
+ 5
161
+ ],
162
+ [
163
+ 1,
164
+ 3,
165
+ 5
166
+ ]
167
+ ],
168
+ "upsample_rates_decoder": [
169
+ 8,
170
+ 8,
171
+ 2,
172
+ 2
173
+ ],
174
+ "upsample_initial_channel_decoder": 512,
175
+ "upsample_kernel_sizes_decoder": [
176
+ 16,
177
+ 16,
178
+ 4,
179
+ 4
180
+ ],
181
+ "periods_multi_period_discriminator": [
182
+ 2,
183
+ 3,
184
+ 5,
185
+ 7,
186
+ 11
187
+ ],
188
+ "use_sdp": true,
189
+ "noise_scale": 1.0,
190
+ "inference_noise_scale": 0.667,
191
+ "length_scale": 1,
192
+ "noise_scale_dp": 1.0,
193
+ "inference_noise_scale_dp": 1.0,
194
+ "max_inference_len": null,
195
+ "init_discriminator": true,
196
+ "use_spectral_norm_disriminator": false,
197
+ "use_speaker_embedding": false,
198
+ "num_speakers": 0,
199
+ "speakers_file": null,
200
+ "d_vector_file": null,
201
+ "speaker_embedding_channels": 256,
202
+ "use_d_vector_file": false,
203
+ "d_vector_dim": 0,
204
+ "detach_dp_input": true,
205
+ "use_language_embedding": false,
206
+ "embedded_language_dim": 4,
207
+ "num_languages": 0,
208
+ "language_ids_file": null,
209
+ "use_speaker_encoder_as_loss": false,
210
+ "speaker_encoder_config_path": "",
211
+ "speaker_encoder_model_path": "",
212
+ "condition_dp_on_speaker": true,
213
+ "freeze_encoder": false,
214
+ "freeze_DP": false,
215
+ "freeze_PE": false,
216
+ "freeze_flow_decoder": false,
217
+ "freeze_waveform_decoder": false,
218
+ "encoder_sample_rate": null,
219
+ "interpolate_z": true,
220
+ "reinit_DP": false,
221
+ "reinit_text_encoder": false
222
+ },
223
+ "lr_gen": 0.0002,
224
+ "lr_disc": 0.0002,
225
+ "lr_scheduler_gen": "ExponentialLR",
226
+ "lr_scheduler_gen_params": {
227
+ "gamma": 0.999875,
228
+ "last_epoch": -1
229
+ },
230
+ "lr_scheduler_disc": "ExponentialLR",
231
+ "lr_scheduler_disc_params": {
232
+ "gamma": 0.999875,
233
+ "last_epoch": -1
234
+ },
235
+ "kl_loss_alpha": 1.0,
236
+ "disc_loss_alpha": 1.0,
237
+ "gen_loss_alpha": 1.0,
238
+ "feat_loss_alpha": 1.0,
239
+ "mel_loss_alpha": 45.0,
240
+ "dur_loss_alpha": 1.0,
241
+ "speaker_encoder_loss_alpha": 1.0,
242
+ "return_wav": true,
243
+ "use_weighted_sampler": false,
244
+ "weighted_sampler_attrs": {},
245
+ "weighted_sampler_multipliers": {},
246
+ "r": 1,
247
+ "num_speakers": 0,
248
+ "use_speaker_embedding": false,
249
+ "speakers_file": null,
250
+ "speaker_embedding_channels": 256,
251
+ "language_ids_file": null,
252
+ "use_language_embedding": false,
253
+ "use_d_vector_file": false,
254
+ "d_vector_file": null,
255
+ "d_vector_dim": 0,
256
+ "github_branch": "* dev"
257
+ }
bho_male/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
bho_male/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---
bho_male/checkpoint_200000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4fb6ce54092c79ab526d4e9bc70514d7ea7f820b0184ef99e6ad3a7b9b72abc
3
+ size 997766981
bho_male/config.json ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": ".",
3
+ "logger_uri": null,
4
+ "run_name": "vits_Bhojpuri_Male_30hrs",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": null,
13
+ "save_step": 20000,
14
+ "save_n_checkpoints": 1000,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": true,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": true,
26
+ "epochs": 500,
27
+ "batch_size": 40,
28
+ "eval_batch_size": 16,
29
+ "grad_clip": [
30
+ 1000,
31
+ 1000
32
+ ],
33
+ "scheduler_after_epoch": true,
34
+ "lr": 0.001,
35
+ "optimizer": "AdamW",
36
+ "optimizer_params": {
37
+ "betas": [
38
+ 0.8,
39
+ 0.99
40
+ ],
41
+ "eps": 1e-09,
42
+ "weight_decay": 0.01
43
+ },
44
+ "lr_scheduler": null,
45
+ "lr_scheduler_params": {},
46
+ "use_grad_scaler": false,
47
+ "cudnn_enable": true,
48
+ "cudnn_deterministic": false,
49
+ "cudnn_benchmark": false,
50
+ "training_seed": 54321,
51
+ "model": "vits",
52
+ "num_loader_workers": 8,
53
+ "num_eval_loader_workers": 4,
54
+ "use_noise_augment": false,
55
+ "audio": {
56
+ "fft_size": 1024,
57
+ "sample_rate": 22050,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "num_mels": 80,
61
+ "mel_fmin": 0,
62
+ "mel_fmax": null
63
+ },
64
+ "use_phonemes": false,
65
+ "phonemizer": null,
66
+ "phoneme_language": "en-us",
67
+ "compute_input_seq_cache": true,
68
+ "text_cleaner": "multilingual_cleaners",
69
+ "enable_eos_bos_chars": false,
70
+ "test_sentences_file": "",
71
+ "phoneme_cache_path": "./phoneme_cache",
72
+ "characters": {
73
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
74
+ "vocab_dict": null,
75
+ "pad": "<PAD>",
76
+ "eos": "<EOS>",
77
+ "bos": "<BOS>",
78
+ "blank": "<BLNK>",
79
+ "characters": "\u091a.\u0947\u0910\u0925\u092e\u0959\u091d\u0906\u0949?\u092d \u092a\u0939\u0928\u093d\u091f\u0938\u0935\u0940\u091b\u0923\u0921\u091e\u0926\u094b\u0915\u0924\u0948\u0943\u095b\u0941\u095e\u092c\u0908\u0946\u094c\u0927\u090b\u093e\u0922\u0907\u093c\u0902\u0905\u0937\u0920\u095c\u0913\u092f,\u093f\u0930\u0901\u0914\u092b\u0909\u0916\u0911\u094d\u0932\u091c\u090f\u090a\u0917\u0936\u095d\u0919\u0918\u0942",
80
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
81
+ "phonemes": null,
82
+ "is_unique": true,
83
+ "is_sorted": true
84
+ },
85
+ "add_blank": true,
86
+ "batch_group_size": 5,
87
+ "loss_masking": null,
88
+ "min_audio_len": 1,
89
+ "max_audio_len": Infinity,
90
+ "min_text_len": 1,
91
+ "max_text_len": Infinity,
92
+ "compute_f0": false,
93
+ "compute_energy": false,
94
+ "compute_linear_spec": true,
95
+ "precompute_num_workers": 0,
96
+ "start_by_longest": false,
97
+ "shuffle": false,
98
+ "drop_last": false,
99
+ "datasets": [
100
+ {
101
+ "formatter": "syspin",
102
+ "dataset_name": "",
103
+ "path": ".",
104
+ "meta_file_train": "../manifests/Bhojpuri_Male/30hrs.tsv",
105
+ "ignored_speakers": null,
106
+ "language": "",
107
+ "phonemizer": "",
108
+ "meta_file_val": "",
109
+ "meta_file_attn_mask": ""
110
+ }
111
+ ],
112
+ "test_sentences": [
113
+ [
114
+ "\u090f\u0928\u094d\u091f\u094d\u0930\u093e\u092a\u0940 \u0915\u0902\u092a\u094d\u092f\u0942\u091f\u093f\u0902\u0917 \u092e\u0947\u0902 \u090f\u0928\u094d\u091f\u094d\u0930\u094b\u092a\u0940 \u090a \u0911\u092a\u0930\u0947\u091f\u093f\u0902\u0917 \u0938\u093f\u0938\u094d\u091f\u092e \u0939 \u091c\u0947 \u092a\u0947 \u0938\u0930\u093e \u0915\u094d\u0930\u093f\u092a\u094d\u091f\u094b\u0917\u094d\u0930\u093e\u092b\u093f\u0915 \u092b\u0902\u0915\u094d\u0936\u0928 \u0938\u092c \u0915\u093e\u092e \u0915\u0930\u0947 \u0932\u0947\u0902",
115
+ "Bhojpuri_Male",
116
+ null,
117
+ "bh"
118
+ ]
119
+ ],
120
+ "eval_split_max_size": null,
121
+ "eval_split_size": 0.01,
122
+ "use_speaker_weighted_sampler": false,
123
+ "speaker_weighted_sampler_alpha": 1.0,
124
+ "use_language_weighted_sampler": false,
125
+ "language_weighted_sampler_alpha": 1.0,
126
+ "use_length_weighted_sampler": false,
127
+ "length_weighted_sampler_alpha": 1.0,
128
+ "model_args": {
129
+ "num_chars": 86,
130
+ "out_channels": 513,
131
+ "spec_segment_size": 32,
132
+ "hidden_channels": 192,
133
+ "hidden_channels_ffn_text_encoder": 768,
134
+ "num_heads_text_encoder": 2,
135
+ "num_layers_text_encoder": 6,
136
+ "kernel_size_text_encoder": 3,
137
+ "dropout_p_text_encoder": 0.1,
138
+ "dropout_p_duration_predictor": 0.5,
139
+ "kernel_size_posterior_encoder": 5,
140
+ "dilation_rate_posterior_encoder": 1,
141
+ "num_layers_posterior_encoder": 16,
142
+ "kernel_size_flow": 5,
143
+ "dilation_rate_flow": 1,
144
+ "num_layers_flow": 4,
145
+ "resblock_type_decoder": "1",
146
+ "resblock_kernel_sizes_decoder": [
147
+ 3,
148
+ 7,
149
+ 11
150
+ ],
151
+ "resblock_dilation_sizes_decoder": [
152
+ [
153
+ 1,
154
+ 3,
155
+ 5
156
+ ],
157
+ [
158
+ 1,
159
+ 3,
160
+ 5
161
+ ],
162
+ [
163
+ 1,
164
+ 3,
165
+ 5
166
+ ]
167
+ ],
168
+ "upsample_rates_decoder": [
169
+ 8,
170
+ 8,
171
+ 2,
172
+ 2
173
+ ],
174
+ "upsample_initial_channel_decoder": 512,
175
+ "upsample_kernel_sizes_decoder": [
176
+ 16,
177
+ 16,
178
+ 4,
179
+ 4
180
+ ],
181
+ "periods_multi_period_discriminator": [
182
+ 2,
183
+ 3,
184
+ 5,
185
+ 7,
186
+ 11
187
+ ],
188
+ "use_sdp": true,
189
+ "noise_scale": 1.0,
190
+ "inference_noise_scale": 0.667,
191
+ "length_scale": 1,
192
+ "noise_scale_dp": 1.0,
193
+ "inference_noise_scale_dp": 1.0,
194
+ "max_inference_len": null,
195
+ "init_discriminator": true,
196
+ "use_spectral_norm_disriminator": false,
197
+ "use_speaker_embedding": false,
198
+ "num_speakers": 0,
199
+ "speakers_file": null,
200
+ "d_vector_file": null,
201
+ "speaker_embedding_channels": 256,
202
+ "use_d_vector_file": false,
203
+ "d_vector_dim": 0,
204
+ "detach_dp_input": true,
205
+ "use_language_embedding": false,
206
+ "embedded_language_dim": 4,
207
+ "num_languages": 0,
208
+ "language_ids_file": null,
209
+ "use_speaker_encoder_as_loss": false,
210
+ "speaker_encoder_config_path": "",
211
+ "speaker_encoder_model_path": "",
212
+ "condition_dp_on_speaker": true,
213
+ "freeze_encoder": false,
214
+ "freeze_DP": false,
215
+ "freeze_PE": false,
216
+ "freeze_flow_decoder": false,
217
+ "freeze_waveform_decoder": false,
218
+ "encoder_sample_rate": null,
219
+ "interpolate_z": true,
220
+ "reinit_DP": false,
221
+ "reinit_text_encoder": false
222
+ },
223
+ "lr_gen": 0.0002,
224
+ "lr_disc": 0.0002,
225
+ "lr_scheduler_gen": "ExponentialLR",
226
+ "lr_scheduler_gen_params": {
227
+ "gamma": 0.999875,
228
+ "last_epoch": -1
229
+ },
230
+ "lr_scheduler_disc": "ExponentialLR",
231
+ "lr_scheduler_disc_params": {
232
+ "gamma": 0.999875,
233
+ "last_epoch": -1
234
+ },
235
+ "kl_loss_alpha": 1.0,
236
+ "disc_loss_alpha": 1.0,
237
+ "gen_loss_alpha": 1.0,
238
+ "feat_loss_alpha": 1.0,
239
+ "mel_loss_alpha": 45.0,
240
+ "dur_loss_alpha": 1.0,
241
+ "speaker_encoder_loss_alpha": 1.0,
242
+ "return_wav": true,
243
+ "use_weighted_sampler": false,
244
+ "weighted_sampler_attrs": {},
245
+ "weighted_sampler_multipliers": {},
246
+ "r": 1,
247
+ "num_speakers": 0,
248
+ "use_speaker_embedding": false,
249
+ "speakers_file": null,
250
+ "speaker_embedding_channels": 256,
251
+ "language_ids_file": null,
252
+ "use_language_embedding": false,
253
+ "use_d_vector_file": false,
254
+ "d_vector_file": null,
255
+ "d_vector_dim": 0,
256
+ "github_branch": "* dev"
257
+ }
bn_female/bn_female_vits_30hrs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53208e056050bb485df9192a0d444d3fa72eefe15b2c04840e9a500e4ac1bbf4
3
+ size 333255366
bn_female/chars.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ূঞংঘঔদলৌআডখরথটোৗঙঐানষঝবছঅঢ়ঁপউধঢশগয়।?িক,যঈস্ত়ফঋৈজ'ীঠৰণওৎঃমচঊড়ইুভে এ"ৃহ
bn_female/jit_infer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
3
+ import torch
4
+ import numpy as np
5
+
6
+ #ch female
7
+ with open("chars.txt", 'r') as f:
8
+ letters = f.read().strip('\n')
9
+ model="bn_female_vits_30hrs.pt"
10
+ text = " হলেও আমাদের সবার সার্বিক শৃঙ্খলা বোধের উন্নতি হবে"
11
+
12
+ config = VitsConfig(
13
+ text_cleaner="multilingual_cleaners",
14
+ characters=CharactersConfig(
15
+ characters_class=VitsCharacters,
16
+ pad="<PAD>",
17
+ eos="<EOS>",
18
+ bos="<BOS>",
19
+ blank="<BLNK>",
20
+ characters=letters,
21
+ punctuations="!¡'(),-.:;¿? ",
22
+ phonemes=None)
23
+ )
24
+ tokenizer, config = TTSTokenizer.init_from_config(config)
25
+
26
+ x = tokenizer.text_to_ids(text)
27
+ x = torch.from_numpy(np.array(x)).unsqueeze(0)
28
+ net = torch.jit.load(model)
29
+ with torch.no_grad():
30
+ out2 = net(x)
31
+ import soundfile as sf
32
+ sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)
bn_male/bn_male_vits_30hrs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9d8d52f0bc33ef01d733eef36fb00f1e17192b8c86123a0ccf84a24dbb80d0e
3
+ size 333249868
bn_male/chars.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ূঞংঘঔদলৌআডখরঃটোৗঙঐনাঝষবঅছঢ়ঁপউধঢশগয়।?িক,যঈসত্ৈফ়ঊজ'ীঠৎণওঋৰমচড়ভুইে থএ"ৃহ
bn_male/extra.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Union
2
+ from dataclasses import asdict, dataclass, field
3
+
4
+
5
+ import re
6
+ from dataclasses import replace
7
+ from typing import Dict
8
+ _whitespace_re = re.compile(r"\s+")
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import List
12
+
13
+ # from TTS.tts.configs.shared_configs import BaseTTSConfig
14
+ # from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
15
+
16
+ @dataclass
17
+ class CharactersConfig():
18
+
19
+ characters_class: str = None
20
+
21
+ # using BaseVocabulary
22
+ vocab_dict: Dict = None
23
+
24
+ # using on BaseCharacters
25
+ pad: str = None
26
+ eos: str = None
27
+ bos: str = None
28
+ blank: str = None
29
+ characters: str = None
30
+ punctuations: str = None
31
+ phonemes: str = None
32
+ is_unique: bool = True # for backwards compatibility of models trained with char sets with duplicates
33
+ is_sorted: bool = True
34
+
35
+
36
+ @dataclass
37
+ class BaseTTSConfig():
38
+
39
+ # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
40
+ # phoneme settings
41
+ use_phonemes: bool = False
42
+ phonemizer: str = None
43
+ phoneme_language: str = None
44
+ compute_input_seq_cache: bool = False
45
+ text_cleaner: str = None
46
+ enable_eos_bos_chars: bool = False
47
+ test_sentences_file: str = ""
48
+ phoneme_cache_path: str = None
49
+ # vocabulary parameters
50
+ characters: CharactersConfig = None
51
+ add_blank: bool = False
52
+ # training params
53
+ batch_group_size: int = 0
54
+ loss_masking: bool = None
55
+ # dataloading
56
+ min_audio_len: int = 1
57
+ max_audio_len: int = float("inf")
58
+ min_text_len: int = 1
59
+ max_text_len: int = float("inf")
60
+ compute_f0: bool = False
61
+ compute_energy: bool = False
62
+ compute_linear_spec: bool = False
63
+ precompute_num_workers: int = 0
64
+ use_noise_augment: bool = False
65
+ start_by_longest: bool = False
66
+ shuffle: bool = False
67
+ drop_last: bool = False
68
+ # dataset
69
+ datasets: str = None
70
+ # optimizer
71
+ optimizer: str = "radam"
72
+ optimizer_params: dict = None
73
+ # scheduler
74
+ lr_scheduler: str = None
75
+ lr_scheduler_params: dict = field(default_factory=lambda: {})
76
+ # testing
77
+ test_sentences: List[str] = field(default_factory=lambda: [])
78
+ # evaluation
79
+ eval_split_max_size: int = None
80
+ eval_split_size: float = 0.01
81
+ # weighted samplers
82
+ use_speaker_weighted_sampler: bool = False
83
+ speaker_weighted_sampler_alpha: float = 1.0
84
+ use_language_weighted_sampler: bool = False
85
+ language_weighted_sampler_alpha: float = 1.0
86
+ use_length_weighted_sampler: bool = False
87
+ length_weighted_sampler_alpha: float = 1.0
88
+
89
+
90
+ @dataclass
91
+ class VitsAudioConfig():
92
+ fft_size: int = 1024
93
+ sample_rate: int = 22050
94
+ win_length: int = 1024
95
+ hop_length: int = 256
96
+ num_mels: int = 80
97
+ mel_fmin: int = 0
98
+ mel_fmax: int = None
99
+
100
+ @dataclass
101
+ class VitsArgs():
102
+ num_chars: int = 100
103
+ out_channels: int = 513
104
+ spec_segment_size: int = 32
105
+ hidden_channels: int = 192
106
+ hidden_channels_ffn_text_encoder: int = 768
107
+ num_heads_text_encoder: int = 2
108
+ num_layers_text_encoder: int = 6
109
+ kernel_size_text_encoder: int = 3
110
+ dropout_p_text_encoder: float = 0.1
111
+ dropout_p_duration_predictor: float = 0.5
112
+ kernel_size_posterior_encoder: int = 5
113
+ dilation_rate_posterior_encoder: int = 1
114
+ num_layers_posterior_encoder: int = 16
115
+ kernel_size_flow: int = 5
116
+ dilation_rate_flow: int = 1
117
+ num_layers_flow: int = 4
118
+ resblock_type_decoder: str = "1"
119
+ resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
120
+ resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
121
+ upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
122
+ upsample_initial_channel_decoder: int = 512
123
+ upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
124
+ periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
125
+ use_sdp: bool = True
126
+ noise_scale: float = 1.0
127
+ inference_noise_scale: float = 0.667
128
+ length_scale: float = 1
129
+ noise_scale_dp: float = 1.0
130
+ inference_noise_scale_dp: float = 1.0
131
+ max_inference_len: int = None
132
+ init_discriminator: bool = True
133
+ use_spectral_norm_disriminator: bool = False
134
+ use_speaker_embedding: bool = False
135
+ num_speakers: int = 0
136
+ speakers_file: str = None
137
+ d_vector_file: List[str] = None
138
+ speaker_embedding_channels: int = 256
139
+ use_d_vector_file: bool = False
140
+ d_vector_dim: int = 0
141
+ detach_dp_input: bool = True
142
+ use_language_embedding: bool = False
143
+ embedded_language_dim: int = 4
144
+ num_languages: int = 0
145
+ language_ids_file: str = None
146
+ use_speaker_encoder_as_loss: bool = False
147
+ speaker_encoder_config_path: str = ""
148
+ speaker_encoder_model_path: str = ""
149
+ condition_dp_on_speaker: bool = True
150
+ freeze_encoder: bool = False
151
+ freeze_DP: bool = False
152
+ freeze_PE: bool = False
153
+ freeze_flow_decoder: bool = False
154
+ freeze_waveform_decoder: bool = False
155
+ encoder_sample_rate: int = None
156
+ interpolate_z: bool = True
157
+ reinit_DP: bool = False
158
+ reinit_text_encoder: bool = False
159
+ @dataclass
160
+ class VitsConfig(BaseTTSConfig):
161
+
162
+ model: str = "vits"
163
+ # model specific params
164
+ model_args: VitsArgs = field(default_factory=VitsArgs)
165
+ audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
166
+
167
+ # optimizer
168
+ grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
169
+ lr_gen: float = 0.0002
170
+ lr_disc: float = 0.0002
171
+ lr_scheduler_gen: str = "ExponentialLR"
172
+ lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
173
+ lr_scheduler_disc: str = "ExponentialLR"
174
+ lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
175
+ scheduler_after_epoch: bool = True
176
+ optimizer: str = "AdamW"
177
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
178
+
179
+ # loss params
180
+ kl_loss_alpha: float = 1.0
181
+ disc_loss_alpha: float = 1.0
182
+ gen_loss_alpha: float = 1.0
183
+ feat_loss_alpha: float = 1.0
184
+ mel_loss_alpha: float = 45.0
185
+ dur_loss_alpha: float = 1.0
186
+ speaker_encoder_loss_alpha: float = 1.0
187
+
188
+ # data loader params
189
+ return_wav: bool = True
190
+ compute_linear_spec: bool = True
191
+
192
+ # sampler params
193
+ use_weighted_sampler: bool = False # TODO: move it to the base config
194
+ weighted_sampler_attrs: dict = field(default_factory=lambda: {})
195
+ weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
196
+
197
+ # overrides
198
+ r: int = 1 # DO NOT CHANGE
199
+ add_blank: bool = True
200
+
201
+ # testing
202
+ test_sentences: List[List] = field(
203
+ default_factory=lambda: [
204
+ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
205
+ ["Be a voice, not an echo."],
206
+ ["I'm sorry Dave. I'm afraid I can't do that."],
207
+ ["This cake is great. It's so delicious and moist."],
208
+ ["Prior to November 22, 1963."],
209
+ ]
210
+ )
211
+
212
+ # multi-speaker settings
213
+ # use speaker embedding layer
214
+ num_speakers: int = 0
215
+ use_speaker_embedding: bool = False
216
+ speakers_file: str = None
217
+ speaker_embedding_channels: int = 256
218
+ language_ids_file: str = None
219
+ use_language_embedding: bool = False
220
+
221
+ # use d-vectors
222
+ use_d_vector_file: bool = False
223
+ d_vector_file: List[str] = None
224
+ d_vector_dim: int = None
225
+
226
+ def __post_init__(self):
227
+ pass
228
+ # for key, val in self.model_args.items():
229
+ # if hasattr(self, key):
230
+ # self[key] = val
231
+
232
+
233
+
234
+
235
+
236
+ def parse_symbols():
237
+ return {
238
+ "pad": _pad,
239
+ "eos": _eos,
240
+ "bos": _bos,
241
+ "characters": _characters,
242
+ "punctuations": _punctuations,
243
+ "phonemes": _phonemes,
244
+ }
245
+
246
+
247
+ # DEFAULT SET OF GRAPHEMES
248
+ _pad = "<PAD>"
249
+ _eos = "<EOS>"
250
+ _bos = "<BOS>"
251
+ _blank = "<BLNK>" # TODO: check if we need this alongside with PAD
252
+ _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
253
+ _punctuations = "!'(),-.:;? "
254
+
255
+
256
+ # DEFAULT SET OF IPA PHONEMES
257
+ # Phonemes definition (All IPA characters)
258
+ _vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
259
+ _non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
260
+ _pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
261
+ _suprasegmentals = "ˈˌːˑ"
262
+ _other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
263
+ _diacrilics = "ɚ˞ɫ"
264
+ _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
265
+
266
+
267
+ class BaseVocabulary:
268
+ """Base Vocabulary class.
269
+
270
+ This class only needs a vocabulary dictionary without specifying the characters.
271
+
272
+ Args:
273
+ vocab (Dict): A dictionary of characters and their corresponding indices.
274
+ """
275
+
276
+ def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
277
+ self.vocab = vocab
278
+ self.pad = pad
279
+ self.blank = blank
280
+ self.bos = bos
281
+ self.eos = eos
282
+
283
+ @property
284
+ def pad_id(self) -> int:
285
+ """Return the index of the padding character. If the padding character is not specified, return the length
286
+ of the vocabulary."""
287
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
288
+
289
+ @property
290
+ def blank_id(self) -> int:
291
+ """Return the index of the blank character. If the blank character is not specified, return the length of
292
+ the vocabulary."""
293
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
294
+
295
+ @property
296
+ def bos_id(self) -> int:
297
+ """Return the index of the bos character. If the bos character is not specified, return the length of the
298
+ vocabulary."""
299
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
300
+
301
+ @property
302
+ def eos_id(self) -> int:
303
+ """Return the index of the eos character. If the eos character is not specified, return the length of the
304
+ vocabulary."""
305
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
306
+
307
+ @property
308
+ def vocab(self):
309
+ """Return the vocabulary dictionary."""
310
+ return self._vocab
311
+
312
+ @vocab.setter
313
+ def vocab(self, vocab):
314
+ """Set the vocabulary dictionary and character mapping dictionaries."""
315
+ self._vocab, self._char_to_id, self._id_to_char = None, None, None
316
+ if vocab is not None:
317
+ self._vocab = vocab
318
+ self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
319
+ self._id_to_char = {
320
+ idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
321
+ }
322
+
323
+ @staticmethod
324
+ def init_from_config(config, **kwargs):
325
+ """Initialize from the given config."""
326
+ if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
327
+ return (
328
+ BaseVocabulary(
329
+ config.characters.vocab_dict,
330
+ config.characters.pad,
331
+ config.characters.blank,
332
+ config.characters.bos,
333
+ config.characters.eos,
334
+ ),
335
+ config,
336
+ )
337
+ return BaseVocabulary(**kwargs), config
338
+
339
+ def to_config(self):
340
+ return CharactersConfig(
341
+ vocab_dict=self._vocab,
342
+ pad=self.pad,
343
+ eos=self.eos,
344
+ bos=self.bos,
345
+ blank=self.blank,
346
+ is_unique=False,
347
+ is_sorted=False,
348
+ )
349
+
350
+ @property
351
+ def num_chars(self):
352
+ """Return number of tokens in the vocabulary."""
353
+ return len(self._vocab)
354
+
355
+ def char_to_id(self, char: str) -> int:
356
+ """Map a character to an token ID."""
357
+ try:
358
+ return self._char_to_id[char]
359
+ except KeyError as e:
360
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
361
+
362
+ def id_to_char(self, idx: int) -> str:
363
+ """Map an token ID to a character."""
364
+ return self._id_to_char[idx]
365
+
366
+
367
+ class BaseCharacters:
368
+
369
+
370
+ def __init__(
371
+ self,
372
+ characters: str = None,
373
+ punctuations: str = None,
374
+ pad: str = None,
375
+ eos: str = None,
376
+ bos: str = None,
377
+ blank: str = None,
378
+ is_unique: bool = False,
379
+ is_sorted: bool = True,
380
+ ) -> None:
381
+ self._characters = characters
382
+ self._punctuations = punctuations
383
+ self._pad = pad
384
+ self._eos = eos
385
+ self._bos = bos
386
+ self._blank = blank
387
+ self.is_unique = is_unique
388
+ self.is_sorted = is_sorted
389
+ self._create_vocab()
390
+
391
+ @property
392
+ def pad_id(self) -> int:
393
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
394
+
395
+ @property
396
+ def blank_id(self) -> int:
397
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
398
+
399
+ @property
400
+ def eos_id(self) -> int:
401
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
402
+
403
+ @property
404
+ def bos_id(self) -> int:
405
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
406
+
407
+ @property
408
+ def characters(self):
409
+ return self._characters
410
+
411
+ @characters.setter
412
+ def characters(self, characters):
413
+ self._characters = characters
414
+ self._create_vocab()
415
+
416
+ @property
417
+ def punctuations(self):
418
+ return self._punctuations
419
+
420
+ @punctuations.setter
421
+ def punctuations(self, punctuations):
422
+ self._punctuations = punctuations
423
+ self._create_vocab()
424
+
425
+ @property
426
+ def pad(self):
427
+ return self._pad
428
+
429
+ @pad.setter
430
+ def pad(self, pad):
431
+ self._pad = pad
432
+ self._create_vocab()
433
+
434
+ @property
435
+ def eos(self):
436
+ return self._eos
437
+
438
+ @eos.setter
439
+ def eos(self, eos):
440
+ self._eos = eos
441
+ self._create_vocab()
442
+
443
+ @property
444
+ def bos(self):
445
+ return self._bos
446
+
447
+ @bos.setter
448
+ def bos(self, bos):
449
+ self._bos = bos
450
+ self._create_vocab()
451
+
452
+ @property
453
+ def blank(self):
454
+ return self._blank
455
+
456
+ @blank.setter
457
+ def blank(self, blank):
458
+ self._blank = blank
459
+ self._create_vocab()
460
+
461
+ @property
462
+ def vocab(self):
463
+ return self._vocab
464
+
465
+ @vocab.setter
466
+ def vocab(self, vocab):
467
+ self._vocab = vocab
468
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
469
+ self._id_to_char = {
470
+ idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
471
+ }
472
+
473
+ @property
474
+ def num_chars(self):
475
+ return len(self._vocab)
476
+
477
+ def _create_vocab(self):
478
+ _vocab = self._characters
479
+ if self.is_unique:
480
+ _vocab = list(set(_vocab))
481
+ if self.is_sorted:
482
+ _vocab = sorted(_vocab)
483
+ _vocab = list(_vocab)
484
+ _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
485
+ _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
486
+ _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
487
+ _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
488
+ self.vocab = _vocab + list(self._punctuations)
489
+ if self.is_unique:
490
+ duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
491
+ assert (
492
+ len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
493
+ ), f" [!] There are duplicate characters in the character set. {duplicates}"
494
+
495
+ def char_to_id(self, char: str) -> int:
496
+ try:
497
+ return self._char_to_id[char]
498
+ except KeyError as e:
499
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
500
+
501
+ def id_to_char(self, idx: int) -> str:
502
+ return self._id_to_char[idx]
503
+
504
+ def print_log(self, level: int = 0):
505
+ """
506
+ Prints the vocabulary in a nice format.
507
+ """
508
+ indent = "\t" * level
509
+ print(f"{indent}| > Characters: {self._characters}")
510
+ print(f"{indent}| > Punctuations: {self._punctuations}")
511
+ print(f"{indent}| > Pad: {self._pad}")
512
+ print(f"{indent}| > EOS: {self._eos}")
513
+ print(f"{indent}| > BOS: {self._bos}")
514
+ print(f"{indent}| > Blank: {self._blank}")
515
+ print(f"{indent}| > Vocab: {self.vocab}")
516
+ print(f"{indent}| > Num chars: {self.num_chars}")
517
+
518
+ @staticmethod
519
+ def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
520
+ """Init your character class from a config.
521
+
522
+ Implement this method for your subclass.
523
+ """
524
+ # use character set from config
525
+ if config.characters is not None:
526
+ return BaseCharacters(**config.characters), config
527
+ # return default character set
528
+ characters = BaseCharacters()
529
+ new_config = replace(config, characters=characters.to_config())
530
+ return characters, new_config
531
+
532
+ def to_config(self) -> "CharactersConfig":
533
+ return CharactersConfig(
534
+ characters=self._characters,
535
+ punctuations=self._punctuations,
536
+ pad=self._pad,
537
+ eos=self._eos,
538
+ bos=self._bos,
539
+ blank=self._blank,
540
+ is_unique=self.is_unique,
541
+ is_sorted=self.is_sorted,
542
+ )
543
+
544
+
545
+ class IPAPhonemes(BaseCharacters):
546
+
547
+
548
+ def __init__(
549
+ self,
550
+ characters: str = _phonemes,
551
+ punctuations: str = _punctuations,
552
+ pad: str = _pad,
553
+ eos: str = _eos,
554
+ bos: str = _bos,
555
+ blank: str = _blank,
556
+ is_unique: bool = False,
557
+ is_sorted: bool = True,
558
+ ) -> None:
559
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
560
+
561
+ @staticmethod
562
+ def init_from_config(config: "Coqpit"):
563
+ """Init a IPAPhonemes object from a model config
564
+
565
+ If characters are not defined in the config, it will be set to the default characters and the config
566
+ will be updated.
567
+ """
568
+ # band-aid for compatibility with old models
569
+ if "characters" in config and config.characters is not None:
570
+ if "phonemes" in config.characters and config.characters.phonemes is not None:
571
+ config.characters["characters"] = config.characters["phonemes"]
572
+ return (
573
+ IPAPhonemes(
574
+ characters=config.characters["characters"],
575
+ punctuations=config.characters["punctuations"],
576
+ pad=config.characters["pad"],
577
+ eos=config.characters["eos"],
578
+ bos=config.characters["bos"],
579
+ blank=config.characters["blank"],
580
+ is_unique=config.characters["is_unique"],
581
+ is_sorted=config.characters["is_sorted"],
582
+ ),
583
+ config,
584
+ )
585
+ # use character set from config
586
+ if config.characters is not None:
587
+ return IPAPhonemes(**config.characters), config
588
+ # return default character set
589
+ characters = IPAPhonemes()
590
+ new_config = replace(config, characters=characters.to_config())
591
+ return characters, new_config
592
+
593
+
594
+ class Graphemes(BaseCharacters):
595
+
596
+
597
+ def __init__(
598
+ self,
599
+ characters: str = _characters,
600
+ punctuations: str = _punctuations,
601
+ pad: str = _pad,
602
+ eos: str = _eos,
603
+ bos: str = _bos,
604
+ blank: str = _blank,
605
+ is_unique: bool = False,
606
+ is_sorted: bool = True,
607
+ ) -> None:
608
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
609
+
610
+ @staticmethod
611
+ def init_from_config(config: "Coqpit"):
612
+ """Init a Graphemes object from a model config
613
+
614
+ If characters are not defined in the config, it will be set to the default characters and the config
615
+ will be updated.
616
+ """
617
+ if config.characters is not None:
618
+ # band-aid for compatibility with old models
619
+ if "phonemes" in config.characters:
620
+ return (
621
+ Graphemes(
622
+ characters=config.characters["characters"],
623
+ punctuations=config.characters["punctuations"],
624
+ pad=config.characters["pad"],
625
+ eos=config.characters["eos"],
626
+ bos=config.characters["bos"],
627
+ blank=config.characters["blank"],
628
+ is_unique=config.characters["is_unique"],
629
+ is_sorted=config.characters["is_sorted"],
630
+ ),
631
+ config,
632
+ )
633
+ return Graphemes(**config.characters), config
634
+ characters = Graphemes()
635
+ new_config = replace(config, characters=characters.to_config())
636
+ return characters, new_config
637
+
638
+
639
+ if __name__ == "__main__":
640
+ gr = Graphemes()
641
+ ph = IPAPhonemes()
642
+ gr.print_log()
643
+ ph.print_log()
644
+
645
+
646
+ class VitsCharacters(BaseCharacters):
647
+ """Characters class for VITs model for compatibility with pre-trained models"""
648
+
649
+ def __init__(
650
+ self,
651
+ graphemes: str = _characters,
652
+ punctuations: str = _punctuations,
653
+ pad: str = _pad,
654
+ ipa_characters: str = _phonemes,
655
+ ) -> None:
656
+ if ipa_characters is not None:
657
+ graphemes += ipa_characters
658
+ super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
659
+
660
+ def _create_vocab(self):
661
+ self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
662
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
663
+ # pylint: disable=unnecessary-comprehension
664
+ self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
665
+
666
+ @staticmethod
667
+ def init_from_config(config):
668
+ _pad = config.characters.pad
669
+ _punctuations = config.characters.punctuations
670
+ _letters = config.characters.characters
671
+ _letters_ipa = config.characters.phonemes
672
+ return (
673
+ VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
674
+ config,
675
+ )
676
+
677
+ def to_config(self) -> "CharactersConfig":
678
+ return CharactersConfig(
679
+ characters=self._characters,
680
+ punctuations=self._punctuations,
681
+ pad=self._pad,
682
+ eos=None,
683
+ bos=None,
684
+ blank=self._blank,
685
+ is_unique=False,
686
+ is_sorted=True,
687
+ )
688
+
689
+ class TTSTokenizer:
690
+ def __init__(
691
+ self,
692
+ text_cleaner: Callable = None,
693
+ characters: "BaseCharacters" = None,
694
+ ):
695
+ self.text_cleaner = text_cleaner
696
+ self.characters = characters
697
+ self.not_found_characters = []
698
+
699
+ @property
700
+ def characters(self):
701
+ return self._characters
702
+
703
+ @characters.setter
704
+ def characters(self, new_characters):
705
+ self._characters = new_characters
706
+ self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
707
+ self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
708
+
709
+ def encode(self, text: str) -> List[int]:
710
+ """Encodes a string of text as a sequence of IDs."""
711
+ token_ids = []
712
+ for char in text:
713
+ try:
714
+ idx = self.characters.char_to_id(char)
715
+ token_ids.append(idx)
716
+ except KeyError:
717
+ # discard but store not found characters
718
+ if char not in self.not_found_characters:
719
+ self.not_found_characters.append(char)
720
+ print(text)
721
+ print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
722
+ return token_ids
723
+
724
+ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
725
+ text = self.text_cleaner(text)
726
+ text = self.encode(text)
727
+ text = self.intersperse_blank_char(text, True)
728
+ return text
729
+
730
+ def pad_with_bos_eos(self, char_sequence: List[str]):
731
+ """Pads a sequence with the special BOS and EOS characters."""
732
+ return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
733
+
734
+ def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
735
+ """Intersperses the blank character between characters in a sequence.
736
+
737
+ Use the ```blank``` character if defined else use the ```pad``` character.
738
+ """
739
+ char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
740
+ result = [char_to_use] * (len(char_sequence) * 2 + 1)
741
+ result[1::2] = char_sequence
742
+ return result
743
+
744
+ @staticmethod
745
+ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
746
+ text_cleaner = multilingual_cleaners
747
+ CharactersClass = VitsCharacters
748
+ characters, new_config = CharactersClass.init_from_config(config)
749
+ # new_config.characters.characters_class = get_import_path(characters)
750
+ new_config.characters.characters_class = VitsCharacters
751
+ return (
752
+ TTSTokenizer(text_cleaner, characters),new_config)
753
+
754
+
755
+ def multilingual_cleaners(text):
756
+ """Pipeline for multilingual text"""
757
+ text = lowercase(text)
758
+ text = replace_symbols(text, lang=None)
759
+ text = remove_aux_symbols(text)
760
+ text = collapse_whitespace(text)
761
+ return text
762
+
763
+ def lowercase(text):
764
+ return text.lower()
765
+
766
+ def collapse_whitespace(text):
767
+ return re.sub(_whitespace_re, " ", text).strip()
768
+
769
+ def replace_symbols(text, lang="en"):
770
+
771
+ text = text.replace(";", ",")
772
+ text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
773
+ text = text.replace(":", ",")
774
+ if lang == "en":
775
+ text = text.replace("&", " and ")
776
+ elif lang == "fr":
777
+ text = text.replace("&", " et ")
778
+ elif lang == "pt":
779
+ text = text.replace("&", " e ")
780
+ elif lang == "ca":
781
+ text = text.replace("&", " i ")
782
+ text = text.replace("'", "")
783
+ return text
784
+
785
+ def remove_aux_symbols(text):
786
+ text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
787
+ return text
bn_male/jit_infer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
3
+ import torch
4
+ import numpy as np
5
+
6
+ #ch female
7
+ with open("chars.txt", 'r') as f:
8
+ letters = f.read().strip('\n')
9
+ model="bn_male_vits_30hrs.pt"
10
+ text = " হলেও আমাদের সবার সার্বিক শৃঙ্খলা বোধের উন্নতি হবে"
11
+
12
+ config = VitsConfig(
13
+ text_cleaner="multilingual_cleaners",
14
+ characters=CharactersConfig(
15
+ characters_class=VitsCharacters,
16
+ pad="<PAD>",
17
+ eos="<EOS>",
18
+ bos="<BOS>",
19
+ blank="<BLNK>",
20
+ characters=letters,
21
+ punctuations="!¡'(),-.:;¿? ",
22
+ phonemes=None)
23
+ )
24
+ tokenizer, config = TTSTokenizer.init_from_config(config)
25
+
26
+ x = tokenizer.text_to_ids(text)
27
+ x = torch.from_numpy(np.array(x)).unsqueeze(0)
28
+ net = torch.jit.load(model)
29
+ with torch.no_grad():
30
+ out2 = net(x)
31
+ import soundfile as sf
32
+ sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)
en_female/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
en_female/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---
en_female/chars.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pqw'"sgufmxre?d!lcab,zk.iytoh jvn
en_female/en_female_vits_30hrs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dfa80f08da6ca7222a16cb6d919251fb733d3f03042848a20201fa6ae0d0b9c
3
+ size 333229574
en_female/extra.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Union
2
+ from dataclasses import asdict, dataclass, field
3
+
4
+
5
+ import re
6
+ from dataclasses import replace
7
+ from typing import Dict
8
+ _whitespace_re = re.compile(r"\s+")
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import List
12
+
13
+ # from TTS.tts.configs.shared_configs import BaseTTSConfig
14
+ # from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
15
+
16
+ @dataclass
17
+ class CharactersConfig():
18
+
19
+ characters_class: str = None
20
+
21
+ # using BaseVocabulary
22
+ vocab_dict: Dict = None
23
+
24
+ # using on BaseCharacters
25
+ pad: str = None
26
+ eos: str = None
27
+ bos: str = None
28
+ blank: str = None
29
+ characters: str = None
30
+ punctuations: str = None
31
+ phonemes: str = None
32
+ is_unique: bool = True # for backwards compatibility of models trained with char sets with duplicates
33
+ is_sorted: bool = True
34
+
35
+
36
+ @dataclass
37
+ class BaseTTSConfig():
38
+
39
+ # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
40
+ # phoneme settings
41
+ use_phonemes: bool = False
42
+ phonemizer: str = None
43
+ phoneme_language: str = None
44
+ compute_input_seq_cache: bool = False
45
+ text_cleaner: str = None
46
+ enable_eos_bos_chars: bool = False
47
+ test_sentences_file: str = ""
48
+ phoneme_cache_path: str = None
49
+ # vocabulary parameters
50
+ characters: CharactersConfig = None
51
+ add_blank: bool = False
52
+ # training params
53
+ batch_group_size: int = 0
54
+ loss_masking: bool = None
55
+ # dataloading
56
+ min_audio_len: int = 1
57
+ max_audio_len: int = float("inf")
58
+ min_text_len: int = 1
59
+ max_text_len: int = float("inf")
60
+ compute_f0: bool = False
61
+ compute_energy: bool = False
62
+ compute_linear_spec: bool = False
63
+ precompute_num_workers: int = 0
64
+ use_noise_augment: bool = False
65
+ start_by_longest: bool = False
66
+ shuffle: bool = False
67
+ drop_last: bool = False
68
+ # dataset
69
+ datasets: str = None
70
+ # optimizer
71
+ optimizer: str = "radam"
72
+ optimizer_params: dict = None
73
+ # scheduler
74
+ lr_scheduler: str = None
75
+ lr_scheduler_params: dict = field(default_factory=lambda: {})
76
+ # testing
77
+ test_sentences: List[str] = field(default_factory=lambda: [])
78
+ # evaluation
79
+ eval_split_max_size: int = None
80
+ eval_split_size: float = 0.01
81
+ # weighted samplers
82
+ use_speaker_weighted_sampler: bool = False
83
+ speaker_weighted_sampler_alpha: float = 1.0
84
+ use_language_weighted_sampler: bool = False
85
+ language_weighted_sampler_alpha: float = 1.0
86
+ use_length_weighted_sampler: bool = False
87
+ length_weighted_sampler_alpha: float = 1.0
88
+
89
+
90
+ @dataclass
91
+ class VitsAudioConfig():
92
+ fft_size: int = 1024
93
+ sample_rate: int = 22050
94
+ win_length: int = 1024
95
+ hop_length: int = 256
96
+ num_mels: int = 80
97
+ mel_fmin: int = 0
98
+ mel_fmax: int = None
99
+
100
+ @dataclass
101
+ class VitsArgs():
102
+ num_chars: int = 100
103
+ out_channels: int = 513
104
+ spec_segment_size: int = 32
105
+ hidden_channels: int = 192
106
+ hidden_channels_ffn_text_encoder: int = 768
107
+ num_heads_text_encoder: int = 2
108
+ num_layers_text_encoder: int = 6
109
+ kernel_size_text_encoder: int = 3
110
+ dropout_p_text_encoder: float = 0.1
111
+ dropout_p_duration_predictor: float = 0.5
112
+ kernel_size_posterior_encoder: int = 5
113
+ dilation_rate_posterior_encoder: int = 1
114
+ num_layers_posterior_encoder: int = 16
115
+ kernel_size_flow: int = 5
116
+ dilation_rate_flow: int = 1
117
+ num_layers_flow: int = 4
118
+ resblock_type_decoder: str = "1"
119
+ resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
120
+ resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
121
+ upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
122
+ upsample_initial_channel_decoder: int = 512
123
+ upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
124
+ periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
125
+ use_sdp: bool = True
126
+ noise_scale: float = 1.0
127
+ inference_noise_scale: float = 0.667
128
+ length_scale: float = 1
129
+ noise_scale_dp: float = 1.0
130
+ inference_noise_scale_dp: float = 1.0
131
+ max_inference_len: int = None
132
+ init_discriminator: bool = True
133
+ use_spectral_norm_disriminator: bool = False
134
+ use_speaker_embedding: bool = False
135
+ num_speakers: int = 0
136
+ speakers_file: str = None
137
+ d_vector_file: List[str] = None
138
+ speaker_embedding_channels: int = 256
139
+ use_d_vector_file: bool = False
140
+ d_vector_dim: int = 0
141
+ detach_dp_input: bool = True
142
+ use_language_embedding: bool = False
143
+ embedded_language_dim: int = 4
144
+ num_languages: int = 0
145
+ language_ids_file: str = None
146
+ use_speaker_encoder_as_loss: bool = False
147
+ speaker_encoder_config_path: str = ""
148
+ speaker_encoder_model_path: str = ""
149
+ condition_dp_on_speaker: bool = True
150
+ freeze_encoder: bool = False
151
+ freeze_DP: bool = False
152
+ freeze_PE: bool = False
153
+ freeze_flow_decoder: bool = False
154
+ freeze_waveform_decoder: bool = False
155
+ encoder_sample_rate: int = None
156
+ interpolate_z: bool = True
157
+ reinit_DP: bool = False
158
+ reinit_text_encoder: bool = False
159
+ @dataclass
160
+ class VitsConfig(BaseTTSConfig):
161
+
162
+ model: str = "vits"
163
+ # model specific params
164
+ model_args: VitsArgs = field(default_factory=VitsArgs)
165
+ audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
166
+
167
+ # optimizer
168
+ grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
169
+ lr_gen: float = 0.0002
170
+ lr_disc: float = 0.0002
171
+ lr_scheduler_gen: str = "ExponentialLR"
172
+ lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
173
+ lr_scheduler_disc: str = "ExponentialLR"
174
+ lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
175
+ scheduler_after_epoch: bool = True
176
+ optimizer: str = "AdamW"
177
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
178
+
179
+ # loss params
180
+ kl_loss_alpha: float = 1.0
181
+ disc_loss_alpha: float = 1.0
182
+ gen_loss_alpha: float = 1.0
183
+ feat_loss_alpha: float = 1.0
184
+ mel_loss_alpha: float = 45.0
185
+ dur_loss_alpha: float = 1.0
186
+ speaker_encoder_loss_alpha: float = 1.0
187
+
188
+ # data loader params
189
+ return_wav: bool = True
190
+ compute_linear_spec: bool = True
191
+
192
+ # sampler params
193
+ use_weighted_sampler: bool = False # TODO: move it to the base config
194
+ weighted_sampler_attrs: dict = field(default_factory=lambda: {})
195
+ weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
196
+
197
+ # overrides
198
+ r: int = 1 # DO NOT CHANGE
199
+ add_blank: bool = True
200
+
201
+ # testing
202
+ test_sentences: List[List] = field(
203
+ default_factory=lambda: [
204
+ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
205
+ ["Be a voice, not an echo."],
206
+ ["I'm sorry Dave. I'm afraid I can't do that."],
207
+ ["This cake is great. It's so delicious and moist."],
208
+ ["Prior to November 22, 1963."],
209
+ ]
210
+ )
211
+
212
+ # multi-speaker settings
213
+ # use speaker embedding layer
214
+ num_speakers: int = 0
215
+ use_speaker_embedding: bool = False
216
+ speakers_file: str = None
217
+ speaker_embedding_channels: int = 256
218
+ language_ids_file: str = None
219
+ use_language_embedding: bool = False
220
+
221
+ # use d-vectors
222
+ use_d_vector_file: bool = False
223
+ d_vector_file: List[str] = None
224
+ d_vector_dim: int = None
225
+
226
+ def __post_init__(self):
227
+ pass
228
+ # for key, val in self.model_args.items():
229
+ # if hasattr(self, key):
230
+ # self[key] = val
231
+
232
+
233
+
234
+
235
+
236
+ def parse_symbols():
237
+ return {
238
+ "pad": _pad,
239
+ "eos": _eos,
240
+ "bos": _bos,
241
+ "characters": _characters,
242
+ "punctuations": _punctuations,
243
+ "phonemes": _phonemes,
244
+ }
245
+
246
+
247
+ # DEFAULT SET OF GRAPHEMES
248
+ _pad = "<PAD>"
249
+ _eos = "<EOS>"
250
+ _bos = "<BOS>"
251
+ _blank = "<BLNK>" # TODO: check if we need this alongside with PAD
252
+ _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
253
+ _punctuations = "!'(),-.:;? "
254
+
255
+
256
+ # DEFAULT SET OF IPA PHONEMES
257
+ # Phonemes definition (All IPA characters)
258
+ _vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
259
+ _non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
260
+ _pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
261
+ _suprasegmentals = "ˈˌːˑ"
262
+ _other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
263
+ _diacrilics = "ɚ˞ɫ"
264
+ _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
265
+
266
+
267
+ class BaseVocabulary:
268
+ """Base Vocabulary class.
269
+
270
+ This class only needs a vocabulary dictionary without specifying the characters.
271
+
272
+ Args:
273
+ vocab (Dict): A dictionary of characters and their corresponding indices.
274
+ """
275
+
276
+ def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
277
+ self.vocab = vocab
278
+ self.pad = pad
279
+ self.blank = blank
280
+ self.bos = bos
281
+ self.eos = eos
282
+
283
+ @property
284
+ def pad_id(self) -> int:
285
+ """Return the index of the padding character. If the padding character is not specified, return the length
286
+ of the vocabulary."""
287
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
288
+
289
+ @property
290
+ def blank_id(self) -> int:
291
+ """Return the index of the blank character. If the blank character is not specified, return the length of
292
+ the vocabulary."""
293
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
294
+
295
+ @property
296
+ def bos_id(self) -> int:
297
+ """Return the index of the bos character. If the bos character is not specified, return the length of the
298
+ vocabulary."""
299
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
300
+
301
+ @property
302
+ def eos_id(self) -> int:
303
+ """Return the index of the eos character. If the eos character is not specified, return the length of the
304
+ vocabulary."""
305
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
306
+
307
+ @property
308
+ def vocab(self):
309
+ """Return the vocabulary dictionary."""
310
+ return self._vocab
311
+
312
+ @vocab.setter
313
+ def vocab(self, vocab):
314
+ """Set the vocabulary dictionary and character mapping dictionaries."""
315
+ self._vocab, self._char_to_id, self._id_to_char = None, None, None
316
+ if vocab is not None:
317
+ self._vocab = vocab
318
+ self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
319
+ self._id_to_char = {
320
+ idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
321
+ }
322
+
323
+ @staticmethod
324
+ def init_from_config(config, **kwargs):
325
+ """Initialize from the given config."""
326
+ if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
327
+ return (
328
+ BaseVocabulary(
329
+ config.characters.vocab_dict,
330
+ config.characters.pad,
331
+ config.characters.blank,
332
+ config.characters.bos,
333
+ config.characters.eos,
334
+ ),
335
+ config,
336
+ )
337
+ return BaseVocabulary(**kwargs), config
338
+
339
+ def to_config(self):
340
+ return CharactersConfig(
341
+ vocab_dict=self._vocab,
342
+ pad=self.pad,
343
+ eos=self.eos,
344
+ bos=self.bos,
345
+ blank=self.blank,
346
+ is_unique=False,
347
+ is_sorted=False,
348
+ )
349
+
350
+ @property
351
+ def num_chars(self):
352
+ """Return number of tokens in the vocabulary."""
353
+ return len(self._vocab)
354
+
355
+ def char_to_id(self, char: str) -> int:
356
+ """Map a character to an token ID."""
357
+ try:
358
+ return self._char_to_id[char]
359
+ except KeyError as e:
360
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
361
+
362
+ def id_to_char(self, idx: int) -> str:
363
+ """Map an token ID to a character."""
364
+ return self._id_to_char[idx]
365
+
366
+
367
+ class BaseCharacters:
368
+
369
+
370
+ def __init__(
371
+ self,
372
+ characters: str = None,
373
+ punctuations: str = None,
374
+ pad: str = None,
375
+ eos: str = None,
376
+ bos: str = None,
377
+ blank: str = None,
378
+ is_unique: bool = False,
379
+ is_sorted: bool = True,
380
+ ) -> None:
381
+ self._characters = characters
382
+ self._punctuations = punctuations
383
+ self._pad = pad
384
+ self._eos = eos
385
+ self._bos = bos
386
+ self._blank = blank
387
+ self.is_unique = is_unique
388
+ self.is_sorted = is_sorted
389
+ self._create_vocab()
390
+
391
+ @property
392
+ def pad_id(self) -> int:
393
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
394
+
395
+ @property
396
+ def blank_id(self) -> int:
397
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
398
+
399
+ @property
400
+ def eos_id(self) -> int:
401
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
402
+
403
+ @property
404
+ def bos_id(self) -> int:
405
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
406
+
407
+ @property
408
+ def characters(self):
409
+ return self._characters
410
+
411
+ @characters.setter
412
+ def characters(self, characters):
413
+ self._characters = characters
414
+ self._create_vocab()
415
+
416
+ @property
417
+ def punctuations(self):
418
+ return self._punctuations
419
+
420
+ @punctuations.setter
421
+ def punctuations(self, punctuations):
422
+ self._punctuations = punctuations
423
+ self._create_vocab()
424
+
425
+ @property
426
+ def pad(self):
427
+ return self._pad
428
+
429
+ @pad.setter
430
+ def pad(self, pad):
431
+ self._pad = pad
432
+ self._create_vocab()
433
+
434
+ @property
435
+ def eos(self):
436
+ return self._eos
437
+
438
+ @eos.setter
439
+ def eos(self, eos):
440
+ self._eos = eos
441
+ self._create_vocab()
442
+
443
+ @property
444
+ def bos(self):
445
+ return self._bos
446
+
447
+ @bos.setter
448
+ def bos(self, bos):
449
+ self._bos = bos
450
+ self._create_vocab()
451
+
452
+ @property
453
+ def blank(self):
454
+ return self._blank
455
+
456
+ @blank.setter
457
+ def blank(self, blank):
458
+ self._blank = blank
459
+ self._create_vocab()
460
+
461
+ @property
462
+ def vocab(self):
463
+ return self._vocab
464
+
465
+ @vocab.setter
466
+ def vocab(self, vocab):
467
+ self._vocab = vocab
468
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
469
+ self._id_to_char = {
470
+ idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
471
+ }
472
+
473
+ @property
474
+ def num_chars(self):
475
+ return len(self._vocab)
476
+
477
+ def _create_vocab(self):
478
+ _vocab = self._characters
479
+ if self.is_unique:
480
+ _vocab = list(set(_vocab))
481
+ if self.is_sorted:
482
+ _vocab = sorted(_vocab)
483
+ _vocab = list(_vocab)
484
+ _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
485
+ _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
486
+ _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
487
+ _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
488
+ self.vocab = _vocab + list(self._punctuations)
489
+ if self.is_unique:
490
+ duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
491
+ assert (
492
+ len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
493
+ ), f" [!] There are duplicate characters in the character set. {duplicates}"
494
+
495
+ def char_to_id(self, char: str) -> int:
496
+ try:
497
+ return self._char_to_id[char]
498
+ except KeyError as e:
499
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
500
+
501
+ def id_to_char(self, idx: int) -> str:
502
+ return self._id_to_char[idx]
503
+
504
+ def print_log(self, level: int = 0):
505
+ """
506
+ Prints the vocabulary in a nice format.
507
+ """
508
+ indent = "\t" * level
509
+ print(f"{indent}| > Characters: {self._characters}")
510
+ print(f"{indent}| > Punctuations: {self._punctuations}")
511
+ print(f"{indent}| > Pad: {self._pad}")
512
+ print(f"{indent}| > EOS: {self._eos}")
513
+ print(f"{indent}| > BOS: {self._bos}")
514
+ print(f"{indent}| > Blank: {self._blank}")
515
+ print(f"{indent}| > Vocab: {self.vocab}")
516
+ print(f"{indent}| > Num chars: {self.num_chars}")
517
+
518
+ @staticmethod
519
+ def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
520
+ """Init your character class from a config.
521
+
522
+ Implement this method for your subclass.
523
+ """
524
+ # use character set from config
525
+ if config.characters is not None:
526
+ return BaseCharacters(**config.characters), config
527
+ # return default character set
528
+ characters = BaseCharacters()
529
+ new_config = replace(config, characters=characters.to_config())
530
+ return characters, new_config
531
+
532
+ def to_config(self) -> "CharactersConfig":
533
+ return CharactersConfig(
534
+ characters=self._characters,
535
+ punctuations=self._punctuations,
536
+ pad=self._pad,
537
+ eos=self._eos,
538
+ bos=self._bos,
539
+ blank=self._blank,
540
+ is_unique=self.is_unique,
541
+ is_sorted=self.is_sorted,
542
+ )
543
+
544
+
545
+ class IPAPhonemes(BaseCharacters):
546
+
547
+
548
+ def __init__(
549
+ self,
550
+ characters: str = _phonemes,
551
+ punctuations: str = _punctuations,
552
+ pad: str = _pad,
553
+ eos: str = _eos,
554
+ bos: str = _bos,
555
+ blank: str = _blank,
556
+ is_unique: bool = False,
557
+ is_sorted: bool = True,
558
+ ) -> None:
559
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
560
+
561
+ @staticmethod
562
+ def init_from_config(config: "Coqpit"):
563
+ """Init a IPAPhonemes object from a model config
564
+
565
+ If characters are not defined in the config, it will be set to the default characters and the config
566
+ will be updated.
567
+ """
568
+ # band-aid for compatibility with old models
569
+ if "characters" in config and config.characters is not None:
570
+ if "phonemes" in config.characters and config.characters.phonemes is not None:
571
+ config.characters["characters"] = config.characters["phonemes"]
572
+ return (
573
+ IPAPhonemes(
574
+ characters=config.characters["characters"],
575
+ punctuations=config.characters["punctuations"],
576
+ pad=config.characters["pad"],
577
+ eos=config.characters["eos"],
578
+ bos=config.characters["bos"],
579
+ blank=config.characters["blank"],
580
+ is_unique=config.characters["is_unique"],
581
+ is_sorted=config.characters["is_sorted"],
582
+ ),
583
+ config,
584
+ )
585
+ # use character set from config
586
+ if config.characters is not None:
587
+ return IPAPhonemes(**config.characters), config
588
+ # return default character set
589
+ characters = IPAPhonemes()
590
+ new_config = replace(config, characters=characters.to_config())
591
+ return characters, new_config
592
+
593
+
594
+ class Graphemes(BaseCharacters):
595
+
596
+
597
+ def __init__(
598
+ self,
599
+ characters: str = _characters,
600
+ punctuations: str = _punctuations,
601
+ pad: str = _pad,
602
+ eos: str = _eos,
603
+ bos: str = _bos,
604
+ blank: str = _blank,
605
+ is_unique: bool = False,
606
+ is_sorted: bool = True,
607
+ ) -> None:
608
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
609
+
610
+ @staticmethod
611
+ def init_from_config(config: "Coqpit"):
612
+ """Init a Graphemes object from a model config
613
+
614
+ If characters are not defined in the config, it will be set to the default characters and the config
615
+ will be updated.
616
+ """
617
+ if config.characters is not None:
618
+ # band-aid for compatibility with old models
619
+ if "phonemes" in config.characters:
620
+ return (
621
+ Graphemes(
622
+ characters=config.characters["characters"],
623
+ punctuations=config.characters["punctuations"],
624
+ pad=config.characters["pad"],
625
+ eos=config.characters["eos"],
626
+ bos=config.characters["bos"],
627
+ blank=config.characters["blank"],
628
+ is_unique=config.characters["is_unique"],
629
+ is_sorted=config.characters["is_sorted"],
630
+ ),
631
+ config,
632
+ )
633
+ return Graphemes(**config.characters), config
634
+ characters = Graphemes()
635
+ new_config = replace(config, characters=characters.to_config())
636
+ return characters, new_config
637
+
638
+
639
+ if __name__ == "__main__":
640
+ gr = Graphemes()
641
+ ph = IPAPhonemes()
642
+ gr.print_log()
643
+ ph.print_log()
644
+
645
+
646
+ class VitsCharacters(BaseCharacters):
647
+ """Characters class for VITs model for compatibility with pre-trained models"""
648
+
649
+ def __init__(
650
+ self,
651
+ graphemes: str = _characters,
652
+ punctuations: str = _punctuations,
653
+ pad: str = _pad,
654
+ ipa_characters: str = _phonemes,
655
+ ) -> None:
656
+ if ipa_characters is not None:
657
+ graphemes += ipa_characters
658
+ super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
659
+
660
+ def _create_vocab(self):
661
+ self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
662
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
663
+ # pylint: disable=unnecessary-comprehension
664
+ self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
665
+
666
+ @staticmethod
667
+ def init_from_config(config):
668
+ _pad = config.characters.pad
669
+ _punctuations = config.characters.punctuations
670
+ _letters = config.characters.characters
671
+ _letters_ipa = config.characters.phonemes
672
+ return (
673
+ VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
674
+ config,
675
+ )
676
+
677
+ def to_config(self) -> "CharactersConfig":
678
+ return CharactersConfig(
679
+ characters=self._characters,
680
+ punctuations=self._punctuations,
681
+ pad=self._pad,
682
+ eos=None,
683
+ bos=None,
684
+ blank=self._blank,
685
+ is_unique=False,
686
+ is_sorted=True,
687
+ )
688
+
689
+ class TTSTokenizer:
690
+ def __init__(
691
+ self,
692
+ text_cleaner: Callable = None,
693
+ characters: "BaseCharacters" = None,
694
+ ):
695
+ self.text_cleaner = text_cleaner
696
+ self.characters = characters
697
+ self.not_found_characters = []
698
+
699
+ @property
700
+ def characters(self):
701
+ return self._characters
702
+
703
+ @characters.setter
704
+ def characters(self, new_characters):
705
+ self._characters = new_characters
706
+ self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
707
+ self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
708
+
709
+ def encode(self, text: str) -> List[int]:
710
+ """Encodes a string of text as a sequence of IDs."""
711
+ token_ids = []
712
+ for char in text:
713
+ try:
714
+ idx = self.characters.char_to_id(char)
715
+ token_ids.append(idx)
716
+ except KeyError:
717
+ # discard but store not found characters
718
+ if char not in self.not_found_characters:
719
+ self.not_found_characters.append(char)
720
+ print(text)
721
+ print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
722
+ return token_ids
723
+
724
+ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
725
+ text = self.text_cleaner(text)
726
+ text = self.encode(text)
727
+ text = self.intersperse_blank_char(text, True)
728
+ return text
729
+
730
+ def pad_with_bos_eos(self, char_sequence: List[str]):
731
+ """Pads a sequence with the special BOS and EOS characters."""
732
+ return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
733
+
734
+ def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
735
+ """Intersperses the blank character between characters in a sequence.
736
+
737
+ Use the ```blank``` character if defined else use the ```pad``` character.
738
+ """
739
+ char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
740
+ result = [char_to_use] * (len(char_sequence) * 2 + 1)
741
+ result[1::2] = char_sequence
742
+ return result
743
+
744
+ @staticmethod
745
+ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
746
+ text_cleaner = multilingual_cleaners
747
+ CharactersClass = VitsCharacters
748
+ characters, new_config = CharactersClass.init_from_config(config)
749
+ # new_config.characters.characters_class = get_import_path(characters)
750
+ new_config.characters.characters_class = VitsCharacters
751
+ return (
752
+ TTSTokenizer(text_cleaner, characters),new_config)
753
+
754
+
755
+ def multilingual_cleaners(text):
756
+ """Pipeline for multilingual text"""
757
+ text = lowercase(text)
758
+ text = replace_symbols(text, lang=None)
759
+ text = remove_aux_symbols(text)
760
+ text = collapse_whitespace(text)
761
+ return text
762
+
763
+ def lowercase(text):
764
+ return text.lower()
765
+
766
+ def collapse_whitespace(text):
767
+ return re.sub(_whitespace_re, " ", text).strip()
768
+
769
+ def replace_symbols(text, lang="en"):
770
+
771
+ text = text.replace(";", ",")
772
+ text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
773
+ text = text.replace(":", ",")
774
+ if lang == "en":
775
+ text = text.replace("&", " and ")
776
+ elif lang == "fr":
777
+ text = text.replace("&", " et ")
778
+ elif lang == "pt":
779
+ text = text.replace("&", " e ")
780
+ elif lang == "ca":
781
+ text = text.replace("&", " i ")
782
+ text = text.replace("'", "")
783
+ return text
784
+
785
+ def remove_aux_symbols(text):
786
+ text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
787
+ return text
en_female/jit_infer.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
3
+ import torch
4
+ import numpy as np
5
+
6
+ #ch female
7
+ with open("chars.txt", 'r') as f:
8
+ letters = f.read().strip('\n')
9
+ model="en_female_vits_30hrs.pt"
10
+ # text = " হলেও আমাদের সবার সার্বিক শৃঙ্খলা বোধের উন্নতি হবে"
11
+ text = "My name is g p t, chat g p t"
12
+
13
+ config = VitsConfig(
14
+ text_cleaner="multilingual_cleaners",
15
+ characters=CharactersConfig(
16
+ characters_class=VitsCharacters,
17
+ pad="<PAD>",
18
+ eos="<EOS>",
19
+ bos="<BOS>",
20
+ blank="<BLNK>",
21
+ characters=letters,
22
+ punctuations="!¡'(),-.:;¿? ",
23
+ phonemes=None)
24
+ )
25
+ tokenizer, config = TTSTokenizer.init_from_config(config)
26
+
27
+ x = tokenizer.text_to_ids(text)
28
+ x = torch.from_numpy(np.array(x)).unsqueeze(0)
29
+ net = torch.jit.load(model)
30
+ with torch.no_grad():
31
+ out2 = net(x)
32
+ import soundfile as sf
33
+ sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)
en_male/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
en_male/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---
en_male/chars.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pqw'"sgufmxre?d!lcab,zk.iytoh jvn
en_male/en_male_vits_30hrs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffa1099438a58c8a13e437d39ec304b530644156ef445032e64422d83e558666
3
+ size 333224012
en_male/extra.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Union
2
+ from dataclasses import asdict, dataclass, field
3
+
4
+
5
+ import re
6
+ from dataclasses import replace
7
+ from typing import Dict
8
+ _whitespace_re = re.compile(r"\s+")
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import List
12
+
13
+ # from TTS.tts.configs.shared_configs import BaseTTSConfig
14
+ # from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
15
+
16
+ @dataclass
17
+ class CharactersConfig():
18
+
19
+ characters_class: str = None
20
+
21
+ # using BaseVocabulary
22
+ vocab_dict: Dict = None
23
+
24
+ # using on BaseCharacters
25
+ pad: str = None
26
+ eos: str = None
27
+ bos: str = None
28
+ blank: str = None
29
+ characters: str = None
30
+ punctuations: str = None
31
+ phonemes: str = None
32
+ is_unique: bool = True # for backwards compatibility of models trained with char sets with duplicates
33
+ is_sorted: bool = True
34
+
35
+
36
+ @dataclass
37
+ class BaseTTSConfig():
38
+
39
+ # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
40
+ # phoneme settings
41
+ use_phonemes: bool = False
42
+ phonemizer: str = None
43
+ phoneme_language: str = None
44
+ compute_input_seq_cache: bool = False
45
+ text_cleaner: str = None
46
+ enable_eos_bos_chars: bool = False
47
+ test_sentences_file: str = ""
48
+ phoneme_cache_path: str = None
49
+ # vocabulary parameters
50
+ characters: CharactersConfig = None
51
+ add_blank: bool = False
52
+ # training params
53
+ batch_group_size: int = 0
54
+ loss_masking: bool = None
55
+ # dataloading
56
+ min_audio_len: int = 1
57
+ max_audio_len: int = float("inf")
58
+ min_text_len: int = 1
59
+ max_text_len: int = float("inf")
60
+ compute_f0: bool = False
61
+ compute_energy: bool = False
62
+ compute_linear_spec: bool = False
63
+ precompute_num_workers: int = 0
64
+ use_noise_augment: bool = False
65
+ start_by_longest: bool = False
66
+ shuffle: bool = False
67
+ drop_last: bool = False
68
+ # dataset
69
+ datasets: str = None
70
+ # optimizer
71
+ optimizer: str = "radam"
72
+ optimizer_params: dict = None
73
+ # scheduler
74
+ lr_scheduler: str = None
75
+ lr_scheduler_params: dict = field(default_factory=lambda: {})
76
+ # testing
77
+ test_sentences: List[str] = field(default_factory=lambda: [])
78
+ # evaluation
79
+ eval_split_max_size: int = None
80
+ eval_split_size: float = 0.01
81
+ # weighted samplers
82
+ use_speaker_weighted_sampler: bool = False
83
+ speaker_weighted_sampler_alpha: float = 1.0
84
+ use_language_weighted_sampler: bool = False
85
+ language_weighted_sampler_alpha: float = 1.0
86
+ use_length_weighted_sampler: bool = False
87
+ length_weighted_sampler_alpha: float = 1.0
88
+
89
+
90
+ @dataclass
91
+ class VitsAudioConfig():
92
+ fft_size: int = 1024
93
+ sample_rate: int = 22050
94
+ win_length: int = 1024
95
+ hop_length: int = 256
96
+ num_mels: int = 80
97
+ mel_fmin: int = 0
98
+ mel_fmax: int = None
99
+
100
+ @dataclass
101
+ class VitsArgs():
102
+ num_chars: int = 100
103
+ out_channels: int = 513
104
+ spec_segment_size: int = 32
105
+ hidden_channels: int = 192
106
+ hidden_channels_ffn_text_encoder: int = 768
107
+ num_heads_text_encoder: int = 2
108
+ num_layers_text_encoder: int = 6
109
+ kernel_size_text_encoder: int = 3
110
+ dropout_p_text_encoder: float = 0.1
111
+ dropout_p_duration_predictor: float = 0.5
112
+ kernel_size_posterior_encoder: int = 5
113
+ dilation_rate_posterior_encoder: int = 1
114
+ num_layers_posterior_encoder: int = 16
115
+ kernel_size_flow: int = 5
116
+ dilation_rate_flow: int = 1
117
+ num_layers_flow: int = 4
118
+ resblock_type_decoder: str = "1"
119
+ resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
120
+ resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
121
+ upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
122
+ upsample_initial_channel_decoder: int = 512
123
+ upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
124
+ periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
125
+ use_sdp: bool = True
126
+ noise_scale: float = 1.0
127
+ inference_noise_scale: float = 0.667
128
+ length_scale: float = 1
129
+ noise_scale_dp: float = 1.0
130
+ inference_noise_scale_dp: float = 1.0
131
+ max_inference_len: int = None
132
+ init_discriminator: bool = True
133
+ use_spectral_norm_disriminator: bool = False
134
+ use_speaker_embedding: bool = False
135
+ num_speakers: int = 0
136
+ speakers_file: str = None
137
+ d_vector_file: List[str] = None
138
+ speaker_embedding_channels: int = 256
139
+ use_d_vector_file: bool = False
140
+ d_vector_dim: int = 0
141
+ detach_dp_input: bool = True
142
+ use_language_embedding: bool = False
143
+ embedded_language_dim: int = 4
144
+ num_languages: int = 0
145
+ language_ids_file: str = None
146
+ use_speaker_encoder_as_loss: bool = False
147
+ speaker_encoder_config_path: str = ""
148
+ speaker_encoder_model_path: str = ""
149
+ condition_dp_on_speaker: bool = True
150
+ freeze_encoder: bool = False
151
+ freeze_DP: bool = False
152
+ freeze_PE: bool = False
153
+ freeze_flow_decoder: bool = False
154
+ freeze_waveform_decoder: bool = False
155
+ encoder_sample_rate: int = None
156
+ interpolate_z: bool = True
157
+ reinit_DP: bool = False
158
+ reinit_text_encoder: bool = False
159
+ @dataclass
160
+ class VitsConfig(BaseTTSConfig):
161
+
162
+ model: str = "vits"
163
+ # model specific params
164
+ model_args: VitsArgs = field(default_factory=VitsArgs)
165
+ audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
166
+
167
+ # optimizer
168
+ grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
169
+ lr_gen: float = 0.0002
170
+ lr_disc: float = 0.0002
171
+ lr_scheduler_gen: str = "ExponentialLR"
172
+ lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
173
+ lr_scheduler_disc: str = "ExponentialLR"
174
+ lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
175
+ scheduler_after_epoch: bool = True
176
+ optimizer: str = "AdamW"
177
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
178
+
179
+ # loss params
180
+ kl_loss_alpha: float = 1.0
181
+ disc_loss_alpha: float = 1.0
182
+ gen_loss_alpha: float = 1.0
183
+ feat_loss_alpha: float = 1.0
184
+ mel_loss_alpha: float = 45.0
185
+ dur_loss_alpha: float = 1.0
186
+ speaker_encoder_loss_alpha: float = 1.0
187
+
188
+ # data loader params
189
+ return_wav: bool = True
190
+ compute_linear_spec: bool = True
191
+
192
+ # sampler params
193
+ use_weighted_sampler: bool = False # TODO: move it to the base config
194
+ weighted_sampler_attrs: dict = field(default_factory=lambda: {})
195
+ weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
196
+
197
+ # overrides
198
+ r: int = 1 # DO NOT CHANGE
199
+ add_blank: bool = True
200
+
201
+ # testing
202
+ test_sentences: List[List] = field(
203
+ default_factory=lambda: [
204
+ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
205
+ ["Be a voice, not an echo."],
206
+ ["I'm sorry Dave. I'm afraid I can't do that."],
207
+ ["This cake is great. It's so delicious and moist."],
208
+ ["Prior to November 22, 1963."],
209
+ ]
210
+ )
211
+
212
+ # multi-speaker settings
213
+ # use speaker embedding layer
214
+ num_speakers: int = 0
215
+ use_speaker_embedding: bool = False
216
+ speakers_file: str = None
217
+ speaker_embedding_channels: int = 256
218
+ language_ids_file: str = None
219
+ use_language_embedding: bool = False
220
+
221
+ # use d-vectors
222
+ use_d_vector_file: bool = False
223
+ d_vector_file: List[str] = None
224
+ d_vector_dim: int = None
225
+
226
+ def __post_init__(self):
227
+ pass
228
+ # for key, val in self.model_args.items():
229
+ # if hasattr(self, key):
230
+ # self[key] = val
231
+
232
+
233
+
234
+
235
+
236
+ def parse_symbols():
237
+ return {
238
+ "pad": _pad,
239
+ "eos": _eos,
240
+ "bos": _bos,
241
+ "characters": _characters,
242
+ "punctuations": _punctuations,
243
+ "phonemes": _phonemes,
244
+ }
245
+
246
+
247
+ # DEFAULT SET OF GRAPHEMES
248
+ _pad = "<PAD>"
249
+ _eos = "<EOS>"
250
+ _bos = "<BOS>"
251
+ _blank = "<BLNK>" # TODO: check if we need this alongside with PAD
252
+ _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
253
+ _punctuations = "!'(),-.:;? "
254
+
255
+
256
+ # DEFAULT SET OF IPA PHONEMES
257
+ # Phonemes definition (All IPA characters)
258
+ _vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
259
+ _non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
260
+ _pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
261
+ _suprasegmentals = "ˈˌːˑ"
262
+ _other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
263
+ _diacrilics = "ɚ˞ɫ"
264
+ _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
265
+
266
+
267
+ class BaseVocabulary:
268
+ """Base Vocabulary class.
269
+
270
+ This class only needs a vocabulary dictionary without specifying the characters.
271
+
272
+ Args:
273
+ vocab (Dict): A dictionary of characters and their corresponding indices.
274
+ """
275
+
276
+ def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
277
+ self.vocab = vocab
278
+ self.pad = pad
279
+ self.blank = blank
280
+ self.bos = bos
281
+ self.eos = eos
282
+
283
+ @property
284
+ def pad_id(self) -> int:
285
+ """Return the index of the padding character. If the padding character is not specified, return the length
286
+ of the vocabulary."""
287
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
288
+
289
+ @property
290
+ def blank_id(self) -> int:
291
+ """Return the index of the blank character. If the blank character is not specified, return the length of
292
+ the vocabulary."""
293
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
294
+
295
+ @property
296
+ def bos_id(self) -> int:
297
+ """Return the index of the bos character. If the bos character is not specified, return the length of the
298
+ vocabulary."""
299
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
300
+
301
+ @property
302
+ def eos_id(self) -> int:
303
+ """Return the index of the eos character. If the eos character is not specified, return the length of the
304
+ vocabulary."""
305
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
306
+
307
+ @property
308
+ def vocab(self):
309
+ """Return the vocabulary dictionary."""
310
+ return self._vocab
311
+
312
+ @vocab.setter
313
+ def vocab(self, vocab):
314
+ """Set the vocabulary dictionary and character mapping dictionaries."""
315
+ self._vocab, self._char_to_id, self._id_to_char = None, None, None
316
+ if vocab is not None:
317
+ self._vocab = vocab
318
+ self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
319
+ self._id_to_char = {
320
+ idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
321
+ }
322
+
323
+ @staticmethod
324
+ def init_from_config(config, **kwargs):
325
+ """Initialize from the given config."""
326
+ if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
327
+ return (
328
+ BaseVocabulary(
329
+ config.characters.vocab_dict,
330
+ config.characters.pad,
331
+ config.characters.blank,
332
+ config.characters.bos,
333
+ config.characters.eos,
334
+ ),
335
+ config,
336
+ )
337
+ return BaseVocabulary(**kwargs), config
338
+
339
+ def to_config(self):
340
+ return CharactersConfig(
341
+ vocab_dict=self._vocab,
342
+ pad=self.pad,
343
+ eos=self.eos,
344
+ bos=self.bos,
345
+ blank=self.blank,
346
+ is_unique=False,
347
+ is_sorted=False,
348
+ )
349
+
350
+ @property
351
+ def num_chars(self):
352
+ """Return number of tokens in the vocabulary."""
353
+ return len(self._vocab)
354
+
355
+ def char_to_id(self, char: str) -> int:
356
+ """Map a character to an token ID."""
357
+ try:
358
+ return self._char_to_id[char]
359
+ except KeyError as e:
360
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
361
+
362
+ def id_to_char(self, idx: int) -> str:
363
+ """Map an token ID to a character."""
364
+ return self._id_to_char[idx]
365
+
366
+
367
+ class BaseCharacters:
368
+
369
+
370
+ def __init__(
371
+ self,
372
+ characters: str = None,
373
+ punctuations: str = None,
374
+ pad: str = None,
375
+ eos: str = None,
376
+ bos: str = None,
377
+ blank: str = None,
378
+ is_unique: bool = False,
379
+ is_sorted: bool = True,
380
+ ) -> None:
381
+ self._characters = characters
382
+ self._punctuations = punctuations
383
+ self._pad = pad
384
+ self._eos = eos
385
+ self._bos = bos
386
+ self._blank = blank
387
+ self.is_unique = is_unique
388
+ self.is_sorted = is_sorted
389
+ self._create_vocab()
390
+
391
+ @property
392
+ def pad_id(self) -> int:
393
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
394
+
395
+ @property
396
+ def blank_id(self) -> int:
397
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
398
+
399
+ @property
400
+ def eos_id(self) -> int:
401
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
402
+
403
+ @property
404
+ def bos_id(self) -> int:
405
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
406
+
407
+ @property
408
+ def characters(self):
409
+ return self._characters
410
+
411
+ @characters.setter
412
+ def characters(self, characters):
413
+ self._characters = characters
414
+ self._create_vocab()
415
+
416
+ @property
417
+ def punctuations(self):
418
+ return self._punctuations
419
+
420
+ @punctuations.setter
421
+ def punctuations(self, punctuations):
422
+ self._punctuations = punctuations
423
+ self._create_vocab()
424
+
425
+ @property
426
+ def pad(self):
427
+ return self._pad
428
+
429
+ @pad.setter
430
+ def pad(self, pad):
431
+ self._pad = pad
432
+ self._create_vocab()
433
+
434
+ @property
435
+ def eos(self):
436
+ return self._eos
437
+
438
+ @eos.setter
439
+ def eos(self, eos):
440
+ self._eos = eos
441
+ self._create_vocab()
442
+
443
+ @property
444
+ def bos(self):
445
+ return self._bos
446
+
447
+ @bos.setter
448
+ def bos(self, bos):
449
+ self._bos = bos
450
+ self._create_vocab()
451
+
452
+ @property
453
+ def blank(self):
454
+ return self._blank
455
+
456
+ @blank.setter
457
+ def blank(self, blank):
458
+ self._blank = blank
459
+ self._create_vocab()
460
+
461
+ @property
462
+ def vocab(self):
463
+ return self._vocab
464
+
465
+ @vocab.setter
466
+ def vocab(self, vocab):
467
+ self._vocab = vocab
468
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
469
+ self._id_to_char = {
470
+ idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
471
+ }
472
+
473
+ @property
474
+ def num_chars(self):
475
+ return len(self._vocab)
476
+
477
+ def _create_vocab(self):
478
+ _vocab = self._characters
479
+ if self.is_unique:
480
+ _vocab = list(set(_vocab))
481
+ if self.is_sorted:
482
+ _vocab = sorted(_vocab)
483
+ _vocab = list(_vocab)
484
+ _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
485
+ _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
486
+ _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
487
+ _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
488
+ self.vocab = _vocab + list(self._punctuations)
489
+ if self.is_unique:
490
+ duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
491
+ assert (
492
+ len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
493
+ ), f" [!] There are duplicate characters in the character set. {duplicates}"
494
+
495
+ def char_to_id(self, char: str) -> int:
496
+ try:
497
+ return self._char_to_id[char]
498
+ except KeyError as e:
499
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
500
+
501
+ def id_to_char(self, idx: int) -> str:
502
+ return self._id_to_char[idx]
503
+
504
+ def print_log(self, level: int = 0):
505
+ """
506
+ Prints the vocabulary in a nice format.
507
+ """
508
+ indent = "\t" * level
509
+ print(f"{indent}| > Characters: {self._characters}")
510
+ print(f"{indent}| > Punctuations: {self._punctuations}")
511
+ print(f"{indent}| > Pad: {self._pad}")
512
+ print(f"{indent}| > EOS: {self._eos}")
513
+ print(f"{indent}| > BOS: {self._bos}")
514
+ print(f"{indent}| > Blank: {self._blank}")
515
+ print(f"{indent}| > Vocab: {self.vocab}")
516
+ print(f"{indent}| > Num chars: {self.num_chars}")
517
+
518
+ @staticmethod
519
+ def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
520
+ """Init your character class from a config.
521
+
522
+ Implement this method for your subclass.
523
+ """
524
+ # use character set from config
525
+ if config.characters is not None:
526
+ return BaseCharacters(**config.characters), config
527
+ # return default character set
528
+ characters = BaseCharacters()
529
+ new_config = replace(config, characters=characters.to_config())
530
+ return characters, new_config
531
+
532
+ def to_config(self) -> "CharactersConfig":
533
+ return CharactersConfig(
534
+ characters=self._characters,
535
+ punctuations=self._punctuations,
536
+ pad=self._pad,
537
+ eos=self._eos,
538
+ bos=self._bos,
539
+ blank=self._blank,
540
+ is_unique=self.is_unique,
541
+ is_sorted=self.is_sorted,
542
+ )
543
+
544
+
545
+ class IPAPhonemes(BaseCharacters):
546
+
547
+
548
+ def __init__(
549
+ self,
550
+ characters: str = _phonemes,
551
+ punctuations: str = _punctuations,
552
+ pad: str = _pad,
553
+ eos: str = _eos,
554
+ bos: str = _bos,
555
+ blank: str = _blank,
556
+ is_unique: bool = False,
557
+ is_sorted: bool = True,
558
+ ) -> None:
559
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
560
+
561
+ @staticmethod
562
+ def init_from_config(config: "Coqpit"):
563
+ """Init a IPAPhonemes object from a model config
564
+
565
+ If characters are not defined in the config, it will be set to the default characters and the config
566
+ will be updated.
567
+ """
568
+ # band-aid for compatibility with old models
569
+ if "characters" in config and config.characters is not None:
570
+ if "phonemes" in config.characters and config.characters.phonemes is not None:
571
+ config.characters["characters"] = config.characters["phonemes"]
572
+ return (
573
+ IPAPhonemes(
574
+ characters=config.characters["characters"],
575
+ punctuations=config.characters["punctuations"],
576
+ pad=config.characters["pad"],
577
+ eos=config.characters["eos"],
578
+ bos=config.characters["bos"],
579
+ blank=config.characters["blank"],
580
+ is_unique=config.characters["is_unique"],
581
+ is_sorted=config.characters["is_sorted"],
582
+ ),
583
+ config,
584
+ )
585
+ # use character set from config
586
+ if config.characters is not None:
587
+ return IPAPhonemes(**config.characters), config
588
+ # return default character set
589
+ characters = IPAPhonemes()
590
+ new_config = replace(config, characters=characters.to_config())
591
+ return characters, new_config
592
+
593
+
594
+ class Graphemes(BaseCharacters):
595
+
596
+
597
+ def __init__(
598
+ self,
599
+ characters: str = _characters,
600
+ punctuations: str = _punctuations,
601
+ pad: str = _pad,
602
+ eos: str = _eos,
603
+ bos: str = _bos,
604
+ blank: str = _blank,
605
+ is_unique: bool = False,
606
+ is_sorted: bool = True,
607
+ ) -> None:
608
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
609
+
610
+ @staticmethod
611
+ def init_from_config(config: "Coqpit"):
612
+ """Init a Graphemes object from a model config
613
+
614
+ If characters are not defined in the config, it will be set to the default characters and the config
615
+ will be updated.
616
+ """
617
+ if config.characters is not None:
618
+ # band-aid for compatibility with old models
619
+ if "phonemes" in config.characters:
620
+ return (
621
+ Graphemes(
622
+ characters=config.characters["characters"],
623
+ punctuations=config.characters["punctuations"],
624
+ pad=config.characters["pad"],
625
+ eos=config.characters["eos"],
626
+ bos=config.characters["bos"],
627
+ blank=config.characters["blank"],
628
+ is_unique=config.characters["is_unique"],
629
+ is_sorted=config.characters["is_sorted"],
630
+ ),
631
+ config,
632
+ )
633
+ return Graphemes(**config.characters), config
634
+ characters = Graphemes()
635
+ new_config = replace(config, characters=characters.to_config())
636
+ return characters, new_config
637
+
638
+
639
+ if __name__ == "__main__":
640
+ gr = Graphemes()
641
+ ph = IPAPhonemes()
642
+ gr.print_log()
643
+ ph.print_log()
644
+
645
+
646
+ class VitsCharacters(BaseCharacters):
647
+ """Characters class for VITs model for compatibility with pre-trained models"""
648
+
649
+ def __init__(
650
+ self,
651
+ graphemes: str = _characters,
652
+ punctuations: str = _punctuations,
653
+ pad: str = _pad,
654
+ ipa_characters: str = _phonemes,
655
+ ) -> None:
656
+ if ipa_characters is not None:
657
+ graphemes += ipa_characters
658
+ super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
659
+
660
+ def _create_vocab(self):
661
+ self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
662
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
663
+ # pylint: disable=unnecessary-comprehension
664
+ self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
665
+
666
+ @staticmethod
667
+ def init_from_config(config):
668
+ _pad = config.characters.pad
669
+ _punctuations = config.characters.punctuations
670
+ _letters = config.characters.characters
671
+ _letters_ipa = config.characters.phonemes
672
+ return (
673
+ VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
674
+ config,
675
+ )
676
+
677
+ def to_config(self) -> "CharactersConfig":
678
+ return CharactersConfig(
679
+ characters=self._characters,
680
+ punctuations=self._punctuations,
681
+ pad=self._pad,
682
+ eos=None,
683
+ bos=None,
684
+ blank=self._blank,
685
+ is_unique=False,
686
+ is_sorted=True,
687
+ )
688
+
689
+ class TTSTokenizer:
690
+ def __init__(
691
+ self,
692
+ text_cleaner: Callable = None,
693
+ characters: "BaseCharacters" = None,
694
+ ):
695
+ self.text_cleaner = text_cleaner
696
+ self.characters = characters
697
+ self.not_found_characters = []
698
+
699
+ @property
700
+ def characters(self):
701
+ return self._characters
702
+
703
+ @characters.setter
704
+ def characters(self, new_characters):
705
+ self._characters = new_characters
706
+ self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
707
+ self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
708
+
709
+ def encode(self, text: str) -> List[int]:
710
+ """Encodes a string of text as a sequence of IDs."""
711
+ token_ids = []
712
+ for char in text:
713
+ try:
714
+ idx = self.characters.char_to_id(char)
715
+ token_ids.append(idx)
716
+ except KeyError:
717
+ # discard but store not found characters
718
+ if char not in self.not_found_characters:
719
+ self.not_found_characters.append(char)
720
+ print(text)
721
+ print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
722
+ return token_ids
723
+
724
+ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
725
+ text = self.text_cleaner(text)
726
+ text = self.encode(text)
727
+ text = self.intersperse_blank_char(text, True)
728
+ return text
729
+
730
+ def pad_with_bos_eos(self, char_sequence: List[str]):
731
+ """Pads a sequence with the special BOS and EOS characters."""
732
+ return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
733
+
734
+ def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
735
+ """Intersperses the blank character between characters in a sequence.
736
+
737
+ Use the ```blank``` character if defined else use the ```pad``` character.
738
+ """
739
+ char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
740
+ result = [char_to_use] * (len(char_sequence) * 2 + 1)
741
+ result[1::2] = char_sequence
742
+ return result
743
+
744
+ @staticmethod
745
+ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
746
+ text_cleaner = multilingual_cleaners
747
+ CharactersClass = VitsCharacters
748
+ characters, new_config = CharactersClass.init_from_config(config)
749
+ # new_config.characters.characters_class = get_import_path(characters)
750
+ new_config.characters.characters_class = VitsCharacters
751
+ return (
752
+ TTSTokenizer(text_cleaner, characters),new_config)
753
+
754
+
755
+ def multilingual_cleaners(text):
756
+ """Pipeline for multilingual text"""
757
+ text = lowercase(text)
758
+ text = replace_symbols(text, lang=None)
759
+ text = remove_aux_symbols(text)
760
+ text = collapse_whitespace(text)
761
+ return text
762
+
763
+ def lowercase(text):
764
+ return text.lower()
765
+
766
+ def collapse_whitespace(text):
767
+ return re.sub(_whitespace_re, " ", text).strip()
768
+
769
+ def replace_symbols(text, lang="en"):
770
+
771
+ text = text.replace(";", ",")
772
+ text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
773
+ text = text.replace(":", ",")
774
+ if lang == "en":
775
+ text = text.replace("&", " and ")
776
+ elif lang == "fr":
777
+ text = text.replace("&", " et ")
778
+ elif lang == "pt":
779
+ text = text.replace("&", " e ")
780
+ elif lang == "ca":
781
+ text = text.replace("&", " i ")
782
+ text = text.replace("'", "")
783
+ return text
784
+
785
+ def remove_aux_symbols(text):
786
+ text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
787
+ return text
en_male/jit_infer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
3
+ import torch
4
+ import numpy as np
5
+
6
+ #ch female
7
+ with open("chars.txt", 'r') as f:
8
+ letters = f.read().strip('\n')
9
+ model="en_male_vits_30hrs.pt"
10
+ text = "This is a text to b spoken"
11
+
12
+ config = VitsConfig(
13
+ text_cleaner="multilingual_cleaners",
14
+ characters=CharactersConfig(
15
+ characters_class=VitsCharacters,
16
+ pad="<PAD>",
17
+ eos="<EOS>",
18
+ bos="<BOS>",
19
+ blank="<BLNK>",
20
+ characters=letters,
21
+ punctuations="!¡'(),-.:;¿? ",
22
+ phonemes=None)
23
+ )
24
+ tokenizer, config = TTSTokenizer.init_from_config(config)
25
+
26
+ x = tokenizer.text_to_ids(text)
27
+ x = torch.from_numpy(np.array(x)).unsqueeze(0)
28
+ net = torch.jit.load(model)
29
+ with torch.no_grad():
30
+ out2 = net(x)
31
+ import soundfile as sf
32
+ sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)
gu_mms/config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "architectures": [
4
+ "VitsModel"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "depth_separable_channels": 2,
8
+ "depth_separable_num_layers": 3,
9
+ "duration_predictor_dropout": 0.5,
10
+ "duration_predictor_filter_channels": 256,
11
+ "duration_predictor_flow_bins": 10,
12
+ "duration_predictor_kernel_size": 3,
13
+ "duration_predictor_num_flows": 4,
14
+ "duration_predictor_tail_bound": 5.0,
15
+ "ffn_dim": 768,
16
+ "ffn_kernel_size": 3,
17
+ "flow_size": 192,
18
+ "hidden_act": "relu",
19
+ "hidden_dropout": 0.1,
20
+ "hidden_size": 192,
21
+ "initializer_range": 0.02,
22
+ "layer_norm_eps": 1e-05,
23
+ "layerdrop": 0.1,
24
+ "leaky_relu_slope": 0.1,
25
+ "model_type": "vits",
26
+ "noise_scale": 0.667,
27
+ "noise_scale_duration": 0.8,
28
+ "num_attention_heads": 2,
29
+ "num_hidden_layers": 6,
30
+ "num_speakers": 1,
31
+ "posterior_encoder_num_wavenet_layers": 16,
32
+ "prior_encoder_num_flows": 4,
33
+ "prior_encoder_num_wavenet_layers": 4,
34
+ "resblock_dilation_sizes": [
35
+ [
36
+ 1,
37
+ 3,
38
+ 5
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 5
44
+ ],
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ]
50
+ ],
51
+ "resblock_kernel_sizes": [
52
+ 3,
53
+ 7,
54
+ 11
55
+ ],
56
+ "sampling_rate": 16000,
57
+ "speaker_embedding_size": 0,
58
+ "speaking_rate": 1.0,
59
+ "spectrogram_bins": 513,
60
+ "torch_dtype": "float32",
61
+ "transformers_version": "4.33.0.dev0",
62
+ "upsample_initial_channel": 512,
63
+ "upsample_kernel_sizes": [
64
+ 16,
65
+ 16,
66
+ 4,
67
+ 4
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2
74
+ ],
75
+ "use_bias": true,
76
+ "use_stochastic_duration_prediction": true,
77
+ "vocab_size": 60,
78
+ "wavenet_dilation_rate": 1,
79
+ "wavenet_dropout": 0.0,
80
+ "wavenet_kernel_size": 5,
81
+ "window_size": 4
82
+ }
gu_mms/special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "pad_token": "|",
3
+ "unk_token": "<unk>"
4
+ }
gu_mms/tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_blank": true,
3
+ "clean_up_tokenization_spaces": true,
4
+ "is_uroman": false,
5
+ "language": "guj",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "normalize": true,
8
+ "pad_token": "|",
9
+ "phonemize": false,
10
+ "tokenizer_class": "VitsTokenizer",
11
+ "unk_token": "<unk>"
12
+ }
gu_mms/vocab.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 59,
3
+ "'": 47,
4
+ "-": 56,
5
+ "|": 0,
6
+ "ં": 10,
7
+ "ઃ": 54,
8
+ "અ": 28,
9
+ "આ": 26,
10
+ "ઇ": 49,
11
+ "ઈ": 30,
12
+ "ઉ": 42,
13
+ "ઊ": 48,
14
+ "ઋ": 57,
15
+ "એ": 29,
16
+ "ઐ": 58,
17
+ "ઓ": 27,
18
+ "ક": 9,
19
+ "ખ": 33,
20
+ "ગ": 32,
21
+ "ઘ": 44,
22
+ "ચ": 39,
23
+ "છ": 23,
24
+ "જ": 18,
25
+ "ઝ": 51,
26
+ "ઞ": 50,
27
+ "ટ": 36,
28
+ "ઠ": 45,
29
+ "ડ": 40,
30
+ "ઢ": 52,
31
+ "ણ": 22,
32
+ "ત": 3,
33
+ "થ": 19,
34
+ "દ": 25,
35
+ "ધ": 34,
36
+ "ન": 4,
37
+ "પ": 12,
38
+ "ફ": 43,
39
+ "બ": 31,
40
+ "ભ": 35,
41
+ "મ": 7,
42
+ "ય": 16,
43
+ "ર": 5,
44
+ "લ": 24,
45
+ "ળ": 37,
46
+ "વ": 13,
47
+ "શ": 21,
48
+ "ષ": 41,
49
+ "સ": 15,
50
+ "હ": 17,
51
+ "ા": 1,
52
+ "િ": 20,
53
+ "ી": 8,
54
+ "ુ": 14,
55
+ "ૂ": 38,
56
+ "ૃ": 46,
57
+ "ે": 2,
58
+ "ૈ": 53,
59
+ "ો": 11,
60
+ "ૌ": 55,
61
+ "્": 6
62
+ }
hi_female/chars.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ शदऊतसओषमऱढै?ख़ौक़ड़ःिअनठय़ज़फ़्खँे।ंऋउ'हछङझ" ुणऔघयञृएईॆीपचॉॠवगडटइ,बॅूऐफकजलग़आधोथाभढ़ऑ
hi_female/extra.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Union
2
+ from dataclasses import asdict, dataclass, field
3
+
4
+
5
+ import re
6
+ from dataclasses import replace
7
+ from typing import Dict
8
+ _whitespace_re = re.compile(r"\s+")
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import List
12
+
13
+ # from TTS.tts.configs.shared_configs import BaseTTSConfig
14
+ # from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
15
+
16
+ @dataclass
17
+ class CharactersConfig():
18
+
19
+ characters_class: str = None
20
+
21
+ # using BaseVocabulary
22
+ vocab_dict: Dict = None
23
+
24
+ # using on BaseCharacters
25
+ pad: str = None
26
+ eos: str = None
27
+ bos: str = None
28
+ blank: str = None
29
+ characters: str = None
30
+ punctuations: str = None
31
+ phonemes: str = None
32
+ is_unique: bool = True # for backwards compatibility of models trained with char sets with duplicates
33
+ is_sorted: bool = True
34
+
35
+
36
+ @dataclass
37
+ class BaseTTSConfig():
38
+
39
+ # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
40
+ # phoneme settings
41
+ use_phonemes: bool = False
42
+ phonemizer: str = None
43
+ phoneme_language: str = None
44
+ compute_input_seq_cache: bool = False
45
+ text_cleaner: str = None
46
+ enable_eos_bos_chars: bool = False
47
+ test_sentences_file: str = ""
48
+ phoneme_cache_path: str = None
49
+ # vocabulary parameters
50
+ characters: CharactersConfig = None
51
+ add_blank: bool = False
52
+ # training params
53
+ batch_group_size: int = 0
54
+ loss_masking: bool = None
55
+ # dataloading
56
+ min_audio_len: int = 1
57
+ max_audio_len: int = float("inf")
58
+ min_text_len: int = 1
59
+ max_text_len: int = float("inf")
60
+ compute_f0: bool = False
61
+ compute_energy: bool = False
62
+ compute_linear_spec: bool = False
63
+ precompute_num_workers: int = 0
64
+ use_noise_augment: bool = False
65
+ start_by_longest: bool = False
66
+ shuffle: bool = False
67
+ drop_last: bool = False
68
+ # dataset
69
+ datasets: str = None
70
+ # optimizer
71
+ optimizer: str = "radam"
72
+ optimizer_params: dict = None
73
+ # scheduler
74
+ lr_scheduler: str = None
75
+ lr_scheduler_params: dict = field(default_factory=lambda: {})
76
+ # testing
77
+ test_sentences: List[str] = field(default_factory=lambda: [])
78
+ # evaluation
79
+ eval_split_max_size: int = None
80
+ eval_split_size: float = 0.01
81
+ # weighted samplers
82
+ use_speaker_weighted_sampler: bool = False
83
+ speaker_weighted_sampler_alpha: float = 1.0
84
+ use_language_weighted_sampler: bool = False
85
+ language_weighted_sampler_alpha: float = 1.0
86
+ use_length_weighted_sampler: bool = False
87
+ length_weighted_sampler_alpha: float = 1.0
88
+
89
+
90
+ @dataclass
91
+ class VitsAudioConfig():
92
+ fft_size: int = 1024
93
+ sample_rate: int = 22050
94
+ win_length: int = 1024
95
+ hop_length: int = 256
96
+ num_mels: int = 80
97
+ mel_fmin: int = 0
98
+ mel_fmax: int = None
99
+
100
+ @dataclass
101
+ class VitsArgs():
102
+ num_chars: int = 100
103
+ out_channels: int = 513
104
+ spec_segment_size: int = 32
105
+ hidden_channels: int = 192
106
+ hidden_channels_ffn_text_encoder: int = 768
107
+ num_heads_text_encoder: int = 2
108
+ num_layers_text_encoder: int = 6
109
+ kernel_size_text_encoder: int = 3
110
+ dropout_p_text_encoder: float = 0.1
111
+ dropout_p_duration_predictor: float = 0.5
112
+ kernel_size_posterior_encoder: int = 5
113
+ dilation_rate_posterior_encoder: int = 1
114
+ num_layers_posterior_encoder: int = 16
115
+ kernel_size_flow: int = 5
116
+ dilation_rate_flow: int = 1
117
+ num_layers_flow: int = 4
118
+ resblock_type_decoder: str = "1"
119
+ resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
120
+ resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
121
+ upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
122
+ upsample_initial_channel_decoder: int = 512
123
+ upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
124
+ periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
125
+ use_sdp: bool = True
126
+ noise_scale: float = 1.0
127
+ inference_noise_scale: float = 0.667
128
+ length_scale: float = 1
129
+ noise_scale_dp: float = 1.0
130
+ inference_noise_scale_dp: float = 1.0
131
+ max_inference_len: int = None
132
+ init_discriminator: bool = True
133
+ use_spectral_norm_disriminator: bool = False
134
+ use_speaker_embedding: bool = False
135
+ num_speakers: int = 0
136
+ speakers_file: str = None
137
+ d_vector_file: List[str] = None
138
+ speaker_embedding_channels: int = 256
139
+ use_d_vector_file: bool = False
140
+ d_vector_dim: int = 0
141
+ detach_dp_input: bool = True
142
+ use_language_embedding: bool = False
143
+ embedded_language_dim: int = 4
144
+ num_languages: int = 0
145
+ language_ids_file: str = None
146
+ use_speaker_encoder_as_loss: bool = False
147
+ speaker_encoder_config_path: str = ""
148
+ speaker_encoder_model_path: str = ""
149
+ condition_dp_on_speaker: bool = True
150
+ freeze_encoder: bool = False
151
+ freeze_DP: bool = False
152
+ freeze_PE: bool = False
153
+ freeze_flow_decoder: bool = False
154
+ freeze_waveform_decoder: bool = False
155
+ encoder_sample_rate: int = None
156
+ interpolate_z: bool = True
157
+ reinit_DP: bool = False
158
+ reinit_text_encoder: bool = False
159
+ @dataclass
160
+ class VitsConfig(BaseTTSConfig):
161
+
162
+ model: str = "vits"
163
+ # model specific params
164
+ model_args: VitsArgs = field(default_factory=VitsArgs)
165
+ audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
166
+
167
+ # optimizer
168
+ grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
169
+ lr_gen: float = 0.0002
170
+ lr_disc: float = 0.0002
171
+ lr_scheduler_gen: str = "ExponentialLR"
172
+ lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
173
+ lr_scheduler_disc: str = "ExponentialLR"
174
+ lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
175
+ scheduler_after_epoch: bool = True
176
+ optimizer: str = "AdamW"
177
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
178
+
179
+ # loss params
180
+ kl_loss_alpha: float = 1.0
181
+ disc_loss_alpha: float = 1.0
182
+ gen_loss_alpha: float = 1.0
183
+ feat_loss_alpha: float = 1.0
184
+ mel_loss_alpha: float = 45.0
185
+ dur_loss_alpha: float = 1.0
186
+ speaker_encoder_loss_alpha: float = 1.0
187
+
188
+ # data loader params
189
+ return_wav: bool = True
190
+ compute_linear_spec: bool = True
191
+
192
+ # sampler params
193
+ use_weighted_sampler: bool = False # TODO: move it to the base config
194
+ weighted_sampler_attrs: dict = field(default_factory=lambda: {})
195
+ weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
196
+
197
+ # overrides
198
+ r: int = 1 # DO NOT CHANGE
199
+ add_blank: bool = True
200
+
201
+ # testing
202
+ test_sentences: List[List] = field(
203
+ default_factory=lambda: [
204
+ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
205
+ ["Be a voice, not an echo."],
206
+ ["I'm sorry Dave. I'm afraid I can't do that."],
207
+ ["This cake is great. It's so delicious and moist."],
208
+ ["Prior to November 22, 1963."],
209
+ ]
210
+ )
211
+
212
+ # multi-speaker settings
213
+ # use speaker embedding layer
214
+ num_speakers: int = 0
215
+ use_speaker_embedding: bool = False
216
+ speakers_file: str = None
217
+ speaker_embedding_channels: int = 256
218
+ language_ids_file: str = None
219
+ use_language_embedding: bool = False
220
+
221
+ # use d-vectors
222
+ use_d_vector_file: bool = False
223
+ d_vector_file: List[str] = None
224
+ d_vector_dim: int = None
225
+
226
+ def __post_init__(self):
227
+ pass
228
+ # for key, val in self.model_args.items():
229
+ # if hasattr(self, key):
230
+ # self[key] = val
231
+
232
+
233
+
234
+
235
+
236
+ def parse_symbols():
237
+ return {
238
+ "pad": _pad,
239
+ "eos": _eos,
240
+ "bos": _bos,
241
+ "characters": _characters,
242
+ "punctuations": _punctuations,
243
+ "phonemes": _phonemes,
244
+ }
245
+
246
+
247
+ # DEFAULT SET OF GRAPHEMES
248
+ _pad = "<PAD>"
249
+ _eos = "<EOS>"
250
+ _bos = "<BOS>"
251
+ _blank = "<BLNK>" # TODO: check if we need this alongside with PAD
252
+ _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
253
+ _punctuations = "!'(),-.:;? "
254
+
255
+
256
+ # DEFAULT SET OF IPA PHONEMES
257
+ # Phonemes definition (All IPA characters)
258
+ _vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
259
+ _non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
260
+ _pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
261
+ _suprasegmentals = "ˈˌːˑ"
262
+ _other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
263
+ _diacrilics = "ɚ˞ɫ"
264
+ _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
265
+
266
+
267
+ class BaseVocabulary:
268
+ """Base Vocabulary class.
269
+
270
+ This class only needs a vocabulary dictionary without specifying the characters.
271
+
272
+ Args:
273
+ vocab (Dict): A dictionary of characters and their corresponding indices.
274
+ """
275
+
276
+ def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
277
+ self.vocab = vocab
278
+ self.pad = pad
279
+ self.blank = blank
280
+ self.bos = bos
281
+ self.eos = eos
282
+
283
+ @property
284
+ def pad_id(self) -> int:
285
+ """Return the index of the padding character. If the padding character is not specified, return the length
286
+ of the vocabulary."""
287
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
288
+
289
+ @property
290
+ def blank_id(self) -> int:
291
+ """Return the index of the blank character. If the blank character is not specified, return the length of
292
+ the vocabulary."""
293
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
294
+
295
+ @property
296
+ def bos_id(self) -> int:
297
+ """Return the index of the bos character. If the bos character is not specified, return the length of the
298
+ vocabulary."""
299
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
300
+
301
+ @property
302
+ def eos_id(self) -> int:
303
+ """Return the index of the eos character. If the eos character is not specified, return the length of the
304
+ vocabulary."""
305
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
306
+
307
+ @property
308
+ def vocab(self):
309
+ """Return the vocabulary dictionary."""
310
+ return self._vocab
311
+
312
+ @vocab.setter
313
+ def vocab(self, vocab):
314
+ """Set the vocabulary dictionary and character mapping dictionaries."""
315
+ self._vocab, self._char_to_id, self._id_to_char = None, None, None
316
+ if vocab is not None:
317
+ self._vocab = vocab
318
+ self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
319
+ self._id_to_char = {
320
+ idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
321
+ }
322
+
323
+ @staticmethod
324
+ def init_from_config(config, **kwargs):
325
+ """Initialize from the given config."""
326
+ if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
327
+ return (
328
+ BaseVocabulary(
329
+ config.characters.vocab_dict,
330
+ config.characters.pad,
331
+ config.characters.blank,
332
+ config.characters.bos,
333
+ config.characters.eos,
334
+ ),
335
+ config,
336
+ )
337
+ return BaseVocabulary(**kwargs), config
338
+
339
+ def to_config(self):
340
+ return CharactersConfig(
341
+ vocab_dict=self._vocab,
342
+ pad=self.pad,
343
+ eos=self.eos,
344
+ bos=self.bos,
345
+ blank=self.blank,
346
+ is_unique=False,
347
+ is_sorted=False,
348
+ )
349
+
350
+ @property
351
+ def num_chars(self):
352
+ """Return number of tokens in the vocabulary."""
353
+ return len(self._vocab)
354
+
355
+ def char_to_id(self, char: str) -> int:
356
+ """Map a character to an token ID."""
357
+ try:
358
+ return self._char_to_id[char]
359
+ except KeyError as e:
360
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
361
+
362
+ def id_to_char(self, idx: int) -> str:
363
+ """Map an token ID to a character."""
364
+ return self._id_to_char[idx]
365
+
366
+
367
+ class BaseCharacters:
368
+
369
+
370
+ def __init__(
371
+ self,
372
+ characters: str = None,
373
+ punctuations: str = None,
374
+ pad: str = None,
375
+ eos: str = None,
376
+ bos: str = None,
377
+ blank: str = None,
378
+ is_unique: bool = False,
379
+ is_sorted: bool = True,
380
+ ) -> None:
381
+ self._characters = characters
382
+ self._punctuations = punctuations
383
+ self._pad = pad
384
+ self._eos = eos
385
+ self._bos = bos
386
+ self._blank = blank
387
+ self.is_unique = is_unique
388
+ self.is_sorted = is_sorted
389
+ self._create_vocab()
390
+
391
+ @property
392
+ def pad_id(self) -> int:
393
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
394
+
395
+ @property
396
+ def blank_id(self) -> int:
397
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
398
+
399
+ @property
400
+ def eos_id(self) -> int:
401
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
402
+
403
+ @property
404
+ def bos_id(self) -> int:
405
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
406
+
407
+ @property
408
+ def characters(self):
409
+ return self._characters
410
+
411
+ @characters.setter
412
+ def characters(self, characters):
413
+ self._characters = characters
414
+ self._create_vocab()
415
+
416
+ @property
417
+ def punctuations(self):
418
+ return self._punctuations
419
+
420
+ @punctuations.setter
421
+ def punctuations(self, punctuations):
422
+ self._punctuations = punctuations
423
+ self._create_vocab()
424
+
425
+ @property
426
+ def pad(self):
427
+ return self._pad
428
+
429
+ @pad.setter
430
+ def pad(self, pad):
431
+ self._pad = pad
432
+ self._create_vocab()
433
+
434
+ @property
435
+ def eos(self):
436
+ return self._eos
437
+
438
+ @eos.setter
439
+ def eos(self, eos):
440
+ self._eos = eos
441
+ self._create_vocab()
442
+
443
+ @property
444
+ def bos(self):
445
+ return self._bos
446
+
447
+ @bos.setter
448
+ def bos(self, bos):
449
+ self._bos = bos
450
+ self._create_vocab()
451
+
452
+ @property
453
+ def blank(self):
454
+ return self._blank
455
+
456
+ @blank.setter
457
+ def blank(self, blank):
458
+ self._blank = blank
459
+ self._create_vocab()
460
+
461
+ @property
462
+ def vocab(self):
463
+ return self._vocab
464
+
465
+ @vocab.setter
466
+ def vocab(self, vocab):
467
+ self._vocab = vocab
468
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
469
+ self._id_to_char = {
470
+ idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
471
+ }
472
+
473
+ @property
474
+ def num_chars(self):
475
+ return len(self._vocab)
476
+
477
+ def _create_vocab(self):
478
+ _vocab = self._characters
479
+ if self.is_unique:
480
+ _vocab = list(set(_vocab))
481
+ if self.is_sorted:
482
+ _vocab = sorted(_vocab)
483
+ _vocab = list(_vocab)
484
+ _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
485
+ _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
486
+ _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
487
+ _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
488
+ self.vocab = _vocab + list(self._punctuations)
489
+ if self.is_unique:
490
+ duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
491
+ assert (
492
+ len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
493
+ ), f" [!] There are duplicate characters in the character set. {duplicates}"
494
+
495
+ def char_to_id(self, char: str) -> int:
496
+ try:
497
+ return self._char_to_id[char]
498
+ except KeyError as e:
499
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
500
+
501
+ def id_to_char(self, idx: int) -> str:
502
+ return self._id_to_char[idx]
503
+
504
+ def print_log(self, level: int = 0):
505
+ """
506
+ Prints the vocabulary in a nice format.
507
+ """
508
+ indent = "\t" * level
509
+ print(f"{indent}| > Characters: {self._characters}")
510
+ print(f"{indent}| > Punctuations: {self._punctuations}")
511
+ print(f"{indent}| > Pad: {self._pad}")
512
+ print(f"{indent}| > EOS: {self._eos}")
513
+ print(f"{indent}| > BOS: {self._bos}")
514
+ print(f"{indent}| > Blank: {self._blank}")
515
+ print(f"{indent}| > Vocab: {self.vocab}")
516
+ print(f"{indent}| > Num chars: {self.num_chars}")
517
+
518
+ @staticmethod
519
+ def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
520
+ """Init your character class from a config.
521
+
522
+ Implement this method for your subclass.
523
+ """
524
+ # use character set from config
525
+ if config.characters is not None:
526
+ return BaseCharacters(**config.characters), config
527
+ # return default character set
528
+ characters = BaseCharacters()
529
+ new_config = replace(config, characters=characters.to_config())
530
+ return characters, new_config
531
+
532
+ def to_config(self) -> "CharactersConfig":
533
+ return CharactersConfig(
534
+ characters=self._characters,
535
+ punctuations=self._punctuations,
536
+ pad=self._pad,
537
+ eos=self._eos,
538
+ bos=self._bos,
539
+ blank=self._blank,
540
+ is_unique=self.is_unique,
541
+ is_sorted=self.is_sorted,
542
+ )
543
+
544
+
545
+ class IPAPhonemes(BaseCharacters):
546
+
547
+
548
+ def __init__(
549
+ self,
550
+ characters: str = _phonemes,
551
+ punctuations: str = _punctuations,
552
+ pad: str = _pad,
553
+ eos: str = _eos,
554
+ bos: str = _bos,
555
+ blank: str = _blank,
556
+ is_unique: bool = False,
557
+ is_sorted: bool = True,
558
+ ) -> None:
559
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
560
+
561
+ @staticmethod
562
+ def init_from_config(config: "Coqpit"):
563
+ """Init a IPAPhonemes object from a model config
564
+
565
+ If characters are not defined in the config, it will be set to the default characters and the config
566
+ will be updated.
567
+ """
568
+ # band-aid for compatibility with old models
569
+ if "characters" in config and config.characters is not None:
570
+ if "phonemes" in config.characters and config.characters.phonemes is not None:
571
+ config.characters["characters"] = config.characters["phonemes"]
572
+ return (
573
+ IPAPhonemes(
574
+ characters=config.characters["characters"],
575
+ punctuations=config.characters["punctuations"],
576
+ pad=config.characters["pad"],
577
+ eos=config.characters["eos"],
578
+ bos=config.characters["bos"],
579
+ blank=config.characters["blank"],
580
+ is_unique=config.characters["is_unique"],
581
+ is_sorted=config.characters["is_sorted"],
582
+ ),
583
+ config,
584
+ )
585
+ # use character set from config
586
+ if config.characters is not None:
587
+ return IPAPhonemes(**config.characters), config
588
+ # return default character set
589
+ characters = IPAPhonemes()
590
+ new_config = replace(config, characters=characters.to_config())
591
+ return characters, new_config
592
+
593
+
594
+ class Graphemes(BaseCharacters):
595
+
596
+
597
+ def __init__(
598
+ self,
599
+ characters: str = _characters,
600
+ punctuations: str = _punctuations,
601
+ pad: str = _pad,
602
+ eos: str = _eos,
603
+ bos: str = _bos,
604
+ blank: str = _blank,
605
+ is_unique: bool = False,
606
+ is_sorted: bool = True,
607
+ ) -> None:
608
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
609
+
610
+ @staticmethod
611
+ def init_from_config(config: "Coqpit"):
612
+ """Init a Graphemes object from a model config
613
+
614
+ If characters are not defined in the config, it will be set to the default characters and the config
615
+ will be updated.
616
+ """
617
+ if config.characters is not None:
618
+ # band-aid for compatibility with old models
619
+ if "phonemes" in config.characters:
620
+ return (
621
+ Graphemes(
622
+ characters=config.characters["characters"],
623
+ punctuations=config.characters["punctuations"],
624
+ pad=config.characters["pad"],
625
+ eos=config.characters["eos"],
626
+ bos=config.characters["bos"],
627
+ blank=config.characters["blank"],
628
+ is_unique=config.characters["is_unique"],
629
+ is_sorted=config.characters["is_sorted"],
630
+ ),
631
+ config,
632
+ )
633
+ return Graphemes(**config.characters), config
634
+ characters = Graphemes()
635
+ new_config = replace(config, characters=characters.to_config())
636
+ return characters, new_config
637
+
638
+
639
+ if __name__ == "__main__":
640
+ gr = Graphemes()
641
+ ph = IPAPhonemes()
642
+ gr.print_log()
643
+ ph.print_log()
644
+
645
+
646
+ class VitsCharacters(BaseCharacters):
647
+ """Characters class for VITs model for compatibility with pre-trained models"""
648
+
649
+ def __init__(
650
+ self,
651
+ graphemes: str = _characters,
652
+ punctuations: str = _punctuations,
653
+ pad: str = _pad,
654
+ ipa_characters: str = _phonemes,
655
+ ) -> None:
656
+ if ipa_characters is not None:
657
+ graphemes += ipa_characters
658
+ super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
659
+
660
+ def _create_vocab(self):
661
+ self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
662
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
663
+ # pylint: disable=unnecessary-comprehension
664
+ self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
665
+
666
+ @staticmethod
667
+ def init_from_config(config):
668
+ _pad = config.characters.pad
669
+ _punctuations = config.characters.punctuations
670
+ _letters = config.characters.characters
671
+ _letters_ipa = config.characters.phonemes
672
+ return (
673
+ VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
674
+ config,
675
+ )
676
+
677
+ def to_config(self) -> "CharactersConfig":
678
+ return CharactersConfig(
679
+ characters=self._characters,
680
+ punctuations=self._punctuations,
681
+ pad=self._pad,
682
+ eos=None,
683
+ bos=None,
684
+ blank=self._blank,
685
+ is_unique=False,
686
+ is_sorted=True,
687
+ )
688
+
689
+ class TTSTokenizer:
690
+ def __init__(
691
+ self,
692
+ text_cleaner: Callable = None,
693
+ characters: "BaseCharacters" = None,
694
+ ):
695
+ self.text_cleaner = text_cleaner
696
+ self.characters = characters
697
+ self.not_found_characters = []
698
+
699
+ @property
700
+ def characters(self):
701
+ return self._characters
702
+
703
+ @characters.setter
704
+ def characters(self, new_characters):
705
+ self._characters = new_characters
706
+ self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
707
+ self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
708
+
709
+ def encode(self, text: str) -> List[int]:
710
+ """Encodes a string of text as a sequence of IDs."""
711
+ token_ids = []
712
+ for char in text:
713
+ try:
714
+ idx = self.characters.char_to_id(char)
715
+ token_ids.append(idx)
716
+ except KeyError:
717
+ # discard but store not found characters
718
+ if char not in self.not_found_characters:
719
+ self.not_found_characters.append(char)
720
+ print(text)
721
+ print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
722
+ return token_ids
723
+
724
+ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
725
+ text = self.text_cleaner(text)
726
+ text = self.encode(text)
727
+ text = self.intersperse_blank_char(text, True)
728
+ return text
729
+
730
+ def pad_with_bos_eos(self, char_sequence: List[str]):
731
+ """Pads a sequence with the special BOS and EOS characters."""
732
+ return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
733
+
734
+ def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
735
+ """Intersperses the blank character between characters in a sequence.
736
+
737
+ Use the ```blank``` character if defined else use the ```pad``` character.
738
+ """
739
+ char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
740
+ result = [char_to_use] * (len(char_sequence) * 2 + 1)
741
+ result[1::2] = char_sequence
742
+ return result
743
+
744
+ @staticmethod
745
+ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
746
+ text_cleaner = multilingual_cleaners
747
+ CharactersClass = VitsCharacters
748
+ characters, new_config = CharactersClass.init_from_config(config)
749
+ # new_config.characters.characters_class = get_import_path(characters)
750
+ new_config.characters.characters_class = VitsCharacters
751
+ return (
752
+ TTSTokenizer(text_cleaner, characters),new_config)
753
+
754
+
755
+ def multilingual_cleaners(text):
756
+ """Pipeline for multilingual text"""
757
+ text = lowercase(text)
758
+ text = replace_symbols(text, lang=None)
759
+ text = remove_aux_symbols(text)
760
+ text = collapse_whitespace(text)
761
+ return text
762
+
763
+ def lowercase(text):
764
+ return text.lower()
765
+
766
+ def collapse_whitespace(text):
767
+ return re.sub(_whitespace_re, " ", text).strip()
768
+
769
+ def replace_symbols(text, lang="en"):
770
+
771
+ text = text.replace(";", ",")
772
+ text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
773
+ text = text.replace(":", ",")
774
+ if lang == "en":
775
+ text = text.replace("&", " and ")
776
+ elif lang == "fr":
777
+ text = text.replace("&", " et ")
778
+ elif lang == "pt":
779
+ text = text.replace("&", " e ")
780
+ elif lang == "ca":
781
+ text = text.replace("&", " i ")
782
+ text = text.replace("'", "")
783
+ return text
784
+
785
+ def remove_aux_symbols(text):
786
+ text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
787
+ return text
hi_female/hi_female_vits_30hrs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bcfb47f599b36e7cbfec27142604c366e538c17e89980a40519291f92a46327
3
+ size 333261446
hi_female/jit_infer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
3
+ import torch
4
+ import numpy as np
5
+
6
+ #ch female
7
+ with open("chars.txt", 'r') as f:
8
+ letters = f.read().strip('\n')
9
+ model="hi_female_vits_30hrs.pt"
10
+ text = "फिल्म गर्दिश में अमरीश पुरी के साथ जैकी श्रॉफ, ऐश्वर्या, डिंपल कपाड़िया"
11
+
12
+ config = VitsConfig(
13
+ text_cleaner="multilingual_cleaners",
14
+ characters=CharactersConfig(
15
+ characters_class=VitsCharacters,
16
+ pad="<PAD>",
17
+ eos="<EOS>",
18
+ bos="<BOS>",
19
+ blank="<BLNK>",
20
+ characters=letters,
21
+ punctuations="!¡'(),-.:;¿? ",
22
+ phonemes=None)
23
+ )
24
+ tokenizer, config = TTSTokenizer.init_from_config(config)
25
+
26
+ x = tokenizer.text_to_ids(text)
27
+ x = torch.from_numpy(np.array(x)).unsqueeze(0)
28
+ net = torch.jit.load(model)
29
+ with torch.no_grad():
30
+ out2 = net(x)
31
+ import soundfile as sf
32
+ sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)
hi_male/chars.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ शदऊतओसषमऱढै?ख़ौक़ड़ःिअनठय़ज़फ़्खँे।ंऋउ'हछङझ" ुणऔयघञृएईॆीपचॉॠवगडटइ,बॅूऐफजकलग़आधोथाभढ़ऑ
hi_male/extra.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Union
2
+ from dataclasses import asdict, dataclass, field
3
+
4
+
5
+ import re
6
+ from dataclasses import replace
7
+ from typing import Dict
8
+ _whitespace_re = re.compile(r"\s+")
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import List
12
+
13
+ # from TTS.tts.configs.shared_configs import BaseTTSConfig
14
+ # from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
15
+
16
+ @dataclass
17
+ class CharactersConfig():
18
+
19
+ characters_class: str = None
20
+
21
+ # using BaseVocabulary
22
+ vocab_dict: Dict = None
23
+
24
+ # using on BaseCharacters
25
+ pad: str = None
26
+ eos: str = None
27
+ bos: str = None
28
+ blank: str = None
29
+ characters: str = None
30
+ punctuations: str = None
31
+ phonemes: str = None
32
+ is_unique: bool = True # for backwards compatibility of models trained with char sets with duplicates
33
+ is_sorted: bool = True
34
+
35
+
36
+ @dataclass
37
+ class BaseTTSConfig():
38
+
39
+ # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
40
+ # phoneme settings
41
+ use_phonemes: bool = False
42
+ phonemizer: str = None
43
+ phoneme_language: str = None
44
+ compute_input_seq_cache: bool = False
45
+ text_cleaner: str = None
46
+ enable_eos_bos_chars: bool = False
47
+ test_sentences_file: str = ""
48
+ phoneme_cache_path: str = None
49
+ # vocabulary parameters
50
+ characters: CharactersConfig = None
51
+ add_blank: bool = False
52
+ # training params
53
+ batch_group_size: int = 0
54
+ loss_masking: bool = None
55
+ # dataloading
56
+ min_audio_len: int = 1
57
+ max_audio_len: int = float("inf")
58
+ min_text_len: int = 1
59
+ max_text_len: int = float("inf")
60
+ compute_f0: bool = False
61
+ compute_energy: bool = False
62
+ compute_linear_spec: bool = False
63
+ precompute_num_workers: int = 0
64
+ use_noise_augment: bool = False
65
+ start_by_longest: bool = False
66
+ shuffle: bool = False
67
+ drop_last: bool = False
68
+ # dataset
69
+ datasets: str = None
70
+ # optimizer
71
+ optimizer: str = "radam"
72
+ optimizer_params: dict = None
73
+ # scheduler
74
+ lr_scheduler: str = None
75
+ lr_scheduler_params: dict = field(default_factory=lambda: {})
76
+ # testing
77
+ test_sentences: List[str] = field(default_factory=lambda: [])
78
+ # evaluation
79
+ eval_split_max_size: int = None
80
+ eval_split_size: float = 0.01
81
+ # weighted samplers
82
+ use_speaker_weighted_sampler: bool = False
83
+ speaker_weighted_sampler_alpha: float = 1.0
84
+ use_language_weighted_sampler: bool = False
85
+ language_weighted_sampler_alpha: float = 1.0
86
+ use_length_weighted_sampler: bool = False
87
+ length_weighted_sampler_alpha: float = 1.0
88
+
89
+
90
+ @dataclass
91
+ class VitsAudioConfig():
92
+ fft_size: int = 1024
93
+ sample_rate: int = 22050
94
+ win_length: int = 1024
95
+ hop_length: int = 256
96
+ num_mels: int = 80
97
+ mel_fmin: int = 0
98
+ mel_fmax: int = None
99
+
100
+ @dataclass
101
+ class VitsArgs():
102
+ num_chars: int = 100
103
+ out_channels: int = 513
104
+ spec_segment_size: int = 32
105
+ hidden_channels: int = 192
106
+ hidden_channels_ffn_text_encoder: int = 768
107
+ num_heads_text_encoder: int = 2
108
+ num_layers_text_encoder: int = 6
109
+ kernel_size_text_encoder: int = 3
110
+ dropout_p_text_encoder: float = 0.1
111
+ dropout_p_duration_predictor: float = 0.5
112
+ kernel_size_posterior_encoder: int = 5
113
+ dilation_rate_posterior_encoder: int = 1
114
+ num_layers_posterior_encoder: int = 16
115
+ kernel_size_flow: int = 5
116
+ dilation_rate_flow: int = 1
117
+ num_layers_flow: int = 4
118
+ resblock_type_decoder: str = "1"
119
+ resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
120
+ resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
121
+ upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
122
+ upsample_initial_channel_decoder: int = 512
123
+ upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
124
+ periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
125
+ use_sdp: bool = True
126
+ noise_scale: float = 1.0
127
+ inference_noise_scale: float = 0.667
128
+ length_scale: float = 1
129
+ noise_scale_dp: float = 1.0
130
+ inference_noise_scale_dp: float = 1.0
131
+ max_inference_len: int = None
132
+ init_discriminator: bool = True
133
+ use_spectral_norm_disriminator: bool = False
134
+ use_speaker_embedding: bool = False
135
+ num_speakers: int = 0
136
+ speakers_file: str = None
137
+ d_vector_file: List[str] = None
138
+ speaker_embedding_channels: int = 256
139
+ use_d_vector_file: bool = False
140
+ d_vector_dim: int = 0
141
+ detach_dp_input: bool = True
142
+ use_language_embedding: bool = False
143
+ embedded_language_dim: int = 4
144
+ num_languages: int = 0
145
+ language_ids_file: str = None
146
+ use_speaker_encoder_as_loss: bool = False
147
+ speaker_encoder_config_path: str = ""
148
+ speaker_encoder_model_path: str = ""
149
+ condition_dp_on_speaker: bool = True
150
+ freeze_encoder: bool = False
151
+ freeze_DP: bool = False
152
+ freeze_PE: bool = False
153
+ freeze_flow_decoder: bool = False
154
+ freeze_waveform_decoder: bool = False
155
+ encoder_sample_rate: int = None
156
+ interpolate_z: bool = True
157
+ reinit_DP: bool = False
158
+ reinit_text_encoder: bool = False
159
+ @dataclass
160
+ class VitsConfig(BaseTTSConfig):
161
+
162
+ model: str = "vits"
163
+ # model specific params
164
+ model_args: VitsArgs = field(default_factory=VitsArgs)
165
+ audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
166
+
167
+ # optimizer
168
+ grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
169
+ lr_gen: float = 0.0002
170
+ lr_disc: float = 0.0002
171
+ lr_scheduler_gen: str = "ExponentialLR"
172
+ lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
173
+ lr_scheduler_disc: str = "ExponentialLR"
174
+ lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
175
+ scheduler_after_epoch: bool = True
176
+ optimizer: str = "AdamW"
177
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
178
+
179
+ # loss params
180
+ kl_loss_alpha: float = 1.0
181
+ disc_loss_alpha: float = 1.0
182
+ gen_loss_alpha: float = 1.0
183
+ feat_loss_alpha: float = 1.0
184
+ mel_loss_alpha: float = 45.0
185
+ dur_loss_alpha: float = 1.0
186
+ speaker_encoder_loss_alpha: float = 1.0
187
+
188
+ # data loader params
189
+ return_wav: bool = True
190
+ compute_linear_spec: bool = True
191
+
192
+ # sampler params
193
+ use_weighted_sampler: bool = False # TODO: move it to the base config
194
+ weighted_sampler_attrs: dict = field(default_factory=lambda: {})
195
+ weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
196
+
197
+ # overrides
198
+ r: int = 1 # DO NOT CHANGE
199
+ add_blank: bool = True
200
+
201
+ # testing
202
+ test_sentences: List[List] = field(
203
+ default_factory=lambda: [
204
+ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
205
+ ["Be a voice, not an echo."],
206
+ ["I'm sorry Dave. I'm afraid I can't do that."],
207
+ ["This cake is great. It's so delicious and moist."],
208
+ ["Prior to November 22, 1963."],
209
+ ]
210
+ )
211
+
212
+ # multi-speaker settings
213
+ # use speaker embedding layer
214
+ num_speakers: int = 0
215
+ use_speaker_embedding: bool = False
216
+ speakers_file: str = None
217
+ speaker_embedding_channels: int = 256
218
+ language_ids_file: str = None
219
+ use_language_embedding: bool = False
220
+
221
+ # use d-vectors
222
+ use_d_vector_file: bool = False
223
+ d_vector_file: List[str] = None
224
+ d_vector_dim: int = None
225
+
226
+ def __post_init__(self):
227
+ pass
228
+ # for key, val in self.model_args.items():
229
+ # if hasattr(self, key):
230
+ # self[key] = val
231
+
232
+
233
+
234
+
235
+
236
+ def parse_symbols():
237
+ return {
238
+ "pad": _pad,
239
+ "eos": _eos,
240
+ "bos": _bos,
241
+ "characters": _characters,
242
+ "punctuations": _punctuations,
243
+ "phonemes": _phonemes,
244
+ }
245
+
246
+
247
+ # DEFAULT SET OF GRAPHEMES
248
+ _pad = "<PAD>"
249
+ _eos = "<EOS>"
250
+ _bos = "<BOS>"
251
+ _blank = "<BLNK>" # TODO: check if we need this alongside with PAD
252
+ _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
253
+ _punctuations = "!'(),-.:;? "
254
+
255
+
256
+ # DEFAULT SET OF IPA PHONEMES
257
+ # Phonemes definition (All IPA characters)
258
+ _vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
259
+ _non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
260
+ _pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
261
+ _suprasegmentals = "ˈˌːˑ"
262
+ _other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
263
+ _diacrilics = "ɚ˞ɫ"
264
+ _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
265
+
266
+
267
+ class BaseVocabulary:
268
+ """Base Vocabulary class.
269
+
270
+ This class only needs a vocabulary dictionary without specifying the characters.
271
+
272
+ Args:
273
+ vocab (Dict): A dictionary of characters and their corresponding indices.
274
+ """
275
+
276
+ def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
277
+ self.vocab = vocab
278
+ self.pad = pad
279
+ self.blank = blank
280
+ self.bos = bos
281
+ self.eos = eos
282
+
283
+ @property
284
+ def pad_id(self) -> int:
285
+ """Return the index of the padding character. If the padding character is not specified, return the length
286
+ of the vocabulary."""
287
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
288
+
289
+ @property
290
+ def blank_id(self) -> int:
291
+ """Return the index of the blank character. If the blank character is not specified, return the length of
292
+ the vocabulary."""
293
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
294
+
295
+ @property
296
+ def bos_id(self) -> int:
297
+ """Return the index of the bos character. If the bos character is not specified, return the length of the
298
+ vocabulary."""
299
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
300
+
301
+ @property
302
+ def eos_id(self) -> int:
303
+ """Return the index of the eos character. If the eos character is not specified, return the length of the
304
+ vocabulary."""
305
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
306
+
307
+ @property
308
+ def vocab(self):
309
+ """Return the vocabulary dictionary."""
310
+ return self._vocab
311
+
312
+ @vocab.setter
313
+ def vocab(self, vocab):
314
+ """Set the vocabulary dictionary and character mapping dictionaries."""
315
+ self._vocab, self._char_to_id, self._id_to_char = None, None, None
316
+ if vocab is not None:
317
+ self._vocab = vocab
318
+ self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
319
+ self._id_to_char = {
320
+ idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
321
+ }
322
+
323
+ @staticmethod
324
+ def init_from_config(config, **kwargs):
325
+ """Initialize from the given config."""
326
+ if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
327
+ return (
328
+ BaseVocabulary(
329
+ config.characters.vocab_dict,
330
+ config.characters.pad,
331
+ config.characters.blank,
332
+ config.characters.bos,
333
+ config.characters.eos,
334
+ ),
335
+ config,
336
+ )
337
+ return BaseVocabulary(**kwargs), config
338
+
339
+ def to_config(self):
340
+ return CharactersConfig(
341
+ vocab_dict=self._vocab,
342
+ pad=self.pad,
343
+ eos=self.eos,
344
+ bos=self.bos,
345
+ blank=self.blank,
346
+ is_unique=False,
347
+ is_sorted=False,
348
+ )
349
+
350
+ @property
351
+ def num_chars(self):
352
+ """Return number of tokens in the vocabulary."""
353
+ return len(self._vocab)
354
+
355
+ def char_to_id(self, char: str) -> int:
356
+ """Map a character to an token ID."""
357
+ try:
358
+ return self._char_to_id[char]
359
+ except KeyError as e:
360
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
361
+
362
+ def id_to_char(self, idx: int) -> str:
363
+ """Map an token ID to a character."""
364
+ return self._id_to_char[idx]
365
+
366
+
367
+ class BaseCharacters:
368
+
369
+
370
+ def __init__(
371
+ self,
372
+ characters: str = None,
373
+ punctuations: str = None,
374
+ pad: str = None,
375
+ eos: str = None,
376
+ bos: str = None,
377
+ blank: str = None,
378
+ is_unique: bool = False,
379
+ is_sorted: bool = True,
380
+ ) -> None:
381
+ self._characters = characters
382
+ self._punctuations = punctuations
383
+ self._pad = pad
384
+ self._eos = eos
385
+ self._bos = bos
386
+ self._blank = blank
387
+ self.is_unique = is_unique
388
+ self.is_sorted = is_sorted
389
+ self._create_vocab()
390
+
391
+ @property
392
+ def pad_id(self) -> int:
393
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
394
+
395
+ @property
396
+ def blank_id(self) -> int:
397
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
398
+
399
+ @property
400
+ def eos_id(self) -> int:
401
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
402
+
403
+ @property
404
+ def bos_id(self) -> int:
405
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
406
+
407
+ @property
408
+ def characters(self):
409
+ return self._characters
410
+
411
+ @characters.setter
412
+ def characters(self, characters):
413
+ self._characters = characters
414
+ self._create_vocab()
415
+
416
+ @property
417
+ def punctuations(self):
418
+ return self._punctuations
419
+
420
+ @punctuations.setter
421
+ def punctuations(self, punctuations):
422
+ self._punctuations = punctuations
423
+ self._create_vocab()
424
+
425
+ @property
426
+ def pad(self):
427
+ return self._pad
428
+
429
+ @pad.setter
430
+ def pad(self, pad):
431
+ self._pad = pad
432
+ self._create_vocab()
433
+
434
+ @property
435
+ def eos(self):
436
+ return self._eos
437
+
438
+ @eos.setter
439
+ def eos(self, eos):
440
+ self._eos = eos
441
+ self._create_vocab()
442
+
443
+ @property
444
+ def bos(self):
445
+ return self._bos
446
+
447
+ @bos.setter
448
+ def bos(self, bos):
449
+ self._bos = bos
450
+ self._create_vocab()
451
+
452
+ @property
453
+ def blank(self):
454
+ return self._blank
455
+
456
+ @blank.setter
457
+ def blank(self, blank):
458
+ self._blank = blank
459
+ self._create_vocab()
460
+
461
+ @property
462
+ def vocab(self):
463
+ return self._vocab
464
+
465
+ @vocab.setter
466
+ def vocab(self, vocab):
467
+ self._vocab = vocab
468
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
469
+ self._id_to_char = {
470
+ idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
471
+ }
472
+
473
+ @property
474
+ def num_chars(self):
475
+ return len(self._vocab)
476
+
477
+ def _create_vocab(self):
478
+ _vocab = self._characters
479
+ if self.is_unique:
480
+ _vocab = list(set(_vocab))
481
+ if self.is_sorted:
482
+ _vocab = sorted(_vocab)
483
+ _vocab = list(_vocab)
484
+ _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
485
+ _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
486
+ _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
487
+ _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
488
+ self.vocab = _vocab + list(self._punctuations)
489
+ if self.is_unique:
490
+ duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
491
+ assert (
492
+ len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
493
+ ), f" [!] There are duplicate characters in the character set. {duplicates}"
494
+
495
+ def char_to_id(self, char: str) -> int:
496
+ try:
497
+ return self._char_to_id[char]
498
+ except KeyError as e:
499
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
500
+
501
+ def id_to_char(self, idx: int) -> str:
502
+ return self._id_to_char[idx]
503
+
504
+ def print_log(self, level: int = 0):
505
+ """
506
+ Prints the vocabulary in a nice format.
507
+ """
508
+ indent = "\t" * level
509
+ print(f"{indent}| > Characters: {self._characters}")
510
+ print(f"{indent}| > Punctuations: {self._punctuations}")
511
+ print(f"{indent}| > Pad: {self._pad}")
512
+ print(f"{indent}| > EOS: {self._eos}")
513
+ print(f"{indent}| > BOS: {self._bos}")
514
+ print(f"{indent}| > Blank: {self._blank}")
515
+ print(f"{indent}| > Vocab: {self.vocab}")
516
+ print(f"{indent}| > Num chars: {self.num_chars}")
517
+
518
+ @staticmethod
519
+ def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
520
+ """Init your character class from a config.
521
+
522
+ Implement this method for your subclass.
523
+ """
524
+ # use character set from config
525
+ if config.characters is not None:
526
+ return BaseCharacters(**config.characters), config
527
+ # return default character set
528
+ characters = BaseCharacters()
529
+ new_config = replace(config, characters=characters.to_config())
530
+ return characters, new_config
531
+
532
+ def to_config(self) -> "CharactersConfig":
533
+ return CharactersConfig(
534
+ characters=self._characters,
535
+ punctuations=self._punctuations,
536
+ pad=self._pad,
537
+ eos=self._eos,
538
+ bos=self._bos,
539
+ blank=self._blank,
540
+ is_unique=self.is_unique,
541
+ is_sorted=self.is_sorted,
542
+ )
543
+
544
+
545
+ class IPAPhonemes(BaseCharacters):
546
+
547
+
548
+ def __init__(
549
+ self,
550
+ characters: str = _phonemes,
551
+ punctuations: str = _punctuations,
552
+ pad: str = _pad,
553
+ eos: str = _eos,
554
+ bos: str = _bos,
555
+ blank: str = _blank,
556
+ is_unique: bool = False,
557
+ is_sorted: bool = True,
558
+ ) -> None:
559
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
560
+
561
+ @staticmethod
562
+ def init_from_config(config: "Coqpit"):
563
+ """Init a IPAPhonemes object from a model config
564
+
565
+ If characters are not defined in the config, it will be set to the default characters and the config
566
+ will be updated.
567
+ """
568
+ # band-aid for compatibility with old models
569
+ if "characters" in config and config.characters is not None:
570
+ if "phonemes" in config.characters and config.characters.phonemes is not None:
571
+ config.characters["characters"] = config.characters["phonemes"]
572
+ return (
573
+ IPAPhonemes(
574
+ characters=config.characters["characters"],
575
+ punctuations=config.characters["punctuations"],
576
+ pad=config.characters["pad"],
577
+ eos=config.characters["eos"],
578
+ bos=config.characters["bos"],
579
+ blank=config.characters["blank"],
580
+ is_unique=config.characters["is_unique"],
581
+ is_sorted=config.characters["is_sorted"],
582
+ ),
583
+ config,
584
+ )
585
+ # use character set from config
586
+ if config.characters is not None:
587
+ return IPAPhonemes(**config.characters), config
588
+ # return default character set
589
+ characters = IPAPhonemes()
590
+ new_config = replace(config, characters=characters.to_config())
591
+ return characters, new_config
592
+
593
+
594
+ class Graphemes(BaseCharacters):
595
+
596
+
597
+ def __init__(
598
+ self,
599
+ characters: str = _characters,
600
+ punctuations: str = _punctuations,
601
+ pad: str = _pad,
602
+ eos: str = _eos,
603
+ bos: str = _bos,
604
+ blank: str = _blank,
605
+ is_unique: bool = False,
606
+ is_sorted: bool = True,
607
+ ) -> None:
608
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
609
+
610
+ @staticmethod
611
+ def init_from_config(config: "Coqpit"):
612
+ """Init a Graphemes object from a model config
613
+
614
+ If characters are not defined in the config, it will be set to the default characters and the config
615
+ will be updated.
616
+ """
617
+ if config.characters is not None:
618
+ # band-aid for compatibility with old models
619
+ if "phonemes" in config.characters:
620
+ return (
621
+ Graphemes(
622
+ characters=config.characters["characters"],
623
+ punctuations=config.characters["punctuations"],
624
+ pad=config.characters["pad"],
625
+ eos=config.characters["eos"],
626
+ bos=config.characters["bos"],
627
+ blank=config.characters["blank"],
628
+ is_unique=config.characters["is_unique"],
629
+ is_sorted=config.characters["is_sorted"],
630
+ ),
631
+ config,
632
+ )
633
+ return Graphemes(**config.characters), config
634
+ characters = Graphemes()
635
+ new_config = replace(config, characters=characters.to_config())
636
+ return characters, new_config
637
+
638
+
639
+ if __name__ == "__main__":
640
+ gr = Graphemes()
641
+ ph = IPAPhonemes()
642
+ gr.print_log()
643
+ ph.print_log()
644
+
645
+
646
+ class VitsCharacters(BaseCharacters):
647
+ """Characters class for VITs model for compatibility with pre-trained models"""
648
+
649
+ def __init__(
650
+ self,
651
+ graphemes: str = _characters,
652
+ punctuations: str = _punctuations,
653
+ pad: str = _pad,
654
+ ipa_characters: str = _phonemes,
655
+ ) -> None:
656
+ if ipa_characters is not None:
657
+ graphemes += ipa_characters
658
+ super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
659
+
660
+ def _create_vocab(self):
661
+ self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
662
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
663
+ # pylint: disable=unnecessary-comprehension
664
+ self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
665
+
666
+ @staticmethod
667
+ def init_from_config(config):
668
+ _pad = config.characters.pad
669
+ _punctuations = config.characters.punctuations
670
+ _letters = config.characters.characters
671
+ _letters_ipa = config.characters.phonemes
672
+ return (
673
+ VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
674
+ config,
675
+ )
676
+
677
+ def to_config(self) -> "CharactersConfig":
678
+ return CharactersConfig(
679
+ characters=self._characters,
680
+ punctuations=self._punctuations,
681
+ pad=self._pad,
682
+ eos=None,
683
+ bos=None,
684
+ blank=self._blank,
685
+ is_unique=False,
686
+ is_sorted=True,
687
+ )
688
+
689
+ class TTSTokenizer:
690
+ def __init__(
691
+ self,
692
+ text_cleaner: Callable = None,
693
+ characters: "BaseCharacters" = None,
694
+ ):
695
+ self.text_cleaner = text_cleaner
696
+ self.characters = characters
697
+ self.not_found_characters = []
698
+
699
+ @property
700
+ def characters(self):
701
+ return self._characters
702
+
703
+ @characters.setter
704
+ def characters(self, new_characters):
705
+ self._characters = new_characters
706
+ self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
707
+ self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
708
+
709
+ def encode(self, text: str) -> List[int]:
710
+ """Encodes a string of text as a sequence of IDs."""
711
+ token_ids = []
712
+ for char in text:
713
+ try:
714
+ idx = self.characters.char_to_id(char)
715
+ token_ids.append(idx)
716
+ except KeyError:
717
+ # discard but store not found characters
718
+ if char not in self.not_found_characters:
719
+ self.not_found_characters.append(char)
720
+ print(text)
721
+ print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
722
+ return token_ids
723
+
724
+ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
725
+ text = self.text_cleaner(text)
726
+ text = self.encode(text)
727
+ text = self.intersperse_blank_char(text, True)
728
+ return text
729
+
730
+ def pad_with_bos_eos(self, char_sequence: List[str]):
731
+ """Pads a sequence with the special BOS and EOS characters."""
732
+ return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
733
+
734
+ def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
735
+ """Intersperses the blank character between characters in a sequence.
736
+
737
+ Use the ```blank``` character if defined else use the ```pad``` character.
738
+ """
739
+ char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
740
+ result = [char_to_use] * (len(char_sequence) * 2 + 1)
741
+ result[1::2] = char_sequence
742
+ return result
743
+
744
+ @staticmethod
745
+ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
746
+ text_cleaner = multilingual_cleaners
747
+ CharactersClass = VitsCharacters
748
+ characters, new_config = CharactersClass.init_from_config(config)
749
+ # new_config.characters.characters_class = get_import_path(characters)
750
+ new_config.characters.characters_class = VitsCharacters
751
+ return (
752
+ TTSTokenizer(text_cleaner, characters),new_config)
753
+
754
+
755
+ def multilingual_cleaners(text):
756
+ """Pipeline for multilingual text"""
757
+ text = lowercase(text)
758
+ text = replace_symbols(text, lang=None)
759
+ text = remove_aux_symbols(text)
760
+ text = collapse_whitespace(text)
761
+ return text
762
+
763
+ def lowercase(text):
764
+ return text.lower()
765
+
766
+ def collapse_whitespace(text):
767
+ return re.sub(_whitespace_re, " ", text).strip()
768
+
769
+ def replace_symbols(text, lang="en"):
770
+
771
+ text = text.replace(";", ",")
772
+ text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
773
+ text = text.replace(":", ",")
774
+ if lang == "en":
775
+ text = text.replace("&", " and ")
776
+ elif lang == "fr":
777
+ text = text.replace("&", " et ")
778
+ elif lang == "pt":
779
+ text = text.replace("&", " e ")
780
+ elif lang == "ca":
781
+ text = text.replace("&", " i ")
782
+ text = text.replace("'", "")
783
+ return text
784
+
785
+ def remove_aux_symbols(text):
786
+ text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
787
+ return text
hi_male/hi_male_vits_30hrs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb36eca2d90214662f1647e83eb6979ead93b72f269606c6411f52959acf77a8
3
+ size 333256012
hi_male/jit_infer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
3
+ import torch
4
+ import numpy as np
5
+
6
+ #ch female
7
+ with open("chars.txt", 'r') as f:
8
+ letters = f.read().strip('\n')
9
+ model="hi_male_vits_30hrs.pt"
10
+ text = "फिल्म गर्दिश में अमरीश पुरी के साथ जैकी श्रॉफ, ऐश्वर्या, डिंपल कपाड़िया"
11
+
12
+ config = VitsConfig(
13
+ text_cleaner="multilingual_cleaners",
14
+ characters=CharactersConfig(
15
+ characters_class=VitsCharacters,
16
+ pad="<PAD>",
17
+ eos="<EOS>",
18
+ bos="<BOS>",
19
+ blank="<BLNK>",
20
+ characters=letters,
21
+ punctuations="!¡'(),-.:;¿? ",
22
+ phonemes=None)
23
+ )
24
+ tokenizer, config = TTSTokenizer.init_from_config(config)
25
+
26
+ x = tokenizer.text_to_ids(text)
27
+ x = torch.from_numpy(np.array(x)).unsqueeze(0)
28
+ net = torch.jit.load(model)
29
+ with torch.no_grad():
30
+ out2 = net(x)
31
+ import soundfile as sf
32
+ sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)
hne_female/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
hne_female/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---
hne_female/ch_female_vits_30hrs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3393916262f03807d8338aa8dce79379582c71a0ada346457e36ea6f72a6635
3
+ size 333255366
hne_female/chars.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ खछगचऊुलशौढ़इणज़झैठढजफ़औ्ड़फूेानटॅयव़ऋदप.थअँऑआघहतषरसभउञडएईऐक़ िओ?धी,ॉंख़कोबमृ
hne_female/extra.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Union
2
+ from dataclasses import asdict, dataclass, field
3
+
4
+
5
+ import re
6
+ from dataclasses import replace
7
+ from typing import Dict
8
+ _whitespace_re = re.compile(r"\s+")
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import List
12
+
13
+ # from TTS.tts.configs.shared_configs import BaseTTSConfig
14
+ # from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
15
+
16
+ @dataclass
17
+ class CharactersConfig():
18
+
19
+ characters_class: str = None
20
+
21
+ # using BaseVocabulary
22
+ vocab_dict: Dict = None
23
+
24
+ # using on BaseCharacters
25
+ pad: str = None
26
+ eos: str = None
27
+ bos: str = None
28
+ blank: str = None
29
+ characters: str = None
30
+ punctuations: str = None
31
+ phonemes: str = None
32
+ is_unique: bool = True # for backwards compatibility of models trained with char sets with duplicates
33
+ is_sorted: bool = True
34
+
35
+
36
+ @dataclass
37
+ class BaseTTSConfig():
38
+
39
+ # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
40
+ # phoneme settings
41
+ use_phonemes: bool = False
42
+ phonemizer: str = None
43
+ phoneme_language: str = None
44
+ compute_input_seq_cache: bool = False
45
+ text_cleaner: str = None
46
+ enable_eos_bos_chars: bool = False
47
+ test_sentences_file: str = ""
48
+ phoneme_cache_path: str = None
49
+ # vocabulary parameters
50
+ characters: CharactersConfig = None
51
+ add_blank: bool = False
52
+ # training params
53
+ batch_group_size: int = 0
54
+ loss_masking: bool = None
55
+ # dataloading
56
+ min_audio_len: int = 1
57
+ max_audio_len: int = float("inf")
58
+ min_text_len: int = 1
59
+ max_text_len: int = float("inf")
60
+ compute_f0: bool = False
61
+ compute_energy: bool = False
62
+ compute_linear_spec: bool = False
63
+ precompute_num_workers: int = 0
64
+ use_noise_augment: bool = False
65
+ start_by_longest: bool = False
66
+ shuffle: bool = False
67
+ drop_last: bool = False
68
+ # dataset
69
+ datasets: str = None
70
+ # optimizer
71
+ optimizer: str = "radam"
72
+ optimizer_params: dict = None
73
+ # scheduler
74
+ lr_scheduler: str = None
75
+ lr_scheduler_params: dict = field(default_factory=lambda: {})
76
+ # testing
77
+ test_sentences: List[str] = field(default_factory=lambda: [])
78
+ # evaluation
79
+ eval_split_max_size: int = None
80
+ eval_split_size: float = 0.01
81
+ # weighted samplers
82
+ use_speaker_weighted_sampler: bool = False
83
+ speaker_weighted_sampler_alpha: float = 1.0
84
+ use_language_weighted_sampler: bool = False
85
+ language_weighted_sampler_alpha: float = 1.0
86
+ use_length_weighted_sampler: bool = False
87
+ length_weighted_sampler_alpha: float = 1.0
88
+
89
+
90
+ @dataclass
91
+ class VitsAudioConfig():
92
+ fft_size: int = 1024
93
+ sample_rate: int = 22050
94
+ win_length: int = 1024
95
+ hop_length: int = 256
96
+ num_mels: int = 80
97
+ mel_fmin: int = 0
98
+ mel_fmax: int = None
99
+
100
+ @dataclass
101
+ class VitsArgs():
102
+ num_chars: int = 100
103
+ out_channels: int = 513
104
+ spec_segment_size: int = 32
105
+ hidden_channels: int = 192
106
+ hidden_channels_ffn_text_encoder: int = 768
107
+ num_heads_text_encoder: int = 2
108
+ num_layers_text_encoder: int = 6
109
+ kernel_size_text_encoder: int = 3
110
+ dropout_p_text_encoder: float = 0.1
111
+ dropout_p_duration_predictor: float = 0.5
112
+ kernel_size_posterior_encoder: int = 5
113
+ dilation_rate_posterior_encoder: int = 1
114
+ num_layers_posterior_encoder: int = 16
115
+ kernel_size_flow: int = 5
116
+ dilation_rate_flow: int = 1
117
+ num_layers_flow: int = 4
118
+ resblock_type_decoder: str = "1"
119
+ resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
120
+ resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
121
+ upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
122
+ upsample_initial_channel_decoder: int = 512
123
+ upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
124
+ periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
125
+ use_sdp: bool = True
126
+ noise_scale: float = 1.0
127
+ inference_noise_scale: float = 0.667
128
+ length_scale: float = 1
129
+ noise_scale_dp: float = 1.0
130
+ inference_noise_scale_dp: float = 1.0
131
+ max_inference_len: int = None
132
+ init_discriminator: bool = True
133
+ use_spectral_norm_disriminator: bool = False
134
+ use_speaker_embedding: bool = False
135
+ num_speakers: int = 0
136
+ speakers_file: str = None
137
+ d_vector_file: List[str] = None
138
+ speaker_embedding_channels: int = 256
139
+ use_d_vector_file: bool = False
140
+ d_vector_dim: int = 0
141
+ detach_dp_input: bool = True
142
+ use_language_embedding: bool = False
143
+ embedded_language_dim: int = 4
144
+ num_languages: int = 0
145
+ language_ids_file: str = None
146
+ use_speaker_encoder_as_loss: bool = False
147
+ speaker_encoder_config_path: str = ""
148
+ speaker_encoder_model_path: str = ""
149
+ condition_dp_on_speaker: bool = True
150
+ freeze_encoder: bool = False
151
+ freeze_DP: bool = False
152
+ freeze_PE: bool = False
153
+ freeze_flow_decoder: bool = False
154
+ freeze_waveform_decoder: bool = False
155
+ encoder_sample_rate: int = None
156
+ interpolate_z: bool = True
157
+ reinit_DP: bool = False
158
+ reinit_text_encoder: bool = False
159
+ @dataclass
160
+ class VitsConfig(BaseTTSConfig):
161
+
162
+ model: str = "vits"
163
+ # model specific params
164
+ model_args: VitsArgs = field(default_factory=VitsArgs)
165
+ audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
166
+
167
+ # optimizer
168
+ grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
169
+ lr_gen: float = 0.0002
170
+ lr_disc: float = 0.0002
171
+ lr_scheduler_gen: str = "ExponentialLR"
172
+ lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
173
+ lr_scheduler_disc: str = "ExponentialLR"
174
+ lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
175
+ scheduler_after_epoch: bool = True
176
+ optimizer: str = "AdamW"
177
+ optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
178
+
179
+ # loss params
180
+ kl_loss_alpha: float = 1.0
181
+ disc_loss_alpha: float = 1.0
182
+ gen_loss_alpha: float = 1.0
183
+ feat_loss_alpha: float = 1.0
184
+ mel_loss_alpha: float = 45.0
185
+ dur_loss_alpha: float = 1.0
186
+ speaker_encoder_loss_alpha: float = 1.0
187
+
188
+ # data loader params
189
+ return_wav: bool = True
190
+ compute_linear_spec: bool = True
191
+
192
+ # sampler params
193
+ use_weighted_sampler: bool = False # TODO: move it to the base config
194
+ weighted_sampler_attrs: dict = field(default_factory=lambda: {})
195
+ weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
196
+
197
+ # overrides
198
+ r: int = 1 # DO NOT CHANGE
199
+ add_blank: bool = True
200
+
201
+ # testing
202
+ test_sentences: List[List] = field(
203
+ default_factory=lambda: [
204
+ ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
205
+ ["Be a voice, not an echo."],
206
+ ["I'm sorry Dave. I'm afraid I can't do that."],
207
+ ["This cake is great. It's so delicious and moist."],
208
+ ["Prior to November 22, 1963."],
209
+ ]
210
+ )
211
+
212
+ # multi-speaker settings
213
+ # use speaker embedding layer
214
+ num_speakers: int = 0
215
+ use_speaker_embedding: bool = False
216
+ speakers_file: str = None
217
+ speaker_embedding_channels: int = 256
218
+ language_ids_file: str = None
219
+ use_language_embedding: bool = False
220
+
221
+ # use d-vectors
222
+ use_d_vector_file: bool = False
223
+ d_vector_file: List[str] = None
224
+ d_vector_dim: int = None
225
+
226
+ def __post_init__(self):
227
+ pass
228
+ # for key, val in self.model_args.items():
229
+ # if hasattr(self, key):
230
+ # self[key] = val
231
+
232
+
233
+
234
+
235
+
236
+ def parse_symbols():
237
+ return {
238
+ "pad": _pad,
239
+ "eos": _eos,
240
+ "bos": _bos,
241
+ "characters": _characters,
242
+ "punctuations": _punctuations,
243
+ "phonemes": _phonemes,
244
+ }
245
+
246
+
247
+ # DEFAULT SET OF GRAPHEMES
248
+ _pad = "<PAD>"
249
+ _eos = "<EOS>"
250
+ _bos = "<BOS>"
251
+ _blank = "<BLNK>" # TODO: check if we need this alongside with PAD
252
+ _characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
253
+ _punctuations = "!'(),-.:;? "
254
+
255
+
256
+ # DEFAULT SET OF IPA PHONEMES
257
+ # Phonemes definition (All IPA characters)
258
+ _vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
259
+ _non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
260
+ _pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
261
+ _suprasegmentals = "ˈˌːˑ"
262
+ _other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
263
+ _diacrilics = "ɚ˞ɫ"
264
+ _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
265
+
266
+
267
+ class BaseVocabulary:
268
+ """Base Vocabulary class.
269
+
270
+ This class only needs a vocabulary dictionary without specifying the characters.
271
+
272
+ Args:
273
+ vocab (Dict): A dictionary of characters and their corresponding indices.
274
+ """
275
+
276
+ def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
277
+ self.vocab = vocab
278
+ self.pad = pad
279
+ self.blank = blank
280
+ self.bos = bos
281
+ self.eos = eos
282
+
283
+ @property
284
+ def pad_id(self) -> int:
285
+ """Return the index of the padding character. If the padding character is not specified, return the length
286
+ of the vocabulary."""
287
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
288
+
289
+ @property
290
+ def blank_id(self) -> int:
291
+ """Return the index of the blank character. If the blank character is not specified, return the length of
292
+ the vocabulary."""
293
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
294
+
295
+ @property
296
+ def bos_id(self) -> int:
297
+ """Return the index of the bos character. If the bos character is not specified, return the length of the
298
+ vocabulary."""
299
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
300
+
301
+ @property
302
+ def eos_id(self) -> int:
303
+ """Return the index of the eos character. If the eos character is not specified, return the length of the
304
+ vocabulary."""
305
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
306
+
307
+ @property
308
+ def vocab(self):
309
+ """Return the vocabulary dictionary."""
310
+ return self._vocab
311
+
312
+ @vocab.setter
313
+ def vocab(self, vocab):
314
+ """Set the vocabulary dictionary and character mapping dictionaries."""
315
+ self._vocab, self._char_to_id, self._id_to_char = None, None, None
316
+ if vocab is not None:
317
+ self._vocab = vocab
318
+ self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
319
+ self._id_to_char = {
320
+ idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
321
+ }
322
+
323
+ @staticmethod
324
+ def init_from_config(config, **kwargs):
325
+ """Initialize from the given config."""
326
+ if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
327
+ return (
328
+ BaseVocabulary(
329
+ config.characters.vocab_dict,
330
+ config.characters.pad,
331
+ config.characters.blank,
332
+ config.characters.bos,
333
+ config.characters.eos,
334
+ ),
335
+ config,
336
+ )
337
+ return BaseVocabulary(**kwargs), config
338
+
339
+ def to_config(self):
340
+ return CharactersConfig(
341
+ vocab_dict=self._vocab,
342
+ pad=self.pad,
343
+ eos=self.eos,
344
+ bos=self.bos,
345
+ blank=self.blank,
346
+ is_unique=False,
347
+ is_sorted=False,
348
+ )
349
+
350
+ @property
351
+ def num_chars(self):
352
+ """Return number of tokens in the vocabulary."""
353
+ return len(self._vocab)
354
+
355
+ def char_to_id(self, char: str) -> int:
356
+ """Map a character to an token ID."""
357
+ try:
358
+ return self._char_to_id[char]
359
+ except KeyError as e:
360
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
361
+
362
+ def id_to_char(self, idx: int) -> str:
363
+ """Map an token ID to a character."""
364
+ return self._id_to_char[idx]
365
+
366
+
367
+ class BaseCharacters:
368
+
369
+
370
+ def __init__(
371
+ self,
372
+ characters: str = None,
373
+ punctuations: str = None,
374
+ pad: str = None,
375
+ eos: str = None,
376
+ bos: str = None,
377
+ blank: str = None,
378
+ is_unique: bool = False,
379
+ is_sorted: bool = True,
380
+ ) -> None:
381
+ self._characters = characters
382
+ self._punctuations = punctuations
383
+ self._pad = pad
384
+ self._eos = eos
385
+ self._bos = bos
386
+ self._blank = blank
387
+ self.is_unique = is_unique
388
+ self.is_sorted = is_sorted
389
+ self._create_vocab()
390
+
391
+ @property
392
+ def pad_id(self) -> int:
393
+ return self.char_to_id(self.pad) if self.pad else len(self.vocab)
394
+
395
+ @property
396
+ def blank_id(self) -> int:
397
+ return self.char_to_id(self.blank) if self.blank else len(self.vocab)
398
+
399
+ @property
400
+ def eos_id(self) -> int:
401
+ return self.char_to_id(self.eos) if self.eos else len(self.vocab)
402
+
403
+ @property
404
+ def bos_id(self) -> int:
405
+ return self.char_to_id(self.bos) if self.bos else len(self.vocab)
406
+
407
+ @property
408
+ def characters(self):
409
+ return self._characters
410
+
411
+ @characters.setter
412
+ def characters(self, characters):
413
+ self._characters = characters
414
+ self._create_vocab()
415
+
416
+ @property
417
+ def punctuations(self):
418
+ return self._punctuations
419
+
420
+ @punctuations.setter
421
+ def punctuations(self, punctuations):
422
+ self._punctuations = punctuations
423
+ self._create_vocab()
424
+
425
+ @property
426
+ def pad(self):
427
+ return self._pad
428
+
429
+ @pad.setter
430
+ def pad(self, pad):
431
+ self._pad = pad
432
+ self._create_vocab()
433
+
434
+ @property
435
+ def eos(self):
436
+ return self._eos
437
+
438
+ @eos.setter
439
+ def eos(self, eos):
440
+ self._eos = eos
441
+ self._create_vocab()
442
+
443
+ @property
444
+ def bos(self):
445
+ return self._bos
446
+
447
+ @bos.setter
448
+ def bos(self, bos):
449
+ self._bos = bos
450
+ self._create_vocab()
451
+
452
+ @property
453
+ def blank(self):
454
+ return self._blank
455
+
456
+ @blank.setter
457
+ def blank(self, blank):
458
+ self._blank = blank
459
+ self._create_vocab()
460
+
461
+ @property
462
+ def vocab(self):
463
+ return self._vocab
464
+
465
+ @vocab.setter
466
+ def vocab(self, vocab):
467
+ self._vocab = vocab
468
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
469
+ self._id_to_char = {
470
+ idx: char for idx, char in enumerate(self.vocab) # pylint: disable=unnecessary-comprehension
471
+ }
472
+
473
+ @property
474
+ def num_chars(self):
475
+ return len(self._vocab)
476
+
477
+ def _create_vocab(self):
478
+ _vocab = self._characters
479
+ if self.is_unique:
480
+ _vocab = list(set(_vocab))
481
+ if self.is_sorted:
482
+ _vocab = sorted(_vocab)
483
+ _vocab = list(_vocab)
484
+ _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
485
+ _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
486
+ _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
487
+ _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
488
+ self.vocab = _vocab + list(self._punctuations)
489
+ if self.is_unique:
490
+ duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
491
+ assert (
492
+ len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
493
+ ), f" [!] There are duplicate characters in the character set. {duplicates}"
494
+
495
+ def char_to_id(self, char: str) -> int:
496
+ try:
497
+ return self._char_to_id[char]
498
+ except KeyError as e:
499
+ raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
500
+
501
+ def id_to_char(self, idx: int) -> str:
502
+ return self._id_to_char[idx]
503
+
504
+ def print_log(self, level: int = 0):
505
+ """
506
+ Prints the vocabulary in a nice format.
507
+ """
508
+ indent = "\t" * level
509
+ print(f"{indent}| > Characters: {self._characters}")
510
+ print(f"{indent}| > Punctuations: {self._punctuations}")
511
+ print(f"{indent}| > Pad: {self._pad}")
512
+ print(f"{indent}| > EOS: {self._eos}")
513
+ print(f"{indent}| > BOS: {self._bos}")
514
+ print(f"{indent}| > Blank: {self._blank}")
515
+ print(f"{indent}| > Vocab: {self.vocab}")
516
+ print(f"{indent}| > Num chars: {self.num_chars}")
517
+
518
+ @staticmethod
519
+ def init_from_config(config: "Coqpit"): # pylint: disable=unused-argument
520
+ """Init your character class from a config.
521
+
522
+ Implement this method for your subclass.
523
+ """
524
+ # use character set from config
525
+ if config.characters is not None:
526
+ return BaseCharacters(**config.characters), config
527
+ # return default character set
528
+ characters = BaseCharacters()
529
+ new_config = replace(config, characters=characters.to_config())
530
+ return characters, new_config
531
+
532
+ def to_config(self) -> "CharactersConfig":
533
+ return CharactersConfig(
534
+ characters=self._characters,
535
+ punctuations=self._punctuations,
536
+ pad=self._pad,
537
+ eos=self._eos,
538
+ bos=self._bos,
539
+ blank=self._blank,
540
+ is_unique=self.is_unique,
541
+ is_sorted=self.is_sorted,
542
+ )
543
+
544
+
545
+ class IPAPhonemes(BaseCharacters):
546
+
547
+
548
+ def __init__(
549
+ self,
550
+ characters: str = _phonemes,
551
+ punctuations: str = _punctuations,
552
+ pad: str = _pad,
553
+ eos: str = _eos,
554
+ bos: str = _bos,
555
+ blank: str = _blank,
556
+ is_unique: bool = False,
557
+ is_sorted: bool = True,
558
+ ) -> None:
559
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
560
+
561
+ @staticmethod
562
+ def init_from_config(config: "Coqpit"):
563
+ """Init a IPAPhonemes object from a model config
564
+
565
+ If characters are not defined in the config, it will be set to the default characters and the config
566
+ will be updated.
567
+ """
568
+ # band-aid for compatibility with old models
569
+ if "characters" in config and config.characters is not None:
570
+ if "phonemes" in config.characters and config.characters.phonemes is not None:
571
+ config.characters["characters"] = config.characters["phonemes"]
572
+ return (
573
+ IPAPhonemes(
574
+ characters=config.characters["characters"],
575
+ punctuations=config.characters["punctuations"],
576
+ pad=config.characters["pad"],
577
+ eos=config.characters["eos"],
578
+ bos=config.characters["bos"],
579
+ blank=config.characters["blank"],
580
+ is_unique=config.characters["is_unique"],
581
+ is_sorted=config.characters["is_sorted"],
582
+ ),
583
+ config,
584
+ )
585
+ # use character set from config
586
+ if config.characters is not None:
587
+ return IPAPhonemes(**config.characters), config
588
+ # return default character set
589
+ characters = IPAPhonemes()
590
+ new_config = replace(config, characters=characters.to_config())
591
+ return characters, new_config
592
+
593
+
594
+ class Graphemes(BaseCharacters):
595
+
596
+
597
+ def __init__(
598
+ self,
599
+ characters: str = _characters,
600
+ punctuations: str = _punctuations,
601
+ pad: str = _pad,
602
+ eos: str = _eos,
603
+ bos: str = _bos,
604
+ blank: str = _blank,
605
+ is_unique: bool = False,
606
+ is_sorted: bool = True,
607
+ ) -> None:
608
+ super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
609
+
610
+ @staticmethod
611
+ def init_from_config(config: "Coqpit"):
612
+ """Init a Graphemes object from a model config
613
+
614
+ If characters are not defined in the config, it will be set to the default characters and the config
615
+ will be updated.
616
+ """
617
+ if config.characters is not None:
618
+ # band-aid for compatibility with old models
619
+ if "phonemes" in config.characters:
620
+ return (
621
+ Graphemes(
622
+ characters=config.characters["characters"],
623
+ punctuations=config.characters["punctuations"],
624
+ pad=config.characters["pad"],
625
+ eos=config.characters["eos"],
626
+ bos=config.characters["bos"],
627
+ blank=config.characters["blank"],
628
+ is_unique=config.characters["is_unique"],
629
+ is_sorted=config.characters["is_sorted"],
630
+ ),
631
+ config,
632
+ )
633
+ return Graphemes(**config.characters), config
634
+ characters = Graphemes()
635
+ new_config = replace(config, characters=characters.to_config())
636
+ return characters, new_config
637
+
638
+
639
+ if __name__ == "__main__":
640
+ gr = Graphemes()
641
+ ph = IPAPhonemes()
642
+ gr.print_log()
643
+ ph.print_log()
644
+
645
+
646
+ class VitsCharacters(BaseCharacters):
647
+ """Characters class for VITs model for compatibility with pre-trained models"""
648
+
649
+ def __init__(
650
+ self,
651
+ graphemes: str = _characters,
652
+ punctuations: str = _punctuations,
653
+ pad: str = _pad,
654
+ ipa_characters: str = _phonemes,
655
+ ) -> None:
656
+ if ipa_characters is not None:
657
+ graphemes += ipa_characters
658
+ super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
659
+
660
+ def _create_vocab(self):
661
+ self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
662
+ self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
663
+ # pylint: disable=unnecessary-comprehension
664
+ self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
665
+
666
+ @staticmethod
667
+ def init_from_config(config):
668
+ _pad = config.characters.pad
669
+ _punctuations = config.characters.punctuations
670
+ _letters = config.characters.characters
671
+ _letters_ipa = config.characters.phonemes
672
+ return (
673
+ VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
674
+ config,
675
+ )
676
+
677
+ def to_config(self) -> "CharactersConfig":
678
+ return CharactersConfig(
679
+ characters=self._characters,
680
+ punctuations=self._punctuations,
681
+ pad=self._pad,
682
+ eos=None,
683
+ bos=None,
684
+ blank=self._blank,
685
+ is_unique=False,
686
+ is_sorted=True,
687
+ )
688
+
689
+ class TTSTokenizer:
690
+ def __init__(
691
+ self,
692
+ text_cleaner: Callable = None,
693
+ characters: "BaseCharacters" = None,
694
+ ):
695
+ self.text_cleaner = text_cleaner
696
+ self.characters = characters
697
+ self.not_found_characters = []
698
+
699
+ @property
700
+ def characters(self):
701
+ return self._characters
702
+
703
+ @characters.setter
704
+ def characters(self, new_characters):
705
+ self._characters = new_characters
706
+ self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
707
+ self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
708
+
709
+ def encode(self, text: str) -> List[int]:
710
+ """Encodes a string of text as a sequence of IDs."""
711
+ token_ids = []
712
+ for char in text:
713
+ try:
714
+ idx = self.characters.char_to_id(char)
715
+ token_ids.append(idx)
716
+ except KeyError:
717
+ # discard but store not found characters
718
+ if char not in self.not_found_characters:
719
+ self.not_found_characters.append(char)
720
+ print(text)
721
+ print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
722
+ return token_ids
723
+
724
+ def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
725
+ text = self.text_cleaner(text)
726
+ text = self.encode(text)
727
+ text = self.intersperse_blank_char(text, True)
728
+ return text
729
+
730
+ def pad_with_bos_eos(self, char_sequence: List[str]):
731
+ """Pads a sequence with the special BOS and EOS characters."""
732
+ return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
733
+
734
+ def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
735
+ """Intersperses the blank character between characters in a sequence.
736
+
737
+ Use the ```blank``` character if defined else use the ```pad``` character.
738
+ """
739
+ char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
740
+ result = [char_to_use] * (len(char_sequence) * 2 + 1)
741
+ result[1::2] = char_sequence
742
+ return result
743
+
744
+ @staticmethod
745
+ def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
746
+ text_cleaner = multilingual_cleaners
747
+ CharactersClass = VitsCharacters
748
+ characters, new_config = CharactersClass.init_from_config(config)
749
+ # new_config.characters.characters_class = get_import_path(characters)
750
+ new_config.characters.characters_class = VitsCharacters
751
+ return (
752
+ TTSTokenizer(text_cleaner, characters),new_config)
753
+
754
+
755
+ def multilingual_cleaners(text):
756
+ """Pipeline for multilingual text"""
757
+ text = lowercase(text)
758
+ text = replace_symbols(text, lang=None)
759
+ text = remove_aux_symbols(text)
760
+ text = collapse_whitespace(text)
761
+ return text
762
+
763
+ def lowercase(text):
764
+ return text.lower()
765
+
766
+ def collapse_whitespace(text):
767
+ return re.sub(_whitespace_re, " ", text).strip()
768
+
769
+ def replace_symbols(text, lang="en"):
770
+
771
+ text = text.replace(";", ",")
772
+ text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
773
+ text = text.replace(":", ",")
774
+ if lang == "en":
775
+ text = text.replace("&", " and ")
776
+ elif lang == "fr":
777
+ text = text.replace("&", " et ")
778
+ elif lang == "pt":
779
+ text = text.replace("&", " e ")
780
+ elif lang == "ca":
781
+ text = text.replace("&", " i ")
782
+ text = text.replace("'", "")
783
+ return text
784
+
785
+ def remove_aux_symbols(text):
786
+ text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
787
+ return text
hne_female/jit_infer.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
3
+ import torch
4
+ import numpy as np
5
+
6
+ #ch female
7
+ letters="खछगचऊुलशौढ़इणज़झैठढजफ़औ्ड़फूेानटॅयव़ऋदप.थअँऑआघहतषरसभउञडएईऐक़ िओ?धी,ॉंख़कोबमृ"
8
+ model="ch_female_vits_30hrs.pt"
9
+ text = "पेरिविंकल के जड़, उपजी अउ पत्त्ता मन ह बिकट उपयोगी हे"
10
+
11
+ config = VitsConfig(
12
+ text_cleaner="multilingual_cleaners",
13
+ characters=CharactersConfig(
14
+ characters_class=VitsCharacters,
15
+ pad="<PAD>",
16
+ eos="<EOS>",
17
+ bos="<BOS>",
18
+ blank="<BLNK>",
19
+ characters=letters,
20
+ punctuations="!¡'(),-.:;¿? ",
21
+ phonemes=None)
22
+ )
23
+ tokenizer, config = TTSTokenizer.init_from_config(config)
24
+
25
+ x = tokenizer.text_to_ids(text)
26
+ x = torch.from_numpy(np.array(x)).unsqueeze(0)
27
+ net = torch.jit.load(model)
28
+ with torch.no_grad():
29
+ out2 = net(x)
30
+ import soundfile as sf
31
+ sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)
hne_male/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
hne_male/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ ---
hne_male/ch_male_vits_30hrs.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ef1fb370e1a8bc844aca171316a909461521ce6afa1371d15d5f8c765cda4d9
3
+ size 333250252