Harshil748 commited on 12 days ago

Commit

9e5ede4

0 Parent(s):

Add all TTS models for 11 Indian languages

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
README.md +50 -0
bho_female/.gitattributes +35 -0
bho_female/README.md +3 -0
bho_female/checkpoint_340000.pth +3 -0
bho_female/config.json +257 -0
bho_male/.gitattributes +35 -0
bho_male/README.md +3 -0
bho_male/checkpoint_200000.pth +3 -0
bho_male/config.json +257 -0
bn_female/bn_female_vits_30hrs.pt +3 -0
bn_female/chars.txt +1 -0
bn_female/jit_infer.py +32 -0
bn_male/bn_male_vits_30hrs.pt +3 -0
bn_male/chars.txt +1 -0
bn_male/extra.py +787 -0
bn_male/jit_infer.py +32 -0
en_female/.gitattributes +35 -0
en_female/README.md +3 -0
en_female/chars.txt +1 -0
en_female/en_female_vits_30hrs.pt +3 -0
en_female/extra.py +787 -0
en_female/jit_infer.py +33 -0
en_male/.gitattributes +35 -0
en_male/README.md +3 -0
en_male/chars.txt +1 -0
en_male/en_male_vits_30hrs.pt +3 -0
en_male/extra.py +787 -0
en_male/jit_infer.py +32 -0
gu_mms/config.json +82 -0
gu_mms/special_tokens_map.json +4 -0
gu_mms/tokenizer_config.json +12 -0
gu_mms/vocab.json +62 -0
hi_female/chars.txt +1 -0
hi_female/extra.py +787 -0
hi_female/hi_female_vits_30hrs.pt +3 -0
hi_female/jit_infer.py +32 -0
hi_male/chars.txt +1 -0
hi_male/extra.py +787 -0
hi_male/hi_male_vits_30hrs.pt +3 -0
hi_male/jit_infer.py +32 -0
hne_female/.gitattributes +35 -0
hne_female/README.md +3 -0
hne_female/ch_female_vits_30hrs.pt +3 -0
hne_female/chars.txt +1 -0
hne_female/extra.py +787 -0
hne_female/jit_infer.py +31 -0
hne_male/.gitattributes +35 -0
hne_male/README.md +3 -0
hne_male/ch_male_vits_30hrs.pt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.pt filter=lfs diff=lfs merge=lfs -text
2	+ *.pth filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,50 @@

+---
+license: mit
+tags:
+- tts
+- text-to-speech
+- indian-languages
+- vits
+language:
+- hi
+- bn
+- mr
+- te
+- kn
+- en
+- bho
+- mai
+- mag
+- hne
+- gu
+---
+# VoiceAPI Models
+TTS models for 11 Indian languages, 21 voices total.
+## Languages & Voices
+| Language | Code | Female | Male |
+|----------|------|--------|------|
+| Hindi | hi | ✅ | ✅ |
+| Bengali | bn | ✅ | ✅ |
+| Marathi | mr | ✅ | ✅ |
+| Telugu | te | ✅ | ✅ |
+| Kannada | kn | ✅ | ✅ |
+| English | en | ✅ | ✅ |
+| Bhojpuri | bho | ✅ | ✅ |
+| Maithili | mai | ✅ | ✅ |
+| Magahi | mag | ✅ | ✅ |
+| Chhattisgarhi | hne | ✅ | ✅ |
+| Gujarati | gu | MMS | - |
+## Model Types
+- **JIT Models** (.pt): SYSPIN VITS models (most languages)
+- **Coqui Models** (.pth): Bhojpuri male/female
+- **MMS**: Facebook MMS for Gujarati
+## Usage
+These models are used by the [VoiceAPI](https://huggingface.co/spaces/Harshil748/VoiceAPI) TTS service.

bho_female/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

bho_female/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: cc-by-4.0
+---

bho_female/checkpoint_340000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2182258024b05f739bf79002cb52cfa863605d54ee2eee5b4a5cd1fbaac797ab
+size 997764677

bho_female/config.json ADDED Viewed

	@@ -0,0 +1,257 @@

+{
+    "output_path": ".",
+    "logger_uri": null,
+    "run_name": "vits_Bhojpuri_Female_30hrs",
+    "project_name": null,
+    "run_description": "\ud83d\udc38Coqui trainer run.",
+    "print_step": 25,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "tensorboard",
+    "log_model_step": null,
+    "save_step": 20000,
+    "save_n_checkpoints": 1000,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": true,
+    "test_delay_epochs": -1,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": true,
+    "epochs": 1000,
+    "batch_size": 40,
+    "eval_batch_size": 16,
+    "grad_clip": [
+        1000,
+        1000
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": null,
+    "lr_scheduler_params": {},
+    "use_grad_scaler": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 4,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 22050,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0,
+        "mel_fmax": null
+    },
+    "use_phonemes": false,
+    "phonemizer": null,
+    "phoneme_language": "en-us",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": "./phoneme_cache",
+    "characters": {
+        "characters_class": "TTS.tts.models.vits.VitsCharacters",
+        "vocab_dict": null,
+        "pad": "<PAD>",
+        "eos": "<EOS>",
+        "bos": "<BOS>",
+        "blank": "<BLNK>",
+        "characters": "\u091a.\u0947\u0910\u0925\u092e\u0959\u091d\u0906\u0949?\u092d\u092a \u0939\u0928\u093d\u091f\u0940\u0938\u0935\u091b\u0923\u0921\u091e\u0926\u094b\u0915\u0924\u0948\u0943\u095b\u0941\u095e\u092c\u0908\u094c\u0927\u090b\u093e\u0922\u0907\u093c\u0902\u0937\u0920\u0905\u095c\u0913\u092f,\u093f\u0930\u0914\u0901\u092b\u0909\u0916\u0911\u094d\u0932\u091c\u090f\u090a\u0917\u0936\u095d\u0919\u0918\u0942",
+        "punctuations": "!\u00a1'(),-.:;\u00bf? ",
+        "phonemes": null,
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 5,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 0,
+    "start_by_longest": false,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "syspin",
+            "dataset_name": "",
+            "path": ".",
+            "meta_file_train": "../manifests/Bhojpuri_Female/30hrs.tsv",
+            "ignored_speakers": null,
+            "language": "",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "\u090f\u0928\u094d\u091f\u094d\u0930\u093e\u092a\u0940 \u0915\u0902\u092a\u094d\u092f\u0942\u091f\u093f\u0902\u0917 \u092e\u0947\u0902 \u090f\u0928\u094d\u091f\u094d\u0930\u094b\u092a\u0940 \u090a \u0911\u092a\u0930\u0947\u091f\u093f\u0902\u0917 \u0938\u093f\u0938\u094d\u091f\u092e \u0939 \u091c\u0947 \u092a\u0947 \u0938\u0930\u093e \u0915\u094d\u0930\u093f\u092a\u094d\u091f\u094b\u0917\u094d\u0930\u093e\u092b\u093f\u0915 \u092b\u0902\u0915\u094d\u0936\u0928 \u0938\u092c \u0915\u093e\u092e \u0915\u0930\u0947 \u0932\u0947\u0902",
+            "Bhojpuri_Female",
+            null,
+            "bh"
+        ]
+    ],
+    "eval_split_max_size": null,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 85,
+        "out_channels": 513,
+        "spec_segment_size": 32,
+        "hidden_channels": 192,
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 6,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "1",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": false,
+        "num_speakers": 0,
+        "speakers_file": null,
+        "d_vector_file": null,
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": false,
+        "d_vector_dim": 0,
+        "detach_dp_input": true,
+        "use_language_embedding": false,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": null,
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "",
+        "speaker_encoder_model_path": "",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 1.0,
+    "return_wav": true,
+    "use_weighted_sampler": false,
+    "weighted_sampler_attrs": {},
+    "weighted_sampler_multipliers": {},
+    "r": 1,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": null,
+    "speaker_embedding_channels": 256,
+    "language_ids_file": null,
+    "use_language_embedding": false,
+    "use_d_vector_file": false,
+    "d_vector_file": null,
+    "d_vector_dim": 0,
+    "github_branch": "* dev"
+}

bho_male/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

bho_male/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: cc-by-4.0
+---

bho_male/checkpoint_200000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4fb6ce54092c79ab526d4e9bc70514d7ea7f820b0184ef99e6ad3a7b9b72abc
+size 997766981

bho_male/config.json ADDED Viewed

	@@ -0,0 +1,257 @@

+{
+    "output_path": ".",
+    "logger_uri": null,
+    "run_name": "vits_Bhojpuri_Male_30hrs",
+    "project_name": null,
+    "run_description": "\ud83d\udc38Coqui trainer run.",
+    "print_step": 25,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "tensorboard",
+    "log_model_step": null,
+    "save_step": 20000,
+    "save_n_checkpoints": 1000,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": true,
+    "test_delay_epochs": -1,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": true,
+    "epochs": 500,
+    "batch_size": 40,
+    "eval_batch_size": 16,
+    "grad_clip": [
+        1000,
+        1000
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": null,
+    "lr_scheduler_params": {},
+    "use_grad_scaler": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 4,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 22050,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0,
+        "mel_fmax": null
+    },
+    "use_phonemes": false,
+    "phonemizer": null,
+    "phoneme_language": "en-us",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": "./phoneme_cache",
+    "characters": {
+        "characters_class": "TTS.tts.models.vits.VitsCharacters",
+        "vocab_dict": null,
+        "pad": "<PAD>",
+        "eos": "<EOS>",
+        "bos": "<BOS>",
+        "blank": "<BLNK>",
+        "characters": "\u091a.\u0947\u0910\u0925\u092e\u0959\u091d\u0906\u0949?\u092d \u092a\u0939\u0928\u093d\u091f\u0938\u0935\u0940\u091b\u0923\u0921\u091e\u0926\u094b\u0915\u0924\u0948\u0943\u095b\u0941\u095e\u092c\u0908\u0946\u094c\u0927\u090b\u093e\u0922\u0907\u093c\u0902\u0905\u0937\u0920\u095c\u0913\u092f,\u093f\u0930\u0901\u0914\u092b\u0909\u0916\u0911\u094d\u0932\u091c\u090f\u090a\u0917\u0936\u095d\u0919\u0918\u0942",
+        "punctuations": "!\u00a1'(),-.:;\u00bf? ",
+        "phonemes": null,
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 5,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 0,
+    "start_by_longest": false,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "syspin",
+            "dataset_name": "",
+            "path": ".",
+            "meta_file_train": "../manifests/Bhojpuri_Male/30hrs.tsv",
+            "ignored_speakers": null,
+            "language": "",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "\u090f\u0928\u094d\u091f\u094d\u0930\u093e\u092a\u0940 \u0915\u0902\u092a\u094d\u092f\u0942\u091f\u093f\u0902\u0917 \u092e\u0947\u0902 \u090f\u0928\u094d\u091f\u094d\u0930\u094b\u092a\u0940 \u090a \u0911\u092a\u0930\u0947\u091f\u093f\u0902\u0917 \u0938\u093f\u0938\u094d\u091f\u092e \u0939 \u091c\u0947 \u092a\u0947 \u0938\u0930\u093e \u0915\u094d\u0930\u093f\u092a\u094d\u091f\u094b\u0917\u094d\u0930\u093e\u092b\u093f\u0915 \u092b\u0902\u0915\u094d\u0936\u0928 \u0938\u092c \u0915\u093e\u092e \u0915\u0930\u0947 \u0932\u0947\u0902",
+            "Bhojpuri_Male",
+            null,
+            "bh"
+        ]
+    ],
+    "eval_split_max_size": null,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 86,
+        "out_channels": 513,
+        "spec_segment_size": 32,
+        "hidden_channels": 192,
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 6,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "1",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": false,
+        "num_speakers": 0,
+        "speakers_file": null,
+        "d_vector_file": null,
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": false,
+        "d_vector_dim": 0,
+        "detach_dp_input": true,
+        "use_language_embedding": false,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": null,
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "",
+        "speaker_encoder_model_path": "",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 1.0,
+    "return_wav": true,
+    "use_weighted_sampler": false,
+    "weighted_sampler_attrs": {},
+    "weighted_sampler_multipliers": {},
+    "r": 1,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": null,
+    "speaker_embedding_channels": 256,
+    "language_ids_file": null,
+    "use_language_embedding": false,
+    "use_d_vector_file": false,
+    "d_vector_file": null,
+    "d_vector_dim": 0,
+    "github_branch": "* dev"
+}

bn_female/bn_female_vits_30hrs.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53208e056050bb485df9192a0d444d3fa72eefe15b2c04840e9a500e4ac1bbf4
+size 333255366

bn_female/chars.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ূঞংঘঔদলৌআডখরথটোৗঙঐানষঝবছঅঢ়ঁপউধঢশগয়।?িক,যঈস্ত়ফঋৈজ'ীঠৰণওৎঃমচঊড়ইুভে এ"ৃহ

bn_female/jit_infer.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
+import torch
+import numpy as np
+#ch female
+with open("chars.txt", 'r') as f:
+    letters = f.read().strip('\n')
+model="bn_female_vits_30hrs.pt"
+text = " হলেও আমাদের সবার সার্বিক শৃঙ্খলা বোধের উন্নতি হবে"
+config = VitsConfig(
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class=VitsCharacters,
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters=letters,
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None)
+    )
+tokenizer, config = TTSTokenizer.init_from_config(config)
+x = tokenizer.text_to_ids(text)
+x = torch.from_numpy(np.array(x)).unsqueeze(0)
+net = torch.jit.load(model)
+with torch.no_grad():
+    out2 = net(x)
+import soundfile as sf
+sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)

bn_male/bn_male_vits_30hrs.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9d8d52f0bc33ef01d733eef36fb00f1e17192b8c86123a0ccf84a24dbb80d0e
+size 333249868

bn_male/chars.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ূঞংঘঔদলৌআডখরঃটোৗঙঐনাঝষবঅছঢ়ঁপউধঢশগয়।?িক,যঈসত্ৈফ়ঊজ'ীঠৎণওঋৰমচড়ভুইে থএ"ৃহ

bn_male/extra.py ADDED Viewed

	@@ -0,0 +1,787 @@

+from typing import Callable, Dict, List, Union
+from dataclasses import asdict, dataclass, field
+import re
+from dataclasses import replace
+from typing import Dict
+_whitespace_re = re.compile(r"\s+")
+from dataclasses import dataclass, field
+from typing import List
+# from TTS.tts.configs.shared_configs import BaseTTSConfig
+# from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
+@dataclass
+class CharactersConfig():
+    characters_class: str = None
+    # using BaseVocabulary
+    vocab_dict: Dict = None
+    # using on BaseCharacters
+    pad: str = None
+    eos: str = None
+    bos: str = None
+    blank: str = None
+    characters: str = None
+    punctuations: str = None
+    phonemes: str = None
+    is_unique: bool = True  # for backwards compatibility of models trained with char sets with duplicates
+    is_sorted: bool = True
+@dataclass
+class BaseTTSConfig():
+    # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    # phoneme settings
+    use_phonemes: bool = False
+    phonemizer: str = None
+    phoneme_language: str = None
+    compute_input_seq_cache: bool = False
+    text_cleaner: str = None
+    enable_eos_bos_chars: bool = False
+    test_sentences_file: str = ""
+    phoneme_cache_path: str = None
+    # vocabulary parameters
+    characters: CharactersConfig = None
+    add_blank: bool = False
+    # training params
+    batch_group_size: int = 0
+    loss_masking: bool = None
+    # dataloading
+    min_audio_len: int = 1
+    max_audio_len: int = float("inf")
+    min_text_len: int = 1
+    max_text_len: int = float("inf")
+    compute_f0: bool = False
+    compute_energy: bool = False
+    compute_linear_spec: bool = False
+    precompute_num_workers: int = 0
+    use_noise_augment: bool = False
+    start_by_longest: bool = False
+    shuffle: bool = False
+    drop_last: bool = False
+    # dataset
+    datasets: str = None
+    # optimizer
+    optimizer: str = "radam"
+    optimizer_params: dict = None
+    # scheduler
+    lr_scheduler: str = None
+    lr_scheduler_params: dict = field(default_factory=lambda: {})
+    # testing
+    test_sentences: List[str] = field(default_factory=lambda: [])
+    # evaluation
+    eval_split_max_size: int = None
+    eval_split_size: float = 0.01
+    # weighted samplers
+    use_speaker_weighted_sampler: bool = False
+    speaker_weighted_sampler_alpha: float = 1.0
+    use_language_weighted_sampler: bool = False
+    language_weighted_sampler_alpha: float = 1.0
+    use_length_weighted_sampler: bool = False
+    length_weighted_sampler_alpha: float = 1.0
+@dataclass
+class VitsAudioConfig():
+    fft_size: int = 1024
+    sample_rate: int = 22050
+    win_length: int = 1024
+    hop_length: int = 256
+    num_mels: int = 80
+    mel_fmin: int = 0
+    mel_fmax: int = None
+@dataclass
+class VitsArgs():
+    num_chars: int = 100
+    out_channels: int = 513
+    spec_segment_size: int = 32
+    hidden_channels: int = 192
+    hidden_channels_ffn_text_encoder: int = 768
+    num_heads_text_encoder: int = 2
+    num_layers_text_encoder: int = 6
+    kernel_size_text_encoder: int = 3
+    dropout_p_text_encoder: float = 0.1
+    dropout_p_duration_predictor: float = 0.5
+    kernel_size_posterior_encoder: int = 5
+    dilation_rate_posterior_encoder: int = 1
+    num_layers_posterior_encoder: int = 16
+    kernel_size_flow: int = 5
+    dilation_rate_flow: int = 1
+    num_layers_flow: int = 4
+    resblock_type_decoder: str = "1"
+    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    upsample_initial_channel_decoder: int = 512
+    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    use_sdp: bool = True
+    noise_scale: float = 1.0
+    inference_noise_scale: float = 0.667
+    length_scale: float = 1
+    noise_scale_dp: float = 1.0
+    inference_noise_scale_dp: float = 1.0
+    max_inference_len: int = None
+    init_discriminator: bool = True
+    use_spectral_norm_disriminator: bool = False
+    use_speaker_embedding: bool = False
+    num_speakers: int = 0
+    speakers_file: str = None
+    d_vector_file: List[str] = None
+    speaker_embedding_channels: int = 256
+    use_d_vector_file: bool = False
+    d_vector_dim: int = 0
+    detach_dp_input: bool = True
+    use_language_embedding: bool = False
+    embedded_language_dim: int = 4
+    num_languages: int = 0
+    language_ids_file: str = None
+    use_speaker_encoder_as_loss: bool = False
+    speaker_encoder_config_path: str = ""
+    speaker_encoder_model_path: str = ""
+    condition_dp_on_speaker: bool = True
+    freeze_encoder: bool = False
+    freeze_DP: bool = False
+    freeze_PE: bool = False
+    freeze_flow_decoder: bool = False
+    freeze_waveform_decoder: bool = False
+    encoder_sample_rate: int = None
+    interpolate_z: bool = True
+    reinit_DP: bool = False
+    reinit_text_encoder: bool = False
+@dataclass
+class VitsConfig(BaseTTSConfig):
+    model: str = "vits"
+    # model specific params
+    model_args: VitsArgs = field(default_factory=VitsArgs)
+    audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
+    # optimizer
+    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    lr_gen: float = 0.0002
+    lr_disc: float = 0.0002
+    lr_scheduler_gen: str = "ExponentialLR"
+    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    lr_scheduler_disc: str = "ExponentialLR"
+    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    scheduler_after_epoch: bool = True
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+    # loss params
+    kl_loss_alpha: float = 1.0
+    disc_loss_alpha: float = 1.0
+    gen_loss_alpha: float = 1.0
+    feat_loss_alpha: float = 1.0
+    mel_loss_alpha: float = 45.0
+    dur_loss_alpha: float = 1.0
+    speaker_encoder_loss_alpha: float = 1.0
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+    # testing
+    test_sentences: List[List] = field(
+        default_factory=lambda: [
+            ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+            ["Be a voice, not an echo."],
+            ["I'm sorry Dave. I'm afraid I can't do that."],
+            ["This cake is great. It's so delicious and moist."],
+            ["Prior to November 22, 1963."],
+        ]
+    )
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    use_speaker_embedding: bool = False
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+    language_ids_file: str = None
+    use_language_embedding: bool = False
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: List[str] = None
+    d_vector_dim: int = None
+    def __post_init__(self):
+        pass
+        # for key, val in self.model_args.items():
+        #     if hasattr(self, key):
+        #         self[key] = val
+def parse_symbols():
+    return {
+        "pad": _pad,
+        "eos": _eos,
+        "bos": _bos,
+        "characters": _characters,
+        "punctuations": _punctuations,
+        "phonemes": _phonemes,
+    }
+# DEFAULT SET OF GRAPHEMES
+_pad = "<PAD>"
+_eos = "<EOS>"
+_bos = "<BOS>"
+_blank = "<BLNK>"  # TODO: check if we need this alongside with PAD
+_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_punctuations = "!'(),-.:;? "
+# DEFAULT SET OF IPA PHONEMES
+# Phonemes definition (All IPA characters)
+_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
+_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
+_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
+_suprasegmentals = "ˈˌːˑ"
+_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
+_diacrilics = "ɚ˞ɫ"
+_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
+class BaseVocabulary:
+    """Base Vocabulary class.
+    This class only needs a vocabulary dictionary without specifying the characters.
+    Args:
+        vocab (Dict): A dictionary of characters and their corresponding indices.
+    """
+    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+        self.vocab = vocab
+        self.pad = pad
+        self.blank = blank
+        self.bos = bos
+        self.eos = eos
+    @property
+    def pad_id(self) -> int:
+        """Return the index of the padding character. If the padding character is not specified, return the length
+        of the vocabulary."""
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        """Return the index of the blank character. If the blank character is not specified, return the length of
+        the vocabulary."""
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        """Return the index of the bos character. If the bos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        """Return the index of the eos character. If the eos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def vocab(self):
+        """Return the vocabulary dictionary."""
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        """Set the vocabulary dictionary and character mapping dictionaries."""
+        self._vocab, self._char_to_id, self._id_to_char = None, None, None
+        if vocab is not None:
+            self._vocab = vocab
+            self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
+            self._id_to_char = {
+                idx: char for idx, char in enumerate(self._vocab)  # pylint: disable=unnecessary-comprehension
+            }
+    @staticmethod
+    def init_from_config(config, **kwargs):
+        """Initialize from the given config."""
+        if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
+            return (
+                BaseVocabulary(
+                    config.characters.vocab_dict,
+                    config.characters.pad,
+                    config.characters.blank,
+                    config.characters.bos,
+                    config.characters.eos,
+                ),
+                config,
+            )
+        return BaseVocabulary(**kwargs), config
+    def to_config(self):
+        return CharactersConfig(
+            vocab_dict=self._vocab,
+            pad=self.pad,
+            eos=self.eos,
+            bos=self.bos,
+            blank=self.blank,
+            is_unique=False,
+            is_sorted=False,
+        )
+    @property
+    def num_chars(self):
+        """Return number of tokens in the vocabulary."""
+        return len(self._vocab)
+    def char_to_id(self, char: str) -> int:
+        """Map a character to an token ID."""
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        """Map an token ID to a character."""
+        return self._id_to_char[idx]
+class BaseCharacters:
+    def __init__(
+        self,
+        characters: str = None,
+        punctuations: str = None,
+        pad: str = None,
+        eos: str = None,
+        bos: str = None,
+        blank: str = None,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        self._characters = characters
+        self._punctuations = punctuations
+        self._pad = pad
+        self._eos = eos
+        self._bos = bos
+        self._blank = blank
+        self.is_unique = is_unique
+        self.is_sorted = is_sorted
+        self._create_vocab()
+    @property
+    def pad_id(self) -> int:
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, characters):
+        self._characters = characters
+        self._create_vocab()
+    @property
+    def punctuations(self):
+        return self._punctuations
+    @punctuations.setter
+    def punctuations(self, punctuations):
+        self._punctuations = punctuations
+        self._create_vocab()
+    @property
+    def pad(self):
+        return self._pad
+    @pad.setter
+    def pad(self, pad):
+        self._pad = pad
+        self._create_vocab()
+    @property
+    def eos(self):
+        return self._eos
+    @eos.setter
+    def eos(self, eos):
+        self._eos = eos
+        self._create_vocab()
+    @property
+    def bos(self):
+        return self._bos
+    @bos.setter
+    def bos(self, bos):
+        self._bos = bos
+        self._create_vocab()
+    @property
+    def blank(self):
+        return self._blank
+    @blank.setter
+    def blank(self, blank):
+        self._blank = blank
+        self._create_vocab()
+    @property
+    def vocab(self):
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        self._vocab = vocab
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
+    @property
+    def num_chars(self):
+        return len(self._vocab)
+    def _create_vocab(self):
+        _vocab = self._characters
+        if self.is_unique:
+            _vocab = list(set(_vocab))
+        if self.is_sorted:
+            _vocab = sorted(_vocab)
+        _vocab = list(_vocab)
+        _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
+        _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
+        _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
+        _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
+        self.vocab = _vocab + list(self._punctuations)
+        if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
+            assert (
+                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+    def char_to_id(self, char: str) -> int:
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        return self._id_to_char[idx]
+    def print_log(self, level: int = 0):
+        """
+        Prints the vocabulary in a nice format.
+        """
+        indent = "\t" * level
+        print(f"{indent}| > Characters: {self._characters}")
+        print(f"{indent}| > Punctuations: {self._punctuations}")
+        print(f"{indent}| > Pad: {self._pad}")
+        print(f"{indent}| > EOS: {self._eos}")
+        print(f"{indent}| > BOS: {self._bos}")
+        print(f"{indent}| > Blank: {self._blank}")
+        print(f"{indent}| > Vocab: {self.vocab}")
+        print(f"{indent}| > Num chars: {self.num_chars}")
+    @staticmethod
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+        Implement this method for your subclass.
+        """
+        # use character set from config
+        if config.characters is not None:
+            return BaseCharacters(**config.characters), config
+        # return default character set
+        characters = BaseCharacters()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
+        )
+class IPAPhonemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _phonemes,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        # band-aid for compatibility with old models
+        if "characters" in config and config.characters is not None:
+            if "phonemes" in config.characters and config.characters.phonemes is not None:
+                config.characters["characters"] = config.characters["phonemes"]
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
+            )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+class Graphemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a Graphemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+if __name__ == "__main__":
+    gr = Graphemes()
+    ph = IPAPhonemes()
+    gr.print_log()
+    ph.print_log()
+class VitsCharacters(BaseCharacters):
+    """Characters class for VITs model for compatibility with pre-trained models"""
+    def __init__(
+        self,
+        graphemes: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        ipa_characters: str = _phonemes,
+    ) -> None:
+        if ipa_characters is not None:
+            graphemes += ipa_characters
+        super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
+    def _create_vocab(self):
+        self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        # pylint: disable=unnecessary-comprehension
+        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+    @staticmethod
+    def init_from_config(config):
+        _pad = config.characters.pad
+        _punctuations = config.characters.punctuations
+        _letters = config.characters.characters
+        _letters_ipa = config.characters.phonemes
+        return (
+            VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
+            config,
+        )
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=None,
+            bos=None,
+            blank=self._blank,
+            is_unique=False,
+            is_sorted=True,
+        )
+class TTSTokenizer:
+    def __init__(
+        self,
+        text_cleaner: Callable = None,
+        characters: "BaseCharacters" = None,
+    ):
+        self.text_cleaner = text_cleaner
+        self.characters = characters
+        self.not_found_characters = []
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, new_characters):
+        self._characters = new_characters
+        self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
+        self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string of text as a sequence of IDs."""
+        token_ids = []
+        for char in text:
+            try:
+                idx = self.characters.char_to_id(char)
+                token_ids.append(idx)
+            except KeyError:
+                # discard but store not found characters
+                if char not in self.not_found_characters:
+                    self.not_found_characters.append(char)
+                    print(text)
+                    print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
+        return token_ids
+    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+        text = self.text_cleaner(text)
+        text = self.encode(text)
+        text = self.intersperse_blank_char(text, True)
+        return text
+    def pad_with_bos_eos(self, char_sequence: List[str]):
+        """Pads a sequence with the special BOS and EOS characters."""
+        return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
+    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+        """Intersperses the blank character between characters in a sequence.
+        Use the ```blank``` character if defined else use the ```pad``` character.
+        """
+        char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
+        result = [char_to_use] * (len(char_sequence) * 2 + 1)
+        result[1::2] = char_sequence
+        return result
+    @staticmethod
+    def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
+        text_cleaner = multilingual_cleaners
+        CharactersClass = VitsCharacters
+        characters, new_config = CharactersClass.init_from_config(config)
+        # new_config.characters.characters_class = get_import_path(characters)
+        new_config.characters.characters_class = VitsCharacters
+        return (
+            TTSTokenizer(text_cleaner, characters),new_config)
+def multilingual_cleaners(text):
+    """Pipeline for multilingual text"""
+    text = lowercase(text)
+    text = replace_symbols(text, lang=None)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text).strip()
+def replace_symbols(text, lang="en"):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
+    text = text.replace(":", ",")
+    if lang == "en":
+        text = text.replace("&", " and ")
+    elif lang == "fr":
+        text = text.replace("&", " et ")
+    elif lang == "pt":
+        text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
+    return text
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
+    return text

bn_male/jit_infer.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
+import torch
+import numpy as np
+#ch female
+with open("chars.txt", 'r') as f:
+    letters = f.read().strip('\n')
+model="bn_male_vits_30hrs.pt"
+text = " হলেও আমাদের সবার সার্বিক শৃঙ্খলা বোধের উন্নতি হবে"
+config = VitsConfig(
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class=VitsCharacters,
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters=letters,
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None)
+    )
+tokenizer, config = TTSTokenizer.init_from_config(config)
+x = tokenizer.text_to_ids(text)
+x = torch.from_numpy(np.array(x)).unsqueeze(0)
+net = torch.jit.load(model)
+with torch.no_grad():
+    out2 = net(x)
+import soundfile as sf
+sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)

en_female/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

en_female/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: cc-by-4.0
+---

en_female/chars.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pqw'"sgufmxre?d!lcab,zk.iytoh jvn

en_female/en_female_vits_30hrs.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dfa80f08da6ca7222a16cb6d919251fb733d3f03042848a20201fa6ae0d0b9c
+size 333229574

en_female/extra.py ADDED Viewed

	@@ -0,0 +1,787 @@

+from typing import Callable, Dict, List, Union
+from dataclasses import asdict, dataclass, field
+import re
+from dataclasses import replace
+from typing import Dict
+_whitespace_re = re.compile(r"\s+")
+from dataclasses import dataclass, field
+from typing import List
+# from TTS.tts.configs.shared_configs import BaseTTSConfig
+# from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
+@dataclass
+class CharactersConfig():
+    characters_class: str = None
+    # using BaseVocabulary
+    vocab_dict: Dict = None
+    # using on BaseCharacters
+    pad: str = None
+    eos: str = None
+    bos: str = None
+    blank: str = None
+    characters: str = None
+    punctuations: str = None
+    phonemes: str = None
+    is_unique: bool = True  # for backwards compatibility of models trained with char sets with duplicates
+    is_sorted: bool = True
+@dataclass
+class BaseTTSConfig():
+    # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    # phoneme settings
+    use_phonemes: bool = False
+    phonemizer: str = None
+    phoneme_language: str = None
+    compute_input_seq_cache: bool = False
+    text_cleaner: str = None
+    enable_eos_bos_chars: bool = False
+    test_sentences_file: str = ""
+    phoneme_cache_path: str = None
+    # vocabulary parameters
+    characters: CharactersConfig = None
+    add_blank: bool = False
+    # training params
+    batch_group_size: int = 0
+    loss_masking: bool = None
+    # dataloading
+    min_audio_len: int = 1
+    max_audio_len: int = float("inf")
+    min_text_len: int = 1
+    max_text_len: int = float("inf")
+    compute_f0: bool = False
+    compute_energy: bool = False
+    compute_linear_spec: bool = False
+    precompute_num_workers: int = 0
+    use_noise_augment: bool = False
+    start_by_longest: bool = False
+    shuffle: bool = False
+    drop_last: bool = False
+    # dataset
+    datasets: str = None
+    # optimizer
+    optimizer: str = "radam"
+    optimizer_params: dict = None
+    # scheduler
+    lr_scheduler: str = None
+    lr_scheduler_params: dict = field(default_factory=lambda: {})
+    # testing
+    test_sentences: List[str] = field(default_factory=lambda: [])
+    # evaluation
+    eval_split_max_size: int = None
+    eval_split_size: float = 0.01
+    # weighted samplers
+    use_speaker_weighted_sampler: bool = False
+    speaker_weighted_sampler_alpha: float = 1.0
+    use_language_weighted_sampler: bool = False
+    language_weighted_sampler_alpha: float = 1.0
+    use_length_weighted_sampler: bool = False
+    length_weighted_sampler_alpha: float = 1.0
+@dataclass
+class VitsAudioConfig():
+    fft_size: int = 1024
+    sample_rate: int = 22050
+    win_length: int = 1024
+    hop_length: int = 256
+    num_mels: int = 80
+    mel_fmin: int = 0
+    mel_fmax: int = None
+@dataclass
+class VitsArgs():
+    num_chars: int = 100
+    out_channels: int = 513
+    spec_segment_size: int = 32
+    hidden_channels: int = 192
+    hidden_channels_ffn_text_encoder: int = 768
+    num_heads_text_encoder: int = 2
+    num_layers_text_encoder: int = 6
+    kernel_size_text_encoder: int = 3
+    dropout_p_text_encoder: float = 0.1
+    dropout_p_duration_predictor: float = 0.5
+    kernel_size_posterior_encoder: int = 5
+    dilation_rate_posterior_encoder: int = 1
+    num_layers_posterior_encoder: int = 16
+    kernel_size_flow: int = 5
+    dilation_rate_flow: int = 1
+    num_layers_flow: int = 4
+    resblock_type_decoder: str = "1"
+    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    upsample_initial_channel_decoder: int = 512
+    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    use_sdp: bool = True
+    noise_scale: float = 1.0
+    inference_noise_scale: float = 0.667
+    length_scale: float = 1
+    noise_scale_dp: float = 1.0
+    inference_noise_scale_dp: float = 1.0
+    max_inference_len: int = None
+    init_discriminator: bool = True
+    use_spectral_norm_disriminator: bool = False
+    use_speaker_embedding: bool = False
+    num_speakers: int = 0
+    speakers_file: str = None
+    d_vector_file: List[str] = None
+    speaker_embedding_channels: int = 256
+    use_d_vector_file: bool = False
+    d_vector_dim: int = 0
+    detach_dp_input: bool = True
+    use_language_embedding: bool = False
+    embedded_language_dim: int = 4
+    num_languages: int = 0
+    language_ids_file: str = None
+    use_speaker_encoder_as_loss: bool = False
+    speaker_encoder_config_path: str = ""
+    speaker_encoder_model_path: str = ""
+    condition_dp_on_speaker: bool = True
+    freeze_encoder: bool = False
+    freeze_DP: bool = False
+    freeze_PE: bool = False
+    freeze_flow_decoder: bool = False
+    freeze_waveform_decoder: bool = False
+    encoder_sample_rate: int = None
+    interpolate_z: bool = True
+    reinit_DP: bool = False
+    reinit_text_encoder: bool = False
+@dataclass
+class VitsConfig(BaseTTSConfig):
+    model: str = "vits"
+    # model specific params
+    model_args: VitsArgs = field(default_factory=VitsArgs)
+    audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
+    # optimizer
+    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    lr_gen: float = 0.0002
+    lr_disc: float = 0.0002
+    lr_scheduler_gen: str = "ExponentialLR"
+    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    lr_scheduler_disc: str = "ExponentialLR"
+    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    scheduler_after_epoch: bool = True
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+    # loss params
+    kl_loss_alpha: float = 1.0
+    disc_loss_alpha: float = 1.0
+    gen_loss_alpha: float = 1.0
+    feat_loss_alpha: float = 1.0
+    mel_loss_alpha: float = 45.0
+    dur_loss_alpha: float = 1.0
+    speaker_encoder_loss_alpha: float = 1.0
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+    # testing
+    test_sentences: List[List] = field(
+        default_factory=lambda: [
+            ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+            ["Be a voice, not an echo."],
+            ["I'm sorry Dave. I'm afraid I can't do that."],
+            ["This cake is great. It's so delicious and moist."],
+            ["Prior to November 22, 1963."],
+        ]
+    )
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    use_speaker_embedding: bool = False
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+    language_ids_file: str = None
+    use_language_embedding: bool = False
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: List[str] = None
+    d_vector_dim: int = None
+    def __post_init__(self):
+        pass
+        # for key, val in self.model_args.items():
+        #     if hasattr(self, key):
+        #         self[key] = val
+def parse_symbols():
+    return {
+        "pad": _pad,
+        "eos": _eos,
+        "bos": _bos,
+        "characters": _characters,
+        "punctuations": _punctuations,
+        "phonemes": _phonemes,
+    }
+# DEFAULT SET OF GRAPHEMES
+_pad = "<PAD>"
+_eos = "<EOS>"
+_bos = "<BOS>"
+_blank = "<BLNK>"  # TODO: check if we need this alongside with PAD
+_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_punctuations = "!'(),-.:;? "
+# DEFAULT SET OF IPA PHONEMES
+# Phonemes definition (All IPA characters)
+_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
+_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
+_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
+_suprasegmentals = "ˈˌːˑ"
+_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
+_diacrilics = "ɚ˞ɫ"
+_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
+class BaseVocabulary:
+    """Base Vocabulary class.
+    This class only needs a vocabulary dictionary without specifying the characters.
+    Args:
+        vocab (Dict): A dictionary of characters and their corresponding indices.
+    """
+    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+        self.vocab = vocab
+        self.pad = pad
+        self.blank = blank
+        self.bos = bos
+        self.eos = eos
+    @property
+    def pad_id(self) -> int:
+        """Return the index of the padding character. If the padding character is not specified, return the length
+        of the vocabulary."""
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        """Return the index of the blank character. If the blank character is not specified, return the length of
+        the vocabulary."""
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        """Return the index of the bos character. If the bos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        """Return the index of the eos character. If the eos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def vocab(self):
+        """Return the vocabulary dictionary."""
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        """Set the vocabulary dictionary and character mapping dictionaries."""
+        self._vocab, self._char_to_id, self._id_to_char = None, None, None
+        if vocab is not None:
+            self._vocab = vocab
+            self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
+            self._id_to_char = {
+                idx: char for idx, char in enumerate(self._vocab)  # pylint: disable=unnecessary-comprehension
+            }
+    @staticmethod
+    def init_from_config(config, **kwargs):
+        """Initialize from the given config."""
+        if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
+            return (
+                BaseVocabulary(
+                    config.characters.vocab_dict,
+                    config.characters.pad,
+                    config.characters.blank,
+                    config.characters.bos,
+                    config.characters.eos,
+                ),
+                config,
+            )
+        return BaseVocabulary(**kwargs), config
+    def to_config(self):
+        return CharactersConfig(
+            vocab_dict=self._vocab,
+            pad=self.pad,
+            eos=self.eos,
+            bos=self.bos,
+            blank=self.blank,
+            is_unique=False,
+            is_sorted=False,
+        )
+    @property
+    def num_chars(self):
+        """Return number of tokens in the vocabulary."""
+        return len(self._vocab)
+    def char_to_id(self, char: str) -> int:
+        """Map a character to an token ID."""
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        """Map an token ID to a character."""
+        return self._id_to_char[idx]
+class BaseCharacters:
+    def __init__(
+        self,
+        characters: str = None,
+        punctuations: str = None,
+        pad: str = None,
+        eos: str = None,
+        bos: str = None,
+        blank: str = None,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        self._characters = characters
+        self._punctuations = punctuations
+        self._pad = pad
+        self._eos = eos
+        self._bos = bos
+        self._blank = blank
+        self.is_unique = is_unique
+        self.is_sorted = is_sorted
+        self._create_vocab()
+    @property
+    def pad_id(self) -> int:
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, characters):
+        self._characters = characters
+        self._create_vocab()
+    @property
+    def punctuations(self):
+        return self._punctuations
+    @punctuations.setter
+    def punctuations(self, punctuations):
+        self._punctuations = punctuations
+        self._create_vocab()
+    @property
+    def pad(self):
+        return self._pad
+    @pad.setter
+    def pad(self, pad):
+        self._pad = pad
+        self._create_vocab()
+    @property
+    def eos(self):
+        return self._eos
+    @eos.setter
+    def eos(self, eos):
+        self._eos = eos
+        self._create_vocab()
+    @property
+    def bos(self):
+        return self._bos
+    @bos.setter
+    def bos(self, bos):
+        self._bos = bos
+        self._create_vocab()
+    @property
+    def blank(self):
+        return self._blank
+    @blank.setter
+    def blank(self, blank):
+        self._blank = blank
+        self._create_vocab()
+    @property
+    def vocab(self):
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        self._vocab = vocab
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
+    @property
+    def num_chars(self):
+        return len(self._vocab)
+    def _create_vocab(self):
+        _vocab = self._characters
+        if self.is_unique:
+            _vocab = list(set(_vocab))
+        if self.is_sorted:
+            _vocab = sorted(_vocab)
+        _vocab = list(_vocab)
+        _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
+        _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
+        _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
+        _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
+        self.vocab = _vocab + list(self._punctuations)
+        if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
+            assert (
+                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+    def char_to_id(self, char: str) -> int:
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        return self._id_to_char[idx]
+    def print_log(self, level: int = 0):
+        """
+        Prints the vocabulary in a nice format.
+        """
+        indent = "\t" * level
+        print(f"{indent}| > Characters: {self._characters}")
+        print(f"{indent}| > Punctuations: {self._punctuations}")
+        print(f"{indent}| > Pad: {self._pad}")
+        print(f"{indent}| > EOS: {self._eos}")
+        print(f"{indent}| > BOS: {self._bos}")
+        print(f"{indent}| > Blank: {self._blank}")
+        print(f"{indent}| > Vocab: {self.vocab}")
+        print(f"{indent}| > Num chars: {self.num_chars}")
+    @staticmethod
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+        Implement this method for your subclass.
+        """
+        # use character set from config
+        if config.characters is not None:
+            return BaseCharacters(**config.characters), config
+        # return default character set
+        characters = BaseCharacters()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
+        )
+class IPAPhonemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _phonemes,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        # band-aid for compatibility with old models
+        if "characters" in config and config.characters is not None:
+            if "phonemes" in config.characters and config.characters.phonemes is not None:
+                config.characters["characters"] = config.characters["phonemes"]
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
+            )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+class Graphemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a Graphemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+if __name__ == "__main__":
+    gr = Graphemes()
+    ph = IPAPhonemes()
+    gr.print_log()
+    ph.print_log()
+class VitsCharacters(BaseCharacters):
+    """Characters class for VITs model for compatibility with pre-trained models"""
+    def __init__(
+        self,
+        graphemes: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        ipa_characters: str = _phonemes,
+    ) -> None:
+        if ipa_characters is not None:
+            graphemes += ipa_characters
+        super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
+    def _create_vocab(self):
+        self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        # pylint: disable=unnecessary-comprehension
+        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+    @staticmethod
+    def init_from_config(config):
+        _pad = config.characters.pad
+        _punctuations = config.characters.punctuations
+        _letters = config.characters.characters
+        _letters_ipa = config.characters.phonemes
+        return (
+            VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
+            config,
+        )
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=None,
+            bos=None,
+            blank=self._blank,
+            is_unique=False,
+            is_sorted=True,
+        )
+class TTSTokenizer:
+    def __init__(
+        self,
+        text_cleaner: Callable = None,
+        characters: "BaseCharacters" = None,
+    ):
+        self.text_cleaner = text_cleaner
+        self.characters = characters
+        self.not_found_characters = []
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, new_characters):
+        self._characters = new_characters
+        self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
+        self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string of text as a sequence of IDs."""
+        token_ids = []
+        for char in text:
+            try:
+                idx = self.characters.char_to_id(char)
+                token_ids.append(idx)
+            except KeyError:
+                # discard but store not found characters
+                if char not in self.not_found_characters:
+                    self.not_found_characters.append(char)
+                    print(text)
+                    print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
+        return token_ids
+    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+        text = self.text_cleaner(text)
+        text = self.encode(text)
+        text = self.intersperse_blank_char(text, True)
+        return text
+    def pad_with_bos_eos(self, char_sequence: List[str]):
+        """Pads a sequence with the special BOS and EOS characters."""
+        return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
+    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+        """Intersperses the blank character between characters in a sequence.
+        Use the ```blank``` character if defined else use the ```pad``` character.
+        """
+        char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
+        result = [char_to_use] * (len(char_sequence) * 2 + 1)
+        result[1::2] = char_sequence
+        return result
+    @staticmethod
+    def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
+        text_cleaner = multilingual_cleaners
+        CharactersClass = VitsCharacters
+        characters, new_config = CharactersClass.init_from_config(config)
+        # new_config.characters.characters_class = get_import_path(characters)
+        new_config.characters.characters_class = VitsCharacters
+        return (
+            TTSTokenizer(text_cleaner, characters),new_config)
+def multilingual_cleaners(text):
+    """Pipeline for multilingual text"""
+    text = lowercase(text)
+    text = replace_symbols(text, lang=None)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text).strip()
+def replace_symbols(text, lang="en"):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
+    text = text.replace(":", ",")
+    if lang == "en":
+        text = text.replace("&", " and ")
+    elif lang == "fr":
+        text = text.replace("&", " et ")
+    elif lang == "pt":
+        text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
+    return text
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
+    return text

en_female/jit_infer.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
+import torch
+import numpy as np
+#ch female
+with open("chars.txt", 'r') as f:
+    letters = f.read().strip('\n')
+model="en_female_vits_30hrs.pt"
+# text = " হলেও আমাদের সবার সার্বিক শৃঙ্খলা বোধের উন্নতি হবে"
+text = "My name is g p t, chat g p t"
+config = VitsConfig(
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class=VitsCharacters,
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters=letters,
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None)
+    )
+tokenizer, config = TTSTokenizer.init_from_config(config)
+x = tokenizer.text_to_ids(text)
+x = torch.from_numpy(np.array(x)).unsqueeze(0)
+net = torch.jit.load(model)
+with torch.no_grad():
+    out2 = net(x)
+import soundfile as sf
+sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)

en_male/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

en_male/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: cc-by-4.0
+---

en_male/chars.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pqw'"sgufmxre?d!lcab,zk.iytoh jvn

en_male/en_male_vits_30hrs.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffa1099438a58c8a13e437d39ec304b530644156ef445032e64422d83e558666
+size 333224012

en_male/extra.py ADDED Viewed

	@@ -0,0 +1,787 @@

+from typing import Callable, Dict, List, Union
+from dataclasses import asdict, dataclass, field
+import re
+from dataclasses import replace
+from typing import Dict
+_whitespace_re = re.compile(r"\s+")
+from dataclasses import dataclass, field
+from typing import List
+# from TTS.tts.configs.shared_configs import BaseTTSConfig
+# from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
+@dataclass
+class CharactersConfig():
+    characters_class: str = None
+    # using BaseVocabulary
+    vocab_dict: Dict = None
+    # using on BaseCharacters
+    pad: str = None
+    eos: str = None
+    bos: str = None
+    blank: str = None
+    characters: str = None
+    punctuations: str = None
+    phonemes: str = None
+    is_unique: bool = True  # for backwards compatibility of models trained with char sets with duplicates
+    is_sorted: bool = True
+@dataclass
+class BaseTTSConfig():
+    # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    # phoneme settings
+    use_phonemes: bool = False
+    phonemizer: str = None
+    phoneme_language: str = None
+    compute_input_seq_cache: bool = False
+    text_cleaner: str = None
+    enable_eos_bos_chars: bool = False
+    test_sentences_file: str = ""
+    phoneme_cache_path: str = None
+    # vocabulary parameters
+    characters: CharactersConfig = None
+    add_blank: bool = False
+    # training params
+    batch_group_size: int = 0
+    loss_masking: bool = None
+    # dataloading
+    min_audio_len: int = 1
+    max_audio_len: int = float("inf")
+    min_text_len: int = 1
+    max_text_len: int = float("inf")
+    compute_f0: bool = False
+    compute_energy: bool = False
+    compute_linear_spec: bool = False
+    precompute_num_workers: int = 0
+    use_noise_augment: bool = False
+    start_by_longest: bool = False
+    shuffle: bool = False
+    drop_last: bool = False
+    # dataset
+    datasets: str = None
+    # optimizer
+    optimizer: str = "radam"
+    optimizer_params: dict = None
+    # scheduler
+    lr_scheduler: str = None
+    lr_scheduler_params: dict = field(default_factory=lambda: {})
+    # testing
+    test_sentences: List[str] = field(default_factory=lambda: [])
+    # evaluation
+    eval_split_max_size: int = None
+    eval_split_size: float = 0.01
+    # weighted samplers
+    use_speaker_weighted_sampler: bool = False
+    speaker_weighted_sampler_alpha: float = 1.0
+    use_language_weighted_sampler: bool = False
+    language_weighted_sampler_alpha: float = 1.0
+    use_length_weighted_sampler: bool = False
+    length_weighted_sampler_alpha: float = 1.0
+@dataclass
+class VitsAudioConfig():
+    fft_size: int = 1024
+    sample_rate: int = 22050
+    win_length: int = 1024
+    hop_length: int = 256
+    num_mels: int = 80
+    mel_fmin: int = 0
+    mel_fmax: int = None
+@dataclass
+class VitsArgs():
+    num_chars: int = 100
+    out_channels: int = 513
+    spec_segment_size: int = 32
+    hidden_channels: int = 192
+    hidden_channels_ffn_text_encoder: int = 768
+    num_heads_text_encoder: int = 2
+    num_layers_text_encoder: int = 6
+    kernel_size_text_encoder: int = 3
+    dropout_p_text_encoder: float = 0.1
+    dropout_p_duration_predictor: float = 0.5
+    kernel_size_posterior_encoder: int = 5
+    dilation_rate_posterior_encoder: int = 1
+    num_layers_posterior_encoder: int = 16
+    kernel_size_flow: int = 5
+    dilation_rate_flow: int = 1
+    num_layers_flow: int = 4
+    resblock_type_decoder: str = "1"
+    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    upsample_initial_channel_decoder: int = 512
+    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    use_sdp: bool = True
+    noise_scale: float = 1.0
+    inference_noise_scale: float = 0.667
+    length_scale: float = 1
+    noise_scale_dp: float = 1.0
+    inference_noise_scale_dp: float = 1.0
+    max_inference_len: int = None
+    init_discriminator: bool = True
+    use_spectral_norm_disriminator: bool = False
+    use_speaker_embedding: bool = False
+    num_speakers: int = 0
+    speakers_file: str = None
+    d_vector_file: List[str] = None
+    speaker_embedding_channels: int = 256
+    use_d_vector_file: bool = False
+    d_vector_dim: int = 0
+    detach_dp_input: bool = True
+    use_language_embedding: bool = False
+    embedded_language_dim: int = 4
+    num_languages: int = 0
+    language_ids_file: str = None
+    use_speaker_encoder_as_loss: bool = False
+    speaker_encoder_config_path: str = ""
+    speaker_encoder_model_path: str = ""
+    condition_dp_on_speaker: bool = True
+    freeze_encoder: bool = False
+    freeze_DP: bool = False
+    freeze_PE: bool = False
+    freeze_flow_decoder: bool = False
+    freeze_waveform_decoder: bool = False
+    encoder_sample_rate: int = None
+    interpolate_z: bool = True
+    reinit_DP: bool = False
+    reinit_text_encoder: bool = False
+@dataclass
+class VitsConfig(BaseTTSConfig):
+    model: str = "vits"
+    # model specific params
+    model_args: VitsArgs = field(default_factory=VitsArgs)
+    audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
+    # optimizer
+    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    lr_gen: float = 0.0002
+    lr_disc: float = 0.0002
+    lr_scheduler_gen: str = "ExponentialLR"
+    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    lr_scheduler_disc: str = "ExponentialLR"
+    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    scheduler_after_epoch: bool = True
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+    # loss params
+    kl_loss_alpha: float = 1.0
+    disc_loss_alpha: float = 1.0
+    gen_loss_alpha: float = 1.0
+    feat_loss_alpha: float = 1.0
+    mel_loss_alpha: float = 45.0
+    dur_loss_alpha: float = 1.0
+    speaker_encoder_loss_alpha: float = 1.0
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+    # testing
+    test_sentences: List[List] = field(
+        default_factory=lambda: [
+            ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+            ["Be a voice, not an echo."],
+            ["I'm sorry Dave. I'm afraid I can't do that."],
+            ["This cake is great. It's so delicious and moist."],
+            ["Prior to November 22, 1963."],
+        ]
+    )
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    use_speaker_embedding: bool = False
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+    language_ids_file: str = None
+    use_language_embedding: bool = False
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: List[str] = None
+    d_vector_dim: int = None
+    def __post_init__(self):
+        pass
+        # for key, val in self.model_args.items():
+        #     if hasattr(self, key):
+        #         self[key] = val
+def parse_symbols():
+    return {
+        "pad": _pad,
+        "eos": _eos,
+        "bos": _bos,
+        "characters": _characters,
+        "punctuations": _punctuations,
+        "phonemes": _phonemes,
+    }
+# DEFAULT SET OF GRAPHEMES
+_pad = "<PAD>"
+_eos = "<EOS>"
+_bos = "<BOS>"
+_blank = "<BLNK>"  # TODO: check if we need this alongside with PAD
+_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_punctuations = "!'(),-.:;? "
+# DEFAULT SET OF IPA PHONEMES
+# Phonemes definition (All IPA characters)
+_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
+_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
+_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
+_suprasegmentals = "ˈˌːˑ"
+_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
+_diacrilics = "ɚ˞ɫ"
+_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
+class BaseVocabulary:
+    """Base Vocabulary class.
+    This class only needs a vocabulary dictionary without specifying the characters.
+    Args:
+        vocab (Dict): A dictionary of characters and their corresponding indices.
+    """
+    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+        self.vocab = vocab
+        self.pad = pad
+        self.blank = blank
+        self.bos = bos
+        self.eos = eos
+    @property
+    def pad_id(self) -> int:
+        """Return the index of the padding character. If the padding character is not specified, return the length
+        of the vocabulary."""
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        """Return the index of the blank character. If the blank character is not specified, return the length of
+        the vocabulary."""
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        """Return the index of the bos character. If the bos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        """Return the index of the eos character. If the eos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def vocab(self):
+        """Return the vocabulary dictionary."""
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        """Set the vocabulary dictionary and character mapping dictionaries."""
+        self._vocab, self._char_to_id, self._id_to_char = None, None, None
+        if vocab is not None:
+            self._vocab = vocab
+            self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
+            self._id_to_char = {
+                idx: char for idx, char in enumerate(self._vocab)  # pylint: disable=unnecessary-comprehension
+            }
+    @staticmethod
+    def init_from_config(config, **kwargs):
+        """Initialize from the given config."""
+        if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
+            return (
+                BaseVocabulary(
+                    config.characters.vocab_dict,
+                    config.characters.pad,
+                    config.characters.blank,
+                    config.characters.bos,
+                    config.characters.eos,
+                ),
+                config,
+            )
+        return BaseVocabulary(**kwargs), config
+    def to_config(self):
+        return CharactersConfig(
+            vocab_dict=self._vocab,
+            pad=self.pad,
+            eos=self.eos,
+            bos=self.bos,
+            blank=self.blank,
+            is_unique=False,
+            is_sorted=False,
+        )
+    @property
+    def num_chars(self):
+        """Return number of tokens in the vocabulary."""
+        return len(self._vocab)
+    def char_to_id(self, char: str) -> int:
+        """Map a character to an token ID."""
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        """Map an token ID to a character."""
+        return self._id_to_char[idx]
+class BaseCharacters:
+    def __init__(
+        self,
+        characters: str = None,
+        punctuations: str = None,
+        pad: str = None,
+        eos: str = None,
+        bos: str = None,
+        blank: str = None,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        self._characters = characters
+        self._punctuations = punctuations
+        self._pad = pad
+        self._eos = eos
+        self._bos = bos
+        self._blank = blank
+        self.is_unique = is_unique
+        self.is_sorted = is_sorted
+        self._create_vocab()
+    @property
+    def pad_id(self) -> int:
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, characters):
+        self._characters = characters
+        self._create_vocab()
+    @property
+    def punctuations(self):
+        return self._punctuations
+    @punctuations.setter
+    def punctuations(self, punctuations):
+        self._punctuations = punctuations
+        self._create_vocab()
+    @property
+    def pad(self):
+        return self._pad
+    @pad.setter
+    def pad(self, pad):
+        self._pad = pad
+        self._create_vocab()
+    @property
+    def eos(self):
+        return self._eos
+    @eos.setter
+    def eos(self, eos):
+        self._eos = eos
+        self._create_vocab()
+    @property
+    def bos(self):
+        return self._bos
+    @bos.setter
+    def bos(self, bos):
+        self._bos = bos
+        self._create_vocab()
+    @property
+    def blank(self):
+        return self._blank
+    @blank.setter
+    def blank(self, blank):
+        self._blank = blank
+        self._create_vocab()
+    @property
+    def vocab(self):
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        self._vocab = vocab
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
+    @property
+    def num_chars(self):
+        return len(self._vocab)
+    def _create_vocab(self):
+        _vocab = self._characters
+        if self.is_unique:
+            _vocab = list(set(_vocab))
+        if self.is_sorted:
+            _vocab = sorted(_vocab)
+        _vocab = list(_vocab)
+        _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
+        _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
+        _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
+        _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
+        self.vocab = _vocab + list(self._punctuations)
+        if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
+            assert (
+                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+    def char_to_id(self, char: str) -> int:
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        return self._id_to_char[idx]
+    def print_log(self, level: int = 0):
+        """
+        Prints the vocabulary in a nice format.
+        """
+        indent = "\t" * level
+        print(f"{indent}| > Characters: {self._characters}")
+        print(f"{indent}| > Punctuations: {self._punctuations}")
+        print(f"{indent}| > Pad: {self._pad}")
+        print(f"{indent}| > EOS: {self._eos}")
+        print(f"{indent}| > BOS: {self._bos}")
+        print(f"{indent}| > Blank: {self._blank}")
+        print(f"{indent}| > Vocab: {self.vocab}")
+        print(f"{indent}| > Num chars: {self.num_chars}")
+    @staticmethod
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+        Implement this method for your subclass.
+        """
+        # use character set from config
+        if config.characters is not None:
+            return BaseCharacters(**config.characters), config
+        # return default character set
+        characters = BaseCharacters()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
+        )
+class IPAPhonemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _phonemes,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        # band-aid for compatibility with old models
+        if "characters" in config and config.characters is not None:
+            if "phonemes" in config.characters and config.characters.phonemes is not None:
+                config.characters["characters"] = config.characters["phonemes"]
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
+            )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+class Graphemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a Graphemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+if __name__ == "__main__":
+    gr = Graphemes()
+    ph = IPAPhonemes()
+    gr.print_log()
+    ph.print_log()
+class VitsCharacters(BaseCharacters):
+    """Characters class for VITs model for compatibility with pre-trained models"""
+    def __init__(
+        self,
+        graphemes: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        ipa_characters: str = _phonemes,
+    ) -> None:
+        if ipa_characters is not None:
+            graphemes += ipa_characters
+        super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
+    def _create_vocab(self):
+        self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        # pylint: disable=unnecessary-comprehension
+        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+    @staticmethod
+    def init_from_config(config):
+        _pad = config.characters.pad
+        _punctuations = config.characters.punctuations
+        _letters = config.characters.characters
+        _letters_ipa = config.characters.phonemes
+        return (
+            VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
+            config,
+        )
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=None,
+            bos=None,
+            blank=self._blank,
+            is_unique=False,
+            is_sorted=True,
+        )
+class TTSTokenizer:
+    def __init__(
+        self,
+        text_cleaner: Callable = None,
+        characters: "BaseCharacters" = None,
+    ):
+        self.text_cleaner = text_cleaner
+        self.characters = characters
+        self.not_found_characters = []
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, new_characters):
+        self._characters = new_characters
+        self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
+        self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string of text as a sequence of IDs."""
+        token_ids = []
+        for char in text:
+            try:
+                idx = self.characters.char_to_id(char)
+                token_ids.append(idx)
+            except KeyError:
+                # discard but store not found characters
+                if char not in self.not_found_characters:
+                    self.not_found_characters.append(char)
+                    print(text)
+                    print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
+        return token_ids
+    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+        text = self.text_cleaner(text)
+        text = self.encode(text)
+        text = self.intersperse_blank_char(text, True)
+        return text
+    def pad_with_bos_eos(self, char_sequence: List[str]):
+        """Pads a sequence with the special BOS and EOS characters."""
+        return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
+    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+        """Intersperses the blank character between characters in a sequence.
+        Use the ```blank``` character if defined else use the ```pad``` character.
+        """
+        char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
+        result = [char_to_use] * (len(char_sequence) * 2 + 1)
+        result[1::2] = char_sequence
+        return result
+    @staticmethod
+    def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
+        text_cleaner = multilingual_cleaners
+        CharactersClass = VitsCharacters
+        characters, new_config = CharactersClass.init_from_config(config)
+        # new_config.characters.characters_class = get_import_path(characters)
+        new_config.characters.characters_class = VitsCharacters
+        return (
+            TTSTokenizer(text_cleaner, characters),new_config)
+def multilingual_cleaners(text):
+    """Pipeline for multilingual text"""
+    text = lowercase(text)
+    text = replace_symbols(text, lang=None)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text).strip()
+def replace_symbols(text, lang="en"):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
+    text = text.replace(":", ",")
+    if lang == "en":
+        text = text.replace("&", " and ")
+    elif lang == "fr":
+        text = text.replace("&", " et ")
+    elif lang == "pt":
+        text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
+    return text
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
+    return text

en_male/jit_infer.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
+import torch
+import numpy as np
+#ch female
+with open("chars.txt", 'r') as f:
+    letters = f.read().strip('\n')
+model="en_male_vits_30hrs.pt"
+text = "This is a text to b spoken"
+config = VitsConfig(
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class=VitsCharacters,
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters=letters,
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None)
+    )
+tokenizer, config = TTSTokenizer.init_from_config(config)
+x = tokenizer.text_to_ids(text)
+x = torch.from_numpy(np.array(x)).unsqueeze(0)
+net = torch.jit.load(model)
+with torch.no_grad():
+    out2 = net(x)
+import soundfile as sf
+sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)

gu_mms/config.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+  "activation_dropout": 0.1,
+  "architectures": [
+    "VitsModel"
+  ],
+  "attention_dropout": 0.1,
+  "depth_separable_channels": 2,
+  "depth_separable_num_layers": 3,
+  "duration_predictor_dropout": 0.5,
+  "duration_predictor_filter_channels": 256,
+  "duration_predictor_flow_bins": 10,
+  "duration_predictor_kernel_size": 3,
+  "duration_predictor_num_flows": 4,
+  "duration_predictor_tail_bound": 5.0,
+  "ffn_dim": 768,
+  "ffn_kernel_size": 3,
+  "flow_size": 192,
+  "hidden_act": "relu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 192,
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "leaky_relu_slope": 0.1,
+  "model_type": "vits",
+  "noise_scale": 0.667,
+  "noise_scale_duration": 0.8,
+  "num_attention_heads": 2,
+  "num_hidden_layers": 6,
+  "num_speakers": 1,
+  "posterior_encoder_num_wavenet_layers": 16,
+  "prior_encoder_num_flows": 4,
+  "prior_encoder_num_wavenet_layers": 4,
+  "resblock_dilation_sizes": [
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ]
+  ],
+  "resblock_kernel_sizes": [
+    3,
+    7,
+    11
+  ],
+  "sampling_rate": 16000,
+  "speaker_embedding_size": 0,
+  "speaking_rate": 1.0,
+  "spectrogram_bins": 513,
+  "torch_dtype": "float32",
+  "transformers_version": "4.33.0.dev0",
+  "upsample_initial_channel": 512,
+  "upsample_kernel_sizes": [
+    16,
+    16,
+    4,
+    4
+  ],
+  "upsample_rates": [
+    8,
+    8,
+    2,
+    2
+  ],
+  "use_bias": true,
+  "use_stochastic_duration_prediction": true,
+  "vocab_size": 60,
+  "wavenet_dilation_rate": 1,
+  "wavenet_dropout": 0.0,
+  "wavenet_kernel_size": 5,
+  "window_size": 4
+}

gu_mms/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "pad_token": "|",
+  "unk_token": "<unk>"
+}

gu_mms/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "add_blank": true,
+  "clean_up_tokenization_spaces": true,
+  "is_uroman": false,
+  "language": "guj",
+  "model_max_length": 1000000000000000019884624838656,
+  "normalize": true,
+  "pad_token": "|",
+  "phonemize": false,
+  "tokenizer_class": "VitsTokenizer",
+  "unk_token": "<unk>"
+}

gu_mms/vocab.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  " ": 59,
+  "'": 47,
+  "-": 56,
+  "|": 0,
+  "ં": 10,
+  "ઃ": 54,
+  "અ": 28,
+  "આ": 26,
+  "ઇ": 49,
+  "ઈ": 30,
+  "ઉ": 42,
+  "ઊ": 48,
+  "ઋ": 57,
+  "એ": 29,
+  "ઐ": 58,
+  "ઓ": 27,
+  "ક": 9,
+  "ખ": 33,
+  "ગ": 32,
+  "ઘ": 44,
+  "ચ": 39,
+  "છ": 23,
+  "જ": 18,
+  "ઝ": 51,
+  "ઞ": 50,
+  "ટ": 36,
+  "ઠ": 45,
+  "ડ": 40,
+  "ઢ": 52,
+  "ણ": 22,
+  "ત": 3,
+  "થ": 19,
+  "દ": 25,
+  "ધ": 34,
+  "ન": 4,
+  "પ": 12,
+  "ફ": 43,
+  "બ": 31,
+  "ભ": 35,
+  "મ": 7,
+  "ય": 16,
+  "ર": 5,
+  "લ": 24,
+  "ળ": 37,
+  "વ": 13,
+  "શ": 21,
+  "ષ": 41,
+  "સ": 15,
+  "હ": 17,
+  "ા": 1,
+  "િ": 20,
+  "ી": 8,
+  "ુ": 14,
+  "ૂ": 38,
+  "ૃ": 46,
+  "ે": 2,
+  "ૈ": 53,
+  "ો": 11,
+  "ૌ": 55,
+  "્": 6
+}

hi_female/chars.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ शदऊतसओषमऱढै?ख़ौक़ड़ःिअनठय़ज़फ़्खँे।ंऋउ'हछङझ" ुणऔघयञृएईॆीपचॉॠवगडटइ,बॅूऐफकजलग़आधोथाभढ़ऑ

hi_female/extra.py ADDED Viewed

	@@ -0,0 +1,787 @@

+from typing import Callable, Dict, List, Union
+from dataclasses import asdict, dataclass, field
+import re
+from dataclasses import replace
+from typing import Dict
+_whitespace_re = re.compile(r"\s+")
+from dataclasses import dataclass, field
+from typing import List
+# from TTS.tts.configs.shared_configs import BaseTTSConfig
+# from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
+@dataclass
+class CharactersConfig():
+    characters_class: str = None
+    # using BaseVocabulary
+    vocab_dict: Dict = None
+    # using on BaseCharacters
+    pad: str = None
+    eos: str = None
+    bos: str = None
+    blank: str = None
+    characters: str = None
+    punctuations: str = None
+    phonemes: str = None
+    is_unique: bool = True  # for backwards compatibility of models trained with char sets with duplicates
+    is_sorted: bool = True
+@dataclass
+class BaseTTSConfig():
+    # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    # phoneme settings
+    use_phonemes: bool = False
+    phonemizer: str = None
+    phoneme_language: str = None
+    compute_input_seq_cache: bool = False
+    text_cleaner: str = None
+    enable_eos_bos_chars: bool = False
+    test_sentences_file: str = ""
+    phoneme_cache_path: str = None
+    # vocabulary parameters
+    characters: CharactersConfig = None
+    add_blank: bool = False
+    # training params
+    batch_group_size: int = 0
+    loss_masking: bool = None
+    # dataloading
+    min_audio_len: int = 1
+    max_audio_len: int = float("inf")
+    min_text_len: int = 1
+    max_text_len: int = float("inf")
+    compute_f0: bool = False
+    compute_energy: bool = False
+    compute_linear_spec: bool = False
+    precompute_num_workers: int = 0
+    use_noise_augment: bool = False
+    start_by_longest: bool = False
+    shuffle: bool = False
+    drop_last: bool = False
+    # dataset
+    datasets: str = None
+    # optimizer
+    optimizer: str = "radam"
+    optimizer_params: dict = None
+    # scheduler
+    lr_scheduler: str = None
+    lr_scheduler_params: dict = field(default_factory=lambda: {})
+    # testing
+    test_sentences: List[str] = field(default_factory=lambda: [])
+    # evaluation
+    eval_split_max_size: int = None
+    eval_split_size: float = 0.01
+    # weighted samplers
+    use_speaker_weighted_sampler: bool = False
+    speaker_weighted_sampler_alpha: float = 1.0
+    use_language_weighted_sampler: bool = False
+    language_weighted_sampler_alpha: float = 1.0
+    use_length_weighted_sampler: bool = False
+    length_weighted_sampler_alpha: float = 1.0
+@dataclass
+class VitsAudioConfig():
+    fft_size: int = 1024
+    sample_rate: int = 22050
+    win_length: int = 1024
+    hop_length: int = 256
+    num_mels: int = 80
+    mel_fmin: int = 0
+    mel_fmax: int = None
+@dataclass
+class VitsArgs():
+    num_chars: int = 100
+    out_channels: int = 513
+    spec_segment_size: int = 32
+    hidden_channels: int = 192
+    hidden_channels_ffn_text_encoder: int = 768
+    num_heads_text_encoder: int = 2
+    num_layers_text_encoder: int = 6
+    kernel_size_text_encoder: int = 3
+    dropout_p_text_encoder: float = 0.1
+    dropout_p_duration_predictor: float = 0.5
+    kernel_size_posterior_encoder: int = 5
+    dilation_rate_posterior_encoder: int = 1
+    num_layers_posterior_encoder: int = 16
+    kernel_size_flow: int = 5
+    dilation_rate_flow: int = 1
+    num_layers_flow: int = 4
+    resblock_type_decoder: str = "1"
+    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    upsample_initial_channel_decoder: int = 512
+    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    use_sdp: bool = True
+    noise_scale: float = 1.0
+    inference_noise_scale: float = 0.667
+    length_scale: float = 1
+    noise_scale_dp: float = 1.0
+    inference_noise_scale_dp: float = 1.0
+    max_inference_len: int = None
+    init_discriminator: bool = True
+    use_spectral_norm_disriminator: bool = False
+    use_speaker_embedding: bool = False
+    num_speakers: int = 0
+    speakers_file: str = None
+    d_vector_file: List[str] = None
+    speaker_embedding_channels: int = 256
+    use_d_vector_file: bool = False
+    d_vector_dim: int = 0
+    detach_dp_input: bool = True
+    use_language_embedding: bool = False
+    embedded_language_dim: int = 4
+    num_languages: int = 0
+    language_ids_file: str = None
+    use_speaker_encoder_as_loss: bool = False
+    speaker_encoder_config_path: str = ""
+    speaker_encoder_model_path: str = ""
+    condition_dp_on_speaker: bool = True
+    freeze_encoder: bool = False
+    freeze_DP: bool = False
+    freeze_PE: bool = False
+    freeze_flow_decoder: bool = False
+    freeze_waveform_decoder: bool = False
+    encoder_sample_rate: int = None
+    interpolate_z: bool = True
+    reinit_DP: bool = False
+    reinit_text_encoder: bool = False
+@dataclass
+class VitsConfig(BaseTTSConfig):
+    model: str = "vits"
+    # model specific params
+    model_args: VitsArgs = field(default_factory=VitsArgs)
+    audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
+    # optimizer
+    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    lr_gen: float = 0.0002
+    lr_disc: float = 0.0002
+    lr_scheduler_gen: str = "ExponentialLR"
+    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    lr_scheduler_disc: str = "ExponentialLR"
+    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    scheduler_after_epoch: bool = True
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+    # loss params
+    kl_loss_alpha: float = 1.0
+    disc_loss_alpha: float = 1.0
+    gen_loss_alpha: float = 1.0
+    feat_loss_alpha: float = 1.0
+    mel_loss_alpha: float = 45.0
+    dur_loss_alpha: float = 1.0
+    speaker_encoder_loss_alpha: float = 1.0
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+    # testing
+    test_sentences: List[List] = field(
+        default_factory=lambda: [
+            ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+            ["Be a voice, not an echo."],
+            ["I'm sorry Dave. I'm afraid I can't do that."],
+            ["This cake is great. It's so delicious and moist."],
+            ["Prior to November 22, 1963."],
+        ]
+    )
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    use_speaker_embedding: bool = False
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+    language_ids_file: str = None
+    use_language_embedding: bool = False
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: List[str] = None
+    d_vector_dim: int = None
+    def __post_init__(self):
+        pass
+        # for key, val in self.model_args.items():
+        #     if hasattr(self, key):
+        #         self[key] = val
+def parse_symbols():
+    return {
+        "pad": _pad,
+        "eos": _eos,
+        "bos": _bos,
+        "characters": _characters,
+        "punctuations": _punctuations,
+        "phonemes": _phonemes,
+    }
+# DEFAULT SET OF GRAPHEMES
+_pad = "<PAD>"
+_eos = "<EOS>"
+_bos = "<BOS>"
+_blank = "<BLNK>"  # TODO: check if we need this alongside with PAD
+_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_punctuations = "!'(),-.:;? "
+# DEFAULT SET OF IPA PHONEMES
+# Phonemes definition (All IPA characters)
+_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
+_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
+_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
+_suprasegmentals = "ˈˌːˑ"
+_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
+_diacrilics = "ɚ˞ɫ"
+_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
+class BaseVocabulary:
+    """Base Vocabulary class.
+    This class only needs a vocabulary dictionary without specifying the characters.
+    Args:
+        vocab (Dict): A dictionary of characters and their corresponding indices.
+    """
+    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+        self.vocab = vocab
+        self.pad = pad
+        self.blank = blank
+        self.bos = bos
+        self.eos = eos
+    @property
+    def pad_id(self) -> int:
+        """Return the index of the padding character. If the padding character is not specified, return the length
+        of the vocabulary."""
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        """Return the index of the blank character. If the blank character is not specified, return the length of
+        the vocabulary."""
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        """Return the index of the bos character. If the bos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        """Return the index of the eos character. If the eos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def vocab(self):
+        """Return the vocabulary dictionary."""
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        """Set the vocabulary dictionary and character mapping dictionaries."""
+        self._vocab, self._char_to_id, self._id_to_char = None, None, None
+        if vocab is not None:
+            self._vocab = vocab
+            self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
+            self._id_to_char = {
+                idx: char for idx, char in enumerate(self._vocab)  # pylint: disable=unnecessary-comprehension
+            }
+    @staticmethod
+    def init_from_config(config, **kwargs):
+        """Initialize from the given config."""
+        if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
+            return (
+                BaseVocabulary(
+                    config.characters.vocab_dict,
+                    config.characters.pad,
+                    config.characters.blank,
+                    config.characters.bos,
+                    config.characters.eos,
+                ),
+                config,
+            )
+        return BaseVocabulary(**kwargs), config
+    def to_config(self):
+        return CharactersConfig(
+            vocab_dict=self._vocab,
+            pad=self.pad,
+            eos=self.eos,
+            bos=self.bos,
+            blank=self.blank,
+            is_unique=False,
+            is_sorted=False,
+        )
+    @property
+    def num_chars(self):
+        """Return number of tokens in the vocabulary."""
+        return len(self._vocab)
+    def char_to_id(self, char: str) -> int:
+        """Map a character to an token ID."""
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        """Map an token ID to a character."""
+        return self._id_to_char[idx]
+class BaseCharacters:
+    def __init__(
+        self,
+        characters: str = None,
+        punctuations: str = None,
+        pad: str = None,
+        eos: str = None,
+        bos: str = None,
+        blank: str = None,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        self._characters = characters
+        self._punctuations = punctuations
+        self._pad = pad
+        self._eos = eos
+        self._bos = bos
+        self._blank = blank
+        self.is_unique = is_unique
+        self.is_sorted = is_sorted
+        self._create_vocab()
+    @property
+    def pad_id(self) -> int:
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, characters):
+        self._characters = characters
+        self._create_vocab()
+    @property
+    def punctuations(self):
+        return self._punctuations
+    @punctuations.setter
+    def punctuations(self, punctuations):
+        self._punctuations = punctuations
+        self._create_vocab()
+    @property
+    def pad(self):
+        return self._pad
+    @pad.setter
+    def pad(self, pad):
+        self._pad = pad
+        self._create_vocab()
+    @property
+    def eos(self):
+        return self._eos
+    @eos.setter
+    def eos(self, eos):
+        self._eos = eos
+        self._create_vocab()
+    @property
+    def bos(self):
+        return self._bos
+    @bos.setter
+    def bos(self, bos):
+        self._bos = bos
+        self._create_vocab()
+    @property
+    def blank(self):
+        return self._blank
+    @blank.setter
+    def blank(self, blank):
+        self._blank = blank
+        self._create_vocab()
+    @property
+    def vocab(self):
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        self._vocab = vocab
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
+    @property
+    def num_chars(self):
+        return len(self._vocab)
+    def _create_vocab(self):
+        _vocab = self._characters
+        if self.is_unique:
+            _vocab = list(set(_vocab))
+        if self.is_sorted:
+            _vocab = sorted(_vocab)
+        _vocab = list(_vocab)
+        _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
+        _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
+        _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
+        _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
+        self.vocab = _vocab + list(self._punctuations)
+        if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
+            assert (
+                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+    def char_to_id(self, char: str) -> int:
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        return self._id_to_char[idx]
+    def print_log(self, level: int = 0):
+        """
+        Prints the vocabulary in a nice format.
+        """
+        indent = "\t" * level
+        print(f"{indent}| > Characters: {self._characters}")
+        print(f"{indent}| > Punctuations: {self._punctuations}")
+        print(f"{indent}| > Pad: {self._pad}")
+        print(f"{indent}| > EOS: {self._eos}")
+        print(f"{indent}| > BOS: {self._bos}")
+        print(f"{indent}| > Blank: {self._blank}")
+        print(f"{indent}| > Vocab: {self.vocab}")
+        print(f"{indent}| > Num chars: {self.num_chars}")
+    @staticmethod
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+        Implement this method for your subclass.
+        """
+        # use character set from config
+        if config.characters is not None:
+            return BaseCharacters(**config.characters), config
+        # return default character set
+        characters = BaseCharacters()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
+        )
+class IPAPhonemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _phonemes,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        # band-aid for compatibility with old models
+        if "characters" in config and config.characters is not None:
+            if "phonemes" in config.characters and config.characters.phonemes is not None:
+                config.characters["characters"] = config.characters["phonemes"]
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
+            )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+class Graphemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a Graphemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+if __name__ == "__main__":
+    gr = Graphemes()
+    ph = IPAPhonemes()
+    gr.print_log()
+    ph.print_log()
+class VitsCharacters(BaseCharacters):
+    """Characters class for VITs model for compatibility with pre-trained models"""
+    def __init__(
+        self,
+        graphemes: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        ipa_characters: str = _phonemes,
+    ) -> None:
+        if ipa_characters is not None:
+            graphemes += ipa_characters
+        super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
+    def _create_vocab(self):
+        self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        # pylint: disable=unnecessary-comprehension
+        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+    @staticmethod
+    def init_from_config(config):
+        _pad = config.characters.pad
+        _punctuations = config.characters.punctuations
+        _letters = config.characters.characters
+        _letters_ipa = config.characters.phonemes
+        return (
+            VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
+            config,
+        )
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=None,
+            bos=None,
+            blank=self._blank,
+            is_unique=False,
+            is_sorted=True,
+        )
+class TTSTokenizer:
+    def __init__(
+        self,
+        text_cleaner: Callable = None,
+        characters: "BaseCharacters" = None,
+    ):
+        self.text_cleaner = text_cleaner
+        self.characters = characters
+        self.not_found_characters = []
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, new_characters):
+        self._characters = new_characters
+        self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
+        self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string of text as a sequence of IDs."""
+        token_ids = []
+        for char in text:
+            try:
+                idx = self.characters.char_to_id(char)
+                token_ids.append(idx)
+            except KeyError:
+                # discard but store not found characters
+                if char not in self.not_found_characters:
+                    self.not_found_characters.append(char)
+                    print(text)
+                    print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
+        return token_ids
+    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+        text = self.text_cleaner(text)
+        text = self.encode(text)
+        text = self.intersperse_blank_char(text, True)
+        return text
+    def pad_with_bos_eos(self, char_sequence: List[str]):
+        """Pads a sequence with the special BOS and EOS characters."""
+        return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
+    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+        """Intersperses the blank character between characters in a sequence.
+        Use the ```blank``` character if defined else use the ```pad``` character.
+        """
+        char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
+        result = [char_to_use] * (len(char_sequence) * 2 + 1)
+        result[1::2] = char_sequence
+        return result
+    @staticmethod
+    def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
+        text_cleaner = multilingual_cleaners
+        CharactersClass = VitsCharacters
+        characters, new_config = CharactersClass.init_from_config(config)
+        # new_config.characters.characters_class = get_import_path(characters)
+        new_config.characters.characters_class = VitsCharacters
+        return (
+            TTSTokenizer(text_cleaner, characters),new_config)
+def multilingual_cleaners(text):
+    """Pipeline for multilingual text"""
+    text = lowercase(text)
+    text = replace_symbols(text, lang=None)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text).strip()
+def replace_symbols(text, lang="en"):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
+    text = text.replace(":", ",")
+    if lang == "en":
+        text = text.replace("&", " and ")
+    elif lang == "fr":
+        text = text.replace("&", " et ")
+    elif lang == "pt":
+        text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
+    return text
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
+    return text

hi_female/hi_female_vits_30hrs.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bcfb47f599b36e7cbfec27142604c366e538c17e89980a40519291f92a46327
+size 333261446

hi_female/jit_infer.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
+import torch
+import numpy as np
+#ch female
+with open("chars.txt", 'r') as f:
+    letters = f.read().strip('\n')
+model="hi_female_vits_30hrs.pt"
+text = "फिल्म गर्दिश में अमरीश पुरी के साथ जैकी श्रॉफ, ऐश्वर्या, डिंपल कपाड़िया"
+config = VitsConfig(
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class=VitsCharacters,
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters=letters,
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None)
+    )
+tokenizer, config = TTSTokenizer.init_from_config(config)
+x = tokenizer.text_to_ids(text)
+x = torch.from_numpy(np.array(x)).unsqueeze(0)
+net = torch.jit.load(model)
+with torch.no_grad():
+    out2 = net(x)
+import soundfile as sf
+sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)

hi_male/chars.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ शदऊतओसषमऱढै?ख़ौक़ड़ःिअनठय़ज़फ़्खँे।ंऋउ'हछङझ" ुणऔयघञृएईॆीपचॉॠवगडटइ,बॅूऐफजकलग़आधोथाभढ़ऑ

hi_male/extra.py ADDED Viewed

	@@ -0,0 +1,787 @@

+from typing import Callable, Dict, List, Union
+from dataclasses import asdict, dataclass, field
+import re
+from dataclasses import replace
+from typing import Dict
+_whitespace_re = re.compile(r"\s+")
+from dataclasses import dataclass, field
+from typing import List
+# from TTS.tts.configs.shared_configs import BaseTTSConfig
+# from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
+@dataclass
+class CharactersConfig():
+    characters_class: str = None
+    # using BaseVocabulary
+    vocab_dict: Dict = None
+    # using on BaseCharacters
+    pad: str = None
+    eos: str = None
+    bos: str = None
+    blank: str = None
+    characters: str = None
+    punctuations: str = None
+    phonemes: str = None
+    is_unique: bool = True  # for backwards compatibility of models trained with char sets with duplicates
+    is_sorted: bool = True
+@dataclass
+class BaseTTSConfig():
+    # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    # phoneme settings
+    use_phonemes: bool = False
+    phonemizer: str = None
+    phoneme_language: str = None
+    compute_input_seq_cache: bool = False
+    text_cleaner: str = None
+    enable_eos_bos_chars: bool = False
+    test_sentences_file: str = ""
+    phoneme_cache_path: str = None
+    # vocabulary parameters
+    characters: CharactersConfig = None
+    add_blank: bool = False
+    # training params
+    batch_group_size: int = 0
+    loss_masking: bool = None
+    # dataloading
+    min_audio_len: int = 1
+    max_audio_len: int = float("inf")
+    min_text_len: int = 1
+    max_text_len: int = float("inf")
+    compute_f0: bool = False
+    compute_energy: bool = False
+    compute_linear_spec: bool = False
+    precompute_num_workers: int = 0
+    use_noise_augment: bool = False
+    start_by_longest: bool = False
+    shuffle: bool = False
+    drop_last: bool = False
+    # dataset
+    datasets: str = None
+    # optimizer
+    optimizer: str = "radam"
+    optimizer_params: dict = None
+    # scheduler
+    lr_scheduler: str = None
+    lr_scheduler_params: dict = field(default_factory=lambda: {})
+    # testing
+    test_sentences: List[str] = field(default_factory=lambda: [])
+    # evaluation
+    eval_split_max_size: int = None
+    eval_split_size: float = 0.01
+    # weighted samplers
+    use_speaker_weighted_sampler: bool = False
+    speaker_weighted_sampler_alpha: float = 1.0
+    use_language_weighted_sampler: bool = False
+    language_weighted_sampler_alpha: float = 1.0
+    use_length_weighted_sampler: bool = False
+    length_weighted_sampler_alpha: float = 1.0
+@dataclass
+class VitsAudioConfig():
+    fft_size: int = 1024
+    sample_rate: int = 22050
+    win_length: int = 1024
+    hop_length: int = 256
+    num_mels: int = 80
+    mel_fmin: int = 0
+    mel_fmax: int = None
+@dataclass
+class VitsArgs():
+    num_chars: int = 100
+    out_channels: int = 513
+    spec_segment_size: int = 32
+    hidden_channels: int = 192
+    hidden_channels_ffn_text_encoder: int = 768
+    num_heads_text_encoder: int = 2
+    num_layers_text_encoder: int = 6
+    kernel_size_text_encoder: int = 3
+    dropout_p_text_encoder: float = 0.1
+    dropout_p_duration_predictor: float = 0.5
+    kernel_size_posterior_encoder: int = 5
+    dilation_rate_posterior_encoder: int = 1
+    num_layers_posterior_encoder: int = 16
+    kernel_size_flow: int = 5
+    dilation_rate_flow: int = 1
+    num_layers_flow: int = 4
+    resblock_type_decoder: str = "1"
+    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    upsample_initial_channel_decoder: int = 512
+    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    use_sdp: bool = True
+    noise_scale: float = 1.0
+    inference_noise_scale: float = 0.667
+    length_scale: float = 1
+    noise_scale_dp: float = 1.0
+    inference_noise_scale_dp: float = 1.0
+    max_inference_len: int = None
+    init_discriminator: bool = True
+    use_spectral_norm_disriminator: bool = False
+    use_speaker_embedding: bool = False
+    num_speakers: int = 0
+    speakers_file: str = None
+    d_vector_file: List[str] = None
+    speaker_embedding_channels: int = 256
+    use_d_vector_file: bool = False
+    d_vector_dim: int = 0
+    detach_dp_input: bool = True
+    use_language_embedding: bool = False
+    embedded_language_dim: int = 4
+    num_languages: int = 0
+    language_ids_file: str = None
+    use_speaker_encoder_as_loss: bool = False
+    speaker_encoder_config_path: str = ""
+    speaker_encoder_model_path: str = ""
+    condition_dp_on_speaker: bool = True
+    freeze_encoder: bool = False
+    freeze_DP: bool = False
+    freeze_PE: bool = False
+    freeze_flow_decoder: bool = False
+    freeze_waveform_decoder: bool = False
+    encoder_sample_rate: int = None
+    interpolate_z: bool = True
+    reinit_DP: bool = False
+    reinit_text_encoder: bool = False
+@dataclass
+class VitsConfig(BaseTTSConfig):
+    model: str = "vits"
+    # model specific params
+    model_args: VitsArgs = field(default_factory=VitsArgs)
+    audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
+    # optimizer
+    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    lr_gen: float = 0.0002
+    lr_disc: float = 0.0002
+    lr_scheduler_gen: str = "ExponentialLR"
+    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    lr_scheduler_disc: str = "ExponentialLR"
+    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    scheduler_after_epoch: bool = True
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+    # loss params
+    kl_loss_alpha: float = 1.0
+    disc_loss_alpha: float = 1.0
+    gen_loss_alpha: float = 1.0
+    feat_loss_alpha: float = 1.0
+    mel_loss_alpha: float = 45.0
+    dur_loss_alpha: float = 1.0
+    speaker_encoder_loss_alpha: float = 1.0
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+    # testing
+    test_sentences: List[List] = field(
+        default_factory=lambda: [
+            ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+            ["Be a voice, not an echo."],
+            ["I'm sorry Dave. I'm afraid I can't do that."],
+            ["This cake is great. It's so delicious and moist."],
+            ["Prior to November 22, 1963."],
+        ]
+    )
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    use_speaker_embedding: bool = False
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+    language_ids_file: str = None
+    use_language_embedding: bool = False
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: List[str] = None
+    d_vector_dim: int = None
+    def __post_init__(self):
+        pass
+        # for key, val in self.model_args.items():
+        #     if hasattr(self, key):
+        #         self[key] = val
+def parse_symbols():
+    return {
+        "pad": _pad,
+        "eos": _eos,
+        "bos": _bos,
+        "characters": _characters,
+        "punctuations": _punctuations,
+        "phonemes": _phonemes,
+    }
+# DEFAULT SET OF GRAPHEMES
+_pad = "<PAD>"
+_eos = "<EOS>"
+_bos = "<BOS>"
+_blank = "<BLNK>"  # TODO: check if we need this alongside with PAD
+_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_punctuations = "!'(),-.:;? "
+# DEFAULT SET OF IPA PHONEMES
+# Phonemes definition (All IPA characters)
+_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
+_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
+_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
+_suprasegmentals = "ˈˌːˑ"
+_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
+_diacrilics = "ɚ˞ɫ"
+_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
+class BaseVocabulary:
+    """Base Vocabulary class.
+    This class only needs a vocabulary dictionary without specifying the characters.
+    Args:
+        vocab (Dict): A dictionary of characters and their corresponding indices.
+    """
+    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+        self.vocab = vocab
+        self.pad = pad
+        self.blank = blank
+        self.bos = bos
+        self.eos = eos
+    @property
+    def pad_id(self) -> int:
+        """Return the index of the padding character. If the padding character is not specified, return the length
+        of the vocabulary."""
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        """Return the index of the blank character. If the blank character is not specified, return the length of
+        the vocabulary."""
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        """Return the index of the bos character. If the bos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        """Return the index of the eos character. If the eos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def vocab(self):
+        """Return the vocabulary dictionary."""
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        """Set the vocabulary dictionary and character mapping dictionaries."""
+        self._vocab, self._char_to_id, self._id_to_char = None, None, None
+        if vocab is not None:
+            self._vocab = vocab
+            self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
+            self._id_to_char = {
+                idx: char for idx, char in enumerate(self._vocab)  # pylint: disable=unnecessary-comprehension
+            }
+    @staticmethod
+    def init_from_config(config, **kwargs):
+        """Initialize from the given config."""
+        if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
+            return (
+                BaseVocabulary(
+                    config.characters.vocab_dict,
+                    config.characters.pad,
+                    config.characters.blank,
+                    config.characters.bos,
+                    config.characters.eos,
+                ),
+                config,
+            )
+        return BaseVocabulary(**kwargs), config
+    def to_config(self):
+        return CharactersConfig(
+            vocab_dict=self._vocab,
+            pad=self.pad,
+            eos=self.eos,
+            bos=self.bos,
+            blank=self.blank,
+            is_unique=False,
+            is_sorted=False,
+        )
+    @property
+    def num_chars(self):
+        """Return number of tokens in the vocabulary."""
+        return len(self._vocab)
+    def char_to_id(self, char: str) -> int:
+        """Map a character to an token ID."""
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        """Map an token ID to a character."""
+        return self._id_to_char[idx]
+class BaseCharacters:
+    def __init__(
+        self,
+        characters: str = None,
+        punctuations: str = None,
+        pad: str = None,
+        eos: str = None,
+        bos: str = None,
+        blank: str = None,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        self._characters = characters
+        self._punctuations = punctuations
+        self._pad = pad
+        self._eos = eos
+        self._bos = bos
+        self._blank = blank
+        self.is_unique = is_unique
+        self.is_sorted = is_sorted
+        self._create_vocab()
+    @property
+    def pad_id(self) -> int:
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, characters):
+        self._characters = characters
+        self._create_vocab()
+    @property
+    def punctuations(self):
+        return self._punctuations
+    @punctuations.setter
+    def punctuations(self, punctuations):
+        self._punctuations = punctuations
+        self._create_vocab()
+    @property
+    def pad(self):
+        return self._pad
+    @pad.setter
+    def pad(self, pad):
+        self._pad = pad
+        self._create_vocab()
+    @property
+    def eos(self):
+        return self._eos
+    @eos.setter
+    def eos(self, eos):
+        self._eos = eos
+        self._create_vocab()
+    @property
+    def bos(self):
+        return self._bos
+    @bos.setter
+    def bos(self, bos):
+        self._bos = bos
+        self._create_vocab()
+    @property
+    def blank(self):
+        return self._blank
+    @blank.setter
+    def blank(self, blank):
+        self._blank = blank
+        self._create_vocab()
+    @property
+    def vocab(self):
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        self._vocab = vocab
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
+    @property
+    def num_chars(self):
+        return len(self._vocab)
+    def _create_vocab(self):
+        _vocab = self._characters
+        if self.is_unique:
+            _vocab = list(set(_vocab))
+        if self.is_sorted:
+            _vocab = sorted(_vocab)
+        _vocab = list(_vocab)
+        _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
+        _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
+        _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
+        _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
+        self.vocab = _vocab + list(self._punctuations)
+        if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
+            assert (
+                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+    def char_to_id(self, char: str) -> int:
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        return self._id_to_char[idx]
+    def print_log(self, level: int = 0):
+        """
+        Prints the vocabulary in a nice format.
+        """
+        indent = "\t" * level
+        print(f"{indent}| > Characters: {self._characters}")
+        print(f"{indent}| > Punctuations: {self._punctuations}")
+        print(f"{indent}| > Pad: {self._pad}")
+        print(f"{indent}| > EOS: {self._eos}")
+        print(f"{indent}| > BOS: {self._bos}")
+        print(f"{indent}| > Blank: {self._blank}")
+        print(f"{indent}| > Vocab: {self.vocab}")
+        print(f"{indent}| > Num chars: {self.num_chars}")
+    @staticmethod
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+        Implement this method for your subclass.
+        """
+        # use character set from config
+        if config.characters is not None:
+            return BaseCharacters(**config.characters), config
+        # return default character set
+        characters = BaseCharacters()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
+        )
+class IPAPhonemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _phonemes,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        # band-aid for compatibility with old models
+        if "characters" in config and config.characters is not None:
+            if "phonemes" in config.characters and config.characters.phonemes is not None:
+                config.characters["characters"] = config.characters["phonemes"]
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
+            )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+class Graphemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a Graphemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+if __name__ == "__main__":
+    gr = Graphemes()
+    ph = IPAPhonemes()
+    gr.print_log()
+    ph.print_log()
+class VitsCharacters(BaseCharacters):
+    """Characters class for VITs model for compatibility with pre-trained models"""
+    def __init__(
+        self,
+        graphemes: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        ipa_characters: str = _phonemes,
+    ) -> None:
+        if ipa_characters is not None:
+            graphemes += ipa_characters
+        super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
+    def _create_vocab(self):
+        self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        # pylint: disable=unnecessary-comprehension
+        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+    @staticmethod
+    def init_from_config(config):
+        _pad = config.characters.pad
+        _punctuations = config.characters.punctuations
+        _letters = config.characters.characters
+        _letters_ipa = config.characters.phonemes
+        return (
+            VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
+            config,
+        )
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=None,
+            bos=None,
+            blank=self._blank,
+            is_unique=False,
+            is_sorted=True,
+        )
+class TTSTokenizer:
+    def __init__(
+        self,
+        text_cleaner: Callable = None,
+        characters: "BaseCharacters" = None,
+    ):
+        self.text_cleaner = text_cleaner
+        self.characters = characters
+        self.not_found_characters = []
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, new_characters):
+        self._characters = new_characters
+        self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
+        self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string of text as a sequence of IDs."""
+        token_ids = []
+        for char in text:
+            try:
+                idx = self.characters.char_to_id(char)
+                token_ids.append(idx)
+            except KeyError:
+                # discard but store not found characters
+                if char not in self.not_found_characters:
+                    self.not_found_characters.append(char)
+                    print(text)
+                    print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
+        return token_ids
+    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+        text = self.text_cleaner(text)
+        text = self.encode(text)
+        text = self.intersperse_blank_char(text, True)
+        return text
+    def pad_with_bos_eos(self, char_sequence: List[str]):
+        """Pads a sequence with the special BOS and EOS characters."""
+        return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
+    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+        """Intersperses the blank character between characters in a sequence.
+        Use the ```blank``` character if defined else use the ```pad``` character.
+        """
+        char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
+        result = [char_to_use] * (len(char_sequence) * 2 + 1)
+        result[1::2] = char_sequence
+        return result
+    @staticmethod
+    def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
+        text_cleaner = multilingual_cleaners
+        CharactersClass = VitsCharacters
+        characters, new_config = CharactersClass.init_from_config(config)
+        # new_config.characters.characters_class = get_import_path(characters)
+        new_config.characters.characters_class = VitsCharacters
+        return (
+            TTSTokenizer(text_cleaner, characters),new_config)
+def multilingual_cleaners(text):
+    """Pipeline for multilingual text"""
+    text = lowercase(text)
+    text = replace_symbols(text, lang=None)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text).strip()
+def replace_symbols(text, lang="en"):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
+    text = text.replace(":", ",")
+    if lang == "en":
+        text = text.replace("&", " and ")
+    elif lang == "fr":
+        text = text.replace("&", " et ")
+    elif lang == "pt":
+        text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
+    return text
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
+    return text

hi_male/hi_male_vits_30hrs.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb36eca2d90214662f1647e83eb6979ead93b72f269606c6411f52959acf77a8
+size 333256012

hi_male/jit_infer.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
+import torch
+import numpy as np
+#ch female
+with open("chars.txt", 'r') as f:
+    letters = f.read().strip('\n')
+model="hi_male_vits_30hrs.pt"
+text = "फिल्म गर्दिश में अमरीश पुरी के साथ जैकी श्रॉफ, ऐश्वर्या, डिंपल कपाड़िया"
+config = VitsConfig(
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class=VitsCharacters,
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters=letters,
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None)
+    )
+tokenizer, config = TTSTokenizer.init_from_config(config)
+x = tokenizer.text_to_ids(text)
+x = torch.from_numpy(np.array(x)).unsqueeze(0)
+net = torch.jit.load(model)
+with torch.no_grad():
+    out2 = net(x)
+import soundfile as sf
+sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)

hne_female/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

hne_female/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: cc-by-4.0
+---

hne_female/ch_female_vits_30hrs.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3393916262f03807d8338aa8dce79379582c71a0ada346457e36ea6f72a6635
+size 333255366

hne_female/chars.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ खछगचऊुलशौढ़इणज़झैठढजफ़औ्ड़फूेानटॅयव़ऋदप.थअँऑआघहतषरसभउञडएईऐक़ िओ?धी,ॉंख़कोबमृ

hne_female/extra.py ADDED Viewed

	@@ -0,0 +1,787 @@

+from typing import Callable, Dict, List, Union
+from dataclasses import asdict, dataclass, field
+import re
+from dataclasses import replace
+from typing import Dict
+_whitespace_re = re.compile(r"\s+")
+from dataclasses import dataclass, field
+from typing import List
+# from TTS.tts.configs.shared_configs import BaseTTSConfig
+# from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
+@dataclass
+class CharactersConfig():
+    characters_class: str = None
+    # using BaseVocabulary
+    vocab_dict: Dict = None
+    # using on BaseCharacters
+    pad: str = None
+    eos: str = None
+    bos: str = None
+    blank: str = None
+    characters: str = None
+    punctuations: str = None
+    phonemes: str = None
+    is_unique: bool = True  # for backwards compatibility of models trained with char sets with duplicates
+    is_sorted: bool = True
+@dataclass
+class BaseTTSConfig():
+    # audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+    # phoneme settings
+    use_phonemes: bool = False
+    phonemizer: str = None
+    phoneme_language: str = None
+    compute_input_seq_cache: bool = False
+    text_cleaner: str = None
+    enable_eos_bos_chars: bool = False
+    test_sentences_file: str = ""
+    phoneme_cache_path: str = None
+    # vocabulary parameters
+    characters: CharactersConfig = None
+    add_blank: bool = False
+    # training params
+    batch_group_size: int = 0
+    loss_masking: bool = None
+    # dataloading
+    min_audio_len: int = 1
+    max_audio_len: int = float("inf")
+    min_text_len: int = 1
+    max_text_len: int = float("inf")
+    compute_f0: bool = False
+    compute_energy: bool = False
+    compute_linear_spec: bool = False
+    precompute_num_workers: int = 0
+    use_noise_augment: bool = False
+    start_by_longest: bool = False
+    shuffle: bool = False
+    drop_last: bool = False
+    # dataset
+    datasets: str = None
+    # optimizer
+    optimizer: str = "radam"
+    optimizer_params: dict = None
+    # scheduler
+    lr_scheduler: str = None
+    lr_scheduler_params: dict = field(default_factory=lambda: {})
+    # testing
+    test_sentences: List[str] = field(default_factory=lambda: [])
+    # evaluation
+    eval_split_max_size: int = None
+    eval_split_size: float = 0.01
+    # weighted samplers
+    use_speaker_weighted_sampler: bool = False
+    speaker_weighted_sampler_alpha: float = 1.0
+    use_language_weighted_sampler: bool = False
+    language_weighted_sampler_alpha: float = 1.0
+    use_length_weighted_sampler: bool = False
+    length_weighted_sampler_alpha: float = 1.0
+@dataclass
+class VitsAudioConfig():
+    fft_size: int = 1024
+    sample_rate: int = 22050
+    win_length: int = 1024
+    hop_length: int = 256
+    num_mels: int = 80
+    mel_fmin: int = 0
+    mel_fmax: int = None
+@dataclass
+class VitsArgs():
+    num_chars: int = 100
+    out_channels: int = 513
+    spec_segment_size: int = 32
+    hidden_channels: int = 192
+    hidden_channels_ffn_text_encoder: int = 768
+    num_heads_text_encoder: int = 2
+    num_layers_text_encoder: int = 6
+    kernel_size_text_encoder: int = 3
+    dropout_p_text_encoder: float = 0.1
+    dropout_p_duration_predictor: float = 0.5
+    kernel_size_posterior_encoder: int = 5
+    dilation_rate_posterior_encoder: int = 1
+    num_layers_posterior_encoder: int = 16
+    kernel_size_flow: int = 5
+    dilation_rate_flow: int = 1
+    num_layers_flow: int = 4
+    resblock_type_decoder: str = "1"
+    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
+    upsample_initial_channel_decoder: int = 512
+    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
+    use_sdp: bool = True
+    noise_scale: float = 1.0
+    inference_noise_scale: float = 0.667
+    length_scale: float = 1
+    noise_scale_dp: float = 1.0
+    inference_noise_scale_dp: float = 1.0
+    max_inference_len: int = None
+    init_discriminator: bool = True
+    use_spectral_norm_disriminator: bool = False
+    use_speaker_embedding: bool = False
+    num_speakers: int = 0
+    speakers_file: str = None
+    d_vector_file: List[str] = None
+    speaker_embedding_channels: int = 256
+    use_d_vector_file: bool = False
+    d_vector_dim: int = 0
+    detach_dp_input: bool = True
+    use_language_embedding: bool = False
+    embedded_language_dim: int = 4
+    num_languages: int = 0
+    language_ids_file: str = None
+    use_speaker_encoder_as_loss: bool = False
+    speaker_encoder_config_path: str = ""
+    speaker_encoder_model_path: str = ""
+    condition_dp_on_speaker: bool = True
+    freeze_encoder: bool = False
+    freeze_DP: bool = False
+    freeze_PE: bool = False
+    freeze_flow_decoder: bool = False
+    freeze_waveform_decoder: bool = False
+    encoder_sample_rate: int = None
+    interpolate_z: bool = True
+    reinit_DP: bool = False
+    reinit_text_encoder: bool = False
+@dataclass
+class VitsConfig(BaseTTSConfig):
+    model: str = "vits"
+    # model specific params
+    model_args: VitsArgs = field(default_factory=VitsArgs)
+    audio: VitsAudioConfig = field(default_factory=VitsAudioConfig)
+    # optimizer
+    grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
+    lr_gen: float = 0.0002
+    lr_disc: float = 0.0002
+    lr_scheduler_gen: str = "ExponentialLR"
+    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    lr_scheduler_disc: str = "ExponentialLR"
+    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
+    scheduler_after_epoch: bool = True
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
+    # loss params
+    kl_loss_alpha: float = 1.0
+    disc_loss_alpha: float = 1.0
+    gen_loss_alpha: float = 1.0
+    feat_loss_alpha: float = 1.0
+    mel_loss_alpha: float = 45.0
+    dur_loss_alpha: float = 1.0
+    speaker_encoder_loss_alpha: float = 1.0
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+    # testing
+    test_sentences: List[List] = field(
+        default_factory=lambda: [
+            ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."],
+            ["Be a voice, not an echo."],
+            ["I'm sorry Dave. I'm afraid I can't do that."],
+            ["This cake is great. It's so delicious and moist."],
+            ["Prior to November 22, 1963."],
+        ]
+    )
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    use_speaker_embedding: bool = False
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+    language_ids_file: str = None
+    use_language_embedding: bool = False
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: List[str] = None
+    d_vector_dim: int = None
+    def __post_init__(self):
+        pass
+        # for key, val in self.model_args.items():
+        #     if hasattr(self, key):
+        #         self[key] = val
+def parse_symbols():
+    return {
+        "pad": _pad,
+        "eos": _eos,
+        "bos": _bos,
+        "characters": _characters,
+        "punctuations": _punctuations,
+        "phonemes": _phonemes,
+    }
+# DEFAULT SET OF GRAPHEMES
+_pad = "<PAD>"
+_eos = "<EOS>"
+_bos = "<BOS>"
+_blank = "<BLNK>"  # TODO: check if we need this alongside with PAD
+_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+_punctuations = "!'(),-.:;? "
+# DEFAULT SET OF IPA PHONEMES
+# Phonemes definition (All IPA characters)
+_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
+_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
+_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
+_suprasegmentals = "ˈˌːˑ"
+_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
+_diacrilics = "ɚ˞ɫ"
+_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
+class BaseVocabulary:
+    """Base Vocabulary class.
+    This class only needs a vocabulary dictionary without specifying the characters.
+    Args:
+        vocab (Dict): A dictionary of characters and their corresponding indices.
+    """
+    def __init__(self, vocab: Dict, pad: str = None, blank: str = None, bos: str = None, eos: str = None):
+        self.vocab = vocab
+        self.pad = pad
+        self.blank = blank
+        self.bos = bos
+        self.eos = eos
+    @property
+    def pad_id(self) -> int:
+        """Return the index of the padding character. If the padding character is not specified, return the length
+        of the vocabulary."""
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        """Return the index of the blank character. If the blank character is not specified, return the length of
+        the vocabulary."""
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        """Return the index of the bos character. If the bos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        """Return the index of the eos character. If the eos character is not specified, return the length of the
+        vocabulary."""
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def vocab(self):
+        """Return the vocabulary dictionary."""
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        """Set the vocabulary dictionary and character mapping dictionaries."""
+        self._vocab, self._char_to_id, self._id_to_char = None, None, None
+        if vocab is not None:
+            self._vocab = vocab
+            self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
+            self._id_to_char = {
+                idx: char for idx, char in enumerate(self._vocab)  # pylint: disable=unnecessary-comprehension
+            }
+    @staticmethod
+    def init_from_config(config, **kwargs):
+        """Initialize from the given config."""
+        if config.characters is not None and "vocab_dict" in config.characters and config.characters.vocab_dict:
+            return (
+                BaseVocabulary(
+                    config.characters.vocab_dict,
+                    config.characters.pad,
+                    config.characters.blank,
+                    config.characters.bos,
+                    config.characters.eos,
+                ),
+                config,
+            )
+        return BaseVocabulary(**kwargs), config
+    def to_config(self):
+        return CharactersConfig(
+            vocab_dict=self._vocab,
+            pad=self.pad,
+            eos=self.eos,
+            bos=self.bos,
+            blank=self.blank,
+            is_unique=False,
+            is_sorted=False,
+        )
+    @property
+    def num_chars(self):
+        """Return number of tokens in the vocabulary."""
+        return len(self._vocab)
+    def char_to_id(self, char: str) -> int:
+        """Map a character to an token ID."""
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        """Map an token ID to a character."""
+        return self._id_to_char[idx]
+class BaseCharacters:
+    def __init__(
+        self,
+        characters: str = None,
+        punctuations: str = None,
+        pad: str = None,
+        eos: str = None,
+        bos: str = None,
+        blank: str = None,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        self._characters = characters
+        self._punctuations = punctuations
+        self._pad = pad
+        self._eos = eos
+        self._bos = bos
+        self._blank = blank
+        self.is_unique = is_unique
+        self.is_sorted = is_sorted
+        self._create_vocab()
+    @property
+    def pad_id(self) -> int:
+        return self.char_to_id(self.pad) if self.pad else len(self.vocab)
+    @property
+    def blank_id(self) -> int:
+        return self.char_to_id(self.blank) if self.blank else len(self.vocab)
+    @property
+    def eos_id(self) -> int:
+        return self.char_to_id(self.eos) if self.eos else len(self.vocab)
+    @property
+    def bos_id(self) -> int:
+        return self.char_to_id(self.bos) if self.bos else len(self.vocab)
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, characters):
+        self._characters = characters
+        self._create_vocab()
+    @property
+    def punctuations(self):
+        return self._punctuations
+    @punctuations.setter
+    def punctuations(self, punctuations):
+        self._punctuations = punctuations
+        self._create_vocab()
+    @property
+    def pad(self):
+        return self._pad
+    @pad.setter
+    def pad(self, pad):
+        self._pad = pad
+        self._create_vocab()
+    @property
+    def eos(self):
+        return self._eos
+    @eos.setter
+    def eos(self, eos):
+        self._eos = eos
+        self._create_vocab()
+    @property
+    def bos(self):
+        return self._bos
+    @bos.setter
+    def bos(self, bos):
+        self._bos = bos
+        self._create_vocab()
+    @property
+    def blank(self):
+        return self._blank
+    @blank.setter
+    def blank(self, blank):
+        self._blank = blank
+        self._create_vocab()
+    @property
+    def vocab(self):
+        return self._vocab
+    @vocab.setter
+    def vocab(self, vocab):
+        self._vocab = vocab
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
+    @property
+    def num_chars(self):
+        return len(self._vocab)
+    def _create_vocab(self):
+        _vocab = self._characters
+        if self.is_unique:
+            _vocab = list(set(_vocab))
+        if self.is_sorted:
+            _vocab = sorted(_vocab)
+        _vocab = list(_vocab)
+        _vocab = [self._blank] + _vocab if self._blank is not None and len(self._blank) > 0 else _vocab
+        _vocab = [self._bos] + _vocab if self._bos is not None and len(self._bos) > 0 else _vocab
+        _vocab = [self._eos] + _vocab if self._eos is not None and len(self._eos) > 0 else _vocab
+        _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
+        self.vocab = _vocab + list(self._punctuations)
+        if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
+            assert (
+                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"
+    def char_to_id(self, char: str) -> int:
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e
+    def id_to_char(self, idx: int) -> str:
+        return self._id_to_char[idx]
+    def print_log(self, level: int = 0):
+        """
+        Prints the vocabulary in a nice format.
+        """
+        indent = "\t" * level
+        print(f"{indent}| > Characters: {self._characters}")
+        print(f"{indent}| > Punctuations: {self._punctuations}")
+        print(f"{indent}| > Pad: {self._pad}")
+        print(f"{indent}| > EOS: {self._eos}")
+        print(f"{indent}| > BOS: {self._bos}")
+        print(f"{indent}| > Blank: {self._blank}")
+        print(f"{indent}| > Vocab: {self.vocab}")
+        print(f"{indent}| > Num chars: {self.num_chars}")
+    @staticmethod
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+        Implement this method for your subclass.
+        """
+        # use character set from config
+        if config.characters is not None:
+            return BaseCharacters(**config.characters), config
+        # return default character set
+        characters = BaseCharacters()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
+        )
+class IPAPhonemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _phonemes,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        # band-aid for compatibility with old models
+        if "characters" in config and config.characters is not None:
+            if "phonemes" in config.characters and config.characters.phonemes is not None:
+                config.characters["characters"] = config.characters["phonemes"]
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
+            )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+class Graphemes(BaseCharacters):
+    def __init__(
+        self,
+        characters: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        eos: str = _eos,
+        bos: str = _bos,
+        blank: str = _blank,
+        is_unique: bool = False,
+        is_sorted: bool = True,
+    ) -> None:
+        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)
+    @staticmethod
+    def init_from_config(config: "Coqpit"):
+        """Init a Graphemes object from a model config
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config
+if __name__ == "__main__":
+    gr = Graphemes()
+    ph = IPAPhonemes()
+    gr.print_log()
+    ph.print_log()
+class VitsCharacters(BaseCharacters):
+    """Characters class for VITs model for compatibility with pre-trained models"""
+    def __init__(
+        self,
+        graphemes: str = _characters,
+        punctuations: str = _punctuations,
+        pad: str = _pad,
+        ipa_characters: str = _phonemes,
+    ) -> None:
+        if ipa_characters is not None:
+            graphemes += ipa_characters
+        super().__init__(graphemes, punctuations, pad, None, None, "<BLNK>", is_unique=False, is_sorted=True)
+    def _create_vocab(self):
+        self._vocab = [self._pad] + list(self._punctuations) + list(self._characters) + [self._blank]
+        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
+        # pylint: disable=unnecessary-comprehension
+        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+    @staticmethod
+    def init_from_config(config):
+        _pad = config.characters.pad
+        _punctuations = config.characters.punctuations
+        _letters = config.characters.characters
+        _letters_ipa = config.characters.phonemes
+        return (
+            VitsCharacters(graphemes=_letters, ipa_characters=_letters_ipa, punctuations=_punctuations, pad=_pad),
+            config,
+        )
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=None,
+            bos=None,
+            blank=self._blank,
+            is_unique=False,
+            is_sorted=True,
+        )
+class TTSTokenizer:
+    def __init__(
+        self,
+        text_cleaner: Callable = None,
+        characters: "BaseCharacters" = None,
+    ):
+        self.text_cleaner = text_cleaner
+        self.characters = characters
+        self.not_found_characters = []
+    @property
+    def characters(self):
+        return self._characters
+    @characters.setter
+    def characters(self, new_characters):
+        self._characters = new_characters
+        self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
+        self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string of text as a sequence of IDs."""
+        token_ids = []
+        for char in text:
+            try:
+                idx = self.characters.char_to_id(char)
+                token_ids.append(idx)
+            except KeyError:
+                # discard but store not found characters
+                if char not in self.not_found_characters:
+                    self.not_found_characters.append(char)
+                    print(text)
+                    print(f" [!] Character {repr(char)} not found in the vocabulary. Discarding it.")
+        return token_ids
+    def text_to_ids(self, text: str, language: str = None) -> List[int]:  # pylint: disable=unused-argument
+        text = self.text_cleaner(text)
+        text = self.encode(text)
+        text = self.intersperse_blank_char(text, True)
+        return text
+    def pad_with_bos_eos(self, char_sequence: List[str]):
+        """Pads a sequence with the special BOS and EOS characters."""
+        return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
+    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
+        """Intersperses the blank character between characters in a sequence.
+        Use the ```blank``` character if defined else use the ```pad``` character.
+        """
+        char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
+        result = [char_to_use] * (len(char_sequence) * 2 + 1)
+        result[1::2] = char_sequence
+        return result
+    @staticmethod
+    def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
+        text_cleaner = multilingual_cleaners
+        CharactersClass = VitsCharacters
+        characters, new_config = CharactersClass.init_from_config(config)
+        # new_config.characters.characters_class = get_import_path(characters)
+        new_config.characters.characters_class = VitsCharacters
+        return (
+            TTSTokenizer(text_cleaner, characters),new_config)
+def multilingual_cleaners(text):
+    """Pipeline for multilingual text"""
+    text = lowercase(text)
+    text = replace_symbols(text, lang=None)
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text).strip()
+def replace_symbols(text, lang="en"):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
+    text = text.replace(":", ",")
+    if lang == "en":
+        text = text.replace("&", " and ")
+    elif lang == "fr":
+        text = text.replace("&", " et ")
+    elif lang == "pt":
+        text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
+    return text
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"]+", "", text)
+    return text

hne_female/jit_infer.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+from extra import TTSTokenizer, VitsConfig, CharactersConfig, VitsCharacters
+import torch
+import numpy as np
+#ch female
+letters="खछगचऊुलशौढ़इणज़झैठढजफ़औ्ड़फूेानटॅयव़ऋदप.थअँऑआघहतषरसभउञडएईऐक़ िओ?धी,ॉंख़कोबमृ"
+model="ch_female_vits_30hrs.pt"
+text = "पेरिविंकल के जड़, उपजी अउ पत्त्ता मन ह बिकट उपयोगी हे"
+config = VitsConfig(
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class=VitsCharacters,
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters=letters,
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None)
+    )
+tokenizer, config = TTSTokenizer.init_from_config(config)
+x = tokenizer.text_to_ids(text)
+x = torch.from_numpy(np.array(x)).unsqueeze(0)
+net = torch.jit.load(model)
+with torch.no_grad():
+    out2 = net(x)
+import soundfile as sf
+sf.write("jit.wav", out2.squeeze().cpu().numpy(), 22050)

hne_male/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

hne_male/README.md ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: cc-by-4.0
+---

hne_male/ch_male_vits_30hrs.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ef1fb370e1a8bc844aca171316a909461521ce6afa1371d15d5f8c765cda4d9
+size 333250252