thanks to fishaudio ❤

Browse files

Files changed (7) hide show

README.md +61 -0
config.json +21 -0
firefly-gan-vq-fsq-8x1024-21hz-generator.pth +3 -0
model.pth +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +82 -0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+tags:
+- text-to-speech
+license: cc-by-nc-sa-4.0
+language:
+- zh
+- en
+- de
+- ja
+- fr
+- es
+- ko
+- ar
+pipeline_tag: text-to-speech
+inference: false
+extra_gated_prompt: >-
+  You agree to not use the model to generate contents that violate DMCA or local
+  laws.
+extra_gated_fields:
+  Country: country
+  Specific date: date_picker
+  I agree to use this model for non-commercial use ONLY: checkbox
+---
+# Fish Speech V1.4
+**Fish Speech V1.4** is a leading text-to-speech (TTS) model trained on 700k hours of audio data in multiple languages.
+Supported languages:
+- English (en) ~300k hours
+- Chinese (zh) ~300k hours
+- German (de) ~20k hours
+- Japanese (ja) ~20k hours
+- French (fr) ~20k hours
+- Spanish (es) ~20k hours
+- Korean (ko) ~20k hours
+- Arabic (ar) ~20k hours
+Please refer to [Fish Speech Github](https://github.com/fishaudio/fish-speech) for more info.
+Demo available at [Fish Audio](https://fish.audio/).
+## Citation
+If you found this repository useful, please consider citing this work:
+```
+@misc{fish-speech-v1.4,
+  author = {Shijia Liao, Tianyu Li, etc},
+  title = {Fish Speech V1.4},
+  year = {2024},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/fishaudio/fish-speech}}
+}
+```
+## License
+This model is permissively licensed under the BY-CC-NC-SA-4.0 license.
+The source code is released under BSD-3-Clause license.

config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "attention_qkv_bias": false,
+    "codebook_size": 1024,
+    "dim": 1024,
+    "dropout": 0.1,
+    "head_dim": 64,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "max_seq_len": 4096,
+    "model_type": "dual_ar",
+    "n_fast_layer": 4,
+    "n_head": 16,
+    "n_layer": 24,
+    "n_local_heads": 2,
+    "norm_eps": 1e-06,
+    "num_codebooks": 8,
+    "rope_base": 1000000.0,
+    "tie_word_embeddings": false,
+    "use_gradient_checkpointing": true,
+    "vocab_size": 32000
+}

firefly-gan-vq-fsq-8x1024-21hz-generator.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01b81dbf753224a156c3fe139b88bf0b9a0f54b11bee864f95e66511c3ccd754
+size 188518579

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d1cfa4b59c37f58d22e0626a53cec61db79390d7d0733b6402bf6f69fe58b93
+size 988988542

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_sequence|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_sequence|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|begin_of_sequence|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|end_of_sequence|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<|semantic|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<|mel|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<|reserve_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<|reserve_1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|begin_of_sequence|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|end_of_sequence|>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<|pad|>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}