Seed-Voice-Conversion

Running

App Files Files Community

Plachta commited on Apr 9

Commit

56a1295

verified ·

1 Parent(s): a812692

Upload 116 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +47 -43
.gitignore +28 -0
README.md +12 -12
app_v1v2.py +175 -0
configs/astral_quantization/default_2048.yml +40 -0
configs/astral_quantization/default_32.yml +40 -0
configs/config.json +1 -0
configs/inuse/.gitignore +0 -0
configs/inuse/config.json +1 -0
configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml +98 -0
configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml +91 -0
configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml +82 -0
configs/v2/ar_base.yaml +0 -0
configs/v2/dit_small.yaml +17 -0
configs/v2/vc_wrapper.yaml +105 -0
hf_utils.py +1 -1
modules/__pycache__/audio.cpython-310.pyc +0 -0
modules/__pycache__/commons.cpython-310.pyc +0 -0
modules/__pycache__/commons.cpython-38.pyc +0 -0
modules/__pycache__/diffusion_transformer.cpython-310.pyc +0 -0
modules/__pycache__/flow_matching.cpython-310.pyc +0 -0
modules/__pycache__/length_regulator.cpython-310.pyc +0 -0
modules/__pycache__/rmvpe.cpython-310.pyc +0 -0
modules/astral_quantization/__pycache__/bsq.cpython-310.pyc +0 -0
modules/astral_quantization/__pycache__/convnext.cpython-310.pyc +0 -0
modules/astral_quantization/__pycache__/default_model.cpython-310.pyc +0 -0
modules/astral_quantization/bsq.py +569 -0
modules/astral_quantization/convnext.py +209 -0
modules/astral_quantization/default_model.py +73 -0
modules/astral_quantization/transformer.py +254 -0
modules/audio.py +82 -82
modules/bigvgan/__pycache__/activations.cpython-310.pyc +0 -0
modules/bigvgan/__pycache__/bigvgan.cpython-310.pyc +0 -0
modules/bigvgan/__pycache__/env.cpython-310.pyc +0 -0
modules/bigvgan/__pycache__/meldataset.cpython-310.pyc +0 -0
modules/bigvgan/__pycache__/utils.cpython-310.pyc +0 -0
modules/bigvgan/alias_free_activation/cuda/__pycache__/__init__.cpython-310.pyc +0 -0
modules/bigvgan/alias_free_activation/cuda/__pycache__/activation1d.cpython-310.pyc +0 -0
modules/bigvgan/alias_free_activation/cuda/__pycache__/load.cpython-310.pyc +0 -0
modules/bigvgan/alias_free_activation/cuda/activation1d.py +2 -2
modules/bigvgan/alias_free_activation/cuda/build/.ninja_deps +3 -0
modules/bigvgan/alias_free_activation/cuda/build/.ninja_log +7 -0
modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation.o +3 -0
modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.cuda.o +3 -0
modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.exp +0 -0
modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.lib +0 -0
modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.pyd +3 -0
modules/bigvgan/alias_free_activation/cuda/build/build.ninja +38 -0
modules/bigvgan/alias_free_activation/torch/__pycache__/__init__.cpython-310.pyc +0 -0
modules/bigvgan/alias_free_activation/torch/__pycache__/act.cpython-310.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -1,43 +1,47 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-examples/reference/dingzhen_0.wav filter=lfs diff=lfs merge=lfs -text
-examples/reference/s3p2.wav filter=lfs diff=lfs merge=lfs -text
-examples/source/source_s3.wav filter=lfs diff=lfs merge=lfs -text
-examples/source/source_s4.wav filter=lfs diff=lfs merge=lfs -text
-examples/source/Wiz[[:space:]]Khalifa,Charlie[[:space:]]Puth[[:space:]]-[[:space:]]See[[:space:]]You[[:space:]]Again[[:space:]]\[vocals\]_\[cut_28sec\].wav filter=lfs diff=lfs merge=lfs -text
-examples/reference/trump_0.wav filter=lfs diff=lfs merge=lfs -text
-examples/source/jay_0.wav filter=lfs diff=lfs merge=lfs -text
-examples/source/TECHNOPOLIS[[:space:]]-[[:space:]]2085[[:space:]]\[vocals\]_\[cut_14sec\].wav filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/reference/dingzhen_0.wav filter=lfs diff=lfs merge=lfs -text
+examples/reference/s3p2.wav filter=lfs diff=lfs merge=lfs -text
+examples/source/source_s3.wav filter=lfs diff=lfs merge=lfs -text
+examples/source/source_s4.wav filter=lfs diff=lfs merge=lfs -text
+examples/source/Wiz[[:space:]]Khalifa,Charlie[[:space:]]Puth[[:space:]]-[[:space:]]See[[:space:]]You[[:space:]]Again[[:space:]]\[vocals\]_\[cut_28sec\].wav filter=lfs diff=lfs merge=lfs -text
+examples/reference/trump_0.wav filter=lfs diff=lfs merge=lfs -text
+examples/source/jay_0.wav filter=lfs diff=lfs merge=lfs -text
+examples/source/TECHNOPOLIS[[:space:]]-[[:space:]]2085[[:space:]]\[vocals\]_\[cut_14sec\].wav filter=lfs diff=lfs merge=lfs -text
+modules/bigvgan/alias_free_activation/cuda/build/.ninja_deps filter=lfs diff=lfs merge=lfs -text
+modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.cuda.o filter=lfs diff=lfs merge=lfs -text
+modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.pyd filter=lfs diff=lfs merge=lfs -text
+modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation.o filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,28 @@

+# general things to ignore
+.DS_Store
+build/
+build_contrib/
+dist/
+.cache/
+*.egg-info/
+*.egg
+*.py[cod]
+__pycache__/
+*.so
+*~
+# IDE
+.vscode/
+.idea/
+# misc
+checkpoints/
+test_waves/
+reconstructed/
+.python-version
+ruff.log
+/configs/inuse/
+runs/
+/garbages/
+/flagged/
+/experimental/

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
----
-title: Seed Voice Conversion
-emoji: 🎤🔄
-colorFrom: green
-colorTo: green
-sdk: gradio
-sdk_version: 4.42.0
-app_file: app.py
-pinned: false
-license: gpl-3.0
----
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Seed Voice Conversion
+emoji: 🎤🔄
+colorFrom: green
+colorTo: green
+sdk: gradio
+sdk_version: 5.23.0
+app_file: app_v1v2.py
+pinned: false
+license: gpl-3.0
+---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app_v1v2.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import spaces
+import gradio as gr
+import torch
+import yaml
+import argparse
+from seed_vc_wrapper import SeedVCWrapper
+# Set up device and torch configurations
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+torch._inductor.config.coordinate_descent_tuning = True
+torch._inductor.config.triton.unique_kernel_names = True
+if hasattr(torch._inductor.config, "fx_graph_cache"):
+    # Experimental feature to reduce compilation times, will be on by default in future
+    torch._inductor.config.fx_graph_cache = True
+dtype = torch.float16
+def load_v2_models(args):
+    from hydra.utils import instantiate
+    from omegaconf import DictConfig
+    cfg = DictConfig(yaml.safe_load(open("configs/v2/vc_wrapper.yaml", "r")))
+    vc_wrapper = instantiate(cfg)
+    vc_wrapper.load_checkpoints()
+    vc_wrapper.to(device)
+    vc_wrapper.eval()
+    vc_wrapper.setup_ar_caches(max_batch_size=1, max_seq_len=4096, dtype=dtype, device=device)
+    if args.compile:
+        vc_wrapper.compile_ar()
+        # vc_wrapper.compile_cfm()
+    return vc_wrapper
+def create_v1_interface():
+    # Initialize the V1 wrapper
+    vc_wrapper = SeedVCWrapper()
+    # Set up Gradio interface
+    description = ("Zero-shot voice conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
+                   "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
+                   "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
+                   "无需训练的 zero-shot 语音/歌声转换模型，若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
+                   "请注意，参考音频若超过 25 秒，则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒，源音频将被分段处理。")
+    inputs = [
+        gr.Audio(type="filepath", label="Source Audio / 源音频"),
+        gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
+        gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数",
+                 info="10 by default, 50~100 for best quality / 默认为 10，50~100 为最佳质量"),
+        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整",
+                 info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速，>1.0 减慢语速"),
+        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate",
+                 info="has subtle influence / 有微小影响"),
+        gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False,
+                   info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
+        gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
+                   info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色，仅在勾选 '启用F0输入' 时生效"),
+        gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0,
+                 info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换，仅在勾选 '启用F0输入' 时生效"),
+    ]
+    examples = [
+        ["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
+        ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, True, True, 0],
+        ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
+         "examples/reference/teio_0.wav", 100, 1.0, 0.7, True, False, 0],
+        ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
+         "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
+    ]
+    outputs = [
+        gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
+        gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')
+    ]
+    return gr.Interface(
+        fn=vc_wrapper.convert_voice,
+        description=description,
+        inputs=inputs,
+        outputs=outputs,
+        title="Seed Voice Conversion V1 (Voice & Singing Voice Conversion)",
+        examples=examples,
+        cache_examples=False,
+    )
+def create_v2_interface(vc_wrapper):
+    # Set up Gradio interface
+    description = ("Zero-shot voice/style conversion with in-context learning. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
+                   "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
+                   "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
+                   "Please click the 'convert style/emotion/accent' checkbox to convert the style, emotion, or accent of the source audio, or else only timbre conversion will be performed.<br> "
+                   "Click the 'anonymization only' checkbox will ignore reference audio but convert source to an 'average voice' determined by model itself.<br> "
+                   "无需训练的 zero-shot 语音/口音转换模型，若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
+                   "请注意，参考音频若超过 25 秒，则会被自动裁剪至此长度。<br>若源音频和参考音频的总时长超过 30 秒，源音频将被分段处理。"
+                   "<br>请勾选 'convert style/emotion/accent' 以转换源音频的风格、情感或口音，否则仅执行音色转换。<br>"
+                   "勾选 'anonymization only' 会无视参考音频而将源音频转换为某种由模型自身决定的 '平均音色'。<br>"
+                   "Credits to [Vevo](https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo)"
+                   )
+    inputs = [
+        gr.Audio(type="filepath", label="Source Audio / 源音频"),
+        gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
+        gr.Slider(minimum=1, maximum=200, value=30, step=1, label="Diffusion Steps / 扩散步数",
+                 info="30 by default, 50~100 for best quality / 默认为 30，50~100 为最佳质量"),
+        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整",
+                 info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速，>1.0 减慢语速"),
+        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.0, label="Intelligibility CFG Rate",
+                 info="controls pronunciation intelligibility / 控制发音清晰度"),
+        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Similarity CFG Rate",
+                  info="controls similarity to reference audio / 控制与参考音频的相似度"),
+        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.9, label="Top-p",
+                 info="AR model sampling top P"),
+        gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature",
+                 info="AR model sampling temperature"),
+        gr.Slider(minimum=1.0, maximum=3.0, step=0.1, value=1.0, label="Repetition Penalty",
+                 info="AR model sampling repetition penalty"),
+        gr.Checkbox(label="convert style/emotion/accent", value=False),
+        gr.Checkbox(label="anonymization only", value=False),
+    ]
+    examples = [
+        ["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 50, 1.0, 0.0, 0.7, 0.9, 1.0, 1.0, False, False],
+        ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 50, 1.0, 0.0, 0.7, 0.9, 1.0, 1.0, False, False],
+    ]
+    outputs = [
+        gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
+        gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')
+    ]
+    return gr.Interface(
+        fn=vc_wrapper.convert_voice_with_streaming,
+        description=description,
+        inputs=inputs,
+        outputs=outputs,
+        title="Seed Voice Conversion V2 (Voice & Style Conversion)",
+        examples=examples,
+        cache_examples=False,
+    )
+def main(args):
+    # Load V2 models
+    vc_wrapper_v2 = load_v2_models(args)
+    # Create interfaces
+    v1_interface = create_v1_interface()
+    v2_interface = create_v2_interface(vc_wrapper_v2)
+    # Create tabs
+    with gr.Blocks(title="Seed Voice Conversion") as demo:
+        gr.Markdown("# Seed Voice Conversion")
+        gr.Markdown("Choose between V1 (Voice & Singing Voice Conversion) or V2 (Voice & Style Conversion)")
+        with gr.Tabs():
+            with gr.TabItem("V2 - Voice & Style Conversion"):
+                v2_interface.render()
+            with gr.TabItem("V1 - Voice & Singing Voice Conversion"):
+                v1_interface.render()
+    # Launch the combined interface
+    demo.launch()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--compile", type=bool, default=True)
+    args = parser.parse_args()
+    main(args)

configs/astral_quantization/default_2048.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+_target_: modules.astral_quantization.default_model.AstralQuantizer
+tokenizer_name: "openai/whisper-small"
+ssl_model_name: "facebook/hubert-large-ll60k"
+ssl_output_layer: 18
+encoder:
+  _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
+  dim: 512
+  num_blocks: 12
+  intermediate_dim: 1536
+  dilation: 1
+  input_dim: 1024
+quantizer:
+  _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+  codebook_size: 2048  # codebook size, must be a power of 2
+  dim: 512
+  entropy_loss_weight: 0.1
+  diversity_gamma: 1.0
+  spherical: True
+  enable_entropy_loss: True
+  soft_entropy_loss: True
+decoder:
+  _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
+  dim: 512
+  num_blocks: 12
+  intermediate_dim: 1536
+  dilation: 1
+  output_dim: 1024
+  gin_channels: 192
+asr_decoder:
+  _target_: modules.astral_quantization.asr_decoder.ASRDecoder
+  hidden_dim: 768
+  num_heads: 12
+  depth: 12
+  block_size: 4096
+  in_channels: 512
+  n_vocab: 51866
+  bos_id: 50528
+  eos_id: 50527
+  dropout_rate: 0.0
+  attn_dropout_rate: 0.0

configs/astral_quantization/default_32.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+_target_: default_model.AstralQuantizer
+tokenizer_name: "openai/whisper-small"
+ssl_model_name: "facebook/hubert-large-ll60k"
+ssl_output_layer: 18
+encoder:
+  _target_: modules.convnext.ConvNeXtV2Stage
+  dim: 512
+  num_blocks: 12
+  intermediate_dim: 1536
+  dilation: 1
+  input_dim: 1024
+quantizer:
+  _target_: modules.bsq.BinarySphericalQuantize
+  codebook_size: 32  # codebook size, must be a power of 2
+  dim: 512
+  entropy_loss_weight: 0.1
+  diversity_gamma: 1.0
+  spherical: True
+  enable_entropy_loss: True
+  soft_entropy_loss: True
+decoder:
+  _target_: modules.convnext.ConvNeXtV2Stage
+  dim: 512
+  num_blocks: 12
+  intermediate_dim: 1536
+  dilation: 1
+  output_dim: 1024
+  gin_channels: 192
+asr_decoder:
+  _target_: modules.asr_decoder.ASRDecoder
+  hidden_dim: 768
+  num_heads: 12
+  depth: 12
+  block_size: 4096
+  in_channels: 512
+  n_vocab: 51866
+  bos_id: 50528
+  eos_id: 50527
+  dropout_rate: 0.0
+  attn_dropout_rate: 0.0

configs/config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"reference_audio_path": "D:/FAcodec/test_waves/kobe_0.wav", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "\u9ea6\u514b\u98ce (Razer BlackShark V2 HS 2.4", "sg_output_device": "\u626c\u58f0\u5668 (Razer BlackShark V2 HS 2.4", "sr_type": "sr_model", "diffusion_steps": 10.0, "inference_cfg_rate": 0.0, "max_prompt_length": 3.0, "block_time": 0.7, "crossfade_length": 0.04, "extra_time": 0.5, "extra_time_right": 0.02}

configs/inuse/.gitignore ADDED Viewed

File without changes

configs/inuse/config.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"reference_audio_path": "D:/seed-vc/examples/reference/trump_0.wav", "sg_hostapi": "MME", "sg_wasapi_exclusive": false, "sg_input_device": "\u9ea6\u514b\u98ce (Razer BlackShark V2 HS USB", "sg_output_device": "\u626c\u58f0\u5668 (Razer BlackShark V2 HS USB", "sr_type": "sr_model", "diffusion_steps": 8.0, "inference_cfg_rate": 0.7, "max_prompt_length": 3.0, "block_time": 0.58, "crossfade_length": 0.04, "extra_time_ce": 2.5, "extra_time": 0.5, "extra_time_right": 0.02}

configs/presets/config_dit_mel_seed_uvit_whisper_base_f0_44k.yml ADDED Viewed

	@@ -0,0 +1,98 @@

+log_dir: "./runs"
+save_freq: 1
+log_interval: 10
+save_interval: 1000
+device: "cuda"
+epochs: 1000 # number of epochs for first stage training (pre-training)
+batch_size: 1
+batch_length: 100 # maximum duration of audio in a batch (in seconds)
+max_len: 80 # maximum number of frames
+pretrained_model: "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth"
+pretrained_encoder: ""
+load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters
+preprocess_params:
+  sr: 44100
+  spect_params:
+    n_fft: 2048
+    win_length: 2048
+    hop_length: 512
+    n_mels: 128
+    fmin: 0
+    fmax: "None"
+model_params:
+  dit_type: "DiT" # uDiT or DiT
+  reg_loss_type: "l1" # l1 or l2
+  timbre_shifter:
+    se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt"
+    ckpt_path: './modules/openvoice/checkpoints_v2/converter'
+  vocoder:
+    type: "bigvgan"
+    name: "nvidia/bigvgan_v2_44khz_128band_512x"
+  speech_tokenizer:
+    type: 'whisper'
+    name: "openai/whisper-small"
+  style_encoder:
+    dim: 192
+    campplus_path: "campplus_cn_common.bin"
+  DAC:
+    encoder_dim: 64
+    encoder_rates: [2, 5, 5, 6]
+    decoder_dim: 1536
+    decoder_rates: [ 6, 5, 5, 2 ]
+    sr: 24000
+  length_regulator:
+    channels: 768
+    is_discrete: false
+    in_channels: 768
+    content_codebook_size: 2048
+    sampling_ratios: [1, 1, 1, 1]
+    vector_quantize: false
+    n_codebooks: 1
+    quantizer_dropout: 0.0
+    f0_condition: true
+    n_f0_bins: 256
+  DiT:
+    hidden_dim: 768
+    num_heads: 12
+    depth: 17
+    class_dropout_prob: 0.1
+    block_size: 8192
+    in_channels: 128
+    style_condition: true
+    final_layer_type: 'mlp'
+    target: 'mel' # mel or codec
+    content_dim: 768
+    content_codebook_size: 1024
+    content_type: 'discrete'
+    f0_condition: true
+    n_f0_bins: 256
+    content_codebooks: 1
+    is_causal: false
+    long_skip_connection: false
+    zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token
+    time_as_token: false
+    style_as_token: false
+    uvit_skip_connection: true
+    add_resblock_in_transformer: false
+  wavenet:
+    hidden_dim: 768
+    num_layers: 8
+    kernel_size: 5
+    dilation_rate: 1
+    p_dropout: 0.2
+    style_condition: true
+loss_params:
+  base_lr: 0.0001
+  lambda_mel: 45
+  lambda_kl: 1.0

configs/presets/config_dit_mel_seed_uvit_whisper_small_wavenet.yml ADDED Viewed

	@@ -0,0 +1,91 @@

+log_dir: "./runs"
+save_freq: 1
+log_interval: 10
+save_interval: 1000
+device: "cuda"
+epochs: 1000 # number of epochs for first stage training (pre-training)
+batch_size: 2
+batch_length: 100 # maximum duration of audio in a batch (in seconds)
+max_len: 80 # maximum number of frames
+pretrained_model: "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth"
+pretrained_encoder: ""
+load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters
+preprocess_params:
+  sr: 22050
+  spect_params:
+    n_fft: 1024
+    win_length: 1024
+    hop_length: 256
+    n_mels: 80
+    fmin: 0
+    fmax: "None"
+model_params:
+  dit_type: "DiT" # uDiT or DiT
+  reg_loss_type: "l1" # l1 or l2
+  timbre_shifter:
+    se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt"
+    ckpt_path: './modules/openvoice/checkpoints_v2/converter'
+  speech_tokenizer:
+    type: 'whisper'
+    name: "openai/whisper-small"
+  style_encoder:
+    dim: 192
+    campplus_path: "campplus_cn_common.bin"
+  vocoder:
+    type: "bigvgan"
+    name: "nvidia/bigvgan_v2_22khz_80band_256x"
+  length_regulator:
+    channels: 512
+    is_discrete: false
+    in_channels: 768
+    content_codebook_size: 2048
+    sampling_ratios: [1, 1, 1, 1]
+    vector_quantize: false
+    n_codebooks: 1
+    quantizer_dropout: 0.0
+    f0_condition: false
+    n_f0_bins: 512
+  DiT:
+    hidden_dim: 512
+    num_heads: 8
+    depth: 13
+    class_dropout_prob: 0.1
+    block_size: 8192
+    in_channels: 80
+    style_condition: true
+    final_layer_type: 'wavenet'
+    target: 'mel' # mel or codec
+    content_dim: 512
+    content_codebook_size: 1024
+    content_type: 'discrete'
+    f0_condition: false
+    n_f0_bins: 512
+    content_codebooks: 1
+    is_causal: false
+    long_skip_connection: true
+    zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token
+    time_as_token: false
+    style_as_token: false
+    uvit_skip_connection: true
+    add_resblock_in_transformer: false
+  wavenet:
+    hidden_dim: 512
+    num_layers: 8
+    kernel_size: 5
+    dilation_rate: 1
+    p_dropout: 0.2
+    style_condition: true
+loss_params:
+  base_lr: 0.0001
+  lambda_mel: 45
+  lambda_kl: 1.0

configs/presets/config_dit_mel_seed_uvit_xlsr_tiny.yml ADDED Viewed

	@@ -0,0 +1,82 @@

+log_dir: "./runs/"
+save_freq: 1
+log_interval: 10
+save_interval: 500
+device: "cuda"
+epochs: 1000 # number of epochs for first stage training (pre-training)
+batch_size: 2
+batch_length: 100 # maximum duration of audio in a batch (in seconds)
+max_len: 80 # maximum number of frames
+pretrained_model: "DiT_uvit_tat_xlsr_ema.pth"
+pretrained_encoder: ""
+load_only_params: False # set to true if do not want to load epoch numbers and optimizer parameters
+preprocess_params:
+  sr: 22050
+  spect_params:
+    n_fft: 1024
+    win_length: 1024
+    hop_length: 256
+    n_mels: 80
+    fmin: 0
+    fmax: 8000
+model_params:
+  dit_type: "DiT" # uDiT or DiT
+  reg_loss_type: "l1" # l1 or l2
+  diffusion_type: "flow"
+  timbre_shifter:
+    se_db_path: "./modules/openvoice/checkpoints_v2/converter/se_db.pt"
+    ckpt_path: './modules/openvoice/checkpoints_v2/converter'
+  vocoder:
+    type: "hifigan"
+  speech_tokenizer:
+    type: 'xlsr'
+    output_layer: 12
+    name: 'facebook/wav2vec2-xls-r-300m'
+  style_encoder:
+    dim: 192
+    campplus_path: "campplus_cn_common.bin"
+  length_regulator:
+    channels: 384
+    is_discrete: false
+    in_channels: 1024
+    content_codebook_size: 1024
+    sampling_ratios: [1, 1, 1, 1]
+    vector_quantize: false
+    n_codebooks: 2
+    quantizer_dropout: 0.0
+    f0_condition: false
+    n_f0_bins: 512
+  DiT:
+    hidden_dim: 384
+    num_heads: 6
+    depth: 9
+    class_dropout_prob: 0.1
+    block_size: 8192
+    in_channels: 80
+    style_condition: true
+    final_layer_type: 'mlp'
+    target: 'mel' # mel or betavae
+    content_dim: 384
+    content_codebook_size: 1024
+    content_type: 'discrete'
+    f0_condition: false
+    n_f0_bins: 512
+    content_codebooks: 1
+    is_causal: false
+    long_skip_connection: false
+    zero_prompt_speech_token: false # for prompt component, do not input corresponding speech token
+    time_as_token: true
+    style_as_token: true
+    uvit_skip_connection: true
+    add_resblock_in_transformer: false
+loss_params:
+  base_lr: 0.0001

configs/v2/ar_base.yaml ADDED Viewed

File without changes

configs/v2/dit_small.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+_target_: modules.v2.cfm.CFM
+estimator:
+  _target_: modules.v2.dit_wrapper.DiT
+  time_as_token: true
+  style_as_token: true
+  uvit_skip_connection: false
+  block_size: 8192
+  depth: 13
+  num_heads: 8
+  hidden_dim: 512
+  in_channels: 80
+  content_dim: 512
+  style_encoder_dim: 192
+  class_dropout_prob: 0.1
+  dropout_rate: 0.0
+  attn_dropout_rate: 0.0

configs/v2/vc_wrapper.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
+sr: 22050
+hop_size: 256
+mel_fn:
+  _target_: modules.audio.mel_spectrogram
+  _partial_: true
+  n_fft: 1024
+  win_size: 1024
+  hop_size: 256
+  num_mels: 80
+  sampling_rate: 22050
+  fmin: 0
+  fmax: null
+  center: False
+cfm:
+  _target_: modules.v2.cfm.CFM
+  estimator:
+    _target_: modules.v2.dit_wrapper.DiT
+    time_as_token: true
+    style_as_token: true
+    uvit_skip_connection: false
+    block_size: 8192
+    depth: 13
+    num_heads: 8
+    hidden_dim: 512
+    in_channels: 80
+    content_dim: 512
+    style_encoder_dim: 192
+    class_dropout_prob: 0.1
+    dropout_rate: 0.0
+    attn_dropout_rate: 0.0
+cfm_length_regulator:
+  _target_: modules.v2.length_regulator.InterpolateRegulator
+  channels: 512
+  is_discrete: true
+  codebook_size: 2048
+  sampling_ratios: [ 1, 1, 1, 1 ]
+  f0_condition: false
+ar:
+  _target_: modules.v2.ar.NaiveWrapper
+  model:
+    _target_: modules.v2.ar.NaiveTransformer
+    config:
+      _target_: modules.v2.ar.NaiveModelArgs
+      dropout: 0.0
+      rope_base: 10000.0
+      dim: 768
+      head_dim: 64
+      n_local_heads: 2
+      intermediate_size: 2304
+      n_head: 12
+      n_layer: 12
+      vocab_size: 2049  # 1 + 1 for eos
+ar_length_regulator:
+  _target_: modules.v2.length_regulator.InterpolateRegulator
+  channels: 768
+  is_discrete: true
+  codebook_size: 32
+  sampling_ratios: [ ]
+  f0_condition: false
+style_encoder:
+  _target_: modules.campplus.DTDNN.CAMPPlus
+  feat_dim: 80
+  embedding_size: 192
+content_extractor_narrow:
+  _target_: modules.astral_quantization.default_model.AstralQuantizer
+  tokenizer_name: "openai/whisper-small"
+  ssl_model_name: "facebook/hubert-large-ll60k"
+  ssl_output_layer: 18
+  skip_ssl: true
+  encoder: &bottleneck_encoder
+    _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
+    dim: 512
+    num_blocks: 12
+    intermediate_dim: 1536
+    dilation: 1
+    input_dim: 1024
+  quantizer:
+    _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+    codebook_size: 32  # codebook size, must be a power of 2
+    dim: 512
+    entropy_loss_weight: 0.1
+    diversity_gamma: 1.0
+    spherical: True
+    enable_entropy_loss: True
+    soft_entropy_loss: True
+content_extractor_wide:
+  _target_: modules.astral_quantization.default_model.AstralQuantizer
+  tokenizer_name: "openai/whisper-small"
+  ssl_model_name: "facebook/hubert-large-ll60k"
+  ssl_output_layer: 18
+  encoder: *bottleneck_encoder
+  quantizer:
+    _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+    codebook_size: 2048  # codebook size, must be a power of 2
+    dim: 512
+    entropy_loss_weight: 0.1
+    diversity_gamma: 1.0
+    spherical: True
+    enable_entropy_loss: True
+    soft_entropy_loss: True
+vocoder:
+  _target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
+  pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
+  use_cuda_kernel: false

hf_utils.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from huggingface_hub import hf_hub_download
-def load_custom_model_from_hf(repo_id, model_filename="pytorch_model.bin", config_filename="config.yml"):
     os.makedirs("./checkpoints", exist_ok=True)
     model_path = hf_hub_download(repo_id=repo_id, filename=model_filename, cache_dir="./checkpoints")
     if config_filename is None:

 from huggingface_hub import hf_hub_download
+def load_custom_model_from_hf(repo_id, model_filename="pytorch_model.bin", config_filename=None):
     os.makedirs("./checkpoints", exist_ok=True)
     model_path = hf_hub_download(repo_id=repo_id, filename=model_filename, cache_dir="./checkpoints")
     if config_filename is None:

modules/__pycache__/audio.cpython-310.pyc CHANGED Viewed

Binary files a/modules/__pycache__/audio.cpython-310.pyc and b/modules/__pycache__/audio.cpython-310.pyc differ

modules/__pycache__/commons.cpython-310.pyc CHANGED Viewed

Binary files a/modules/__pycache__/commons.cpython-310.pyc and b/modules/__pycache__/commons.cpython-310.pyc differ

modules/__pycache__/commons.cpython-38.pyc ADDED Viewed

Binary file (14.2 kB). View file

modules/__pycache__/diffusion_transformer.cpython-310.pyc CHANGED Viewed

Binary files a/modules/__pycache__/diffusion_transformer.cpython-310.pyc and b/modules/__pycache__/diffusion_transformer.cpython-310.pyc differ

modules/__pycache__/flow_matching.cpython-310.pyc CHANGED Viewed

Binary files a/modules/__pycache__/flow_matching.cpython-310.pyc and b/modules/__pycache__/flow_matching.cpython-310.pyc differ

modules/__pycache__/length_regulator.cpython-310.pyc CHANGED Viewed

Binary files a/modules/__pycache__/length_regulator.cpython-310.pyc and b/modules/__pycache__/length_regulator.cpython-310.pyc differ

modules/__pycache__/rmvpe.cpython-310.pyc ADDED Viewed

Binary file (17.6 kB). View file

modules/astral_quantization/__pycache__/bsq.cpython-310.pyc ADDED Viewed

Binary file (12.7 kB). View file

modules/astral_quantization/__pycache__/convnext.cpython-310.pyc ADDED Viewed

Binary file (6.87 kB). View file

modules/astral_quantization/__pycache__/default_model.cpython-310.pyc ADDED Viewed

Binary file (2.8 kB). View file

modules/astral_quantization/bsq.py ADDED Viewed

	@@ -0,0 +1,569 @@

+"""
+Lookup Free Quantization
+Proposed in https://arxiv.org/abs/2310.05737
+In the simplest setup, each dimension is quantized into {-1, 1}.
+An entropy penalty is used to encourage utilization.
+"""
+from math import log2, ceil
+from functools import partial, cache
+from collections import namedtuple
+from contextlib import nullcontext
+import torch.distributed as dist
+from torch.distributed import nn as dist_nn
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from torch.nn import Module
+from torch.amp import autocast
+from einops import rearrange, reduce, pack, unpack
+# constants
+Return = namedtuple('Return', ['quantized', 'indices', 'entropy_aux_loss'])
+LossBreakdown = namedtuple('LossBreakdown', ['per_sample_entropy', 'batch_entropy', 'commitment'])
+# distributed helpers
+@cache
+def is_distributed():
+    return dist.is_initialized() and dist.get_world_size() > 1
+def maybe_distributed_mean(t):
+    if not is_distributed():
+        return t
+    dist_nn.all_reduce(t)
+    t = t / dist.get_world_size()
+    return t
+# helper functions
+def exists(v):
+    return v is not None
+def identity(t):
+    return t
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg() if callable(arg) else arg
+    return None
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def l2norm(t):
+    return F.normalize(t, dim = -1)
+# entropy
+def log(t, eps = 1e-5):
+    return t.clamp(min = eps).log()
+def entropy(prob):
+    return (-prob * log(prob)).sum(dim=-1)
+# cosine sim linear
+class CosineSimLinear(Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        scale = 1.
+    ):
+        super().__init__()
+        self.scale = scale
+        self.weight = nn.Parameter(torch.randn(dim_in, dim_out))
+    def forward(self, x):
+        x = F.normalize(x, dim = -1)
+        w = F.normalize(self.weight, dim = 0)
+        return (x @ w) * self.scale
+def soft_entropy_loss(u, tau=1.0, gamma=1.0):
+    """
+    Compute the soft entropy loss for Binary Spherical Quantization (BSQ).
+    Args:
+        u (torch.Tensor): Input latent embeddings of shape (batch_size, L).
+        tau (float): Temperature scaling factor.
+        gamma (float): Weight for the second entropy term.
+    Returns:
+        torch.Tensor: Soft entropy loss.
+    """
+    # Binary quantization: Generate implicit codebook corners
+    L = u.size(1)  # Dimensionality of codebook
+    corners = torch.tensor([-1.0, 1.0], device=u.device) / (L**0.5)
+    # Compute soft quantization probabilities for all dimensions
+    # q_hat(c|u) for each dimension
+    prob_matrix = torch.sigmoid(2 * tau * corners.unsqueeze(1) * u.unsqueeze(2))  # Shape: (batch_size, L, 2)
+    # Entropy of q_hat(c|u) (independent along each dimension)
+    entropy_per_dim = -torch.sum(prob_matrix * prob_matrix.log(), dim=-1)  # Shape: (batch_size, L)
+    entropy_term1 = entropy_per_dim.mean()
+    # Expected probabilities for dataset entropy (approximation)
+    expected_probs = prob_matrix.mean(dim=0)  # Mean across batch, shape: (L, 2)
+    entropy_term2 = -torch.sum(expected_probs * expected_probs.log(), dim=-1).mean()
+    # Final entropy loss
+    loss = entropy_term1 - gamma * entropy_term2
+    return loss
+# class
+class BinarySphericalQuantize(Module):
+    def __init__(
+        self,
+        *,
+        dim = None,
+        codebook_size = None,
+        entropy_loss_weight = 0.1,
+        commitment_loss_weight = 0.,
+        diversity_gamma = 1.,
+        straight_through_activation = nn.Identity(),
+        num_codebooks = 1,
+        keep_num_codebooks_dim = None,
+        codebook_scale = 1.,                        # for residual LFQ, codebook scaled down by 2x at each layer
+        frac_per_sample_entropy = 0.25,               # make less than 1. to only use a random fraction of the probs for per sample entropy
+        has_projections = None,
+        projection_has_bias = True,
+        soft_clamp_input_value = None,
+        cosine_sim_project_in = False,
+        cosine_sim_project_in_scale = None,
+        channel_first = None,
+        experimental_softplus_entropy_loss = False,
+        entropy_loss_offset = 5.,                   # how much to shift the loss before softplus
+        spherical = True,                          # from https://arxiv.org/abs/2406.07548
+        force_quantization_f32 = True,               # will force the quantization step to be full precision
+        enable_entropy_loss = True,
+        soft_entropy_loss = True,
+    ):
+        super().__init__()
+        # some assert validations
+        assert exists(dim) or exists(codebook_size), 'either dim or codebook_size must be specified for LFQ'
+        assert not exists(codebook_size) or log2(codebook_size).is_integer(), f'your codebook size must be a power of 2 for lookup free quantization (suggested {2 ** ceil(log2(codebook_size))})'
+        codebook_size = default(codebook_size, lambda: 2 ** dim)
+        self.codebook_size = codebook_size
+        codebook_dim = int(log2(codebook_size))
+        codebook_dims = codebook_dim * num_codebooks
+        dim = default(dim, codebook_dims)
+        has_projections = default(has_projections, dim != codebook_dims)
+        if cosine_sim_project_in:
+            cosine_sim_project_in = default(cosine_sim_project_in_scale, codebook_scale)
+            project_in_klass = partial(CosineSimLinear, scale = cosine_sim_project_in)
+        else:
+            project_in_klass = partial(nn.Linear, bias = projection_has_bias)
+        self.project_in = project_in_klass(dim, codebook_dims) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(codebook_dims, dim, bias = projection_has_bias) if has_projections else nn.Identity()
+        self.has_projections = has_projections
+        self.dim = dim
+        self.codebook_dim = codebook_dim
+        self.num_codebooks = num_codebooks
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        # channel first
+        self.channel_first = channel_first
+        # straight through activation
+        self.activation = straight_through_activation
+        # whether to use BSQ (binary spherical quantization)
+        self.spherical = spherical
+        self.maybe_l2norm = (lambda t: l2norm(t) * self.codebook_scale) if spherical else identity
+        # entropy aux loss related weights
+        assert 0 < frac_per_sample_entropy <= 1.
+        self.frac_per_sample_entropy = frac_per_sample_entropy
+        self.diversity_gamma = diversity_gamma
+        self.entropy_loss_weight = entropy_loss_weight
+        # codebook scale
+        self.codebook_scale = codebook_scale
+        # commitment loss
+        self.commitment_loss_weight = commitment_loss_weight
+        # whether to soft clamp the input value from -value to value
+        self.soft_clamp_input_value = soft_clamp_input_value
+        assert not exists(soft_clamp_input_value) or soft_clamp_input_value >= codebook_scale
+        # whether to make the entropy loss positive through a softplus (experimental, please report if this worked or not in discussions)
+        self.entropy_loss_offset = entropy_loss_offset
+        self.experimental_softplus_entropy_loss = experimental_softplus_entropy_loss
+        # for no auxiliary loss, during inference
+        self.register_buffer('mask', 2 ** torch.arange(codebook_dim - 1, -1, -1))
+        self.register_buffer('zero', torch.tensor(0.), persistent = False)
+        # whether to force quantization step to be f32
+        self.force_quantization_f32 = force_quantization_f32
+        # codes
+        self.enable_entropy_loss = enable_entropy_loss
+        self.soft_entropy_loss = soft_entropy_loss
+        if codebook_size <= 100000:
+            all_codes = torch.arange(codebook_size)
+            bits = ((all_codes[..., None].int() & self.mask) != 0).float()
+            codebook = self.bits_to_codes(bits)
+            self.register_buffer('codebook', codebook.float(), persistent = False)
+        else:
+            all_codes = torch.arange(pow(2, 16))
+            mask = 2 ** torch.arange(16 - 1, -1, -1)
+            bits = ((all_codes[..., None].int() & mask) != 0).float()
+            codebook = self.bits_to_codes(bits)
+            self.register_buffer('codebook', codebook.float(), persistent = False)
+    def bits_to_codes(self, bits):
+        return bits * self.codebook_scale * 2 - self.codebook_scale
+    @property
+    def dtype(self):
+        return self.codebook.dtype
+    def indices_to_codes(
+        self,
+        indices,
+        project_out = True
+    ):
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        should_transpose = default(self.channel_first, is_img_or_video)
+        if not self.keep_num_codebooks_dim:
+            indices = rearrange(indices, '... -> ... 1')
+        # indices to codes, which are bits of either -1 or 1
+        bits = ((indices[..., None].int() & self.mask) != 0).to(self.dtype)
+        codes = self.bits_to_codes(bits)
+        codes = self.maybe_l2norm(codes)
+        codes = rearrange(codes, '... c d -> ... (c d)')
+        # whether to project codes out to original dimensions
+        # if the input feature dimensions were not log2(codebook size)
+        if project_out:
+            codes = self.project_out(codes)
+        # rearrange codes back to original shape
+        if should_transpose:
+            codes = rearrange(codes, 'b ... d -> b d ...')
+        return codes
+    def bits_to_z(self, bits):
+        # assert bits must contain only -1 and 1
+        assert torch.all(bits.abs() == 1)
+        quantized = bits.float()
+        quantized = self.maybe_l2norm(quantized)
+        z = self.project_out(quantized)
+        return z
+    def forward(
+        self,
+        x,
+        inv_temperature = 100.,
+        return_loss_breakdown = False,
+        mask = None,
+        return_bits = False
+    ):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension, which is also log2(codebook size)
+        c - number of codebook dim
+        """
+        is_img_or_video = x.ndim >= 4
+        should_transpose = default(self.channel_first, is_img_or_video)
+        # standardize image or video into (batch, seq, dimension)
+        if should_transpose:
+            x = rearrange(x, 'b d ... -> b ... d')
+            x, ps = pack_one(x, 'b * d')
+        assert x.shape[-1] == self.dim, f'expected dimension of {self.dim} but received {x.shape[-1]}'
+        x = self.project_in(x)
+        # maybe soft clamp
+        if exists(self.soft_clamp_input_value):
+            clamp_value = self.soft_clamp_input_value
+            x = (x / clamp_value).tanh() * clamp_value
+        # split out number of codebooks
+        x = rearrange(x, 'b n (c d) -> b n c d', c = self.num_codebooks)
+        # maybe l2norm
+        x = self.maybe_l2norm(x)
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        quantization_context = partial(autocast, 'cuda', enabled = False) if force_f32 else nullcontext
+        with quantization_context():
+            if force_f32:
+                orig_dtype = x.dtype
+                x = x.float()
+            # quantize by eq 3.
+            original_input = x
+            codebook_value = torch.ones_like(x) * self.codebook_scale
+            quantized = torch.where(x > 0, codebook_value, -codebook_value)
+            if return_bits:
+                return quantized
+            # calculate indices
+            indices = reduce((quantized > 0).int() * self.mask.int(), 'b n c d -> b n c', 'sum')
+            # maybe l2norm
+            quantized = self.maybe_l2norm(quantized)
+            # use straight-through gradients (optionally with custom activation fn) if training
+            if self.training:
+                x = self.activation(x)
+                x = x + (quantized - x).detach()
+            else:
+                x = quantized
+            # entropy aux loss
+            if self.soft_entropy_loss:
+                entropy_aux_loss = soft_entropy_loss(x, tau=1.0, gamma=1.0)
+            elif self.training and self.enable_entropy_loss:
+                if force_f32:
+                    codebook = self.codebook.float()
+                codebook = self.maybe_l2norm(codebook)
+                # whether to only use a fraction of probs, for reducing memory
+                if self.frac_per_sample_entropy < 1.:
+                    # account for mask
+                    if exists(mask):
+                        original_input = original_input[mask]
+                    original_input = rearrange(original_input, 'b n ... -> (b n) ...')
+                    rand_mask = torch.randn(self.codebook_dim).argsort(dim = -1) < 16
+                    sampled_input = original_input[..., rand_mask]
+                    sampled_distance = -2 * einsum('... i d, j d -> ... i j', sampled_input, codebook)
+                    sampled_prob = (-sampled_distance * inv_temperature).softmax(dim = -1)
+                    per_sample_probs = sampled_prob
+                else:
+                    if exists(mask):
+                        original_input = original_input[mask]
+                    original_input = rearrange(original_input, 'b n ... -> (b n) ...')
+                    # the same as euclidean distance up to a constant
+                    distance = -2 * einsum('... i d, j d -> ... i j', original_input, codebook)
+                    prob = (-distance * inv_temperature).softmax(dim = -1)
+                    per_sample_probs = prob
+                # calculate per sample entropy
+                per_sample_entropy = entropy(per_sample_probs).mean()
+                # distribution over all available tokens in the batch
+                avg_prob = reduce(per_sample_probs, '... c d -> c d', 'mean')
+                avg_prob = maybe_distributed_mean(avg_prob)
+                codebook_entropy = entropy(avg_prob).mean()
+                # 1. entropy will be nudged to be low for each code, to encourage the network to output confident predictions
+                # 2. codebook entropy will be nudged to be high, to encourage all codes to be uniformly used within the batch
+                entropy_aux_loss = per_sample_entropy - self.diversity_gamma * codebook_entropy
+            else:
+                # if not training, just return dummy 0
+                entropy_aux_loss = per_sample_entropy = codebook_entropy = self.zero
+            # whether to make the entropy loss positive or not through a (shifted) softplus
+            if self.training and self.experimental_softplus_entropy_loss:
+                entropy_aux_loss = F.softplus(entropy_aux_loss + self.entropy_loss_offset)
+            # commit loss
+            if self.training and self.commitment_loss_weight > 0.:
+                commit_loss = F.mse_loss(original_input, quantized.detach(), reduction = 'none')
+                if exists(mask):
+                    commit_loss = commit_loss[mask]
+                commit_loss = commit_loss.mean()
+            else:
+                commit_loss = self.zero
+            # input back to original dtype if needed
+            if force_f32:
+                x = x.type(orig_dtype)
+        # merge back codebook dim
+        x = rearrange(x, 'b n c d -> b n (c d)')
+        # project out to feature dimension if needed
+        x = self.project_out(x)
+        # reconstitute image or video dimensions
+        if should_transpose:
+            x = unpack_one(x, ps, 'b * d')
+            x = rearrange(x, 'b ... d -> b d ...')
+            indices = unpack_one(indices, ps, 'b * c')
+        # whether to remove single codebook dim
+        if not self.keep_num_codebooks_dim:
+            indices = rearrange(indices, '... 1 -> ...')
+        # complete aux loss
+        aux_loss = entropy_aux_loss * self.entropy_loss_weight + commit_loss * self.commitment_loss_weight
+        # returns
+        ret = Return(x, indices, aux_loss)
+        if not return_loss_breakdown:
+            return ret
+        return ret, LossBreakdown(per_sample_entropy, codebook_entropy, commit_loss)
+class GroupedResidualBSQ(Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        groups = 1,
+        accept_image_fmap = False,
+        **kwargs
+    ):
+        super().__init__()
+        self.dim = dim
+        self.groups = groups
+        assert (dim % groups) == 0
+        dim_per_group = dim // groups
+        self.accept_image_fmap = accept_image_fmap
+        self.rvqs = nn.ModuleList([])
+        for _ in range(groups):
+            self.rvqs.append(LFQ(
+                dim = dim_per_group,
+                **kwargs
+            ))
+        self.codebook_size = self.rvqs[0].codebook_size
+    @property
+    def codebooks(self):
+        return torch.stack(tuple(rvq.codebooks for rvq in self.rvqs))
+    @property
+    def split_dim(self):
+        return 1 if self.accept_image_fmap else -1
+    def get_codes_from_indices(self, indices):
+        codes = tuple(rvq.get_codes_from_indices(chunk_indices) for rvq, chunk_indices in zip(self.rvqs, indices))
+        return torch.stack(codes)
+    def get_output_from_indices(self, indices):
+        outputs = tuple(rvq.get_output_from_indices(chunk_indices) for rvq, chunk_indices in zip(self.rvqs, indices))
+        return torch.cat(outputs, dim = self.split_dim)
+    def forward(
+        self,
+        x,
+        return_all_codes = False
+    ):
+        shape, split_dim = x.shape, self.split_dim
+        assert shape[split_dim] == self.dim
+        # split the feature dimension into groups
+        x = x.chunk(self.groups, dim = split_dim)
+        forward_kwargs = dict(
+        )
+        # invoke residual vq on each group
+        out = tuple(rvq(chunk, **forward_kwargs) for rvq, chunk in zip(self.rvqs, x))
+        out = tuple(zip(*out))
+        # otherwise, get all the zipped outputs and combine them
+        quantized, all_indices, *maybe_aux_loss = out
+        quantized = torch.cat(quantized, dim = split_dim)
+        all_indices = torch.stack(all_indices)
+        ret = (quantized, all_indices, *maybe_aux_loss)
+        return ret

modules/astral_quantization/convnext.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List
+class ConvNextV2LayerNorm(nn.Module):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {self.data_format}")
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.data_format == "channels_last":
+            x = torch.nn.functional.layer_norm(
+                x, self.normalized_shape, self.weight, self.bias, self.eps
+            )
+        elif self.data_format == "channels_first":
+            input_dtype = x.dtype
+            x = x.float()
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = x.to(dtype=input_dtype)
+            x = self.weight[None, :, None] * x + self.bias[None, :, None]
+        return x
+class GRN(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, dim))
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=1, keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+class InterpolationLayer(nn.Module):
+    def __init__(self, ):  # this is a default of 1 / 50 * (44100 / 512) / 4
+        super().__init__()
+        pass
+    def forward(self, x: torch.Tensor, target_len: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        x = F.interpolate(x, size=target_len, mode='linear')
+        return x
+class ConvNeXtV2Stage(nn.Module):
+    def __init__(
+        self,
+        dim: int = 512,
+        intermediate_dim: int = 2048,
+        num_blocks: int = 1,
+        dilation: int = 1,
+        downsample_layer_indices: List[int] = None,
+        downsample_factors: List[int] = None,
+        upsample_layer_indices: List[int] = None,
+        upsample_factors: List[int] = None,
+        interpolation_layer_indices: List[int] = None,
+        input_dim: int = None,
+        output_dim: int = None,
+        gin_channels: int = 0,
+    ):
+        super().__init__()
+        # maybe downsample layers
+        if downsample_layer_indices is not None:
+            assert downsample_factors is not None
+            self.downsample_blocks = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        ConvNextV2LayerNorm(dim, data_format="channels_first"),
+                        nn.Conv1d(
+                            dim, dim, kernel_size=downsample_factor, stride=downsample_factor
+                        ),
+                    ) for _, downsample_factor in zip(downsample_layer_indices, downsample_factors)
+                ]
+            )
+            self.downsample_layer_indices = downsample_layer_indices
+        else:
+            self.downsample_blocks = nn.ModuleList()
+            self.downsample_layer_indices = []
+        # maybe upsample layers
+        if upsample_layer_indices is not None:
+            assert upsample_factors is not None
+            self.upsample_blocks = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        ConvNextV2LayerNorm(dim, data_format="channels_first"),
+                        nn.ConvTranspose1d(
+                            dim, dim, kernel_size=upsample_factor, stride=upsample_factor
+                        ),
+                    ) for _, upsample_factor in zip(upsample_layer_indices, upsample_factors)
+                ]
+            )
+            self.upsample_layer_indices = upsample_layer_indices
+        else:
+            self.upsample_blocks = nn.ModuleList()
+            self.upsample_layer_indices = []
+        # maybe interpolation layers
+        if interpolation_layer_indices is not None:
+            self.interpolation_blocks = nn.ModuleList(
+                [
+                    InterpolationLayer()
+                    for _ in interpolation_layer_indices
+                ]
+            )
+            self.interpolation_layer_indices = interpolation_layer_indices
+        else:
+            self.interpolation_blocks = nn.ModuleList()
+            self.interpolation_layer_indices = []
+        # main blocks
+        self.blocks = nn.ModuleList(
+            [
+                ConvNeXtV2Block(
+                    dim=dim,
+                    intermediate_dim=intermediate_dim,
+                    dilation=dilation,
+                )
+                for _ in range(num_blocks)
+            ]
+        )
+        # maybe input and output projections
+        if input_dim is not None and input_dim != dim:
+            self.input_projection = nn.Conv1d(input_dim, dim, kernel_size=1)
+        else:
+            self.input_projection = nn.Identity()
+        if output_dim is not None and output_dim != dim:
+            self.output_projection = nn.Conv1d(dim, output_dim, kernel_size=1)
+        else:
+            self.output_projection = nn.Identity()
+        if gin_channels > 0:
+            self.gin = nn.Conv1d(gin_channels, dim, kernel_size=1)
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        x = self.input_projection(x)  # B, D, T
+        if hasattr(self, 'gin'):
+            g = kwargs['g']
+            x = x + self.gin(g)
+        # pad to a multiple of cumprod(downsample_factors)
+        if len(self.downsample_blocks) > 0:
+            downsample_factor = 1
+            for factor in self.downsample_blocks:
+                downsample_factor *= factor[1].stride[0]
+            pad_len = downsample_factor - x.size(-1) % downsample_factor
+            if pad_len > 0:
+                x = torch.cat([x, torch.zeros_like(x[:, :, :pad_len])], dim=-1)
+        # main blocks
+        for layer_idx, block in enumerate(self.blocks):
+            if layer_idx in self.downsample_layer_indices:
+                x = self.downsample_blocks[self.downsample_layer_indices.index(layer_idx)](x)
+            if layer_idx in self.upsample_layer_indices:
+                x = self.upsample_blocks[self.upsample_layer_indices.index(layer_idx)](x)
+            if layer_idx in self.interpolation_layer_indices:
+                x = self.interpolation_blocks[self.interpolation_layer_indices.index(layer_idx)](x, target_len=kwargs['target_len'])
+            x = block(x)
+        x = self.output_projection(x)
+        return x
+    def setup_caches(self, *args, **kwargs):
+        pass
+class ConvNeXtV2Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        padding = (dilation * (7 - 1)) // 2
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
+        )  # depthwise conv
+        self.norm = ConvNextV2LayerNorm(dim, data_format="channels_first")
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(intermediate_dim)
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = self.norm(x)
+        x = x.transpose(1, 2)  # b d n -> b n d
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        x = x.transpose(1, 2)  # b n d -> b d n
+        return residual + x

modules/astral_quantization/default_model.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+from transformers import AutoTokenizer, AutoModel, Wav2Vec2FeatureExtractor
+class AstralQuantizer(torch.nn.Module):
+    def __init__(
+            self,
+            tokenizer_name: str,
+            ssl_model_name: str,
+            ssl_output_layer: int,
+            encoder: torch.nn.Module,
+            quantizer: torch.nn.Module,
+            skip_ssl: bool = False,
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.quantizer = quantizer
+        self.tokenizer_name = tokenizer_name
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        # Load SSL model from Huggingface
+        self.ssl_model_name = ssl_model_name
+        self.ssl_output_layer = ssl_output_layer
+        self.ssl_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(ssl_model_name)
+        if skip_ssl:  # in case the same SSL model has been loaded somewhere else
+            self.ssl_model = None
+        else:
+            self.ssl_model = AutoModel.from_pretrained(ssl_model_name).eval()
+            self.ssl_model.encoder.layers = self.ssl_model.encoder.layers[:ssl_output_layer]
+            self.ssl_model.encoder.layer_norm = torch.nn.Identity()
+    def load_separate_checkpoint(self, checkpoint_path):
+        params = torch.load(checkpoint_path, map_location='cpu')['net']
+        for key in params.keys():
+            for k in list(params[key].keys()):
+                if k.startswith("module."):
+                    params[key][k[len("module."):]] = params[key][k]
+                    del params[key][k]
+        self.encoder.load_state_dict(params['encoder'])
+        self.quantizer.load_state_dict(params['vq'])
+        if self.decoder is not None:
+            self.decoder.load_state_dict(params['decoder'])
+        if self.asr_decoder is not None:
+            self.asr_decoder.load_state_dict(params['predictor'], strict=False)
+    def forward(self, waves_16k, wave_16k_lens, ssl_model=None):
+        ssl_fn = self.ssl_model if self.ssl_model else ssl_model
+        assert ssl_fn is not None, "In case in-class SSL model loading is skipped, external ssl_model must be provided"
+        waves_16k_input_list = [
+            waves_16k[bib, :wave_16k_lens[bib]].cpu().numpy()
+            for bib in range(len(waves_16k))
+        ]
+        alt_inputs = self.ssl_feature_extractor(
+            waves_16k_input_list,
+            return_tensors='pt',
+            return_attention_mask=True,
+            padding=True,
+            sampling_rate=16000
+        ).to(waves_16k.device)
+        feature_lens = alt_inputs.data['attention_mask'].sum(-1) // 320  # frame rate of hubert is 50 Hz
+        outputs = ssl_fn(
+            alt_inputs.input_values,
+            attention_mask=alt_inputs.attention_mask,
+        )
+        last_hidden_states = outputs.last_hidden_state
+        last_hidden_states = last_hidden_states[:, :feature_lens.max(), :]
+        feature_lens = feature_lens.clamp(max=last_hidden_states.size(1))
+        last_hidden_states = last_hidden_states.transpose(1, 2)
+        x_hidden = self.encoder(last_hidden_states, feature_lens)
+        x_hidden = x_hidden.transpose(1, 2)
+        x_quantized, indices = self.quantizer(x_hidden)[:2]
+        return x_quantized, indices, feature_lens

modules/astral_quantization/transformer.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+import time
+def find_multiple(n: int, k: int) -> int:
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+class AdaptiveLayerNorm(nn.Module):
+    r"""Adaptive Layer Normalization"""
+    def __init__(self, d_model, norm) -> None:
+        super(AdaptiveLayerNorm, self).__init__()
+        self.project_layer = nn.Linear(d_model, 2 * d_model)
+        self.norm = norm
+        self.d_model = d_model
+        self.eps = self.norm.eps
+    def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor:
+        if embedding is None:
+            return self.norm(input)
+        weight, bias = torch.split(
+            self.project_layer(embedding),
+            split_size_or_sections=self.d_model,
+            dim=-1,
+        )
+        return weight * self.norm(input) + bias
+@dataclass
+class ModelArgs:
+    block_size: int = 2048
+    vocab_size: int = 32000
+    n_layer: int = 32
+    n_head: int = 32
+    dim: int = 4096
+    intermediate_size: int = None
+    n_local_heads: int = -1
+    head_dim: int = 64
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    has_cross_attention: bool = False
+    context_dim: int = 0
+    is_causal: bool = False
+    dropout_rate: float = 0.1
+    attn_dropout_rate: float = 0.1
+    def __post_init__(self):
+        if self.n_local_heads == -1:
+            self.n_local_heads = self.n_head
+        if self.intermediate_size is None:
+            hidden_dim = 4 * self.dim
+            n_hidden = int(2 * hidden_dim / 3)
+            self.intermediate_size = find_multiple(n_hidden, 256)
+        # self.head_dim = self.dim // self.n_head
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(TransformerBlock(config) for _ in range(config.n_layer))
+        self.norm = AdaptiveLayerNorm(config.dim, RMSNorm(config.dim, eps=config.norm_eps))
+        self.max_batch_size = -1
+        self.max_seq_length = config.block_size
+        freqs_cis = precompute_freqs_cis(self.config.block_size, self.config.head_dim,
+                                              self.config.rope_base)
+        self.register_buffer("freqs_cis", freqs_cis)
+        causal_mask = torch.tril(
+            torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool)
+        )
+        self.register_buffer("causal_mask", causal_mask)
+    def forward(self,
+                x: Tensor,
+                c: Tensor,
+                input_pos: Optional[Tensor] = None,
+                mask: Optional[Tensor] = None,
+                context: Optional[Tensor] = None,
+                context_input_pos: Optional[Tensor] = None,
+                cross_attention_mask: Optional[Tensor] = None,
+                ) -> Tensor:
+        if mask is None:
+            mask = self.causal_mask[:x.size(1), :x.size(1)]
+        else:
+            mask = mask[..., input_pos]
+        freqs_cis = self.freqs_cis[input_pos]
+        if context is not None:
+            context_freqs_cis = self.freqs_cis[context_input_pos]
+        else:
+            context_freqs_cis = None
+        skip_in_x_list = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x, c, freqs_cis, mask, context, context_freqs_cis, cross_attention_mask)
+        x = self.norm(x, c)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.attention = Attention(config)
+        self.feed_forward = FeedForward(config)
+        self.ffn_norm = AdaptiveLayerNorm(config.dim, RMSNorm(config.dim, eps=config.norm_eps))
+        self.attention_norm = AdaptiveLayerNorm(config.dim, RMSNorm(config.dim, eps=config.norm_eps))
+        if config.has_cross_attention:
+            self.has_cross_attention = True
+            self.cross_attention = Attention(config, is_cross_attention=True)
+            self.cross_attention_norm = AdaptiveLayerNorm(config.dim, RMSNorm(config.dim, eps=config.norm_eps))
+        else:
+            self.has_cross_attention = False
+    def forward(self,
+                x: Tensor,
+                c: Tensor,
+                freqs_cis: Tensor,
+                mask: Tensor,
+                context: Optional[Tensor] = None,
+                context_freqs_cis: Optional[Tensor] = None,
+                cross_attention_mask: Optional[Tensor] = None,
+                ) -> Tensor:
+        #time_attn_start = time.time()
+        h = x + self.attention(self.attention_norm(x, c), freqs_cis, mask)
+        #print(f"time take for attention of sequence length {x.shape[1]} is {time.time() - time_attn_start}")
+        if self.has_cross_attention:
+            h = h + self.cross_attention(self.cross_attention_norm(h, c), freqs_cis, cross_attention_mask, context, context_freqs_cis)
+        out = h + self.feed_forward(self.ffn_norm(h, c))
+        return out
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs, is_cross_attention: bool = False):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        total_head_dim = (config.n_head + 2 * config.n_local_heads) * config.head_dim
+        # key, query, value projections for all heads, but in a batch
+        if is_cross_attention:
+            self.wq = nn.Linear(config.dim, config.n_head * config.head_dim, bias=False)
+            self.wkv = nn.Linear(config.context_dim, 2 * config.n_local_heads * config.head_dim, bias=False)
+        else:
+            self.wqkv = nn.Linear(config.dim, total_head_dim, bias=False)
+        self.wo = nn.Linear(config.head_dim * config.n_head, config.dim, bias=False)
+        self.kv_cache = None
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_local_heads = config.n_local_heads
+        self.dim = config.dim
+        self.attn_dropout_rate = config.attn_dropout_rate
+    def forward(self,
+                x: Tensor,
+                freqs_cis: Tensor,
+                mask: Tensor,
+                context: Optional[Tensor] = None,
+                context_freqs_cis: Optional[Tensor] = None,
+                ) -> Tensor:
+        bsz, seqlen, _ = x.shape
+        kv_size = self.n_local_heads * self.head_dim
+        if context is None:
+            q, k, v = self.wqkv(x).split([kv_size, kv_size, kv_size], dim=-1)
+            context_seqlen = seqlen
+        else:
+            q = self.wq(x)
+            k, v = self.wkv(context).split([kv_size, kv_size], dim=-1)
+            context_seqlen = context.shape[1]
+        q = q.view(bsz, seqlen, self.n_head, self.head_dim)
+        k = k.view(bsz, context_seqlen, self.n_local_heads, self.head_dim)
+        v = v.view(bsz, context_seqlen, self.n_local_heads, self.head_dim)
+        q = apply_rotary_emb(q, freqs_cis)
+        k = apply_rotary_emb(k, context_freqs_cis if context_freqs_cis is not None else freqs_cis)
+        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=self.attn_dropout_rate if self.training else 0.0)
+        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.head_dim * self.n_head)
+        y = self.wo(y)
+        return y
+class FeedForward(nn.Module):
+    def __init__(self, config: ModelArgs) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w3 = nn.Linear(config.dim, config.intermediate_size, bias=False)
+        self.w2 = nn.Linear(config.intermediate_size, config.dim, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(self.dropout(F.silu(self.w1(x)) * self.w3(x)))
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor) -> Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def precompute_freqs_cis(
+        seq_len: int, n_elem: int, base: int = 10000,
+        dtype: torch.dtype = torch.bfloat16
+) -> Tensor:
+    freqs = 1.0 / (base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem))
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+    return cache.to(dtype=dtype)
+def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)

modules/audio.py CHANGED Viewed

@@ -1,82 +1,82 @@
-import numpy as np
-import torch
-import torch.utils.data
-from librosa.filters import mel as librosa_mel_fn
-from scipy.io.wavfile import read
-MAX_WAV_VALUE = 32768.0
-def load_wav(full_path):
-    sampling_rate, data = read(full_path)
-    return data, sampling_rate
-def dynamic_range_compression(x, C=1, clip_val=1e-5):
-    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
-def dynamic_range_decompression(x, C=1):
-    return np.exp(x) / C
-def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
-    return torch.log(torch.clamp(x, min=clip_val) * C)
-def dynamic_range_decompression_torch(x, C=1):
-    return torch.exp(x) / C
-def spectral_normalize_torch(magnitudes):
-    output = dynamic_range_compression_torch(magnitudes)
-    return output
-def spectral_de_normalize_torch(magnitudes):
-    output = dynamic_range_decompression_torch(magnitudes)
-    return output
-mel_basis = {}
-hann_window = {}
-def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
-    if torch.min(y) < -1.0:
-        print("min value is ", torch.min(y))
-    if torch.max(y) > 1.0:
-        print("max value is ", torch.max(y))
-    global mel_basis, hann_window  # pylint: disable=global-statement
-    if f"{str(sampling_rate)}_{str(fmax)}_{str(y.device)}" not in mel_basis:
-        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
-        mel_basis[str(sampling_rate) + "_" + str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
-        hann_window[str(sampling_rate) + "_" + str(y.device)] = torch.hann_window(win_size).to(y.device)
-    y = torch.nn.functional.pad(
-        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
-    )
-    y = y.squeeze(1)
-    spec = torch.view_as_real(
-        torch.stft(
-            y,
-            n_fft,
-            hop_length=hop_size,
-            win_length=win_size,
-            window=hann_window[str(sampling_rate) + "_" + str(y.device)],
-            center=center,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True,
-        )
-    )
-    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
-    spec = torch.matmul(mel_basis[str(sampling_rate) + "_" + str(fmax) + "_" + str(y.device)], spec)
-    spec = spectral_normalize_torch(spec)
-    return spec

+import numpy as np
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window  # pylint: disable=global-statement
+    if f"{str(sampling_rate)}_{str(fmax)}_{str(y.device)}" not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(sampling_rate) + "_" + str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(sampling_rate) + "_" + str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[str(sampling_rate) + "_" + str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(sampling_rate) + "_" + str(fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec

modules/bigvgan/__pycache__/activations.cpython-310.pyc ADDED Viewed

Binary file (4 kB). View file

modules/bigvgan/__pycache__/bigvgan.cpython-310.pyc ADDED Viewed

Binary file (11.8 kB). View file

modules/bigvgan/__pycache__/env.cpython-310.pyc ADDED Viewed

Binary file (796 Bytes). View file

modules/bigvgan/__pycache__/meldataset.cpython-310.pyc ADDED Viewed

Binary file (8.54 kB). View file

modules/bigvgan/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.84 kB). View file

modules/bigvgan/alias_free_activation/cuda/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (158 Bytes). View file

modules/bigvgan/alias_free_activation/cuda/__pycache__/activation1d.cpython-310.pyc ADDED Viewed

Binary file (2.34 kB). View file

modules/bigvgan/alias_free_activation/cuda/__pycache__/load.cpython-310.pyc ADDED Viewed

Binary file (1.99 kB). View file

modules/bigvgan/alias_free_activation/cuda/activation1d.py CHANGED Viewed

@@ -3,10 +3,10 @@
 import torch
 import torch.nn as nn
-from alias_free_activation.torch.resample import UpSample1d, DownSample1d
 # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
-from alias_free_activation.cuda import load
 anti_alias_activation_cuda = load.load()

 import torch
 import torch.nn as nn
+from ..torch.resample import UpSample1d, DownSample1d
 # load fused CUDA kernel: this enables importing anti_alias_activation_cuda
+from ..cuda import load
 anti_alias_activation_cuda = load.load()

modules/bigvgan/alias_free_activation/cuda/build/.ninja_deps ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e233713716a5778577f244b0f310944ff26d3079ce0e42491791da7d42e363c1
+size 522068

modules/bigvgan/alias_free_activation/cuda/build/.ninja_log ADDED Viewed

	@@ -0,0 +1,7 @@

+# ninja log v5
+9	39554	7516864785377831	anti_alias_activation.o	3a177f31dd72c43c
+13	152601	7516865914203767	anti_alias_activation_cuda.cuda.o	2d613e7382d803fd
+152628	153062	7516865920541751	anti_alias_activation_cuda.pyd	f6366e9bdfb27f7
+128	50503	7654004565901584	anti_alias_activation.o	9ed3213f2e0d0858
+133	176837	7654005827401976	anti_alias_activation_cuda.cuda.o	a679b6661c609136
+176839	177401	7654005835005523	anti_alias_activation_cuda.pyd	f6366e9bdfb27f7

modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation.o ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74c2824b05582070b69f51ec588aadb268c4fddf18fbb4590f901d1cdf32185c
+size 3246655

modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.cuda.o ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86c48de557041de7ebaff7926b5f346cc5e4e2dddc6cf5b88409f6cb161db0f4
+size 4724513

modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.exp ADDED Viewed

Binary file (25.1 kB). View file

modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.lib ADDED Viewed

Binary file (43.7 kB). View file

modules/bigvgan/alias_free_activation/cuda/build/anti_alias_activation_cuda.pyd ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db37ea2dd31dfe67e68ee6019877d14638c41724ff9342c55f638f4d2cda3d03
+size 2454528

modules/bigvgan/alias_free_activation/cuda/build/build.ninja ADDED Viewed

	@@ -0,0 +1,38 @@

+ninja_required_version = 1.3
+cxx = cl
+nvcc = C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin\nvcc
+cflags = -DTORCH_EXTENSION_NAME=anti_alias_activation_cuda -DTORCH_API_INCLUDE_EXTENSION_H -ID:\Anaconda\envs\vocos\lib\site-packages\torch\include -ID:\Anaconda\envs\vocos\lib\site-packages\torch\include\torch\csrc\api\include "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include" -ID:\Anaconda\envs\vocos\Include /std:c++17 -O3 /MD /wd4819 /wd4251 /wd4244 /wd4267 /wd4275 /wd4018 /wd4190 /wd4624 /wd4067 /wd4068 /EHsc
+post_cflags =
+cuda_cflags = -Xcudafe --diag_suppress=dll_interface_conflict_dllexport_assumed -Xcudafe --diag_suppress=dll_interface_conflict_none_assumed -Xcudafe --diag_suppress=field_without_dll_interface -Xcudafe --diag_suppress=base_class_has_different_dll_interface -Xcompiler /EHsc -Xcompiler /wd4068 -Xcompiler /wd4067 -Xcompiler /wd4624 -Xcompiler /wd4190 -Xcompiler /wd4018 -Xcompiler /wd4275 -Xcompiler /wd4267 -Xcompiler /wd4244 -Xcompiler /wd4251 -Xcompiler /wd4819 -Xcompiler /MD -DTORCH_EXTENSION_NAME=anti_alias_activation_cuda -DTORCH_API_INCLUDE_EXTENSION_H -ID:\Anaconda\envs\vocos\lib\site-packages\torch\include -ID:\Anaconda\envs\vocos\lib\site-packages\torch\include\torch\csrc\api\include "-IC:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\include" -ID:\Anaconda\envs\vocos\Include -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_86,code=sm_86 -std=c++17 -O3 -gencode arch=compute_70,code=sm_70 --use_fast_math -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda -gencode arch=compute_80,code=sm_80
+cuda_post_cflags =
+cuda_dlink_post_cflags =
+sycl_dlink_post_cflags =
+ldflags = /DLL c10.lib c10_cuda.lib torch_cpu.lib torch_cuda.lib -INCLUDE:?warp_size@cuda@at@@YAHXZ torch.lib /LIBPATH:D:\Anaconda\envs\vocos\lib\site-packages\torch\lib torch_python.lib /LIBPATH:D:\Anaconda\envs\vocos\libs "/LIBPATH:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\lib\x64" cudart.lib
+rule compile
+  command = cl /showIncludes $cflags -c $in /Fo$out $post_cflags
+  deps = msvc
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc --generate-dependencies-with-compile --dependency-output $out.d $cuda_cflags -c $in -o $out $cuda_post_cflags
+rule link
+  command = "D$:\Visual Studio\VC\Tools\MSVC\14.29.30133\bin\Hostx86\x64/link.exe" $in /nologo $ldflags /out:$out
+build anti_alias_activation.o: compile D$:\seed-vc\modules\bigvgan\alias_free_activation\cuda\anti_alias_activation.cpp
+build anti_alias_activation_cuda.cuda.o: cuda_compile D$:\seed-vc\modules\bigvgan\alias_free_activation\cuda\anti_alias_activation_cuda.cu
+build anti_alias_activation_cuda.pyd: link anti_alias_activation.o anti_alias_activation_cuda.cuda.o
+default anti_alias_activation_cuda.pyd

modules/bigvgan/alias_free_activation/torch/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (217 Bytes). View file

modules/bigvgan/alias_free_activation/torch/__pycache__/act.cpython-310.pyc ADDED Viewed

Binary file (1.05 kB). View file