Spaces:

FireRedTeam
/

FireRedTTS2

Running on Zero

App Files Files Community

Shen Feiyu commited on Sep 16

Commit

71cd91e

1 Parent(s): a4ec42e

init at 250916

Browse files

Files changed (18) hide show

README.md +4 -4
app.py +357 -0
fireredtts2/__init__.py +0 -0
fireredtts2/codec/__init__.py +1 -0
fireredtts2/codec/audio.py +148 -0
fireredtts2/codec/decoder.py +700 -0
fireredtts2/codec/model.py +376 -0
fireredtts2/codec/rvq.py +164 -0
fireredtts2/codec/utils.py +38 -0
fireredtts2/codec/whisper.py +420 -0
fireredtts2/fireredtts2.py +459 -0
fireredtts2/llm/__init__.py +1 -0
fireredtts2/llm/llm.py +371 -0
fireredtts2/llm/modules.py +90 -0
fireredtts2/llm/utils.py +303 -0
fireredtts2/utils/spliter.py +289 -0
pretrained_models/README.md +1 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: FireRedTTS2
-emoji: 🐨
-colorFrom: gray
-colorTo: blue
 sdk: gradio
 sdk_version: 5.45.0
 app_file: app.py

 ---
+title: Tts2 Test
+emoji: 🌖
+colorFrom: pink
+colorTo: green
 sdk: gradio
 sdk_version: 5.45.0
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import re
+import spaces
+import gradio as gr
+from tqdm import tqdm
+from huggingface_hub import snapshot_download
+from argparse import ArgumentParser
+from typing import Literal, List, Tuple
+from fireredtts2.fireredtts2 import FireRedTTS2
+# ================================================
+#                   FireRedTTS2 Model
+# ================================================
+# Global model instance
+model: FireRedTTS2 = None
+def initiate_model(pretrained_dir: str, device="cuda"):
+    global model
+    if model is None:
+        model = FireRedTTS2(
+            pretrained_dir=pretrained_dir,
+            gen_type="dialogue",
+            device=device,
+        )
+# ================================================
+#                   Gradio
+# ================================================
+# i18n
+_i18n_key2lang_dict = dict(
+    # Title markdown
+    title_md_desc=dict(
+        en="FireRedTTS-2 🔥 Dialogue Generation",
+        zh="FireRedTTS-2 🔥 对话生成",
+    ),
+    # Voice mode radio
+    voice_mode_label=dict(
+        en="Voice Mode",
+        zh="音色模式",
+    ),
+    voice_model_choice1=dict(
+        en="Voice Clone",
+        zh="音色克隆",
+    ),
+    voice_model_choice2=dict(
+        en="Random Voice",
+        zh="随机音色",
+    ),
+    # Speaker1 Prompt
+    spk1_prompt_audio_label=dict(
+        en="Speaker 1 Prompt Audio",
+        zh="说话人 1 参考语音",
+    ),
+    spk1_prompt_text_label=dict(
+        en="Speaker 1 Prompt Text",
+        zh="说话人 1 参考文本",
+    ),
+    spk1_prompt_text_placeholder=dict(
+        en="[S1] text of speaker 1 prompt audio.",
+        zh="[S1] 说话人 1 参考文本",
+    ),
+    # Speaker2 Prompt
+    spk2_prompt_audio_label=dict(
+        en="Speaker 2 Prompt Audio",
+        zh="说话人 2 参考语音",
+    ),
+    spk2_prompt_text_label=dict(
+        en="Speaker 2 Prompt Text",
+        zh="说话人 2 参考文本",
+    ),
+    spk2_prompt_text_placeholder=dict(
+        en="[S2] text of speaker 2 prompt audio.",
+        zh="[S2] 说话人 2 参考文本",
+    ),
+    # Dialogue input textbox
+    dialogue_text_input_label=dict(
+        en="Dialogue Text Input",
+        zh="对话文本输入",
+    ),
+    dialogue_text_input_placeholder=dict(
+        en="[S1]text[S2]text[S1]text...",
+        zh="[S1]文本[S2]文本[S1]文本...",
+    ),
+    # Generate button
+    generate_btn_label=dict(
+        en="Generate Audio",
+        zh="合成",
+    ),
+    # Generated audio
+    generated_audio_label=dict(
+        en="Generated Dialogue Audio",
+        zh="合成的对话音频",
+    ),
+    # Warining1: invalid text for prompt
+    warn_invalid_spk1_prompt_text=dict(
+        en='Invalid speaker 1 prompt text, should strictly follow: "[S1]xxx"',
+        zh='说话人 1 参考文本不合规，格式："[S1]xxx"',
+    ),
+    warn_invalid_spk2_prompt_text=dict(
+        en='Invalid speaker 2 prompt text, should strictly follow: "[S2]xxx"',
+        zh='说话人 2 参考文本不合规，格式："[S2]xxx"',
+    ),
+    # Warining2: invalid text for dialogue input
+    warn_invalid_dialogue_text=dict(
+        en='Invalid dialogue input text, should strictly follow: "[S1]xxx[S2]xxx..."',
+        zh='对话文本输入不合规，格式："[S1]xxx[S2]xxx..."',
+    ),
+    # Warining3: incomplete prompt info
+    warn_incomplete_prompt=dict(
+        en="Please provide prompt audio and text for both speaker 1 and speaker 2",
+        zh="请提供说话人 1 与说话人 2 的参考语音与参考文本",
+    ),
+)
+global_lang: Literal["zh", "en"] = "zh"
+def i18n(key):
+    global global_lang
+    return _i18n_key2lang_dict[key][global_lang]
+def check_monologue_text(text: str, prefix: str = None) -> bool:
+    text = text.strip()
+    # Check speaker tags
+    if prefix is not None and (not text.startswith(prefix)):
+        return False
+    # Remove prefix
+    if prefix is not None:
+        text = text.removeprefix(prefix)
+    text = text.strip()
+    # If empty?
+    if len(text) == 0:
+        return False
+    return True
+def check_dialogue_text(text_list: List[str]) -> bool:
+    if len(text_list) == 0:
+        return False
+    for text in text_list:
+        if not (
+            check_monologue_text(text, "[S1]")
+            or check_monologue_text(text, "[S2]")
+            or check_monologue_text(text, "[S3]")
+            or check_monologue_text(text, "[S4]")
+        ):
+            return False
+    return True
+@spaces.GPU(duration=200)
+def dialogue_synthesis_function(
+    target_text: str,
+    voice_mode: Literal[0, 1] = 0,  # 0 means voice clone
+    spk1_prompt_text: str | None = "",
+    spk1_prompt_audio: str | None = None,
+    spk2_prompt_text: str | None = "",
+    spk2_prompt_audio: str | None = None,
+):
+    # Voice clone mode, check prompt info
+    if voice_mode == 0:
+        prompt_has_value = [
+            spk1_prompt_text != "",
+            spk1_prompt_audio is not None,
+            spk2_prompt_text != "",
+            spk2_prompt_audio is not None,
+        ]
+        if not all(prompt_has_value):
+            gr.Warning(message=i18n("warn_incomplete_prompt"))
+            return None
+        if not check_monologue_text(spk1_prompt_text, "[S1]"):
+            gr.Warning(message=i18n("warn_invalid_spk1_prompt_text"))
+            return None
+        if not check_monologue_text(spk2_prompt_text, "[S2]"):
+            gr.Warning(message=i18n("warn_invalid_spk2_prompt_text"))
+            return None
+    # Check dialogue text
+    target_text_list: List[str] = re.findall(r"(\[S[0-9]\][^\[\]]*)", target_text)
+    target_text_list = [text.strip() for text in target_text_list]
+    if not check_dialogue_text(target_text_list):
+        gr.Warning(message=i18n("warn_invalid_dialogue_text"))
+        return None
+    # Go synthesis
+    progress_bar = gr.Progress(track_tqdm=True)
+    prompt_wav_list = (
+        None if voice_mode != 0 else [spk1_prompt_audio, spk2_prompt_audio]
+    )
+    prompt_text_list = None if voice_mode != 0 else [spk1_prompt_text, spk2_prompt_text]
+    target_audio = model.generate_dialogue(
+        text_list=target_text_list,
+        prompt_wav_list=prompt_wav_list,
+        prompt_text_list=prompt_text_list,
+        temperature=0.9,
+        topk=30,
+    )
+    return (24000, target_audio.squeeze(0).numpy())
+# UI rendering
+def render_interface() -> gr.Blocks:
+    with gr.Blocks(title="FireRedTTS-2", theme=gr.themes.Default()) as page:
+        # ======================== UI ========================
+        # A large title
+        title_desc = gr.Markdown(value="# {}".format(i18n("title_md_desc")))
+        with gr.Row():
+            lang_choice = gr.Radio(
+                choices=["中文", "English"],
+                value="中文",
+                label="Display Language/显示语言",
+                type="index",
+                interactive=True,
+            )
+            voice_mode_choice = gr.Radio(
+                choices=[i18n("voice_model_choice1"), i18n("voice_model_choice2")],
+                value=i18n("voice_model_choice1"),
+                label=i18n("voice_mode_label"),
+                type="index",
+                interactive=True,
+            )
+        with gr.Row():
+            # ==== Speaker1 Prompt ====
+            with gr.Column(scale=1):
+                with gr.Group(visible=True) as spk1_prompt_group:
+                    spk1_prompt_audio = gr.Audio(
+                        label=i18n("spk1_prompt_audio_label"),
+                        type="filepath",
+                        editable=False,
+                        interactive=True,
+                    )  # Audio component returns tmp audio path
+                    spk1_prompt_text = gr.Textbox(
+                        label=i18n("spk1_prompt_text_label"),
+                        placeholder=i18n("spk1_prompt_text_placeholder"),
+                        lines=3,
+                    )
+            # ==== Speaker2 Prompt ====
+            with gr.Column(scale=1):
+                with gr.Group(visible=True) as spk2_prompt_group:
+                    spk2_prompt_audio = gr.Audio(
+                        label=i18n("spk2_prompt_audio_label"),
+                        type="filepath",
+                        editable=False,
+                        interactive=True,
+                    )
+                    spk2_prompt_text = gr.Textbox(
+                        label=i18n("spk2_prompt_text_label"),
+                        placeholder=i18n("spk2_prompt_text_placeholder"),
+                        lines=3,
+                    )
+            # ==== Text input ====
+            with gr.Column(scale=2):
+                dialogue_text_input = gr.Textbox(
+                    label=i18n("dialogue_text_input_label"),
+                    placeholder=i18n("dialogue_text_input_placeholder"),
+                    lines=18,
+                )
+        # Generate button
+        generate_btn = gr.Button(
+            value=i18n("generate_btn_label"), variant="primary", size="lg"
+        )
+        # Long output audio
+        generate_audio = gr.Audio(
+            label=i18n("generated_audio_label"),
+            interactive=False,
+        )
+        # ======================== Action ========================
+        # Language action
+        def _change_component_language(lang):
+            global global_lang
+            global_lang = ["zh", "en"][lang]
+            return [
+                # title_desc
+                gr.update(value="# {}".format(i18n("title_md_desc"))),
+                # voice_mode_choice
+                gr.update(
+                    choices=[i18n("voice_model_choice1"), i18n("voice_model_choice2")],
+                    value=i18n("voice_model_choice1"),
+                    label=i18n("voice_mode_label"),
+                ),
+                # spk1_prompt_{audio,text}
+                gr.update(label=i18n("spk1_prompt_audio_label")),
+                gr.update(
+                    label=i18n("spk1_prompt_text_label"),
+                    placeholder=i18n("spk1_prompt_text_placeholder"),
+                ),
+                # spk2_prompt_{audio,text}
+                gr.update(label=i18n("spk2_prompt_audio_label")),
+                gr.update(
+                    label=i18n("spk2_prompt_text_label"),
+                    placeholder=i18n("spk2_prompt_text_placeholder"),
+                ),
+                # dialogue_text_input
+                gr.update(
+                    label=i18n("dialogue_text_input_label"),
+                    placeholder=i18n("dialogue_text_input_placeholder"),
+                ),
+                # generate_btn
+                gr.update(value=i18n("generate_btn_label")),
+                # generate_audio
+                gr.update(label=i18n("generated_audio_label")),
+            ]
+        lang_choice.change(
+            fn=_change_component_language,
+            inputs=[lang_choice],
+            outputs=[
+                title_desc,
+                voice_mode_choice,
+                spk1_prompt_audio,
+                spk1_prompt_text,
+                spk2_prompt_audio,
+                spk2_prompt_text,
+                dialogue_text_input,
+                generate_btn,
+                generate_audio,
+            ],
+        )
+        # Voice clone mode action
+        def _change_prompt_input_visibility(voice_mode):
+            enable = voice_mode == 0
+            return [gr.update(visible=enable), gr.update(visible=enable)]
+        voice_mode_choice.change(
+            fn=_change_prompt_input_visibility,
+            inputs=[voice_mode_choice],
+            outputs=[spk1_prompt_group, spk2_prompt_group],
+        )
+        generate_btn.click(
+            fn=dialogue_synthesis_function,
+            inputs=[
+                dialogue_text_input,
+                voice_mode_choice,
+                spk1_prompt_text,
+                spk1_prompt_audio,
+                spk2_prompt_text,
+                spk2_prompt_audio,
+            ],
+            outputs=[generate_audio],
+        )
+    return page
+if __name__ == "__main__":
+    # Download model
+    snapshot_download(repo_id='FireRedTeam/FireRedTTS2', local_dir='pretrained_models/FireRedTTS2')
+    # Initiate model
+    initiate_model('pretrained_models/FireRedTTS2')
+    # UI
+    page = render_interface()
+    page.queue()
+    page.launch()

fireredtts2/__init__.py ADDED Viewed

File without changes

fireredtts2/codec/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from fireredtts2.codec.model import RedCodecInfer

fireredtts2/codec/audio.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
+and remove unnecessary dependencies.
+"""
+import warnings
+import numpy as np
+from typing import Union, Optional
+def hertz_to_mel(
+    freq: Union[float, np.ndarray], mel_scale: str = "htk"
+) -> Union[float, np.ndarray]:
+    if mel_scale not in ["slaney", "htk", "kaldi"]:
+        raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
+    if mel_scale == "htk":
+        return 2595.0 * np.log10(1.0 + (freq / 700.0))
+    elif mel_scale == "kaldi":
+        return 1127.0 * np.log(1.0 + (freq / 700.0))
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = 27.0 / np.log(6.4)
+    mels = 3.0 * freq / 200.0
+    if isinstance(freq, np.ndarray):
+        log_region = freq >= min_log_hertz
+        mels[log_region] = (
+            min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
+        )
+    elif freq >= min_log_hertz:
+        mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
+    return mels
+def mel_to_hertz(
+    mels: Union[float, np.ndarray], mel_scale: str = "htk"
+) -> Union[float, np.ndarray]:
+    if mel_scale not in ["slaney", "htk", "kaldi"]:
+        raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
+    if mel_scale == "htk":
+        return 700.0 * (np.power(10, mels / 2595.0) - 1.0)
+    elif mel_scale == "kaldi":
+        return 700.0 * (np.exp(mels / 1127.0) - 1.0)
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = np.log(6.4) / 27.0
+    freq = 200.0 * mels / 3.0
+    if isinstance(mels, np.ndarray):
+        log_region = mels >= min_log_mel
+        freq[log_region] = min_log_hertz * np.exp(
+            logstep * (mels[log_region] - min_log_mel)
+        )
+    elif mels >= min_log_mel:
+        freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
+    return freq
+def _create_triangular_filter_bank(
+    fft_freqs: np.ndarray, filter_freqs: np.ndarray
+) -> np.ndarray:
+    """
+    Creates a triangular filter bank.
+    Adapted from *torchaudio* and *librosa*.
+    Args:
+        fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`):
+            Discrete frequencies of the FFT bins in Hz.
+        filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`):
+            Center frequencies of the triangular filters to create, in Hz.
+    Returns:
+        `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)`
+    """
+    filter_diff = np.diff(filter_freqs)
+    slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
+def mel_filter_bank(
+    num_frequency_bins: int,
+    num_mel_filters: int,
+    min_frequency: float,
+    max_frequency: float,
+    sampling_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+    triangularize_in_mel_space: bool = False,
+) -> np.ndarray:
+    if norm is not None and norm != "slaney":
+        raise ValueError('norm must be one of None or "slaney"')
+    # center points of the triangular mel filters
+    mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
+    mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
+    mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
+    filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
+    if triangularize_in_mel_space:
+        # frequencies of FFT bins in Hz, but filters triangularized in mel space
+        fft_bin_width = sampling_rate / (num_frequency_bins * 2)
+        fft_freqs = hertz_to_mel(
+            fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale
+        )
+        filter_freqs = mel_freqs
+    else:
+        # frequencies of FFT bins in Hz
+        fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
+    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (
+            filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters]
+        )
+        mel_filters *= np.expand_dims(enorm, 0)
+    if (mel_filters.max(axis=0) == 0.0).any():
+        warnings.warn(
+            "At least one mel filter has all zero values. "
+            f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. "
+            f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low."
+        )
+    return mel_filters

fireredtts2/codec/decoder.py ADDED Viewed

	@@ -0,0 +1,700 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fireredtts2.codec.whisper import WhisperEncoderLayer
+from fireredtts2.codec.utils import make_nonpad_mask, make_block_causal_mask
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.block1 = nn.Sequential(
+            nn.GroupNorm(
+                num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+            ),
+            nn.SiLU(),
+            nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=1, padding=1),
+        )
+        self.block2 = nn.Sequential(
+            nn.GroupNorm(
+                num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
+            ),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1),
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv1d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv1d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x: shape (b, c, t)
+        """
+        h = x
+        h = self.block1(h)
+        h = self.block2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+# A causal variant of Conv1d
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels, kernel_size)
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+    def forward_chunk(self, x: torch.Tensor, cnn_cache: torch.Tensor = None):
+        if cnn_cache is None:
+            cnn_cache = x.new_zeros(
+                (x.shape[0], self.in_channels, self.causal_padding[0])
+            )
+        x = torch.cat([cnn_cache, x], dim=2)
+        new_cnn_cache = x[..., -self.causal_padding[0] :]
+        x = super(CausalConv1d, self).forward(x)
+        return x, new_cnn_cache
+# A causal variant of ResnetBlock
+class CausalResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = None,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.block1 = nn.Sequential(
+            Transpose(1, 2),
+            nn.LayerNorm(in_channels),
+            Transpose(1, 2),
+            nn.SiLU(),
+            CausalConv1d(in_channels, out_channels, kernel_size=3),
+        )
+        self.block2 = nn.Sequential(
+            Transpose(1, 2),
+            nn.LayerNorm(out_channels),
+            Transpose(1, 2),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv1d(out_channels, out_channels, kernel_size=3),
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = torch.nn.Conv1d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x: shape (b, c, t)
+        """
+        h = x
+        h = self.block1(h)
+        h = self.block2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+    def forward_chunk(self, x: torch.Tensor, cache: torch.Tensor = None):
+        """
+        Args:
+            x: shape (b, c, t)
+            cache: shape (b, c_in+c_out, t=2)
+        """
+        cache1, cache2 = (
+            (None, None)
+            if cache is None
+            else cache.split((self.in_channels, self.out_channels), dim=1)
+        )
+        h = x
+        # block1
+        h = self.block1[:4](h)
+        h, new_cache1 = self.block1[4].forward_chunk(h, cache1)
+        # block2
+        h = self.block2[:5](h)
+        h, new_cache2 = self.block2[5].forward_chunk(h, cache2)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        new_cache = torch.cat([new_cache1, new_cache2], dim=1)
+        return x + h, new_cache
+# Nonstreaming Vocos backbone based on Transformer layers
+class VocosBackbone(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 1024,
+        num_layers: int = 12,
+        num_heads: int = 16,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.in_proj = nn.Conv1d(embed_dim, embed_dim, kernel_size=7, padding=3)
+        self.prior_net = nn.Sequential(
+            ResnetBlock(embed_dim, embed_dim, dropout=dropout),
+            ResnetBlock(embed_dim, embed_dim, dropout=dropout),
+        )
+        self.transformers = nn.ModuleList(
+            [WhisperEncoderLayer(embed_dim, num_heads) for _ in range(num_layers)]
+        )
+        self.post_net = nn.Sequential(
+            ResnetBlock(embed_dim, embed_dim, dropout=dropout),
+            ResnetBlock(embed_dim, embed_dim, dropout=dropout),
+        )
+        self.final_norm = nn.LayerNorm(embed_dim, eps=1e-6)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+    ):
+        """
+        Args:
+            x: shape (b, t, c)
+            x_lens: shape (b,)
+        """
+        x = x.transpose(1, 2)
+        x = self.in_proj(x)
+        x = self.prior_net(x)
+        x = x.transpose(1, 2)
+        attention_mask = make_nonpad_mask(x_lens).unsqueeze(1)  # (b, 1, t)
+        # NOTE(sfy): I think positional embedding is unnecessary
+        for layer in self.transformers:
+            x = layer(x, attention_mask)
+        x = x.transpose(1, 2)
+        x = self.post_net(x)
+        x = x.transpose(1, 2)
+        x = self.final_norm(x)
+        return x
+# Streaming Vocos backbone based on Transformer layers
+class CausalVocosBackbone(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 1024,
+        num_layers: int = 12,
+        num_heads: int = 16,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.in_proj = CausalConv1d(embed_dim, embed_dim, kernel_size=7)
+        self.prior_net = nn.Sequential(
+            CausalResnetBlock(embed_dim, embed_dim, dropout=dropout),
+            CausalResnetBlock(embed_dim, embed_dim, dropout=dropout),
+        )
+        self.transformers = nn.ModuleList(
+            [WhisperEncoderLayer(embed_dim, num_heads) for _ in range(num_layers)]
+        )
+        self.post_net = nn.Sequential(
+            CausalResnetBlock(embed_dim, embed_dim, dropout=dropout),
+            CausalResnetBlock(embed_dim, embed_dim, dropout=dropout),
+        )
+        self.final_norm = nn.LayerNorm(embed_dim, eps=1e-6)
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+    ):
+        """
+        Args:
+            x: shape (b, t, c)
+            x_lens: shape (b,)
+        """
+        x = x.transpose(1, 2)
+        x = self.in_proj(x)
+        x = self.prior_net(x)
+        x = x.transpose(1, 2)
+        # NOTE(sfy): We have no padding in training, so safe for sdpa attention, no Nan.
+        # Also, 1 token(12.5Hz) -> 4 latents(50Hz) -> 8 latents(100Hz),
+        # so we design a 8 block causal attention mask instead of fully causal to improve performance
+        attention_mask = make_block_causal_mask(x_lens, chunk_size=8)
+        for layer in self.transformers:
+            x = layer(x, attention_mask)
+        x = x.transpose(1, 2)
+        x = self.post_net(x)
+        x = x.transpose(1, 2)
+        x = self.final_norm(x)
+        return x
+    def forward_chunk(
+        self,
+        x: torch.Tensor,
+        conv_cache1: torch.Tensor = None,
+        conv_cache2: torch.Tensor = None,
+        kv_cache: torch.Tensor = None,
+    ):
+        # Unpack cache
+        cache1 = conv_cache1
+        cache2, cache3, cache4, cache5 = (
+            (None, None, None, None)
+            if conv_cache2 is None
+            else conv_cache2.chunk(4, dim=1)
+        )
+        # cache1: shape (b, c=embed_dim, t=6)
+        x = x.transpose(1, 2)
+        x, new_cache1 = self.in_proj.forward_chunk(x, cache1)
+        # cache2: shape (b, c=embed_dim*2, t=2)
+        x, new_cache2 = self.prior_net[0].forward_chunk(x, cache2)
+        # cache3: shape (b, c=embed_dim*2, t=2)
+        x, new_cache3 = self.prior_net[1].forward_chunk(x, cache3)
+        x = x.transpose(1, 2)
+        # k,v-cache: shape (b, nlayer, nh, t, c*2)
+        new_kv_cache = []
+        for idx, layer in enumerate(self.transformers):
+            kv_cache_i = None if kv_cache is None else kv_cache[:, idx]
+            x, new_kv_cache_i = layer.forward_chunk(x, kv_cache=kv_cache_i)
+            new_kv_cache.append(new_kv_cache_i)
+        new_kv_cache = torch.stack(new_kv_cache, dim=1)
+        x = x.transpose(1, 2)
+        # cache4: shape (b, c=embed_dim*2, t=2)
+        x, new_cache4 = self.post_net[0].forward_chunk(x, cache4)
+        # cache5: shape (b, c=embed_dim*2, t=2)
+        x, new_cache5 = self.post_net[1].forward_chunk(x, cache5)
+        x = x.transpose(1, 2)
+        x = self.final_norm(x)
+        new_conv_cache1 = new_cache1
+        new_conv_cache2 = torch.cat(
+            [new_cache2, new_cache3, new_cache4, new_cache5], dim=1
+        )
+        return x, new_conv_cache1, new_conv_cache2, new_kv_cache
+class ISTFT(nn.Module):
+    """
+    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+    See issue: https://github.com/pytorch/pytorch/issues/62323
+    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+    The NOLA constraint is met as we trim padded samples anyway.
+    Args:
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames.
+        win_length (int): The size of window frame and STFT filter.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(
+        self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"
+    ):
+        super().__init__()
+        assert padding in ["center", "same"], "Padding must be 'center' or 'same'."
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+    def forward(self, spec: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+        Args:
+            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+                            N is the number of frequency bins, and T is the number of time frames.
+        Returns:
+            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+        """
+        if self.padding == "center":
+            # Fallback to pytorch native implementation
+            return torch.istft(
+                spec,
+                self.n_fft,
+                self.hop_length,
+                self.win_length,
+                self.window,
+                center=True,
+            )
+        elif self.padding == "same":
+            pad = (self.win_length - self.hop_length) // 2
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        assert spec.dim() == 3, "Expected a 3D tensor as input"
+        B, N, T = spec.shape
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+        # Overlap and Add
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, pad:-pad]
+        # Window envelope
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = torch.nn.functional.fold(
+            window_sq,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        ).squeeze()[pad:-pad]
+        # Normalize
+        assert (window_envelope > 1e-11).all()
+        y = y / window_envelope
+        return y
+    def forward_chunk(
+        self, spec: torch.Tensor, cache: torch.Tensor = None, last_chunk: bool = False
+    ):
+        """Forward only one frame.
+        Args:
+            spec: shape (B, N, T=chunk_size)
+            cache: previous chunk's last ifft frame, shape (B, N, T=3)
+            last_chunk: if last_chunk, will not trim the last (win-hop) segment
+        Returns:
+            y: shape (B, T=effective_length)
+        """
+        assert self.padding == "same", "Padding must be same."
+        assert (
+            self.win_length % self.hop_length == 0
+        ), f"{self.win_length} {self.hop_length}"
+        pad = (self.win_length - self.hop_length) // 2
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]  # (B, N, T=chunk_size)
+        # Append previous cache
+        if cache is not None:
+            ifft = torch.cat([cache, ifft], dim=-1)
+        new_cache_t = self.win_length // self.hop_length - 1
+        new_cache = ifft[..., -new_cache_t:]
+        # Overlap and Add
+        output_size = (ifft.shape[-1] - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, :]
+        # Window envelope
+        window_sq = (
+            self.window.square().expand(1, ifft.shape[-1], -1).transpose(1, 2)
+        )  # (B=1, N, T)
+        window_envelope = torch.nn.functional.fold(
+            window_sq,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        ).squeeze()
+        # Normalize
+        # assert (window_envelope > 1e-11).all()
+        y = y / window_envelope
+        # Only take effective part
+        if cache is None:
+            y = y[:, pad:]
+        else:
+            y = y[:, (self.win_length - self.hop_length) :]
+        if last_chunk:
+            y = y[:, :-pad]
+        else:
+            y = y[:, : -(self.win_length - self.hop_length)]
+        return y, new_cache
+class ISTFTHead(nn.Module):
+    """
+    ISTFT Head module for predicting STFT complex coefficients.
+    Args:
+        dim (int): Hidden dimension of the model.
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames, which should align with
+                          the resolution of the input features.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+        super().__init__()
+        self.hop_length = hop_length
+        out_dim = n_fft + 2
+        self.out = torch.nn.Linear(dim, out_dim)
+        self.istft = ISTFT(
+            n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding
+        )
+    def forward(self, x: torch.Tensor, x_len: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the ISTFTHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x_pred = self.out(x)
+        x_pred = x_pred.transpose(1, 2)
+        mag, p = x_pred.chunk(2, dim=1)
+        mag = torch.exp(mag)
+        mag = torch.clip(
+            mag, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        # recalculating phase here does not produce anything new
+        # only costs time
+        # phase = torch.atan2(y, x)
+        # S = mag * torch.exp(phase * 1j)
+        # better directly produce the complex value
+        S = mag * (x + 1j * y)
+        audio = self.istft(S)
+        audio_length = x_len * self.hop_length
+        return audio, audio_length
+    def forward_chunk(
+        self, x: torch.Tensor, cache: torch.Tensor = None, last_chunk: bool = False
+    ):
+        """ISTFTHead can be adapted in streaming inference without retraining.
+        Args:
+            x: shape (B, T, C)
+            cache: shape (B, N, T=3), istft cache
+        Returns:
+            audio: shape (B, t)
+        """
+        x_pred = self.out(x)
+        x_pred = x_pred.transpose(1, 2)
+        mag, p = x_pred.chunk(2, dim=1)
+        mag = torch.exp(mag)  # (B, C, T)
+        mag = torch.clip(
+            mag, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        S = mag * (x + 1j * y)  # (B, C, T)
+        audio, new_cache = self.istft.forward_chunk(S, cache, last_chunk)
+        return audio, new_cache
+# UpsampleConv(50->100Hz) + VocosBackbone + ISTFTHead
+class AcousticDecoder(nn.Module):
+    def __init__(
+        self,
+        # Transformer
+        embed_dim: int,
+        num_layers: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        # iSTFT
+        hop_length: int = 240,
+        # Causal
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.hop_length = hop_length
+        self.causal = causal
+        # Output upsample
+        self.upsample_conv = nn.Sequential(
+            nn.ConvTranspose1d(
+                embed_dim,
+                embed_dim,
+                kernel_size=3,
+                stride=2,
+                padding=0,  # Do not fill input side
+                output_padding=0,  # Can be adjusted to precisely control length
+            ),
+            nn.GELU(),
+            nn.ConvTranspose1d(
+                embed_dim,
+                embed_dim,
+                kernel_size=3,
+                stride=1,
+                padding=0,  # Do not fill input side
+            ),
+            nn.GELU(),
+        )
+        self.backbone = (
+            CausalVocosBackbone(embed_dim, num_layers, num_heads, dropout)
+            if causal
+            else VocosBackbone(embed_dim, num_layers, num_heads, dropout)
+        )
+        self.isift = ISTFTHead(embed_dim, hop_length * 4, hop_length, padding="same")
+        # Init weights
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv1d):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, x: torch.Tensor, x_lens: torch.Tensor):
+        """
+        Args:
+            x: shape (b, t, c)
+            x_lens: shape (b,)
+        """
+        # Upsample
+        target_length = x.shape[1] * 2
+        x = x.transpose(1, 2)
+        x = self.upsample_conv(x)
+        x = x.transpose(1, 2)
+        # NOTE strict upsampling, trim the last 3 elements
+        x = x[:, :target_length]
+        x_lens = x_lens * 2
+        # Backbone
+        x = self.backbone(x, x_lens)
+        # iSTFT
+        y, y_lens = self.isift(x, x_lens)
+        return y, y_lens
+    def forward_upsample_conv_chunk(self, x: torch.Tensor, cache: torch.Tensor = None):
+        """Stream forward upsample_conv module with previous block cache.
+        Args:
+            x: shape (B, C, T)
+            cache: shape (B, C, 3), where 3 denotes 1 history state for 1st conv and 2 for the rest conv.
+        """
+        # Unpack cache
+        cache1, cache2 = (
+            (None, None) if cache is None else torch.split(cache, [1, 2], dim=2)
+        )
+        # 1st conv cache
+        if cache1 is not None:
+            x = torch.cat([cache1, x], dim=2)
+        new_cache1 = x[..., -1:]
+        # 1st conv
+        x = self.upsample_conv[0](x)[..., :-1]  # remove extra 1 frame
+        if cache1 is not None:
+            x = x[..., 2:]  # remove cache1 part
+        x = self.upsample_conv[1](x)
+        # 2nd conv cache
+        if cache2 is not None:
+            x = torch.cat([cache2, x], dim=2)
+        new_cache2 = x[..., -2:]
+        # 2nd conv
+        x = self.upsample_conv[2](x)[..., :-2]  # remove extra 2 frame
+        if cache2 is not None:
+            x = x[..., 2:]  # remove cache2 part
+        x = self.upsample_conv[3](x)
+        new_cache = torch.cat([new_cache1, new_cache2], dim=2)
+        return x, new_cache
+    def forward_chunk(
+        self,
+        x: torch.Tensor,
+        # Upsample conv cache
+        up_conv_cache: torch.Tensor = None,
+        # Backbone conv cache
+        bb_conv_cache1: torch.Tensor = None,
+        bb_conv_cache2: torch.Tensor = None,
+        # Backbone attention cache
+        bb_kv_cache: torch.Tensor = None,
+        # iSTFT cache
+        is_cache: torch.Tensor = None,
+        last_chunk: bool = False,
+    ):
+        """
+        Args:
+            x: input sequence at 50Hz, length should be multiples of 4
+        """
+        assert (
+            self.causal
+        ), "Only AcousticDecoder with causal=True supports forward_chunk method."
+        x = x.transpose(1, 2)
+        x, new_up_conv_cache = self.forward_upsample_conv_chunk(x, up_conv_cache)
+        x = x.transpose(1, 2)
+        # Backbone
+        x, new_bb_conv_cache1, new_bb_conv_cache2, new_bb_kv_cache = (
+            self.backbone.forward_chunk(
+                x,
+                bb_conv_cache1,
+                bb_conv_cache2,
+                bb_kv_cache,
+            )
+        )
+        # iSTFT
+        y, new_is_cache = self.isift.forward_chunk(x, is_cache, last_chunk)
+        return (
+            y,
+            new_up_conv_cache,
+            new_bb_conv_cache1,
+            new_bb_conv_cache2,
+            new_bb_kv_cache,
+            new_is_cache,
+        )

fireredtts2/codec/model.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import math
+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Dict
+from torch.nn.utils.rnn import pad_sequence
+from fireredtts2.codec.rvq import ResidualVQ
+from fireredtts2.codec.decoder import AcousticDecoder
+from fireredtts2.codec.utils import make_nonpad_mask
+from fireredtts2.codec.whisper import (
+    WhisperEncoderLayer,
+    PretrainedWhisperEncoder,
+    WhisperAcousticEncoder,
+)
+class SslAdaptor(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        embed_dim: int,
+        out_dim: int,
+        num_layers: int,
+        num_heads: int,
+        ffn_dim: int = None,
+        attn_dropout: float = 0.0,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.embed_dim = embed_dim
+        self.dropout = dropout
+        # Input Projection
+        self.in_proj = nn.Linear(in_dim, embed_dim)
+        # Transformer
+        self.layers = nn.ModuleList(
+            [
+                WhisperEncoderLayer(
+                    embed_dim, num_heads, ffn_dim, attn_dropout, dropout
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # Output norm
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        # Output projection
+        self.out_proj = nn.Linear(embed_dim, out_dim)
+        # Init weight
+        self.apply(self._init_weights)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        hidden_length: torch.Tensor,
+    ):
+        # Downsampling
+        hidden_states = self.in_proj(hidden_states)
+        # Transformer
+        attention_mask = make_nonpad_mask(hidden_length).unsqueeze(1)  # (b, 1, t)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states, hidden_length
+    def _init_weights(self, module):
+        std = 0.02
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class ResidualDownConv(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        avg_pooler=4,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.avg_pooler = avg_pooler
+        self.intermediate_dim = embed_dim * avg_pooler
+        # Convolution layer for downsampling
+        self.gate_proj = nn.Conv1d(
+            embed_dim, self.intermediate_dim, avg_pooler, avg_pooler, bias=False
+        )
+        self.up_proj = nn.Conv1d(
+            embed_dim, self.intermediate_dim, avg_pooler, avg_pooler, bias=False
+        )
+        # Downsampled linear projection
+        self.down_proj = nn.Linear(
+            self.intermediate_dim, self.intermediate_dim, bias=False
+        )
+        # Activation function and layer normalization
+        self.act_fn = nn.SiLU()
+        self.layer_norm = nn.LayerNorm(self.intermediate_dim)
+        # Final output projection
+        self.out_proj = nn.Linear(self.intermediate_dim, embed_dim)
+    def forward(self, x: torch.Tensor, input_length: torch.Tensor):
+        output_length = input_length // self.avg_pooler
+        batch_size, seq_len, _ = x.shape  # (B, T, D)
+        xt = x.permute(0, 2, 1)  # (B, D, T)
+        g = self.gate_proj(xt).permute(0, 2, 1)  # (B, T//4, D*4)
+        u = self.up_proj(xt).permute(0, 2, 1)  # (B, T//4, D*4)
+        x = x.reshape(batch_size, -1, self.intermediate_dim)  # (B, T//4, D*4)
+        c = self.down_proj(self.act_fn(g) * u)  # (B, T//4, D*4)
+        res = self.layer_norm(c + x)  # (B, T//4, D*4)
+        res = self.out_proj(res)
+        return res, output_length
+class UpConv(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        stride: int = 4,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.stride = stride
+        self.in_proj = nn.Linear(embed_dim, self.stride * embed_dim)
+        # Simple transpose convolution layer to keep channel number consistent
+        self.up_conv = nn.ConvTranspose1d(
+            self.stride * embed_dim,
+            embed_dim,
+            kernel_size=stride,
+            stride=stride,
+            bias=False,
+        )
+    def forward(self, x: torch.Tensor, input_length: torch.Tensor):
+        x = self.in_proj(x)
+        x = x.transpose(1, 2)
+        res = self.up_conv(x)
+        res = res.transpose(1, 2)
+        output_length = input_length * self.stride
+        return res, output_length
+class RedCodec(nn.Module):
+    def __init__(
+        self,
+        ssl: PretrainedWhisperEncoder,
+        ssl_adaptor: SslAdaptor,
+        acoustic_encoder: WhisperAcousticEncoder,
+        downsample: ResidualDownConv,
+        rvq: ResidualVQ,
+        upsample: UpConv,
+        semantic_decoder: SslAdaptor,
+        acoustic_decoder: AcousticDecoder,
+    ):
+        super().__init__()
+        self.ssl = ssl
+        self.ssl_adaptor = ssl_adaptor
+        self.acoustic_encoder = acoustic_encoder
+        self.downsample = downsample
+        self.rvq = rvq
+        self.upsample = upsample
+        self.semantic_decoder = semantic_decoder
+        self.acoustic_decoder = acoustic_decoder
+    @classmethod
+    def from_config(cls, config_json: str) -> "RedCodec":
+        with open(config_json, "rb") as f:
+            config = json.load(f)["codec"]
+        ssl = PretrainedWhisperEncoder.from_pretrained()
+        ssl_adaptor = SslAdaptor(**config["ssl_adaptor"])
+        acoustic_encoder = WhisperAcousticEncoder(**config["acoustic_encoder"])
+        downsample = ResidualDownConv(**config["downsample"])
+        rvq = ResidualVQ(**config["rvq"])
+        upsample = UpConv(**config["upsample"])
+        semantic_decoder = SslAdaptor(**config["semantic_decoder"])
+        acoustic_decoder = AcousticDecoder(**config["acoustic_decoder"])
+        return cls(
+            ssl,
+            ssl_adaptor,
+            acoustic_encoder,
+            downsample,
+            rvq,
+            upsample,
+            semantic_decoder,
+            acoustic_decoder,
+        )
+class RedCodecInfer(RedCodec):
+    def __init__(self, codec: RedCodec):
+        super().__init__(
+            codec.ssl,
+            codec.ssl_adaptor,
+            codec.acoustic_encoder,
+            codec.downsample,
+            codec.rvq,
+            codec.upsample,
+            codec.semantic_decoder,
+            codec.acoustic_decoder,
+        )
+    @classmethod
+    def from_pretrained(cls, conf_path: str, ckpt_path: str) -> "RedCodecInfer":
+        with open(conf_path, "r") as f:
+            codec = RedCodec.from_config(conf_path)
+        ckpt = torch.load(ckpt_path)["generator"]
+        codec.load_state_dict(ckpt)
+        return cls(codec)
+    def _encode_one_batch(self, audio16k: torch.Tensor):
+        B, T = audio16k.shape
+        audio16k_length = torch.tensor(
+            [T] * B, dtype=torch.long, device=audio16k.device
+        )
+        # Semantic
+        ssl, ssl_length = self.ssl.forward(audio16k, audio16k_length)
+        ssl = ssl.clone()  # For onnx export
+        sem_feats, sem_length = self.ssl_adaptor(ssl, ssl_length)
+        # Acoustic
+        aco_feats, aco_length = self.acoustic_encoder(audio16k, audio16k_length)
+        # VQ
+        vq_in_feats = torch.cat([sem_feats, aco_feats], dim=2)
+        vq_in_feats, vq_in_length = self.downsample(vq_in_feats, aco_length)
+        # RVQ,
+        indices = self.rvq.encode_codes(vq_in_feats.transpose(1, 2))  # (nq, B, L)
+        indices = indices.permute(1, 0, 2)
+        return indices  # (B, nq, L)
+    @staticmethod
+    def _pad_and_chunk(audio: torch.Tensor, chunk_size: int) -> List[torch.Tensor]:
+        pad_len = math.ceil(audio.shape[1] / chunk_size) * chunk_size - audio.shape[1]
+        audio = F.pad(audio, (0, pad_len), mode="constant", value=0)
+        audio_chunks = audio.split(chunk_size, dim=1)
+        return audio_chunks
+    @torch.inference_mode()
+    def encode(
+        self,
+        audio16k: torch.Tensor,
+        audio16k_length: torch.Tensor = None,
+        batch_size: int = 96,
+    ):
+        """
+        Args:
+            audio16k: shape (b, t)
+            audio16k_length: (b,)
+        Returns:
+            token: shape (b, nq, l)
+            token_length: (b,)
+        """
+        if audio16k_length is None:
+            assert audio16k.shape[0] == 1
+            audio16k_length = torch.tensor(
+                [audio16k.shape[1]], dtype=torch.long, device=audio16k.device
+            )
+        CHUNK_SIZE = 6 * 16000
+        B, T = audio16k.shape
+        # Pad, chunk, and batch
+        audio16k_batch = []
+        batch_size_list = []
+        for i in range(B):
+            # Remove extra paddings
+            one_audio_chunks = self._pad_and_chunk(
+                audio16k[i : (i + 1), : audio16k_length[i]], CHUNK_SIZE
+            )
+            audio16k_batch += one_audio_chunks
+            batch_size_list.append(len(one_audio_chunks))
+        audio16k_batch = torch.cat(audio16k_batch, dim=0)
+        # Batch encode
+        token_batch = []
+        for i in range(0, audio16k_batch.shape[0], batch_size):
+            one_audio_batch = audio16k_batch[i : (i + batch_size)]
+            one_token_batch = self._encode_one_batch(one_audio_batch)
+            token_batch.append(one_token_batch)
+        token_batch = torch.cat(token_batch, dim=0)
+        # Recover & concat
+        token_list = torch.split(
+            token_batch, batch_size_list, dim=0
+        )  # [(B=1, nq, l), (B=3, nq, l), ...]
+        token_list = [
+            torch.cat(token_ts.split(1, dim=0), dim=-1)  # (B=1, nq, l)
+            for token_ts in token_list
+        ]
+        # Pad tokens
+        token = pad_sequence(
+            [ts.squeeze(0).transpose(1, 0) for ts in token_list],
+            batch_first=True,
+            padding_value=0,
+        ).transpose(
+            1, 2
+        )  # (B, nq, L)
+        token_length = (audio16k_length / 1280).ceil().long()
+        token = token[
+            ..., : token_length.max()
+        ]  # Remove extra paddings (we pad to multiples of 6s)
+        return token, token_length
+    @torch.inference_mode()
+    def decode(self, tokens: torch.Tensor):
+        """
+        Args:
+            tokens: (B=1, nq, L)
+        Returns:
+            audio: (B=1, t)
+        """
+        tokens = tokens.permute(1, 0, 2)  # (B, nq, L) -> (nq, B, L)
+        vq_out_feats = self.rvq.decode_codes(tokens)
+        vq_out_feats = vq_out_feats.transpose(1, 2)
+        vq_out_length = torch.tensor(
+            [vq_out_feats.shape[1]], dtype=torch.long, device=vq_out_feats.device
+        )
+        vq_out_feats, vq_out_length = self.upsample(vq_out_feats, vq_out_length)
+        # audio: (b, t)
+        audio, audio_length = self.acoustic_decoder(vq_out_feats, vq_out_length)
+        return audio
+    @torch.inference_mode()
+    def decode_one_token(
+        self, token: torch.Tensor, cache_dict: Dict[str, torch.Tensor], last_token: bool
+    ):
+        """Decode one single token to audio.
+        Args:
+            token: (B=1, nq, L=1)
+        Returns:
+            audio:  (B=1, t)
+        """
+        # token->latent->upsample, (naturally causal)
+        token = token.permute(1, 0, 2)  # (B, nq, L) -> (nq, B, L)
+        vq_out_feats = self.rvq.decode_codes(token)
+        vq_out_feats = vq_out_feats.transpose(1, 2)
+        vq_out_length = torch.tensor(
+            [vq_out_feats.shape[1]], dtype=torch.long, device=vq_out_feats.device
+        )
+        vq_out_feats, vq_out_length = self.upsample(vq_out_feats, vq_out_length)
+        # acoustic decoder
+        up_conv_cache = cache_dict.get("up_conv_cache", None)
+        bb_conv_cache1 = cache_dict.get("bb_conv_cache1", None)
+        bb_conv_cache2 = cache_dict.get("bb_conv_cache2", None)
+        bb_kv_cache = cache_dict.get("bb_kv_cache", None)
+        is_cache = cache_dict.get("is_cache", None)
+        (
+            audio,
+            new_up_conv_cache,
+            new_bb_conv_cache1,
+            new_bb_conv_cache2,
+            new_bb_kv_cache,
+            new_is_cache,
+        ) = self.acoustic_decoder.forward_chunk(
+            vq_out_feats,
+            up_conv_cache,
+            bb_conv_cache1,
+            bb_conv_cache2,
+            bb_kv_cache,
+            is_cache,
+            last_token,
+        )
+        new_cache_dict = {
+            "up_conv_cache": new_up_conv_cache,
+            "bb_conv_cache1": new_bb_conv_cache1,
+            "bb_conv_cache2": new_bb_conv_cache2,
+            "bb_kv_cache": new_bb_kv_cache,
+            "is_cache": new_is_cache,
+        }
+        return audio, new_cache_dict

fireredtts2/codec/rvq.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils.parametrizations import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class VectorQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        codebook_size: int,
+        codebook_dim: int,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.in_project = (
+            WNConv1d(
+                self.input_dim, self.codebook_dim, kernel_size=1
+            )  # (B, D, T) -> (B, D', T)
+            if self.input_dim != self.codebook_dim
+            else nn.Identity()
+        )
+        self.out_project = (
+            WNConv1d(
+                self.codebook_dim, self.input_dim, kernel_size=1
+            )  # (B, D', T) -> (B, D, T)
+            if self.input_dim != self.codebook_dim
+            else nn.Identity()
+        )
+        # Initialize codebook and EMA buffers
+        self.register_buffer(
+            "codebook", torch.zeros(codebook_size, codebook_dim).float()
+        )  # (codebook_size, D'), ensure fp32
+        # Place holder, not used in inference
+        self.register_buffer("inited", torch.tensor([True], dtype=torch.bool))  # (1)
+        self.register_buffer(
+            "cluster_size", torch.zeros(codebook_size).float()
+        )  # (codebook_size), ensure fp32
+        self.register_buffer(
+            "embed_avg", self.codebook.clone().float()
+        )  # (codebook_size, D'), ensure fp32
+    def decode_code(self, embed_id):  # embed_id: (B, T)
+        embed = (
+            F.embedding(embed_id, self.codebook).transpose(1, 2).float()
+        )  # (B, D', T), ensure fp32
+        return embed
+    def encode_code(self, z: torch.Tensor):  # z: (B, D, T)
+        # logging.info(f"{self.cluster_size = }, {self.codebook = }, {self.embed_avg = }, {self.inited = }")
+        z = z.float()  # Ensure fp32
+        z_e = self.in_project(z).float()  # (B, D', T), ensure fp32
+        # Rearrange for quantization
+        encodings = rearrange(z_e, "b d t -> (b t) d").float()  # (B*T, D'), ensure fp32
+        # Quantization
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)  # (B*T, 1)
+            - 2 * encodings @ self.codebook.float().t()  # (B*T, codebook_size)
+            + self.codebook.float().pow(2).sum(1, keepdim=True).t()
+        )  # (1, codebook_size)
+        # dist: (B*T, codebook_size)
+        indices = (-dist).max(1)[1]  # (B*T)
+        indices = rearrange(indices, "(b t) -> b t", b=z.size(0))  # (B, T)
+        # Get quantized vectors
+        z_q = self.decode_code(indices).float()  # (B, D', T), ensure fp32
+        # Straight-through estimator
+        z_q = z_e + (z_q - z_e).detach()  # (B, D', T)
+        z_q = self.out_project(z_q).float()  # (B, D, T), ensure fp32
+        # z_q: (B, D, T), commit_loss: (B), indices: (B, T), z: (B, D', T)
+        return z_q, indices
+class ResidualVQ(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 768,  # Input dimension, unrelated to RVQ
+        rvq_dim=None,  # RVQ dimension. If different from input_dim/output_dim, will add input_dim->rvq_dim/rvq_dim->output_dim projection
+        output_dim: int = None,  # Output dimension, unrelated to RVQ
+        num_quantizers: int = 8,
+        codebook_size: int = 1024,
+        codebook_dim: int = 256,  # Dimension of each codebook. If different from rvq_dim, will add rvq_dim->codebook_dim and codebook_dim->rvq_dim projections
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.num_quantizers = num_quantizers
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.rvq_dim = rvq_dim
+        self.input_proj = (
+            WNConv1d(input_dim, rvq_dim, kernel_size=1)
+            if input_dim != rvq_dim
+            else nn.Identity()
+        )
+        self.output_proj = (
+            WNConv1d(rvq_dim, output_dim, kernel_size=1)
+            if rvq_dim != output_dim
+            else nn.Identity()
+        )
+        self.quantizers = nn.ModuleList(
+            [
+                VectorQuantize(
+                    input_dim=rvq_dim,
+                    codebook_size=self.codebook_size,
+                    codebook_dim=codebook_dim,
+                )
+                for i in range(num_quantizers)
+            ]
+        )
+    def encode_codes(self, z: torch.Tensor):
+        z = self.input_proj(z)
+        residual = z.clone().float()  # (B, D, T), ensure fp32
+        all_indices = []
+        # Quantize to tokens
+        for i, quantizer in enumerate(self.quantizers):
+            # (B, D, T), (B), scalar, (B, T), (B, D', T), ensure fp32
+            z_q_i, indices_i = quantizer.encode_code(residual)
+            residual = residual - z_q_i
+            all_indices.append(indices_i)  # (B, T)
+        all_indices = torch.stack(all_indices)  # (N, B, T)
+        return all_indices
+    def decode_codes(self, codes):  # codes: (nq, B, T)
+        """Decode codes from multiple quantizers to embeddings.
+        Args:
+            codes: Tensor of shape (nq, B, T) containing code indices for each quantizer.
+        Returns:
+            emb: Tensor of shape (B, D, T) representing the decoded embeddings.
+        """
+        nq, B, T = codes.shape
+        device = codes.device
+        emb = torch.zeros(
+            B, self.rvq_dim, T, device=device, dtype=torch.float32
+        )  # (B, D, T)
+        for i, quantizer in enumerate(self.quantizers[:nq]):
+            code_i = codes[i]  # (B, T)
+            quantized_i = quantizer.decode_code(code_i)  # (B, D', T)
+            emb += quantizer.out_project(quantized_i)  # Accumulate quantized embeddings
+        emb = self.output_proj(emb)  # (B, D, T), apply output projection
+        return emb  # (B, D, T)

fireredtts2/codec/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import math
+import torch
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0, max_len, dtype=torch.int64, device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask  # (b, t)
+def make_nonpad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    return ~make_pad_mask(lengths, max_len)
+def make_block_causal_mask(
+    lengths: torch.Tensor, max_len: int = 0, chunk_size: int = 4
+) -> torch.Tensor:
+    mask = make_nonpad_mask(lengths, max_len)  # (b, t)
+    attn_mask = torch.logical_and(mask.unsqueeze(1), mask.unsqueeze(2))  # (b, t, t)
+    num_blocks = math.ceil(attn_mask.shape[1] / chunk_size)
+    block_mask = torch.block_diag(
+        *[torch.ones(chunk_size, chunk_size) for _ in range(num_blocks)]
+    )
+    block_mask = block_mask[: attn_mask.shape[1], : attn_mask.shape[1]].to(
+        attn_mask
+    )  # (t, t)
+    diag_mask = attn_mask.new_full(
+        (1, attn_mask.shape[1], attn_mask.shape[2]), fill_value=True
+    ).tril()  # (1, t, t)
+    diag_mask = diag_mask.logical_or(block_mask)
+    attn_mask = attn_mask.logical_and(diag_mask)
+    return attn_mask

fireredtts2/codec/whisper.py ADDED Viewed

	@@ -0,0 +1,420 @@

+# Extracted from transformers' WhisperModel to simplify package dependency
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Literal
+from fireredtts2.codec.utils import make_nonpad_mask
+from fireredtts2.codec.audio import mel_filter_bank
+def sinusoids(length: int, channels: int, max_timescale: float = 10000) -> torch.Tensor:
+    """Returns sinusoids for positional embedding"""
+    if channels % 2 != 0:
+        raise ValueError(
+            f"Number of channels has to be divisible by 2 for sinusoidal positional embeddings, got {channels} channels."
+        )
+    log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length).view(-1, 1) * inv_timescales.view(1, -1)
+    return torch.cat([scaled_time.sin(), scaled_time.cos()], dim=1)
+class WhisperSdpaAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.bias = bias
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            attention_mask: Bool mask or float mask. Bool mask, True indicates should attend. Float mask is added to the attention score.
+        """
+        bsz, tgt_len, _ = hidden_states.size()
+        query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
+        key_states = self._shape(self.k_proj(hidden_states), tgt_len, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), tgt_len, bsz)
+        # NOTE sdpa needs a 4-dim attention_mask: (b, nh, tq, tv)
+        if attention_mask is not None and len(attention_mask.shape) == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+        )  # (bsz, nh, l, d)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+    def forward_chunk(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor = None,
+    ):
+        """Forward self-attention with kv cache.
+        Args:
+            hidden_states: shape (b, t, c)
+            kv_cache: shape (b, nh, t, c*2)
+        """
+        bsz, tgt_len, _ = hidden_states.size()
+        # shape (b, nh, t, c)
+        query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
+        key_states = self._shape(self.k_proj(hidden_states), tgt_len, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), tgt_len, bsz)
+        # unpack cache
+        if kv_cache is not None:
+            k_cache, v_cache = kv_cache.chunk(2, dim=-1)
+            key_states = torch.cat([k_cache, key_states], dim=2)
+            value_states = torch.cat([v_cache, value_states], dim=2)
+        new_kv_cache = torch.cat([key_states, value_states], dim=-1)
+        # attention
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=None,
+            dropout_p=0.0,
+        )  # (bsz, nh, l, d)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, new_kv_cache
+class WhisperEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        ffn_dim: int = None,
+        attn_dropout: float = 0.0,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.dropout = dropout
+        # Attention
+        self.self_attn = WhisperSdpaAttention(embed_dim, num_heads, attn_dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        # FFN
+        ffn_dim = ffn_dim if ffn_dim is not None else embed_dim * 4
+        self.fc1 = nn.Linear(embed_dim, ffn_dim)
+        self.fc2 = nn.Linear(ffn_dim, embed_dim)
+        # Output norm
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ):
+        # Attention
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states, attention_mask)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        # FFN
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = F.gelu(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        return hidden_states
+    def forward_chunk(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor = None,
+    ):
+        """Forward self-attention with kv cache.
+        Args:
+            hidden_states: shape (b, t, c)
+            kv_cache: shape (b, nh, t, c*2)
+        """
+        # Attention
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, new_kv_cache = self.self_attn.forward_chunk(
+            hidden_states, kv_cache
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        # FFN
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = F.gelu(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        return hidden_states, new_kv_cache
+class WhisperEncoder(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        embed_dim: int,
+        num_layers: int,
+        num_heads: int,
+        ffn_dim: int = None,
+        attn_dropout: float = 0.0,
+        dropout: float = 0.0,
+        max_positions: int = 1500,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.embed_dim = embed_dim
+        self.dropout = dropout
+        # Input downsampling
+        self.conv1 = nn.Conv1d(in_dim, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+        # Fixed positional embedding
+        self.max_positions = max_positions
+        self.embed_positions = nn.Embedding(self.max_positions, embed_dim)
+        self.embed_positions.requires_grad_(False)
+        # Transformer
+        self.layers = nn.ModuleList(
+            [
+                WhisperEncoderLayer(
+                    embed_dim, num_heads, ffn_dim, attn_dropout, dropout
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # Output norm
+        self.layer_norm = nn.LayerNorm(embed_dim)
+        # Init weight
+        self.apply(self._init_weights)
+        # Init position embedding
+        self.embed_positions.weight.copy_(sinusoids(*self.embed_positions.weight.shape))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        hidden_length: torch.Tensor,
+        apply_position: bool = True,
+    ):
+        # Downsampling
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = F.gelu(self.conv1(hidden_states))
+        hidden_states = F.gelu(self.conv2(hidden_states))
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_length = hidden_length // 2  # from 100Hz -> 50Hz
+        # Pos encoding
+        if apply_position:
+            pos_embed = self.embed_positions(
+                torch.arange(0, hidden_states.shape[1], device=hidden_states.device)
+            )
+            hidden_states = hidden_states + pos_embed
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training
+        )
+        # Transformer
+        attention_mask = make_nonpad_mask(hidden_length).unsqueeze(1)  # (b, 1, t)
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states, hidden_length
+    def _init_weights(self, module):
+        std = 0.02
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class WhisperMelExtractor(nn.Module):
+    def __init__(
+        self,
+        num_mels: int = 128,
+        sampling_rate: int = 16000,
+        hop_length: int = 160,
+        n_fft: int = 400,
+        fmin: float = 0,
+        fmax: float = 8000,
+        padding_value=0.0,
+    ):
+        super().__init__()
+        self.num_mels = num_mels
+        self.sampling_rate = sampling_rate
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.fmin = fmin
+        self.fmax = fmax
+        self.padding_value = padding_value
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=(1 + n_fft // 2),
+            num_mel_filters=num_mels,
+            min_frequency=fmin,
+            max_frequency=fmax,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+    def extract_fbank(self, audio: torch.Tensor):
+        """
+        Args:
+            audio: batched audio of shape (b, t)
+        """
+        device = audio.device  # compute on cuda if input is on cuda
+        # Mel
+        window = torch.hann_window(self.n_fft).to(device)
+        stft = torch.stft(
+            audio, self.n_fft, self.hop_length, window=window, return_complex=True
+        )
+        magnitudes = stft[..., :-1].abs() ** 2
+        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32).to(device)
+        mel_spec = mel_filters.T @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        # Norm
+        max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0]
+        log_spec = torch.maximum(log_spec, max_val - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        return log_spec
+    def __call__(self, audio16k: torch.Tensor, audio16k_length: torch.Tensor):
+        mel = self.extract_fbank(audio16k).transpose(1, 2)
+        mel_length = audio16k_length // self.hop_length
+        # mel: (b, t, c=128)
+        return mel, mel_length
+# Pretrained encoder from whisper-large-v3
+class PretrainedWhisperEncoder(WhisperEncoder):
+    @classmethod
+    def from_pretrained(cls, pretrained_path: str = None):
+        encoder = cls(
+            in_dim=128,
+            embed_dim=1280,
+            num_layers=32,
+            num_heads=20,
+            ffn_dim=5120,
+            attn_dropout=0.0,
+            max_positions=1500,
+        )
+        if pretrained_path is not None:
+            ckpt = torch.load(pretrained_path, map_location="cpu")
+            encoder.load_state_dict(ckpt)
+        encoder.eval()
+        # Disable grad
+        for p in encoder.parameters():
+            p.requires_grad_(False)
+        # Add Mel extractor
+        encoder.feature_extractor = WhisperMelExtractor(
+            num_mels=128,
+            sampling_rate=16000,
+            hop_length=160,
+            n_fft=400,
+            fmin=0,
+            fmax=8000,
+        )
+        return encoder
+    @torch.inference_mode()
+    def forward(self, audio16k: torch.Tensor, audio16k_length: torch.Tensor):
+        # Extract mel
+        mel, mel_length = self.feature_extractor(audio16k, audio16k_length)
+        # Forward model
+        semantic_feats, semantic_length = super().forward(
+            mel, mel_length, apply_position=True
+        )
+        return semantic_feats, semantic_length
+class WhisperAcousticEncoder(WhisperEncoder):
+    def __init__(
+        self,
+        # Mel extraction params
+        num_mels: int = 128,
+        sampling_rate: int = 16000,
+        hop_length: int = 160,
+        n_fft: int = 400,
+        fmin: float = 0.0,
+        fmax: float = 8000,
+        # Encoder params
+        embed_dim: int = 768,
+        num_layers: int = 12,
+        num_heads: int = 8,
+        ffn_dim: int = None,
+        attn_dropout: float = 0.0,
+        dropout: float = 0.0,
+        max_positions: int = 1500,  # 50Hz * 30s
+    ):
+        super().__init__(
+            in_dim=num_mels,
+            embed_dim=embed_dim,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            ffn_dim=ffn_dim,
+            attn_dropout=attn_dropout,
+            dropout=dropout,
+            max_positions=max_positions,
+        )
+        self.feature_extractor = WhisperMelExtractor(
+            num_mels=num_mels,
+            sampling_rate=sampling_rate,
+            hop_length=hop_length,
+            n_fft=n_fft,
+            fmin=fmin,
+            fmax=fmax,
+        )
+    def forward(self, audio16k: torch.Tensor, audio16k_length: torch.Tensor):
+        # Extract mel
+        with torch.no_grad():
+            mel, mel_length = self.feature_extractor(audio16k, audio16k_length)
+        # Forward model
+        hidden_states, hidden_length = super().forward(
+            mel, mel_length, apply_position=True
+        )
+        return hidden_states, hidden_length

fireredtts2/fireredtts2.py ADDED Viewed

	@@ -0,0 +1,459 @@

+import os
+import time
+import json
+import torch
+import torchaudio
+from typing import List, Tuple
+from fireredtts2.codec import RedCodecInfer
+from fireredtts2.llm import load_llm_model, load_custom_tokenizer
+from fireredtts2.llm.utils import Segment
+from fireredtts2.utils.spliter import clean_text, split_text, process_text_list
+from tqdm import tqdm
+class FireRedTTS2:
+    def __init__(self, pretrained_dir, gen_type, device):
+        assert os.path.exists(pretrained_dir)
+        assert gen_type in ["monologue", "dialogue"]
+        llm_config_path = os.path.join(pretrained_dir, "config_llm.json")
+        if gen_type == "monologue":
+            llm_ckpt_path = os.path.join(pretrained_dir, "llm_pretrain.pt")
+            # llm_ckpt_path = os.path.join(pretrained_dir, "llm_posttrain.pt")
+        else:
+            llm_ckpt_path = os.path.join(pretrained_dir, "llm_posttrain.pt")
+        codec_config_path = os.path.join(pretrained_dir, "config_codec.json")
+        codec_ckpt_path = os.path.join(pretrained_dir, "codec.pt")
+        pretrained_qwen_path = os.path.join(pretrained_dir, "Qwen2.5-1.5B")
+        # check
+        assert os.path.exists(llm_config_path)
+        assert os.path.exists(llm_ckpt_path)
+        assert os.path.exists(codec_config_path)
+        assert os.path.exists(codec_ckpt_path)
+        assert os.path.exists(pretrained_qwen_path)
+        # ==== Load Torch LLM ====
+        llm_config = json.load(open(llm_config_path))
+        self._model = load_llm_model(
+            configs=llm_config, checkpoint_path=llm_ckpt_path, device=device
+        )
+        self._model.eval()
+        self._model.setup_caches(1)
+        print("[INFO] LLM Loaded...")
+        # ==== Load Qwen2.5 Text Tokenizer ====
+        self._text_tokenizer = load_custom_tokenizer(pretrained_qwen_path)
+        print("[INFO] Text Tokenizer Loaded...")
+        # ==== Load Torch Audio Tokenizer ====
+        torch_codec = RedCodecInfer.from_pretrained(codec_config_path, codec_ckpt_path)
+        torch_codec.eval()
+        self._audio_tokenizer = torch_codec.to(device)
+        print("[INFO] Codec Loaded...")
+        self.sample_rate = 16000
+        self.device = device
+        self.max_seq_len = 3100
+    def load_prompt_audio(self, audio_path) -> torch.Tensor:
+        audio, audio_sr = torchaudio.load(audio_path)
+        # Audio must be single channel
+        if audio.shape[0] > 1:
+            audio = audio[0, :].unsqueeze(0)
+        audio16k = torchaudio.functional.resample(audio, audio_sr, 16000)
+        return audio16k
+    def prepare_prompt(self, text, speaker, audio_path) -> Segment:
+        audio_tensor = self.load_prompt_audio(audio_path)
+        return Segment(text=text, speaker=speaker, audio=audio_tensor)
+    def _tokenize_text_segment(
+        self, text: str, speaker: str
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        frame_tokens = []
+        frame_masks = []
+        text = speaker + "<|text_start|>" + text + "<|text_end|>"
+        text_tokens = self._text_tokenizer.encode(text)
+        text_frame = torch.zeros(len(text_tokens), 17).long()
+        text_frame_mask = torch.zeros(len(text_tokens), 17).bool()
+        text_frame[:, -1] = torch.tensor(text_tokens)
+        text_frame_mask[:, -1] = True
+        frame_tokens.append(text_frame.to(self.device))
+        frame_masks.append(text_frame_mask.to(self.device))
+        return torch.cat(frame_tokens, dim=0), torch.cat(frame_masks, dim=0)
+    def _tokenize_audio(self, audio: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        frame_tokens = []
+        frame_masks = []
+        # (K, T)
+        audio_length = torch.tensor([audio.shape[1]], dtype=torch.long)
+        audio_tokens, token_length = self._audio_tokenizer.encode(
+            audio.to(self.device),
+            audio_length.to(self.device),
+            batch_size=48,
+        )
+        audio_tokens = audio_tokens.squeeze(0)
+        # add EOS frame
+        eos_frame = torch.zeros(audio_tokens.size(0), 1).to(self.device)
+        audio_tokens = torch.cat([audio_tokens, eos_frame], dim=1)
+        audio_frame = torch.zeros(audio_tokens.size(1), 17).long().to(self.device)
+        audio_frame_mask = torch.zeros(audio_tokens.size(1), 17).bool().to(self.device)
+        audio_frame[:, :-1] = audio_tokens.transpose(0, 1)
+        audio_frame_mask[:, :-1] = True
+        frame_tokens.append(audio_frame)
+        frame_masks.append(audio_frame_mask)
+        return torch.cat(frame_tokens, dim=0), torch.cat(frame_masks, dim=0)
+    def _tokenize_segment(self, segment: Segment) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            (seq_len,17), (seq_len, 17)
+        """
+        text_tokens, text_masks = self._tokenize_text_segment(
+            segment.text, segment.speaker
+        )
+        audio_tokens, audio_masks = self._tokenize_audio(segment.audio)
+        return torch.cat([text_tokens, audio_tokens], dim=0), torch.cat(
+            [text_masks, audio_masks], dim=0
+        )
+    @torch.inference_mode()
+    def generate(
+        self,
+        text: str,
+        speaker: str,
+        context: List[Segment],
+        max_audio_length_ms: float = 90_000,
+        temperature: float = 0.9,
+        topk: int = 20,
+    ) -> torch.Tensor:
+        self._model.reset_caches()
+        max_generation_len = int(max_audio_length_ms / 80)
+        tokens, tokens_mask = [], []
+        for segment in context:
+            segment_tokens, segment_tokens_mask = self._tokenize_segment(segment)
+            tokens.append(segment_tokens)
+            tokens_mask.append(segment_tokens_mask)
+        gen_segment_tokens, gen_segment_tokens_mask = self._tokenize_text_segment(
+            text, speaker
+        )
+        tokens.append(gen_segment_tokens)
+        tokens_mask.append(gen_segment_tokens_mask)
+        prompt_tokens = torch.cat(tokens, dim=0).long().to(self.device)
+        prompt_tokens_mask = torch.cat(tokens_mask, dim=0).bool().to(self.device)
+        samples = []
+        curr_tokens = prompt_tokens.unsqueeze(0)
+        curr_tokens_mask = prompt_tokens_mask.unsqueeze(0)
+        curr_pos = (
+            torch.arange(0, prompt_tokens.size(0)).unsqueeze(0).long().to(self.device)
+        )
+        max_seq_len = 3100
+        max_context_len = max_seq_len - max_generation_len
+        if curr_tokens.size(1) >= max_context_len:
+            raise ValueError(
+                f"Inputs too long, must be below max_seq_len - max_generation_len: {max_context_len}"
+            )
+        for _ in range(max_generation_len):
+            sample = self._model.generate_frame(
+                curr_tokens, curr_tokens_mask, curr_pos, temperature, topk
+            )
+            # eos
+            if torch.all(sample == 0):
+                break
+            samples.append(sample)
+            curr_tokens = torch.cat(
+                [sample, torch.zeros(1, 1).long().to(self.device)], dim=1
+            ).unsqueeze(1)
+            curr_tokens_mask = torch.cat(
+                [
+                    torch.ones_like(sample).bool(),
+                    torch.zeros(1, 1).bool().to(self.device),
+                ],
+                dim=1,
+            ).unsqueeze(1)
+            curr_pos = curr_pos[:, -1:] + 1
+        audio = (
+            self._audio_tokenizer.decode(torch.stack(samples).permute(1, 2, 0))
+            .squeeze(0)
+            .squeeze(0)
+        )
+        return audio
+    def generate_single(
+        self, context: List[Segment], temperature: float = 0.9, topk: int = 20
+    ):
+        self._model.reset_caches()
+        max_generation_len = 400
+        tokens, tokens_mask = [], []
+        for segment in context:
+            segment_tokens, segment_tokens_mask = self._tokenize_segment(segment)
+            tokens.append(segment_tokens)
+            tokens_mask.append(segment_tokens_mask)
+        prompt_tokens = torch.cat(tokens, dim=0).long().to(self.device)
+        prompt_tokens_mask = torch.cat(tokens_mask, dim=0).bool().to(self.device)
+        prompt_tokens = prompt_tokens[:-3, :]
+        prompt_tokens_mask = prompt_tokens_mask[:-3, :]
+        samples = []
+        curr_tokens = prompt_tokens.unsqueeze(0)
+        curr_tokens_mask = prompt_tokens_mask.unsqueeze(0)
+        curr_pos = (
+            torch.arange(0, prompt_tokens.size(0)).unsqueeze(0).long().to(self.device)
+        )
+        num_token = 0
+        start_time = time.time()
+        for _ in range(max_generation_len):
+            sample = self._model.generate_frame(
+                curr_tokens, curr_tokens_mask, curr_pos, temperature, topk
+            )
+            # eos
+            if torch.all(sample == 0):
+                break
+            samples.append(sample)
+            curr_tokens = torch.cat(
+                [sample, torch.zeros(1, 1).long().to(self.device)], dim=1
+            ).unsqueeze(1)
+            curr_tokens_mask = torch.cat(
+                [
+                    torch.ones_like(sample).bool(),
+                    torch.zeros(1, 1).bool().to(self.device),
+                ],
+                dim=1,
+            ).unsqueeze(1)
+            curr_pos = curr_pos[:, -1:] + 1
+            num_token += 1
+            if num_token == 2:
+                end_time = time.time()
+                duration = end_time - start_time
+                print("---first pack duration:", duration)
+        gen_tokens = torch.stack(samples).permute(1, 2, 0)
+        return gen_tokens
+    # @torch.inference_mode()
+    # def generate_stream(
+    #     self,
+    #     text: str,
+    #     speaker: str,
+    #     context: List[Segment],
+    #     max_audio_length_ms: float = 90_000,
+    #     temperature: float = 0.9,
+    #     topk: int = 50,
+    # ):
+    #     self._model.reset_caches()
+    #     max_generation_len = int(max_audio_length_ms / 80)
+    #     tokens, tokens_mask = [], []
+    #     for segment in context:
+    #         segment_tokens, segment_tokens_mask = self._tokenize_segment(segment)
+    #         tokens.append(segment_tokens)
+    #         tokens_mask.append(segment_tokens_mask)
+    #     gen_segment_tokens, gen_segment_tokens_mask = self._tokenize_text_segment(
+    #         text, speaker
+    #     )
+    #     tokens.append(gen_segment_tokens)
+    #     tokens_mask.append(gen_segment_tokens_mask)
+    #     prompt_tokens = torch.cat(tokens, dim=0).long().to(self.device)
+    #     prompt_tokens_mask = torch.cat(tokens_mask, dim=0).bool().to(self.device)
+    #     samples = []
+    #     curr_tokens = prompt_tokens.unsqueeze(0)
+    #     curr_tokens_mask = prompt_tokens_mask.unsqueeze(0)
+    #     curr_pos = (
+    #         torch.arange(0, prompt_tokens.size(0)).unsqueeze(0).long().to(self.device)
+    #     )
+    #     max_seq_len = 3100
+    #     max_context_len = max_seq_len - max_generation_len
+    #     if curr_tokens.size(1) >= max_context_len:
+    #         raise ValueError(
+    #             f"Inputs too long, must be below max_seq_len - max_generation_len: {max_context_len}"
+    #         )
+    #     # codec cache
+    #     codec_cache = {}
+    #     prev_sample = None
+    #     for _ in range(max_generation_len):
+    #         sample = self._model.generate_frame(
+    #             curr_tokens, curr_tokens_mask, curr_pos, temperature, topk
+    #         )
+    #         # eos
+    #         if torch.all(sample == 0):
+    #             break
+    #         # decode one token
+    #         if prev_sample is None:
+    #             prev_sample = sample  # sample: (b, nq)
+    #         else:
+    #             audio_chunk, codec_cache = self._audio_tokenizer.decode_one_token(
+    #                 prev_sample.unsqueeze(-1),
+    #                 codec_cache,
+    #                 last_token=False,
+    #             )
+    #             yield audio_chunk.squeeze(0)
+    #             prev_sample = sample
+    #         samples.append(sample)  # sample: (b, nq)
+    #         curr_tokens = torch.cat(
+    #             [sample, torch.zeros(1, 1).long().to(self.device)], dim=1
+    #         ).unsqueeze(1)
+    #         curr_tokens_mask = torch.cat(
+    #             [
+    #                 torch.ones_like(sample).bool(),
+    #                 torch.zeros(1, 1).bool().to(self.device),
+    #             ],
+    #             dim=1,
+    #         ).unsqueeze(1)
+    #         curr_pos = curr_pos[:, -1:] + 1
+    #     audio_chunk, codec_cache = self._audio_tokenizer.decode_one_token(
+    #         prev_sample.unsqueeze(-1),
+    #         codec_cache,
+    #         last_token=True,
+    #     )
+    #     yield audio_chunk.squeeze(0)
+    @torch.inference_mode()
+    def generate_dialogue(
+        self,
+        text_list,
+        prompt_wav_list=None,
+        prompt_text_list=None,
+        temperature=0.9,
+        topk=20,
+    ):
+        all_generated_segments = []
+        all_storage_segments = []
+        prompt_segments = []
+        text_list = process_text_list(text_list=text_list)
+        if prompt_wav_list is not None:
+            assert len(prompt_wav_list) == len(prompt_text_list)
+            # Prepare prompts
+            for i in range(len(prompt_wav_list)):
+                prompt_wav = prompt_wav_list[i]
+                prompt_text = prompt_text_list[i]
+                speaker = prompt_text[:4]
+                assert speaker in ["[S1]", "[S2]", "[S3]", "[S4]"]
+                prompt_segments.append(
+                    self.prepare_prompt(
+                        text=prompt_text, speaker=speaker, audio_path=prompt_wav
+                    )
+                )
+        for text in tqdm(text_list):
+            speaker = text[:4]
+            text = text[4:]
+            # print("---speaker:", speaker)
+            # print("---text:", text)
+            assert speaker in ["[S1]", "[S2]", "[S3]", "[S4]"]
+            audio_tensor = self.generate(
+                text=text,
+                speaker=speaker,
+                context=prompt_segments + all_generated_segments,
+                max_audio_length_ms=30_000,
+                temperature=temperature,
+                topk=topk,
+            )
+            # 做上下文管理的时候需要将audio 转到16k
+            audio_16k = torchaudio.functional.resample(
+                audio_tensor.unsqueeze(0), 24000, 16000
+            )
+            all_generated_segments.append(
+                Segment(text=text, speaker=speaker, audio=audio_16k)
+            )
+            all_storage_segments.append(
+                Segment(text=text, speaker=speaker, audio=audio_tensor.unsqueeze(0))
+            )
+        # Concatenate all generations
+        all_audio = torch.cat([seg.audio for seg in all_storage_segments], dim=1)
+        all_audio = all_audio.cpu()
+        return all_audio
+    @torch.inference_mode()
+    def generate_monologue(
+        self, text, prompt_wav=None, prompt_text=None, temperature=0.75, topk=20
+    ):
+        # step1. construct context
+        if prompt_wav is not None:
+            assert os.path.exists(prompt_wav)
+            assert prompt_text is not None
+            all_generated_segments = []
+            all_storage_segments = []
+            prompt_segments = []
+            prompt_text = clean_text(text=prompt_text)
+            text = clean_text(text=text)
+            text_list = split_text(text=text, length=400)
+            audio_list = []
+            for text in text_list:
+                text = clean_text(text=text)
+                input_text = prompt_text[:-1] + "," + text
+                prompt_a = self.prepare_prompt(
+                    text=input_text, speaker="[S1]", audio_path=prompt_wav
+                )
+                context = [prompt_a]
+                while True:
+                    gen_tokens = self.generate_single(
+                        context=context, temperature=temperature, topk=topk
+                    )
+                    if gen_tokens.shape[2] > 18:
+                        break
+                    # else:
+                    #     print("生成结果小于1s,重新跑")
+                gen_tokens = gen_tokens[:, :, 2:]  # cut leading silence
+                audio = self._audio_tokenizer.decode(gen_tokens).squeeze(0).squeeze(0)
+                audio_list.append(audio.unsqueeze(0))
+            all_audio = torch.cat(tensors=audio_list, dim=1)
+            return all_audio
+        else:
+            # random speaker
+            text = clean_text(text=text.strip())
+            audio_tensor = self.generate(
+                text=text,
+                speaker="[S1]",
+                context=[],
+                max_audio_length_ms=30_000,
+                temperature=temperature,
+                topk=topk,
+            )
+            return audio_tensor.unsqueeze(0)

fireredtts2/llm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from fireredtts2.llm.utils import load_llm_model, load_custom_tokenizer

fireredtts2/llm/llm.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import dataclass
+from huggingface_hub import PyTorchModelHubMixin
+from fireredtts2.llm.modules import FLAVORS
+def _prepare_transformer(model):
+    embed_dim = model.tok_embeddings.embedding_dim
+    model.tok_embeddings = nn.Identity()
+    model.output = nn.Identity()
+    return model, embed_dim
+def _create_causal_mask(seq_len: int, device: torch.device):
+    return torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool, device=device))
+def _index_causal_mask(mask: torch.Tensor, input_pos: torch.Tensor):
+    """
+    Args:
+        mask: (max_seq_len, max_seq_len)
+        input_pos: (batch_size, seq_len)
+    Returns:
+        (batch_size, seq_len, max_seq_len)
+    """
+    r = mask[input_pos, :]
+    return r
+# Does multinomial sampling without a cuda synchronization
+def _multinomial_sample_one_no_sync(probs):
+    q = torch.empty_like(probs).exponential_(1)
+    return torch.argmax(probs / q, dim=-1, keepdim=True).to(dtype=torch.int)
+def sample_topk(logits: torch.Tensor, topk: int, temperature: float):
+    logits = logits / temperature
+    filter_value: float = -float("Inf")
+    indices_to_remove = logits < torch.topk(logits, topk)[0][..., -1, None]
+    scores_processed = logits.masked_fill(indices_to_remove, filter_value)
+    scores_processed = torch.nn.functional.log_softmax(scores_processed, dim=-1)
+    probs = torch.nn.functional.softmax(scores_processed, dim=-1)
+    sample_token = _multinomial_sample_one_no_sync(probs)
+    return sample_token
+def sample_top_nsigma(logits: torch.Tensor, n: float, temperature: float):
+    """_summary_
+    Args:
+        logits (torch.Tensor): _description_
+        n (float): _description_
+        temperature (float): _description_
+    Returns:
+        _type_: _description_
+    """
+    logits = logits / temperature
+    threshold = logits.max(dim=-1, keepdim=True).values - n * logits.std(
+        dim=-1, keepdim=True
+    )
+    logits[logits < threshold] = float("-inf")
+    # scores_processed = torch.nn.functional.log_softmax(logits, dim=-1)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    sample_token = _multinomial_sample_one_no_sync(probs)
+    return sample_token
+@dataclass
+class ModelArgs:
+    backbone_flavor: str
+    decoder_flavor: str
+    text_vocab_size: int
+    audio_vocab_size: int
+    audio_num_codebooks: int
+    decoder_loss_weight: float
+    use_text_loss: bool
+class Model(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.backbone, backbone_dim = _prepare_transformer(
+            FLAVORS[config.backbone_flavor]()
+        )
+        self.decoder, decoder_dim = _prepare_transformer(
+            FLAVORS[config.decoder_flavor]()
+        )
+        self.text_embeddings = nn.Embedding(config.text_vocab_size, backbone_dim)
+        self.audio_embeddings = nn.Embedding(
+            config.audio_vocab_size * config.audio_num_codebooks, backbone_dim
+        )
+        self.projection = nn.Linear(backbone_dim, decoder_dim, bias=False)
+        self.text_head = nn.Linear(backbone_dim, config.text_vocab_size, bias=False)
+        self.codebook0_head = nn.Linear(
+            backbone_dim, config.audio_vocab_size, bias=False
+        )
+        self.audio_head = nn.Parameter(
+            torch.empty(
+                config.audio_num_codebooks - 1, decoder_dim, config.audio_vocab_size
+            )
+        )
+        self.decoder_loss_weight = config.decoder_loss_weight
+        self.use_text_loss = config.use_text_loss
+        # debug
+        # print("---backbone_dim:", backbone_dim)
+        # print("---decoder_dim:", decoder_dim)
+        # print("---self.decoder_loss_weight:", self.decoder_loss_weight)
+        # print("---self.use_text_loss:", self.use_text_loss)
+    def setup_caches(self, max_batch_size: int) -> torch.Tensor:
+        """Setup KV caches and return a causal mask."""
+        dtype = next(self.parameters()).dtype
+        device = next(self.parameters()).device
+        with device:
+            self.backbone.setup_caches(max_batch_size, dtype)
+            self.decoder.setup_caches(
+                max_batch_size,
+                dtype,
+                decoder_max_seq_len=self.config.audio_num_codebooks,
+            )
+        self.register_buffer(
+            "backbone_causal_mask",
+            _create_causal_mask(self.backbone.max_seq_len, device),
+        )
+        self.register_buffer(
+            "decoder_causal_mask",
+            _create_causal_mask(self.config.audio_num_codebooks, device),
+        )
+    def forward(self, tokens: torch.Tensor, tokens_mask: torch.Tensor):
+        """
+        Forward pass for Sesame's CSM model.
+        This will be added to the model with `model.forward = types.MethodType(forward, model)`
+        Args:
+            tokens: (batch_size, seq_len, n_codebooks+1)
+            tokens_mask: (batch_size, seq_len, n_codebooks+1)
+        """
+        dtype = next(self.parameters()).dtype
+        bsz, seq_len, _ = tokens.size()
+        device = tokens.device
+        # print("---tokens:\n", tokens, tokens.shape)
+        # print("---tokens_mask:\n", tokens_mask, tokens_mask.shape)
+        # print("---bsz:", bsz)
+        # print("---seq_len:", seq_len)
+        # embed tokens
+        embeds = self._embed_tokens(tokens)  # (bsz,seq_len,33,2048)
+        # print("---embeds:\n", embeds, embeds.shape)
+        # get targets and codebook embeddings corresponding to audio tokens
+        audio_mask = tokens_mask[:, :, 0]  # [bsz, seq_len]
+        target_tokens = tokens[audio_mask][:, :-1]  # [audio_len, n_codebooks]
+        # [audio_len, n_codebooks, embed_dim]
+        c_embeds = embeds[:, :, :-1, :][audio_mask]
+        # print("---audio_mask:\n", audio_mask, audio_mask.shape)
+        # print("---target_tokens:\n", target_tokens, target_tokens.shape)
+        # get targets corresponding to text tokens
+        text_mask = tokens_mask[:, :, -1]
+        text_target_mask = torch.roll(input=text_mask, shifts=1, dims=1)
+        text_target_tokens = tokens[text_target_mask][:, -1]
+        # print("---text_target_mask:\n", text_target_mask, text_target_mask.shape)
+        # print("---target_text_tokens:\n", text_target_tokens, text_target_tokens.shape)
+        # print("\n\n")
+        # retain just non-padding embeddings
+        masked_embeds = embeds * tokens_mask.unsqueeze(-1)
+        h = masked_embeds.sum(dim=2)
+        # backbone forward pass
+        # [bsz, seq_len]
+        padding_mask = tokens_mask[:, :, 0] | tokens_mask[:, :, -1]
+        # [seq_len, seq_len]
+        backbone_attn_mask = _create_causal_mask(seq_len, device)
+        # [bsz, seq_len, seq_len]
+        padding_3d = padding_mask.unsqueeze(-1) * padding_mask.unsqueeze(1)
+        backbone_attn_mask = backbone_attn_mask.unsqueeze(0) * padding_3d
+        backbone_attn_mask = backbone_attn_mask | torch.eye(
+            seq_len, device=device
+        ).bool().unsqueeze(0).expand(bsz, -1, -1)
+        input_pos = (
+            torch.arange(0, seq_len).unsqueeze(0).expand(bsz, seq_len).long().to(device)
+        )
+        h = self.backbone(h, input_pos=input_pos, mask=backbone_attn_mask).to(
+            dtype=dtype
+        )
+        # print("---h:\n", h, h.shape)
+        # get backbone embeddings used for audio codebook prediction predict first codebook and compute loss
+        audio_mask = torch.roll(audio_mask, -1, 1)  # shift audio mask to the right by 1
+        audio_h = h[audio_mask]  # [audio_len, embed_dim]
+        # print("---audio_mask after shift:\n", audio_mask, audio_mask.shape)
+        c0_logits = self.codebook0_head(audio_h)  # [audio_len, audio_vocab_size]
+        c0_target = target_tokens[:, 0]  # [audio_len]
+        c0_loss = F.cross_entropy(c0_logits, c0_target)
+        # predict text loss
+        text_h = h[text_mask]
+        text_logits = self.text_head(text_h)
+        text_loss = F.cross_entropy(text_logits, text_target_tokens, ignore_index=0)
+        # print("---text_h:\n", text_h, text_h.shape)
+        # print("---text_logits:\n", text_logits)
+        # print("---text_loss:", text_loss)
+        # "compute amortization" (train decoder on random 1/16 subset of audio tokens)
+        # important change to 1/8
+        # indices = torch.randperm(c_embeds.size(0))[: c_embeds.size(0) // 16]
+        indices = torch.randperm(c_embeds.size(0))[: c_embeds.size(0) // 8]
+        # [audio_len//16, n_codebooks-1, embed_dim]
+        c_embeds = c_embeds[indices][:, :-1, :]
+        audio_h = audio_h[indices]  # [audio_len//16, embed_dim]
+        target_tokens = target_tokens[indices][:, 1:]  # [audio_len//16, n_codebooks-1]
+        # concatenate backbone embeddings and codebook embeddings for decoder input
+        # [audio_len//16, n_codebooks, embed_dim]
+        decoder_embeds = torch.cat([audio_h.unsqueeze(1), c_embeds], dim=1)
+        N, n_codebooks, _ = decoder_embeds.size()
+        c_pos = (
+            torch.arange(0, n_codebooks)
+            .unsqueeze(0)
+            .expand(N, n_codebooks)
+            .long()
+            .to(device)
+        )
+        decoder_causal_mask = _create_causal_mask(
+            decoder_embeds.size(1), device
+        ).expand(N, -1, -1)
+        decoder_h = self.decoder(
+            self.projection(decoder_embeds), input_pos=c_pos, mask=decoder_causal_mask
+        ).to(dtype=dtype)
+        c_logits = torch.einsum("bsd,sdv->bsv", decoder_h[:, 1:, :], self.audio_head)
+        c_loss = F.cross_entropy(
+            c_logits.reshape(-1, c_logits.size(-1)), target_tokens.reshape(-1)
+        )
+        if self.use_text_loss:
+            loss = (
+                2
+                * (
+                    (1 - self.decoder_loss_weight) * c0_loss
+                    + self.decoder_loss_weight * c_loss
+                )
+                + 0.01 * text_loss
+            )
+        else:
+            loss = 2 * (
+                (1 - self.decoder_loss_weight) * c0_loss
+                + self.decoder_loss_weight * c_loss
+            )
+        return loss, text_loss, c0_loss, c_loss
+    def generate_frame(
+        self,
+        tokens: torch.Tensor,
+        tokens_mask: torch.Tensor,
+        input_pos: torch.Tensor,
+        temperature: float,
+        topk: int,
+    ) -> torch.Tensor:
+        """
+        Args:
+            tokens: (batch_size, seq_len, audio_num_codebooks+1)
+            tokens_mask: (batch_size, seq_len, audio_num_codebooks+1)
+            input_pos: (batch_size, seq_len) positions for each token
+            mask: (batch_size, seq_len, max_seq_len
+        Returns:
+            (batch_size, audio_num_codebooks) sampled tokens
+        """
+        dtype = next(self.parameters()).dtype
+        b, s, _ = tokens.size()
+        assert self.backbone.caches_are_enabled(), "backbone caches are not enabled"
+        curr_backbone_mask = _index_causal_mask(self.backbone_causal_mask, input_pos)
+        embeds = self._embed_tokens(tokens)
+        masked_embeds = embeds * tokens_mask.unsqueeze(-1)
+        h = masked_embeds.sum(dim=2)
+        h = self.backbone(h, input_pos=input_pos, mask=curr_backbone_mask).to(
+            dtype=dtype
+        )
+        last_h = h[:, -1, :]
+        c0_logits = self.codebook0_head(last_h)
+        c0_sample = sample_topk(c0_logits, topk, temperature)
+        c0_embed = self._embed_audio(0, c0_sample)
+        curr_h = torch.cat([last_h.unsqueeze(1), c0_embed], dim=1)
+        curr_sample = c0_sample.clone()
+        curr_pos = (
+            torch.arange(0, curr_h.size(1), device=curr_h.device)
+            .unsqueeze(0)
+            .repeat(curr_h.size(0), 1)
+        )
+        # Decoder caches must be reset every frame.
+        self.decoder.reset_caches()
+        for i in range(1, self.config.audio_num_codebooks):
+            curr_decoder_mask = _index_causal_mask(self.decoder_causal_mask, curr_pos)
+            decoder_h = self.decoder(
+                self.projection(curr_h), input_pos=curr_pos, mask=curr_decoder_mask
+            ).to(dtype=dtype)
+            ci_logits = torch.mm(decoder_h[:, -1, :], self.audio_head[i - 1])
+            ci_sample = sample_topk(ci_logits, 10, 0.75)  # fix to 10 and 0.75
+            ci_embed = self._embed_audio(i, ci_sample)
+            curr_h = ci_embed
+            curr_sample = torch.cat([curr_sample, ci_sample], dim=1)
+            curr_pos = curr_pos[:, -1:] + 1
+        return curr_sample
+    def reset_caches(self):
+        self.backbone.reset_caches()
+        self.decoder.reset_caches()
+    def _embed_audio(self, codebook: int, tokens: torch.Tensor) -> torch.Tensor:
+        return self.audio_embeddings(tokens + codebook * self.config.audio_vocab_size)
+    def _embed_tokens(self, tokens: torch.Tensor) -> torch.Tensor:
+        text_embeds = self.text_embeddings(tokens[:, :, -1]).unsqueeze(-2)
+        audio_tokens = tokens[:, :, :-1] + (
+            self.config.audio_vocab_size
+            * torch.arange(self.config.audio_num_codebooks, device=tokens.device)
+        )
+        audio_embeds = self.audio_embeddings(audio_tokens.view(-1)).reshape(
+            tokens.size(0), tokens.size(1), self.config.audio_num_codebooks, -1
+        )
+        return torch.cat([audio_embeds, text_embeds], dim=-2)
+if __name__ == "__main__":
+    MIMI_SAMPLE_RATE = 24000
+    BACKBONE_FLAVOR = "qwen-3b"
+    DECODER_FLAVOR = "qwen-500m"
+    TEXT_VOCAB_SIZE = 128256
+    AUDIO_VOCAB_SIZE = 2051
+    AUDIO_NUM_CODEBOOKS = 32
+    config = ModelArgs(
+        backbone_flavor=BACKBONE_FLAVOR,
+        decoder_flavor=DECODER_FLAVOR,
+        text_vocab_size=TEXT_VOCAB_SIZE,
+        audio_vocab_size=AUDIO_VOCAB_SIZE,
+        audio_num_codebooks=AUDIO_NUM_CODEBOOKS,
+        decoder_loss_weight=0.5,
+        use_text_loss=True,
+    )
+    model = Model(config)

fireredtts2/llm/modules.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from torchtune.models.qwen2 import qwen2
+from torchtune.modules.transformer import TransformerDecoder
+def qwen2_200M() -> TransformerDecoder:
+    return qwen2(
+        vocab_size=151936,
+        num_layers=4,
+        num_heads=12,
+        num_kv_heads=2,
+        embed_dim=1536,
+        intermediate_dim=8960,
+        max_seq_len=4096,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+    )
+def qwen2_500M() -> TransformerDecoder:
+    return qwen2(
+        vocab_size=151936,
+        num_layers=24,
+        num_heads=14,
+        num_kv_heads=2,
+        embed_dim=896,
+        intermediate_dim=4864,
+        max_seq_len=4096,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+    )
+def qwen2_1_5B() -> TransformerDecoder:
+    return qwen2(
+        vocab_size=151936,
+        num_layers=28,
+        num_heads=12,
+        num_kv_heads=2,
+        embed_dim=1536,
+        intermediate_dim=8960,
+        max_seq_len=4096,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+    )
+def qwen2_3B() -> TransformerDecoder:
+    return qwen2(
+        vocab_size=151936,
+        num_layers=36,
+        num_heads=16,
+        num_kv_heads=2,
+        embed_dim=2048,
+        intermediate_dim=11008,
+        max_seq_len=4096,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+    )
+def qwen2_7B() -> TransformerDecoder:
+    return qwen2(
+        vocab_size=152064,
+        num_layers=28,
+        num_heads=28,
+        num_kv_heads=4,
+        embed_dim=3584,
+        intermediate_dim=18944,
+        max_seq_len=4096,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+    )
+FLAVORS = {
+    "qwen-200m": qwen2_200M,
+    "qwen-500m": qwen2_500M,
+    "qwen-1.5b": qwen2_1_5B,
+    "qwen-3b": qwen2_3B,
+    "qwen-7b": qwen2_7B,
+}

fireredtts2/llm/utils.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import os
+import json
+import torch
+import torch.nn as nn
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Union
+from torch.optim.lr_scheduler import LambdaLR
+from transformers import AutoTokenizer
+from fireredtts2.llm.llm import Model, ModelArgs
+@dataclass
+class Segment:
+    speaker: str
+    text: str
+    audio: torch.Tensor
+class WarmupDecayLR(LambdaLR):
+    """
+    Learning rate scheduler with a linear warmup and specificable decay.
+    """
+    def __init__(
+        self, optimizer, warmup_steps: int, total_steps: int, decay_type: str = "linear"
+    ):
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.decay_type = decay_type
+        super().__init__(optimizer, self.lr_lambda, last_epoch=-1)
+    def lr_lambda(self, step: int) -> float:
+        if step < self.warmup_steps:
+            return step / self.warmup_steps
+        else:
+            if self.decay_type == "linear":
+                return (self.total_steps - step) / (
+                    self.total_steps - self.warmup_steps
+                )
+            elif self.decay_type == "constant":
+                return 1.0
+            elif self.decay_type == "exponential":
+                return 0.1 ** (
+                    (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
+                )
+            elif self.decay_type == "cosine":
+                return 0.5 * (
+                    1
+                    + torch.cos(
+                        torch.pi
+                        * torch.tensor(
+                            (step - self.warmup_steps)
+                            / (self.total_steps - self.warmup_steps)
+                        )
+                    )
+                )
+            else:
+                raise ValueError(f"Invalid decay type: {self.decay_type}")
+additional_special_tokens = [
+    "<|text_start|>",
+    "<|text_end|>",
+    "[S1]",
+    "[S2]",
+    "[S3]",
+    "[S4]",
+    "[S5]",
+    "[S6]",
+    "[S7]",
+    "[S8]",
+    "[S9]",
+    "[S10]",
+    "[S11]",
+    "[S12]",
+    "[S13]",
+    "[S14]",
+    "[S15]",
+    "[S16]",
+    "[S17]",
+    "[S18]",
+    "[S19]",
+    "[S20]",
+    "[S21]",
+    "[S22]",
+    "[S23]",
+    "[S24]",
+    "[S25]",
+    "[S26]",
+    "[S27]",
+    "[S28]",
+    "[S29]",
+    "[S30]",
+    "[S31]",
+    "[S32]",
+    "[S33]",
+    "[S34]",
+    "[S35]",
+    "[S36]",
+    "[S37]",
+    "[S38]",
+    "[S39]",
+    "[S40]",
+    "[S_PODCAST_1]",
+    "[S_PODCAST_2]",
+    "[S_PODCAST_3]",
+    "[S_PODCAST_4]",
+    "[S_PODCAST_5]",
+    "[S_PODCAST_6]",
+    "[S_PODCAST_7]",
+    "[S_PODCAST_8]",
+    "[S_PODCAST_9]",
+    "[S_PODCAST_10]",
+    "[S_DIALOG_1]",
+    "[S_DIALOG_2]",
+    "[S_DIALOG_3]",
+    "[S_DIALOG_4]",
+    "[S_DIALOG_5]",
+    "[S_DIALOG_6]",
+    "[S_DIALOG_7]",
+    "[S_DIALOG_8]",
+    "[S_DIALOG_9]",
+    "[S_DIALOG_10]",
+    "<|emotion_neutral|>",
+    "<|emotion_happy|>",
+    "<|emotion_sad|>",
+    "<|emotion_concern|>",
+    "<|emotion_confuse|>",
+    "<|emotion_angry|>",
+    "<|emotion_surprise|>",
+    "<|emotion_disgust|>",
+    "<|emotion_nervous|>",
+    "<|emotion_apology|>",
+    "<|emotion_understand|>",
+    "<|emotion_fear|>",
+    "<|emotion_comfort|>",
+    "<|emotion_shy|>",
+    "<|emotion_serious|>",
+    "<|emotion_extra1|>",
+    "<|emotion_extra2|>",
+    "<|emotion_extra3|>",
+    "<|emotion_extra4|>",
+    "<|emotion_extra5|>",
+    "<|emotion_extra6|>",
+    "<|emotion_extra7|>",
+    "<|emotion_extra8|>",
+    "<|emotion_extra9|>",
+    "<|emotion_extra10|>",
+    "<|breath|>",
+    "<|humph|>",
+    "<|laugh_heng|>",
+    "<|hissing|>",
+    "<|sniff|>",
+    "<|laugh_he|>",
+    "<|sigh|>",
+    "<|laugh|>",
+    "<|laugh_ha|>",
+    "<|quick_breath|>",
+    "<|laugh_hei|>",
+    "<|laugh_speak|>",
+    "<|/laugh_speak|>",
+    "<|cry|>",
+    "<|choking|>",
+    "<|cry_speak|>",
+    "<|/cry_speak|>",
+    "<|slurp|>",
+    "<|clucking|>",
+    "<|yawning|>",
+    "<|cough|>",
+    "<|smack|>",
+    "<|hem|>",
+    "<|stretch|>",
+    "<|sneeze|>",
+    "<|paralinguistic_extra1|>",
+    "<|paralinguistic_extra2|>",
+    "<|paralinguistic_extra3|>",
+    "<|paralinguistic_extra4|>",
+    "<|paralinguistic_extra5|>",
+    "<|paralinguistic_extra6|>",
+    "<|paralinguistic_extra7|>",
+    "<|paralinguistic_extra8|>",
+    "<|paralinguistic_extra10|>",
+    "<|paralinguistic_extra11|>",
+    "<|paralinguistic_extra12|>",
+    "<|paralinguistic_extra13|>",
+]
+def load_custom_tokenizer(qwen2_tokenizer_path: str):
+    tok = AutoTokenizer.from_pretrained(qwen2_tokenizer_path)
+    special_tokens_dict = {
+        "additional_special_tokens": additional_special_tokens,
+    }
+    tok.add_special_tokens(special_tokens_dict)
+    return tok
+def init_weights(model: nn.Module):
+    """
+    Initialize the weights of the model.
+    - Xavier uniform initialization for linear layers
+    - Normal initialization for embeddings
+    - Xavier uniform initialization for parameters
+    """
+    def _init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Embedding):
+            nn.init.normal_(m.weight, mean=0.0, std=0.02)
+        elif isinstance(m, nn.Parameter):
+            nn.init.xavier_uniform_(m.data)
+    model.apply(_init_weights)
+    # Special handling for audio_head because it's nn.Parameter directly
+    nn.init.xavier_uniform_(model.audio_head)
+    return model
+def load_llm_model(
+    configs,
+    checkpoint_path: Union[str, Path] = None,
+    device: Union[str, torch.device] = "cuda",
+) -> Model:
+    """Load model, add forward method, and move to device.
+    Args:
+        model_name_or_checkpoint_path: Name or path of pretrained model or checkpoint.
+        device: Device to move the model to.
+        decoder_loss_weight: Decoder loss weight.
+    """
+    model_arg = ModelArgs(
+        backbone_flavor=configs["llm_models"]["backbone_flavor"],
+        decoder_flavor=configs["llm_models"]["decoder_flavor"],
+        text_vocab_size=configs["llm_models"]["text_vocab_size"],
+        audio_vocab_size=configs["llm_models"]["audio_vocab_size"],
+        audio_num_codebooks=configs["llm_models"]["audio_num_codebooks"],
+        decoder_loss_weight=configs["llm_models"]["decoder_loss_weight"],
+        use_text_loss=True,
+    )
+    model = Model(model_arg)
+    if checkpoint_path and os.path.exists(checkpoint_path):
+        state_dict = torch.load(
+            checkpoint_path, map_location="cpu", weights_only=False
+        )["model"]
+        model.load_state_dict(state_dict)
+    else:
+        model = init_weights(model)
+    model = model.to(device=device)
+    return model
+def summarize(
+    writer,
+    global_step,
+    scalars={},
+    histograms={},
+    images={},
+    audios={},
+    audio_sampling_rate=22050,
+):
+    for k, v in scalars.items():
+        writer.add_scalar(k, v, global_step)
+    for k, v in histograms.items():
+        writer.add_histogram(k, v, global_step)
+    for k, v in images.items():
+        writer.add_image(k, v, global_step, dataformats="HWC")
+    for k, v in audios.items():
+        writer.add_audio(k, v, global_step, audio_sampling_rate)
+def get_grad_norm(model):
+    total_norm = 0
+    num = 0
+    for name, p in model.named_parameters():
+        try:
+            param_norm = p.grad.data.norm(2)
+            total_norm += param_norm.item() ** 2
+            num += 1
+        except:
+            print(name)
+    total_norm = total_norm ** (1.0 / 2)
+    total_norm = total_norm / num
+    return total_norm
+def read_jsonl(path):
+    path = os.path.expanduser(path)
+    with open(path, "r") as f:
+        json_str = f.read()
+    data_list = []
+    for line in json_str.splitlines():
+        data = json.loads(line)
+        data_list.append(data)
+    return data_list

fireredtts2/utils/spliter.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import re
+import string
+SYMBOLS_MAPPING = {
+    "\n": "",
+    "\t": "",
+    "…": ",",
+    "“": "'",
+    "”": "'",
+    "‘": "'",
+    "’": "'",
+    "【": "",
+    "】": "",
+    "[": "",
+    "]": "",
+    "（": "",
+    "）": "",
+    "(": "",
+    ")": "",
+    "・": "",
+    "·": "",
+    "「": "'",
+    "」": "'",
+    "《": "'",
+    "》": "'",
+    "—": "",
+    "～": "，",
+    "~": "，",
+    "：": ",",
+    "；": ",",
+    ";": ",",
+    ":": ",",
+    '"': "",
+    "！": "，",
+    # "!": ".",
+    "————": "",
+    "——": "",
+    "—": "",
+    "……": "，",
+    "*": "",
+}
+REPLACE_SYMBOL_REGEX = re.compile(
+    "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys())
+)
+EMOJI_REGEX = re.compile(
+    "["
+    "\U0001f600-\U0001f64f"  # emoticons
+    "\U0001f300-\U0001f5ff"  # symbols & pictographs
+    "\U0001f680-\U0001f6ff"  # transport & map symbols
+    "\U0001f1e0-\U0001f1ff"  # flags (iOS)
+    "]+",
+    flags=re.UNICODE,
+)
+def clean_text(text):
+    # Clean the text
+    text = text.strip()
+    text = text.replace("\xa0", "")
+    # Replace all chinese symbols with their english counterparts
+    text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text)
+    # Remove emojis
+    text = EMOJI_REGEX.sub(r"", text)
+    # Remove continuous periods (...) and commas (,,,)
+    text = re.sub(r"[.,]{2,}", lambda m: m.group()[0], text)
+    return text
+def utf_8_len(text):
+    return len(text.encode("utf-8"))
+def break_text(texts, length, splits: set):
+    for text in texts:
+        if utf_8_len(text) <= length:
+            yield text
+            continue
+        curr = ""
+        for char in text:
+            curr += char
+            if char in splits:
+                yield curr
+                curr = ""
+        if curr:
+            yield curr
+def break_text_by_length(texts, length):
+    for text in texts:
+        if utf_8_len(text) <= length:
+            yield text
+            continue
+        curr = ""
+        for char in text:
+            curr += char
+            if utf_8_len(curr) >= length:
+                yield curr
+                curr = ""
+        if curr:
+            yield curr
+def add_cleaned(curr, segments):
+    curr = curr.strip()
+    if curr and not all(c.isspace() or c in string.punctuation for c in curr):
+        segments.append(curr)
+def protect_float(text):
+    # Turns 3.14 into <3_f_14> to prevent splitting
+    return re.sub(r"(\d+)\.(\d+)", r"<\1_f_\2>", text)
+def unprotect_float(text):
+    # Turns <3_f_14> into 3.14
+    return re.sub(r"<(\d+)_f_(\d+)>", r"\1.\2", text)
+def split_text(text, length):
+    text = clean_text(text)
+    # Break the text into pieces with following rules:
+    # 1. Split the text at ".", "!", "?" if text is NOT a float
+    # 2. If the text is longer than length, split at ","
+    # 3. If the text is still longer than length, split at " "
+    # 4. If the text is still longer than length, split at any character to length
+    texts = [text]
+    texts = map(protect_float, texts)
+    texts = break_text(texts, length, {".", "!", "?", "。", "！", "？"})
+    texts = map(unprotect_float, texts)
+    texts = break_text(texts, length, {",", "，"})
+    texts = break_text(texts, length, {" "})
+    texts = list(break_text_by_length(texts, length))
+    # Then, merge the texts into segments with length <= length
+    segments = []
+    curr = ""
+    for text in texts:
+        if utf_8_len(curr) + utf_8_len(text) <= length:
+            curr += text
+        else:
+            add_cleaned(curr, segments)
+            curr = text
+    if curr:
+        add_cleaned(curr, segments)
+    return segments
+def contains_chinese(text):
+    """检测文本是否包含中文字符"""
+    return bool(re.search(r"[\u4e00-\u9fff]", text))
+def count_words_english(text):
+    """统计英文单词数量"""
+    return len(text.split())
+def count_characters_chinese(text):
+    """统计中文字符数量"""
+    return len(text)
+def split_by_punctuation_english(text):
+    """按英文标点符号分割"""
+    sentences = re.split(r"([.!?])", text)
+    result = []
+    for i in range(0, len(sentences) - 1, 2):
+        sentence = sentences[i].strip()
+        if sentence:
+            if i + 1 < len(sentences):
+                sentence += sentences[i + 1]
+            result.append(sentence)
+    if len(sentences) % 2 == 1 and sentences[-1].strip():
+        result.append(sentences[-1].strip())
+    return result
+def split_by_punctuation_chinese(text):
+    """按中文标点符号分割"""
+    sentences = re.split(r"([。！？])", text)
+    result = []
+    for i in range(0, len(sentences) - 1, 2):
+        sentence = sentences[i].strip()
+        if sentence:
+            if i + 1 < len(sentences):
+                sentence += sentences[i + 1]
+            result.append(sentence)
+    if len(sentences) % 2 == 1 and sentences[-1].strip():
+        result.append(sentences[-1].strip())
+    return result
+def merge_sentences_english(sentences, max_words=80):
+    """合并英文句子"""
+    result = []
+    current_chunk = ""
+    for sentence in sentences:
+        if not current_chunk:
+            current_chunk = sentence
+        else:
+            test_chunk = current_chunk + " " + sentence
+            if count_words_english(test_chunk) <= max_words:
+                current_chunk = test_chunk
+            else:
+                result.append(current_chunk)
+                current_chunk = sentence
+    if current_chunk:
+        result.append(current_chunk)
+    return result
+def merge_sentences_chinese(sentences, max_chars=100):
+    """合并中文句子"""
+    result = []
+    current_chunk = ""
+    for sentence in sentences:
+        if not current_chunk:
+            current_chunk = sentence
+        else:
+            test_chunk = current_chunk + sentence
+            if count_characters_chinese(test_chunk) <= max_chars:
+                current_chunk = test_chunk
+            else:
+                result.append(current_chunk)
+                current_chunk = sentence
+    if current_chunk:
+        result.append(current_chunk)
+    return result
+def process_text(text):
+    chinese_max_limit = 150
+    english_max_limit = 80
+    # 移除开头的标记如[S2]
+    text = re.sub(r"^\[S\d+\]", "", text).strip()
+    is_chinese = contains_chinese(text)
+    if is_chinese:
+        if count_characters_chinese(text) <= chinese_max_limit:
+            return [text]
+        sentences = split_by_punctuation_chinese(text)
+        result = merge_sentences_chinese(sentences, chinese_max_limit)
+    else:
+        if count_words_english(text) <= english_max_limit:
+            return [text]
+        sentences = split_by_punctuation_english(text)
+        result = merge_sentences_english(sentences, english_max_limit)
+    return result
+def process_text_list(text_list):
+    new_text_list = []
+    for text in text_list:
+        speaker = text[:4]
+        # print("---speaker:", speaker)
+        assert speaker in ["[S1]", "[S2]", "[S3]", "[S4]"]
+        result = process_text(text=text)
+        # print("---result:\n", result, len(result))
+        for chunk in result:
+            new_text_list.append(speaker + chunk)
+    return new_text_list

pretrained_models/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Put the pre-trained model in this folder.

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torchaudio
+torchtune
+torchao
+transformers
+einops
+gradio