YuE-music-generator-demo

Runtime error

App Files Files Community

KingNish commited on Jan 29

Commit

c022c1a

verified ·

1 Parent(s): 9df60ba

by depseek

Browse files

Files changed (1) hide show

app.py +219 -276

app.py CHANGED Viewed

@@ -7,27 +7,25 @@ import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
 import torch
 from huggingface_hub import snapshot_download
 import uuid
-import time
-from tqdm import tqdm
-from einops import rearrange
 import torchaudio
 from torchaudio.transforms import Resample
 import soundfile as sf
-from omegaconf import OmegaConf
-import numpy as np
 import re
-import sys
-from collections import Counter
-# --- Constants and Setup ---
-IS_SHARED_UI = "innova-ai/YuE-music-generator-demo" in os.environ.get('SPACE_ID', '')
-OUTPUT_DIR = "./output"
-XCODEC_MINI_INFER_DIR = "./xcodec_mini_infer"
-MODEL_ID = "m-a-p/YuE-s1-7B-anneal-en-cot"
-# Install flash-attn
 def install_flash_attn():
     try:
         print("Installing flash-attn...")
@@ -36,39 +34,56 @@ def install_flash_attn():
             "pip install flash-attn --no-build-isolation",
             env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
             shell=True,
-            check=True  # Use check=True to raise an exception on failure
         )
         print("flash-attn installed successfully!")
     except subprocess.CalledProcessError as e:
         print(f"Failed to install flash-attn: {e}")
         exit(1)
 install_flash_attn()
-# --- Utility Functions ---
-def download_xcodec_resources():
-    """Downloads xcodec inference files."""
-    if not os.path.exists(XCODEC_MINI_INFER_DIR):
-        os.makedirs(XCODEC_MINI_INFER_DIR, exist_ok=True)
-        print(f"Created folder at: {XCODEC_MINI_INFER_DIR}")
-        snapshot_download(repo_id="m-a-p/xcodec_mini_infer", local_dir=XCODEC_MINI_INFER_DIR)
-    else:
-        print(f"Folder already exists at: {XCODEC_MINI_INFER_DIR}")
-download_xcodec_resources()
-# Add xcodec paths
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
-from codecmanipulator import CodecManipulator
-from mmtokenizer import _MMSentencePieceTokenizer
-from models.soundstream_hubert_new import SoundStream
-from vocoder import build_codec_model, process_audio
-from post_process_audio import replace_low_freq_with_energy_matched
 def empty_output_folder(output_dir):
-    """Empties the output folder."""
     for file in os.listdir(output_dir):
         file_path = os.path.join(output_dir, file)
         try:
@@ -79,30 +94,24 @@ def empty_output_folder(output_dir):
         except Exception as e:
             print(f"Error deleting file {file_path}: {e}")
 def create_temp_file(content, prefix, suffix=".txt"):
-    """Creates a temporary file with content."""
     temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=prefix, suffix=suffix)
     content = content.strip() + "\n\n"
     content = content.replace("\r\n", "\n").replace("\r", "\n")
     temp_file.write(content)
     temp_file.close()
-    print(f"\nContent written to {prefix}{suffix}:\n{content}\n---")
     return temp_file.name
 def get_last_mp3_file(output_dir):
-    """Gets the most recently modified MP3 file in a directory."""
     mp3_files = [file for file in os.listdir(output_dir) if file.endswith('.mp3')]
     if not mp3_files:
         print("No .mp3 files found in the output folder.")
         return None
     mp3_files_with_path = [os.path.join(output_dir, file) for file in mp3_files]
-    mp3_files_with_path.sort(key=os.path.getmtime, reverse=True)
     return mp3_files_with_path[0]
 class BlockTokenRangeProcessor(LogitsProcessor):
     def __init__(self, start_id, end_id):
         self.blocked_token_ids = list(range(start_id, end_id))
@@ -111,9 +120,7 @@ class BlockTokenRangeProcessor(LogitsProcessor):
         scores[:, self.blocked_token_ids] = -float("inf")
         return scores
 def load_audio_mono(filepath, sampling_rate=16000):
-    """Loads an audio file and converts to mono, optionally resamples."""
     audio, sr = torchaudio.load(filepath)
     audio = torch.mean(audio, dim=0, keepdim=True)
     if sr != sampling_rate:
@@ -121,17 +128,13 @@ def load_audio_mono(filepath, sampling_rate=16000):
         audio = resampler(audio)
     return audio
 def split_lyrics(lyrics: str):
-    """Splits lyrics into segments based on bracketed headers."""
     pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
     segments = re.findall(pattern, lyrics, re.DOTALL)
     structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
     return structured_lyrics
 def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
-    """Saves an audio tensor to disk."""
     folder_path = os.path.dirname(path)
     if not os.path.exists(folder_path):
         os.makedirs(folder_path)
@@ -141,226 +144,166 @@ def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False)
     torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
-# --- Music Generation Class ---
-class MusicGenerator:
-    def __init__(self, device="cuda:0", basic_model_config=f'{XCODEC_MINI_INFER_DIR}/final_ckpt/config.yaml', resume_path=f'{XCODEC_MINI_INFER_DIR}/final_ckpt/ckpt_00360000.pth'):
-        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
-        self.mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
-        self.codectool = CodecManipulator("xcodec", 0, 1)
-        model_config = OmegaConf.load(basic_model_config)
-        self.codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(self.device)
-        parameter_dict = torch.load(resume_path, map_location='cpu')
-        self.codec_model.load_state_dict(parameter_dict['codec_model'])
-        self.codec_model.to(self.device)
-        self.codec_model.eval()
-        # load stage1 model to GPU at initial time
-        self.stage1_model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            torch_dtype=torch.float16,
-            attn_implementation="flash_attention_2",
-            ).to(self.device)
-        self.stage1_model.eval()
-    def generate(
-        self,
-        genre_txt=None,
-        lyrics_txt=None,
-        max_new_tokens=3000,
-        run_n_segments=2,
-        use_audio_prompt=False,
-        audio_prompt_path="",
-        prompt_start_time=0.0,
-        prompt_end_time=30.0,
-        output_dir=OUTPUT_DIR,
-        keep_intermediate=False,
-        disable_offload_model=False,
-        rescale=False
-    ):
-        if use_audio_prompt and not audio_prompt_path:
-            raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
-        stage1_output_dir = os.path.join(output_dir, f"stage1")
-        os.makedirs(stage1_output_dir, exist_ok=True)
-        stage1_output_set = []
-        genres = genre_txt.strip()
-        lyrics = split_lyrics(lyrics_txt + "\n")
-        full_lyrics = "\n".join(lyrics)
-        prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
-        prompt_texts += lyrics
-        random_id = uuid.uuid4()
-        output_seq = None
-        top_p = 0.93
-        temperature = 1.0
-        repetition_penalty = 1.2
-        start_of_segment = self.mmtokenizer.tokenize('[start_of_segment]')
-        end_of_segment = self.mmtokenizer.tokenize('[end_of_segment]')
-        raw_output = None
-        run_n_segments = min(run_n_segments + 1, len(lyrics))
-        print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
-        for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
-            section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            guidance_scale = 1.5 if i <= 1 else 1.2
-            if i == 0:
-                continue
-            if i == 1:
-                if use_audio_prompt:
-                    audio_prompt = load_audio_mono(audio_prompt_path)
-                    audio_prompt.unsqueeze_(0)
-                    with torch.no_grad():
-                        raw_codes = self.codec_model.encode(audio_prompt.to(self.device), target_bw=0.5)
-                    raw_codes = raw_codes.transpose(0, 1)
-                    raw_codes = raw_codes.cpu().numpy().astype(np.int16)
-                    code_ids = self.codectool.npy2ids(raw_codes[0])
-                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]
-                    audio_prompt_codec_ids = [self.mmtokenizer.soa] + self.codectool.sep_ids + audio_prompt_codec + [self.mmtokenizer.eoa]
-                    sentence_ids = self.mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + self.mmtokenizer.tokenize(
-                        "[end_of_reference]")
-                    head_id = self.mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
-                else:
-                    head_id = self.mmtokenizer.tokenize(prompt_texts[0])
-                prompt_ids = head_id + start_of_segment + self.mmtokenizer.tokenize(section_text) + [self.mmtokenizer.soa] + self.codectool.sep_ids
             else:
-                prompt_ids = end_of_segment + start_of_segment + self.mmtokenizer.tokenize(section_text) + [self.mmtokenizer.soa] + self.codectool.sep_ids
-            prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(self.device)
-            input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
-            max_context = 16384 - max_new_tokens - 1
-            if input_ids.shape[-1] > max_context:
-                print(f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
-                input_ids = input_ids[:, -(max_context):]
-            with torch.no_grad():
-                output_seq = self.stage1_model.generate(
-                    input_ids=input_ids,
-                    max_new_tokens=max_new_tokens,
-                    min_new_tokens=100,
-                    do_sample=True,
-                    top_p=top_p,
-                    temperature=temperature,
-                    repetition_penalty=repetition_penalty,
-                    eos_token_id=self.mmtokenizer.eoa,
-                    pad_token_id=self.mmtokenizer.eoa,
-                    logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
-                    guidance_scale=guidance_scale,
                 )
-                if output_seq[0][-1].item() != self.mmtokenizer.eoa:
-                    tensor_eoa = torch.as_tensor([[self.mmtokenizer.eoa]]).to(self.stage1_model.device)
-                    output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
-            if i > 1:
-                raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
-            else:
-                raw_output = output_seq
-            print(len(raw_output))
-        ids = raw_output[0].cpu().numpy()
-        soa_idx = np.where(ids == self.mmtokenizer.soa)[0].tolist()
-        eoa_idx = np.where(ids == self.mmtokenizer.eoa)[0].tolist()
-        if len(soa_idx) != len(eoa_idx):
-            raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
-        vocals = []
-        instrumentals = []
-        range_begin = 1 if use_audio_prompt else 0
-        for i in range(range_begin, len(soa_idx)):
-            codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
-            if codec_ids[0] == 32016:
-                codec_ids = codec_ids[1:]
-            codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
-            vocals_ids = self.codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
-            vocals.append(vocals_ids)
-            instrumentals_ids = self.codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
-            instrumentals.append(instrumentals_ids)
-        vocals = np.concatenate(vocals, axis=1)
-        instrumentals = np.concatenate(instrumentals, axis=1)
-        vocal_save_path = os.path.join(stage1_output_dir,
-                                       f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_vocal_{random_id}".replace(
-                                           '.', '@') + '.npy')
-        inst_save_path = os.path.join(stage1_output_dir,
-                                       f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_instrumental_{random_id}".replace(
-                                           '.', '@') + '.npy')
-        np.save(vocal_save_path, vocals)
-        np.save(inst_save_path, instrumentals)
-        stage1_output_set.append(vocal_save_path)
-        stage1_output_set.append(inst_save_path)
-        print("Converting to Audio...")
-        recons_output_dir = os.path.join(output_dir, "recons")
-        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
-        os.makedirs(recons_mix_dir, exist_ok=True)
-        tracks = []
-        for npy in stage1_output_set:
-            codec_result = np.load(npy)
-            decodec_rlt = []
-            with torch.no_grad():
-                decoded_waveform = self.codec_model.decode(
-                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(self.device))
-            decoded_waveform = decoded_waveform.cpu().squeeze(0)
-            decodec_rlt.append(torch.as_tensor(decoded_waveform))
-            decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
-            tracks.append(save_path)
-            save_audio(decodec_rlt, save_path, 16000)
-        for inst_path in tracks:
-            try:
-                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
-                        and 'instrumental' in inst_path:
-                    vocal_path = inst_path.replace('instrumental', 'vocal')
-                    if not os.path.exists(vocal_path):
-                        continue
-                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
-                    vocal_stem, sr = sf.read(inst_path)
-                    instrumental_stem, _ = sf.read(vocal_path)
-                    mix_stem = (vocal_stem + instrumental_stem) / 1
-                    sf.write(recons_mix, mix_stem, sr)
-            except Exception as e:
-                print(e)
-        return recons_mix
-# --- Gradio Interface ---
-music_generator = MusicGenerator() # Initialize the music generator here to keep the model loaded
-@spaces.GPU(duration=120)
-def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=200):
-    """Inference function for the Gradio interface."""
-    os.makedirs(OUTPUT_DIR, exist_ok=True)
-    print(f"Output folder ensured at: {OUTPUT_DIR}")
-    empty_output_folder(OUTPUT_DIR)
-    try:
-        music = music_generator.generate(
-            genre_txt=genre_txt_content,
-            lyrics_txt=lyrics_txt_content,
-            run_n_segments=num_segments,
-            output_dir=OUTPUT_DIR,
-            max_new_tokens=max_new_tokens
-        )
-        return music
-    except Exception as e:
-        print(f"Error occurred during inference: {e}")
-        return None
-    finally:
-        print("Temporary files deleted.")
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
@@ -368,7 +311,7 @@ with gr.Blocks() as demo:
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
@@ -381,9 +324,9 @@ with gr.Blocks() as demo:
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
-                if IS_SHARED_UI:
                     num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
                     max_new_tokens = gr.Slider(label="Max New Tokens", info="100 tokens equals 1 second long music", minimum=100, maximum="3000", step=100, value=500, interactive=True)
                 else:
@@ -393,7 +336,7 @@ with gr.Blocks() as demo:
                 music_out = gr.Audio(label="Audio Result")
         gr.Examples(
-            examples=[
                 [
                     "female blues airy vocal bright vocal piano sad romantic guitar jazz",
                     """[verse]
@@ -428,17 +371,17 @@ Through the highs and lows, I'mma keep it real
 Living out my dreams with this mic and a deal
                     """
                 ]
-            ],
-            inputs=[genre_txt, lyrics_txt],
-            outputs=[music_out],
-            cache_examples=False,
-            # cache_mode="lazy", # not enable cache yet
-            fn=infer
         )
     submit_btn.click(
-        fn=infer,
-        inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
-        outputs=[music_out]
     )
 demo.queue().launch(show_api=False, show_error=True)

 from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
 import torch
 from huggingface_hub import snapshot_download
+import sys
 import uuid
+import numpy as np
+import json
+from omegaconf import OmegaConf
 import torchaudio
 from torchaudio.transforms import Resample
 import soundfile as sf
+from tqdm import tqdm
+from einops import rearrange
+import time
+from codecmanipulator import CodecManipulator
+from mmtokenizer import _MMSentencePieceTokenizer
 import re
+is_shared_ui = True if "innova-ai/YuE-music-generator-demo" in os.environ.get('SPACE_ID', '') else False
+# Install required package
 def install_flash_attn():
     try:
         print("Installing flash-attn...")
             "pip install flash-attn --no-build-isolation",
             env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
             shell=True,
         )
         print("flash-attn installed successfully!")
     except subprocess.CalledProcessError as e:
         print(f"Failed to install flash-attn: {e}")
         exit(1)
+# Install flash-attn
 install_flash_attn()
+# Download xcodec_mini_infer
+folder_path = './xcodec_mini_infer'
+if not os.path.exists(folder_path):
+    os.makedirs(folder_path, exist_ok=True)
+    print(f"Folder created at: {folder_path}")
+else:
+    print(f"Folder already exists at: {folder_path}")
+snapshot_download(
+    repo_id = "m-a-p/xcodec_mini_infer",
+    local_dir = "./xcodec_mini_infer"
+)
+# Add to path
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
+# Load Model (do this ONCE)
+print("Loading Models...")
+device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModelForCausalLM.from_pretrained(
+    "m-a-p/YuE-s1-7B-anneal-en-cot",
+    torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2",
+).to(device).eval()
+mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
+codectool = CodecManipulator("xcodec", 0, 1)
+model_config = OmegaConf.load('./xcodec_mini_infer/final_ckpt/config.yaml')
+codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
+parameter_dict = torch.load('./xcodec_mini_infer/final_ckpt/ckpt_00360000.pth', map_location='cpu')
+codec_model.load_state_dict(parameter_dict['codec_model'])
+codec_model.to(device)
+codec_model.eval()
+print("Models Loaded!")
 def empty_output_folder(output_dir):
     for file in os.listdir(output_dir):
         file_path = os.path.join(output_dir, file)
         try:
         except Exception as e:
             print(f"Error deleting file {file_path}: {e}")
 def create_temp_file(content, prefix, suffix=".txt"):
     temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=prefix, suffix=suffix)
     content = content.strip() + "\n\n"
     content = content.replace("\r\n", "\n").replace("\r", "\n")
     temp_file.write(content)
     temp_file.close()
     return temp_file.name
 def get_last_mp3_file(output_dir):
     mp3_files = [file for file in os.listdir(output_dir) if file.endswith('.mp3')]
     if not mp3_files:
         print("No .mp3 files found in the output folder.")
         return None
     mp3_files_with_path = [os.path.join(output_dir, file) for file in mp3_files]
+    mp3_files_with_path.sort(key=lambda x: os.path.getmtime(x), reverse=True)
     return mp3_files_with_path[0]
 class BlockTokenRangeProcessor(LogitsProcessor):
     def __init__(self, start_id, end_id):
         self.blocked_token_ids = list(range(start_id, end_id))
         scores[:, self.blocked_token_ids] = -float("inf")
         return scores
 def load_audio_mono(filepath, sampling_rate=16000):
     audio, sr = torchaudio.load(filepath)
     audio = torch.mean(audio, dim=0, keepdim=True)
     if sr != sampling_rate:
         audio = resampler(audio)
     return audio
 def split_lyrics(lyrics: str):
     pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
     segments = re.findall(pattern, lyrics, re.DOTALL)
     structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
     return structured_lyrics
 def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
     folder_path = os.path.dirname(path)
     if not os.path.exists(folder_path):
         os.makedirs(folder_path)
     torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+@spaces.GPU(duration=120)
+def generate_music(
+    genre_txt=None,
+    lyrics_txt=None,
+    max_new_tokens=3000,
+    run_n_segments=2,
+    use_audio_prompt=False,
+    audio_prompt_path="",
+    prompt_start_time=0.0,
+    prompt_end_time=30.0,
+    output_dir="./output",
+    keep_intermediate=False,
+    cuda_idx=0,
+    rescale=False,
+):
+    if use_audio_prompt and not audio_prompt_path:
+        raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
+    stage1_output_dir = os.path.join(output_dir, f"stage1")
+    os.makedirs(stage1_output_dir, exist_ok=True)
+    stage1_output_set = []
+    genres = genre_txt.strip()
+    lyrics = split_lyrics(lyrics_txt+"\n")
+    full_lyrics = "\n".join(lyrics)
+    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
+    prompt_texts += lyrics
+    random_id = uuid.uuid4()
+    output_seq = None
+    top_p = 0.93
+    temperature = 1.0
+    repetition_penalty = 1.2
+    start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
+    end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
+    raw_output = None
+    run_n_segments = min(run_n_segments+1, len(lyrics))
+    print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
+    for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
+        section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+        guidance_scale = 1.5 if i <=1 else 1.2
+        if i==0:
+            continue
+        if i==1:
+            if use_audio_prompt:
+                audio_prompt = load_audio_mono(audio_prompt_path)
+                audio_prompt.unsqueeze_(0)
+                with torch.no_grad():
+                    raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
+                raw_codes = raw_codes.transpose(0, 1)
+                raw_codes = raw_codes.cpu().numpy().astype(np.int16)
+                code_ids = codectool.npy2ids(raw_codes[0])
+                audio_prompt_codec = code_ids[int(prompt_start_time *50): int(prompt_end_time *50)]
+                audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
+                sentence_ids = mmtokenizer.tokenize("[start_of_reference]") +  audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
+                head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
             else:
+                head_id = mmtokenizer.tokenize(prompt_texts[0])
+            prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        else:
+            prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
+        input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
+        max_context = 16384-max_new_tokens-1
+        if input_ids.shape[-1] > max_context:
+            print(f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
+            input_ids = input_ids[:, -(max_context):]
+        with torch.no_grad():
+            output_seq = model.generate(
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens,
+                min_new_tokens=100,
+                do_sample=True,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                eos_token_id=mmtokenizer.eoa,
+                pad_token_id=mmtokenizer.eoa,
+                logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
+                guidance_scale=guidance_scale,
                 )
+            if output_seq[0][-1].item() != mmtokenizer.eoa:
+                tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
+                output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+        if i > 1:
+            raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
+        else:
+            raw_output = output_seq
+        print(len(raw_output))
+    ids = raw_output[0].cpu().numpy()
+    soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
+    eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
+    if len(soa_idx)!=len(eoa_idx):
+        raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
+    vocals = []
+    instrumentals = []
+    range_begin = 1 if use_audio_prompt else 0
+    for i in range(range_begin, len(soa_idx)):
+        codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
+        if codec_ids[0] == 32016:
+            codec_ids = codec_ids[1:]
+        codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
+        vocals_ids = codectool.ids2npy(rearrange(codec_ids,"(n b) -> b n", b=2)[0])
+        vocals.append(vocals_ids)
+        instrumentals_ids = codectool.ids2npy(rearrange(codec_ids,"(n b) -> b n", b=2)[1])
+        instrumentals.append(instrumentals_ids)
+    vocals = np.concatenate(vocals, axis=1)
+    instrumentals = np.concatenate(instrumentals, axis=1)
+    vocal_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_vocal_{random_id}".replace('.', '@')+'.npy')
+    inst_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_instrumental_{random_id}".replace('.', '@')+'.npy')
+    np.save(vocal_save_path, vocals)
+    np.save(inst_save_path, instrumentals)
+    stage1_output_set.append(vocal_save_path)
+    stage1_output_set.append(inst_save_path)
+    print("Converting to Audio...")
+    recons_output_dir = os.path.join(output_dir, "recons")
+    recons_mix_dir = os.path.join(recons_output_dir, 'mix')
+    os.makedirs(recons_mix_dir, exist_ok=True)
+    tracks = []
+    for npy in stage1_output_set:
+        codec_result = np.load(npy)
+        decodec_rlt=[]
+        with torch.no_grad():
+            decoded_waveform = codec_model.decode(torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
+        decoded_waveform = decoded_waveform.cpu().squeeze(0)
+        decodec_rlt.append(torch.as_tensor(decoded_waveform))
+        decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+        save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
+        tracks.append(save_path)
+        save_audio(decodec_rlt, save_path, 16000)
+    # mix tracks
+    for inst_path in tracks:
+        try:
+            if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
+                and 'instrumental' in inst_path:
+                # find pair
+                vocal_path = inst_path.replace('instrumental', 'vocal')
+                if not os.path.exists(vocal_path):
+                    continue
+                # mix
+                recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
+                vocal_stem, sr = sf.read(inst_path)
+                instrumental_stem, _ = sf.read(vocal_path)
+                mix_stem = (vocal_stem + instrumental_stem) / 1
+                sf.write(recons_mix, mix_stem, sr)
+        except Exception as e:
+            print(e)
+    return recons_mix
+# Gradio
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
+                if is_shared_ui:
                     num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
                     max_new_tokens = gr.Slider(label="Max New Tokens", info="100 tokens equals 1 second long music", minimum=100, maximum="3000", step=100, value=500, interactive=True)
                 else:
                 music_out = gr.Audio(label="Audio Result")
         gr.Examples(
+            examples = [
                 [
                     "female blues airy vocal bright vocal piano sad romantic guitar jazz",
                     """[verse]
 Living out my dreams with this mic and a deal
                     """
                 ]
+            ],
+             inputs = [genre_txt, lyrics_txt],
+            outputs = [music_out],
+            cache_examples = False,
+            # cache_mode="lazy",
+            fn=generate_music
         )
     submit_btn.click(
+        fn = generate_music,
+        inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
+        outputs = [music_out]
     )
 demo.queue().launch(show_api=False, show_error=True)