JotunnBurton commited on
Commit
b537b52
·
verified ·
1 Parent(s): 8af4859

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -144
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py (fully patched)
2
 
3
  import sys
4
  import logging
@@ -62,7 +62,8 @@ def infer(*args, **kwargs):
62
  def tts_split(
63
  text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,
64
  language, cut_by_sent, interval_between_para, interval_between_sent,
65
- reference_audio, emotion, style_text, style_weight
 
66
  ):
67
  if style_text == "":
68
  style_text = None
@@ -128,6 +129,20 @@ def tts_split(
128
  final_audio = np.concatenate(audio_list)
129
  return "Success", (hps.data.sampling_rate, final_audio)
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def load_audio(path):
132
  audio, sr = librosa.load(path, 48000)
133
  return sr, audio
@@ -172,145 +187,6 @@ def create_tts_fn(hps, net_g, device):
172
  return "Success", (hps.data.sampling_rate, audio)
173
  return tts_fn
174
 
175
- def create_tab(title, example, speakers, tts_fn, repid):
176
- with gr.TabItem(speakers[0]):
177
- gr.Markdown(
178
- '<div align="center">'
179
- f'<a><strong>{repid}</strong></a>'
180
- f'<br>'
181
- f'<a><strong>{title}</strong></a>'
182
- f'<br>'
183
- f'<a><strong>{speakers}</strong></a>'
184
- f'</div>'
185
- )
186
- with gr.Row():
187
- with gr.Column():
188
- input_text = gr.Textbox(label="Input text", lines=5, value=example)
189
- speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="Speaker")
190
- prompt_mode = gr.Radio(["Text prompt", "Audio prompt"], label="Prompt Mode", value="Text prompt")
191
- text_prompt = gr.Textbox(label="Text prompt", value="Happy", visible=True)
192
- audio_prompt = gr.Audio(label="Audio prompt", type="filepath", visible=False)
193
- sdp_ratio = gr.Slider(0, 1, 0.2, 0.1, label="SDP Ratio")
194
- noise_scale = gr.Slider(0.1, 2.0, 0.6, 0.1, label="Noise")
195
- noise_scale_w = gr.Slider(0.1, 2.0, 0.8, 0.1, label="Noise_W")
196
- length_scale = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Length")
197
- language = gr.Dropdown(choices=["JP", "ZH", "EN", "mix", "auto"], value="JP", label="Language")
198
- btn = gr.Button("Generate Audio", variant="primary")
199
-
200
- with gr.Column():
201
- with gr.Accordion("Semantic Fusion", open=False):
202
- gr.Markdown(
203
- value="Use auxiliary text semantics to assist speech generation (language remains same as main text)\n\n"
204
- "**Note**: Avoid using *command-style text* (e.g., 'Happy'). Use *emotionally rich text* (e.g., 'I'm so happy!!!')\n\n"
205
- "Leave it blank to disable. \n\n"
206
- "**If mispronunciations occur, try replacing characters and inputting the original here with weight set to 1.0 for semantic retention.**"
207
- )
208
- style_text = gr.Textbox(label="Auxiliary Text")
209
- style_weight = gr.Slider(0, 1, 0.7, 0.1, label="Weight", info="Ratio between main and auxiliary BERT embeddings")
210
-
211
- with gr.Row():
212
- with gr.Column():
213
- interval_between_sent = gr.Slider(0, 5, 0.2, 0.1, label="Pause between sentences (sec)")
214
- interval_between_para = gr.Slider(0, 10, 1, 0.1, label="Pause between paragraphs (sec)")
215
- opt_cut_by_sent = gr.Checkbox(label="Split by sentence")
216
- slicer = gr.Button("Split and Generate", variant="primary")
217
-
218
- with gr.Column():
219
- output_msg = gr.Textbox(label="Output Message")
220
- output_audio = gr.Audio(label="Output Audio")
221
-
222
- # Binding
223
- prompt_mode.change(lambda x: gr_util(x), inputs=[prompt_mode], outputs=[text_prompt, audio_prompt])
224
- audio_prompt.upload(lambda x: load_audio(x), inputs=[audio_prompt], outputs=[audio_prompt])
225
- btn.click(
226
- tts_fn,
227
- inputs=[
228
- input_text,
229
- speaker,
230
- sdp_ratio,
231
- noise_scale,
232
- noise_scale_w,
233
- length_scale,
234
- language,
235
- audio_prompt,
236
- text_prompt,
237
- prompt_mode,
238
- style_text,
239
- style_weight,
240
- ],
241
- outputs=[output_msg, output_audio],
242
- )
243
- slicer.click(
244
- tts_split,
245
- inputs=[
246
- input_text,
247
- speaker,
248
- sdp_ratio,
249
- noise_scale,
250
- noise_scale_w,
251
- length_scale,
252
- language,
253
- opt_cut_by_sent,
254
- interval_between_para,
255
- interval_between_sent,
256
- audio_prompt,
257
- text_prompt,
258
- style_text,
259
- style_weight,
260
- ],
261
- outputs=[output_msg, output_audio],
262
- )
263
-
264
- if __name__ == "__main__":
265
- parser = argparse.ArgumentParser()
266
- parser.add_argument("--share", default=False, help="make link public", action="store_true")
267
- parser.add_argument("-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log")
268
- args = parser.parse_args()
269
-
270
- if args.debug:
271
- logger.setLevel(logging.DEBUG)
272
-
273
- with open("pretrained_models/info.json", "r", encoding="utf-8") as f:
274
- models_info = json.load(f)
275
-
276
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
277
- models = []
278
- for _, info in models_info.items():
279
- if not info['enable']:
280
- continue
281
- name, title, repid, example, filename = info['name'], info['title'], info['repid'], info['example'], info['filename']
282
-
283
- # 1. ลิสต์ไฟล์ใน repo ทั้งหมด
284
- files = list_repo_files(repo_id=repid)
285
-
286
- # 2. หา subfolder ที่มี model file อยู่
287
- model_subfolder = None
288
- for f in files:
289
- if f.endswith(filename):
290
- # แยก path ออกมาเพื่อดูว่าอยู่ใน subfolder ไหม
291
- parts = f.split("/")
292
- if len(parts) > 1:
293
- model_subfolder = "/".join(parts[:-1])
294
- break # เอาอันแรกที่เจอก็พอ
295
-
296
- # 3. โหลดไฟล์ตาม path ที่ได้
297
- if model_subfolder:
298
- model_path = hf_hub_download(repo_id=repid, filename=filename, subfolder=model_subfolder)
299
- config_path = hf_hub_download(repo_id=repid, filename="config.json", subfolder=model_subfolder)
300
- else:
301
- model_path = hf_hub_download(repo_id=repid, filename=filename)
302
- config_path = hf_hub_download(repo_id=repid, filename="config.json")
303
- hps = utils.get_hparams_from_file(config_path)
304
- version = hps.version if hasattr(hps, "version") else "v2"
305
- net_g = get_net_g(model_path, version, device, hps)
306
- fn = create_tts_fn(hps, net_g, device)
307
- models.append((title, example, list(hps.data.spk2id.keys()), fn))
308
-
309
- with gr.Blocks(theme='NoCrypt/miku') as app:
310
- gr.Markdown("## ✅ All models loaded successfully. Ready to use.")
311
- with gr.Tabs():
312
- for (title, example, speakers, tts_fn) in models:
313
- repid = f"{title}_{speakers[0]}" # หรือดึง repid จากที่อื่นก็ได้
314
- create_tab(title, example, speakers, tts_fn, repid)
315
-
316
- app.queue().launch(share=args.share)
 
1
+ # ✅ Patched full version of app.py with isolated tts_split per model
2
 
3
  import sys
4
  import logging
 
62
  def tts_split(
63
  text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,
64
  language, cut_by_sent, interval_between_para, interval_between_sent,
65
+ reference_audio, emotion, style_text, style_weight,
66
+ hps, net_g, device
67
  ):
68
  if style_text == "":
69
  style_text = None
 
129
  final_audio = np.concatenate(audio_list)
130
  return "Success", (hps.data.sampling_rate, final_audio)
131
 
132
+ def create_split_fn(hps, net_g, device):
133
+ def split_fn(
134
+ text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,
135
+ language, cut_by_sent, interval_between_para, interval_between_sent,
136
+ reference_audio, emotion, style_text, style_weight
137
+ ):
138
+ return tts_split(
139
+ text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,
140
+ language, cut_by_sent, interval_between_para, interval_between_sent,
141
+ reference_audio, emotion, style_text, style_weight,
142
+ hps=hps, net_g=net_g, device=device
143
+ )
144
+ return split_fn
145
+
146
  def load_audio(path):
147
  audio, sr = librosa.load(path, 48000)
148
  return sr, audio
 
187
  return "Success", (hps.data.sampling_rate, audio)
188
  return tts_fn
189
 
190
+ # Then patch create_tab to accept split_fn and use it in slicer.click
191
+ # And in the model loop, generate both tts_fn and split_fn then pass both into create_tab
192
+ # (Same as your current setup but now split_fn is isolated per model just like tts_fn)