Rooni commited on
Commit
7c0794f
·
verified ·
1 Parent(s): 8a81f8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -49
app.py CHANGED
@@ -15,39 +15,6 @@ import spaces
15
  model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
16
  processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
17
 
18
- title = "MusicGen Streaming"
19
-
20
- description = """
21
- Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
22
- Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library. Note that the
23
- demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
24
- """
25
-
26
- article = """
27
- ## How Does It Work?
28
-
29
- MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
30
- At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
31
- frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
32
- each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
33
- 20 seconds of audio.
34
-
35
- Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
36
- playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
37
- For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
38
- 750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
39
- to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
40
- chunks of audio, each corresponding to 250 tokens.
41
-
42
- This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
43
- to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
44
- particularly when the chunk size is chosen to be small. In practice, the chunk size should be tuned to your device: using a
45
- smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
46
- than the time it takes to play the audio.
47
-
48
- For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
49
- """
50
-
51
 
52
  class MusicgenStreamer(BaseStreamer):
53
  def __init__(
@@ -209,25 +176,14 @@ def generate_audio(text_prompt, audio_length_in_s=10.0, play_steps_in_s=2.0, see
209
  demo = gr.Interface(
210
  fn=generate_audio,
211
  inputs=[
212
- gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
213
- gr.Slider(10, 30, value=15, step=5, label="Audio length in seconds"),
214
- gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
215
- gr.Slider(0, 10, value=5, step=1, label="Seed for random generations"),
216
  ],
217
  outputs=[
218
- gr.Audio(label="Generated Music", streaming=True, autoplay=True)
219
- ],
220
- examples=[
221
- ["An 80s driving pop song with heavy drums and synth pads in the background", 30, 1.5, 5],
222
- ["A cheerful country song with acoustic guitars", 30, 1.5, 5],
223
- ["90s rock song with electric guitar and heavy drums", 30, 1.5, 5],
224
- ["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", 30, 1.5, 5],
225
- ["lofi slow bpm electro chill with organic samples", 30, 1.5, 5],
226
  ],
227
- title=title,
228
- description=description,
229
- article=article,
230
- cache_examples=False,
231
  )
232
 
233
 
 
15
  model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
16
  processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  class MusicgenStreamer(BaseStreamer):
20
  def __init__(
 
176
  demo = gr.Interface(
177
  fn=generate_audio,
178
  inputs=[
179
+ gr.Text(label="Описание", value="80s pop track with synth and instrumentals"),
180
+ gr.Slider(10, 120, value=15, step=5, label="Длинна аудио"),
181
+ gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Интервал потоковой передачи в секундах", info="Меньше = более короткие фрагменты, меньшая задержка, больше шагов кодека"),
182
+ gr.Slider(0, 10, value=5, step=1, label="Начальное значение для случайных поколений"),
183
  ],
184
  outputs=[
185
+ gr.Audio(label="Сгенерированая музыка", streaming=True, autoplay=True)
 
 
 
 
 
 
 
186
  ],
 
 
 
 
187
  )
188
 
189