Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,39 +15,6 @@ import spaces
|
|
| 15 |
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
| 16 |
processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
|
| 17 |
|
| 18 |
-
title = "MusicGen Streaming"
|
| 19 |
-
|
| 20 |
-
description = """
|
| 21 |
-
Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
| 22 |
-
Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library. Note that the
|
| 23 |
-
demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
|
| 24 |
-
"""
|
| 25 |
-
|
| 26 |
-
article = """
|
| 27 |
-
## How Does It Work?
|
| 28 |
-
|
| 29 |
-
MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
|
| 30 |
-
At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
|
| 31 |
-
frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
|
| 32 |
-
each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
|
| 33 |
-
20 seconds of audio.
|
| 34 |
-
|
| 35 |
-
Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
|
| 36 |
-
playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
|
| 37 |
-
For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
|
| 38 |
-
750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
|
| 39 |
-
to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
|
| 40 |
-
chunks of audio, each corresponding to 250 tokens.
|
| 41 |
-
|
| 42 |
-
This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
|
| 43 |
-
to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
|
| 44 |
-
particularly when the chunk size is chosen to be small. In practice, the chunk size should be tuned to your device: using a
|
| 45 |
-
smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
|
| 46 |
-
than the time it takes to play the audio.
|
| 47 |
-
|
| 48 |
-
For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
|
| 49 |
-
"""
|
| 50 |
-
|
| 51 |
|
| 52 |
class MusicgenStreamer(BaseStreamer):
|
| 53 |
def __init__(
|
|
@@ -209,25 +176,14 @@ def generate_audio(text_prompt, audio_length_in_s=10.0, play_steps_in_s=2.0, see
|
|
| 209 |
demo = gr.Interface(
|
| 210 |
fn=generate_audio,
|
| 211 |
inputs=[
|
| 212 |
-
gr.Text(label="
|
| 213 |
-
gr.Slider(10,
|
| 214 |
-
gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="
|
| 215 |
-
gr.Slider(0, 10, value=5, step=1, label="
|
| 216 |
],
|
| 217 |
outputs=[
|
| 218 |
-
gr.Audio(label="
|
| 219 |
-
],
|
| 220 |
-
examples=[
|
| 221 |
-
["An 80s driving pop song with heavy drums and synth pads in the background", 30, 1.5, 5],
|
| 222 |
-
["A cheerful country song with acoustic guitars", 30, 1.5, 5],
|
| 223 |
-
["90s rock song with electric guitar and heavy drums", 30, 1.5, 5],
|
| 224 |
-
["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", 30, 1.5, 5],
|
| 225 |
-
["lofi slow bpm electro chill with organic samples", 30, 1.5, 5],
|
| 226 |
],
|
| 227 |
-
title=title,
|
| 228 |
-
description=description,
|
| 229 |
-
article=article,
|
| 230 |
-
cache_examples=False,
|
| 231 |
)
|
| 232 |
|
| 233 |
|
|
|
|
| 15 |
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
| 16 |
processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
class MusicgenStreamer(BaseStreamer):
|
| 20 |
def __init__(
|
|
|
|
| 176 |
demo = gr.Interface(
|
| 177 |
fn=generate_audio,
|
| 178 |
inputs=[
|
| 179 |
+
gr.Text(label="Описание", value="80s pop track with synth and instrumentals"),
|
| 180 |
+
gr.Slider(10, 120, value=15, step=5, label="Длинна аудио"),
|
| 181 |
+
gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Интервал потоковой передачи в секундах", info="Меньше = более короткие фрагменты, меньшая задержка, больше шагов кодека"),
|
| 182 |
+
gr.Slider(0, 10, value=5, step=1, label="Начальное значение для случайных поколений"),
|
| 183 |
],
|
| 184 |
outputs=[
|
| 185 |
+
gr.Audio(label="Сгенерированая музыка", streaming=True, autoplay=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
)
|
| 188 |
|
| 189 |
|