Spaces:
Sleeping
Sleeping
| import torch | |
| import torchaudio | |
| import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To access VoiceBox class | |
| #import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class | |
| import numpy as np | |
| from voicebox.src.constants import PPG_PRETRAINED_PATH | |
| #Set voicebox default parameters | |
| LOOKAHEAD = 5 | |
| voicebox_kwargs={'win_length': 256, | |
| 'ppg_encoder_hidden_size': 256, | |
| 'use_phoneme_encoder': True, | |
| 'use_pitch_encoder': True, | |
| 'use_loudness_encoder': True, | |
| 'spec_encoder_lookahead_frames': 0, | |
| 'spec_encoder_type': 'mel', | |
| 'spec_encoder_mlp_depth': 2, | |
| 'bottleneck_lookahead_frames': LOOKAHEAD, | |
| 'ppg_encoder_path': PPG_PRETRAINED_PATH, | |
| 'n_bands': 128, | |
| 'spec_encoder_hidden_size': 512, | |
| 'bottleneck_skip': True, | |
| 'bottleneck_hidden_size': 512, | |
| 'bottleneck_feedforward_size': 512, | |
| 'bottleneck_type': 'lstm', | |
| 'bottleneck_depth': 2, | |
| 'control_eps': 0.5, | |
| 'projection_norm': float('inf'), | |
| 'conditioning_dim': 512} | |
| #Load pretrained model: | |
| model = vb.VoiceBox(**voicebox_kwargs) | |
| model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True) | |
| model.eval() | |
| #Define function to convert final audio format: | |
| def float32_to_int16(waveform): | |
| waveform = waveform / np.abs(waveform).max() | |
| waveform = waveform * 32767 | |
| waveform = waveform.astype(np.int16) | |
| waveform = waveform.ravel() | |
| return waveform | |
| #Define predict function: | |
| def predict(inp): | |
| #How to transform audio from string to tensor | |
| waveform, sample_rate = torchaudio.load(inp) | |
| #Run model without changing weights | |
| with torch.no_grad(): | |
| waveform = model(waveform) | |
| #Transform output audio into gradio-readable format | |
| waveform = waveform.numpy() | |
| waveform = float32_to_int16(waveform) | |
| return sample_rate, waveform | |
| #Set up gradio interface | |
| import gradio as gr | |
| interface = gr.Interface( | |
| fn=predict, | |
| inputs=gr.Audio(type="filepath"), | |
| outputs=gr.Audio() | |
| ) | |
| interface.launch() |