Commit
·
86a2cc3
1
Parent(s):
cf6e216
initial commit
Browse files- .gitignore +0 -0
- Readme.md +112 -0
- backbone.py +62 -0
- config.json +26 -0
- configuration_antispoofing.py +9 -0
- conformer.py +284 -0
- feature_extraction_antispoofing.py +36 -0
- modeling_antispoofing.py +21 -0
- pipeline_antispoofing.py +42 -0
- preprocessor_config.json +4 -0
- pytorch_model.bin +3 -0
.gitignore
ADDED
|
File without changes
|
Readme.md
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
tags:
|
| 5 |
+
- audio
|
| 6 |
+
- audio-classification
|
| 7 |
+
- antispoofing
|
| 8 |
+
- deepfake-detection
|
| 9 |
+
- speech
|
| 10 |
+
license: mit
|
| 11 |
+
pipeline_tag: audio-classification
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# DF Arena 1B - Audio Antispoofing Model
|
| 15 |
+
|
| 16 |
+
A deep learning model for detecting spoofed (fake/synthesized) audio vs. bonafide (genuine) speech.
|
| 17 |
+
|
| 18 |
+
## Model Description
|
| 19 |
+
|
| 20 |
+
This model uses Wav2Vec2-XLS-R-2B as a backbone for audio antispoofing detection. It classifies audio samples as either bonafide (real human speech) or spoofed (synthetic/manipulated audio).
|
| 21 |
+
|
| 22 |
+
**Key Features:**
|
| 23 |
+
- Detects AI-generated speech and voice cloning
|
| 24 |
+
- Works with various audio formats
|
| 25 |
+
- Robust against different spoofing attacks
|
| 26 |
+
|
| 27 |
+
## Usage
|
| 28 |
+
```python
|
| 29 |
+
from transformers import pipeline
|
| 30 |
+
|
| 31 |
+
# Load the pipeline
|
| 32 |
+
pipe = pipeline(
|
| 33 |
+
"audio-classification",
|
| 34 |
+
model="your-username/df-arena-1b-antispoofing",
|
| 35 |
+
trust_remote_code=True
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# Predict from audio file
|
| 39 |
+
result = pipe("audio.wav")
|
| 40 |
+
print(result)
|
| 41 |
+
# Output: {'label': 'bonafide', 'score': 0.95}
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
### Advanced Usage
|
| 45 |
+
```python
|
| 46 |
+
import librosa
|
| 47 |
+
from transformers import pipeline
|
| 48 |
+
|
| 49 |
+
pipe = pipeline(
|
| 50 |
+
"audio-classification",
|
| 51 |
+
model="your-username/df-arena-1b-antispoofing",
|
| 52 |
+
trust_remote_code=True
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
# From numpy array
|
| 56 |
+
audio, sr = librosa.load("audio.wav", sr=16000)
|
| 57 |
+
result = pipe(audio)
|
| 58 |
+
|
| 59 |
+
# Get all scores
|
| 60 |
+
result = pipe("audio.wav", return_all_scores=True)
|
| 61 |
+
# Output: [
|
| 62 |
+
# {'label': 'bonafide', 'score': 0.95},
|
| 63 |
+
# {'label': 'spoof', 'score': 0.05}
|
| 64 |
+
# ]
|
| 65 |
+
|
| 66 |
+
# Batch processing
|
| 67 |
+
results = pipe(["audio1.wav", "audio2.wav", "audio3.wav"])
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## Model Details
|
| 71 |
+
|
| 72 |
+
- **Backbone:** Wav2Vec2-XLS-R-2B
|
| 73 |
+
- **Input:** 16kHz audio (mono)
|
| 74 |
+
- **Output:** Binary classification (bonafide/spoof)
|
| 75 |
+
- **Parameters:** ~2B
|
| 76 |
+
|
| 77 |
+
## Training Data
|
| 78 |
+
|
| 79 |
+
[Describe your training dataset here]
|
| 80 |
+
|
| 81 |
+
## Performance
|
| 82 |
+
|
| 83 |
+
[Add performance metrics here]
|
| 84 |
+
|
| 85 |
+
| Dataset | Accuracy | EER |
|
| 86 |
+
|---------|----------|-----|
|
| 87 |
+
| Test Set | XX% | XX% |
|
| 88 |
+
|
| 89 |
+
## Limitations
|
| 90 |
+
|
| 91 |
+
- Optimized for speech audio (may not work well on music or environmental sounds)
|
| 92 |
+
- Performance may vary with very low quality audio
|
| 93 |
+
- Requires 16kHz sampling rate for best results
|
| 94 |
+
|
| 95 |
+
## Citation
|
| 96 |
+
```bibtex
|
| 97 |
+
@misc{df-arena-1b-2024,
|
| 98 |
+
author = {Your Name},
|
| 99 |
+
title = {DF Arena 1B: Audio Antispoofing Model},
|
| 100 |
+
year = {2024},
|
| 101 |
+
publisher = {HuggingFace},
|
| 102 |
+
url = {https://huggingface.co/your-username/df-arena-1b-antispoofing}
|
| 103 |
+
}
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
## License
|
| 107 |
+
|
| 108 |
+
MIT License
|
| 109 |
+
|
| 110 |
+
## Contact
|
| 111 |
+
|
| 112 |
+
For questions or issues, please open an issue on the model repository.
|
backbone.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
from torch import Tensor
|
| 6 |
+
from transformers import Wav2Vec2Model, Wav2Vec2Config
|
| 7 |
+
from conformer import FinalConformer
|
| 8 |
+
|
| 9 |
+
class DF_Arena_1B(nn.Module):
|
| 10 |
+
def __init__(self):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.ssl_model = Wav2Vec2Model(Wav2Vec2Config.from_pretrained("facebook/wav2vec2-xls-r-1b"))
|
| 13 |
+
self.ssl_model.config.output_hidden_states = True
|
| 14 |
+
self.first_bn = nn.BatchNorm2d(num_features=1)
|
| 15 |
+
self.selu = nn.SELU(inplace=True)
|
| 16 |
+
self.fc0 = nn.Linear(1280, 1) #1280 for 1b, 1920 for 2b
|
| 17 |
+
self.sig = nn.Sigmoid()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
self.conformer = FinalConformer(emb_size=1280, heads=4, ffmult=4, exp_fac=2, kernel_size=31, n_encoders=4)
|
| 21 |
+
|
| 22 |
+
# Learnable attention weights
|
| 23 |
+
self.attn_scores = nn.Linear(1280, 1, bias=False)
|
| 24 |
+
|
| 25 |
+
def get_attenF1Dpooling(self, x):
|
| 26 |
+
#print(x.shape, 'x shape in attnF1Dpooling')
|
| 27 |
+
logits = self.attn_scores(x)
|
| 28 |
+
weights = torch.softmax(logits, dim=1) # (B, T, 1)
|
| 29 |
+
pooled = torch.sum(weights * x, dim=1, keepdim=True) # (B, 1, D)
|
| 30 |
+
return pooled
|
| 31 |
+
|
| 32 |
+
def get_attenF1D(self, layerResult):
|
| 33 |
+
poollayerResult = []
|
| 34 |
+
fullf = []
|
| 35 |
+
for layer in layerResult:
|
| 36 |
+
# layer shape: (B, D, T)
|
| 37 |
+
#layery = layer.permute(0, 2, 1) # (B, T, D)
|
| 38 |
+
layery = self.get_attenF1Dpooling(layer) # (B, 1, D)
|
| 39 |
+
poollayerResult.append(layery)
|
| 40 |
+
fullf.append(layer.unsqueeze(1)) # (B, 1, D, T)
|
| 41 |
+
|
| 42 |
+
layery = torch.cat(poollayerResult, dim=1) # (B, L, D)
|
| 43 |
+
fullfeature = torch.cat(fullf, dim=1) # (B, L, D, T)
|
| 44 |
+
return layery, fullfeature
|
| 45 |
+
|
| 46 |
+
def forward(self, x):
|
| 47 |
+
out_ssl = self.ssl_model(x.unsqueeze(0)) #layerresult = [(x,z),24个] x(201,1,1024) z(1,201,201)
|
| 48 |
+
y0, fullfeature = self.get_attenF1D(out_ssl.hidden_states)
|
| 49 |
+
y0 = self.fc0(y0)
|
| 50 |
+
y0 = self.sig(y0)
|
| 51 |
+
y0 = y0.view(y0.shape[0], y0.shape[1], y0.shape[2], -1)
|
| 52 |
+
fullfeature = fullfeature * y0
|
| 53 |
+
fullfeature = torch.sum(fullfeature, 1)
|
| 54 |
+
fullfeature = fullfeature.unsqueeze(dim=1)
|
| 55 |
+
fullfeature = self.first_bn(fullfeature)
|
| 56 |
+
fullfeature = self.selu(fullfeature)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
output, _ = self.conformer(fullfeature.squeeze(1))
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
return output
|
config.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": ["DF-Arena-1B-V0.1"],
|
| 3 |
+
"model_type": "antispoofing",
|
| 4 |
+
|
| 5 |
+
"num_labels": 2,
|
| 6 |
+
"id2label": {
|
| 7 |
+
"1": "bonafide",
|
| 8 |
+
"0": "spoof"
|
| 9 |
+
},
|
| 10 |
+
"label2id": {
|
| 11 |
+
"bonafide": 1,
|
| 12 |
+
"spoof": 0
|
| 13 |
+
},
|
| 14 |
+
|
| 15 |
+
"auto_map": {
|
| 16 |
+
"AutoConfig": "configuration_antispoofing.DF_Arena_1B_Config",
|
| 17 |
+
"AutoModel": "modeling_antispoofing.DF_Arena_1B_Antispoofing",
|
| 18 |
+
"AutoFeatureExtractor": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
|
| 19 |
+
},
|
| 20 |
+
"custom_pipelines": {
|
| 21 |
+
"antispoofing": {
|
| 22 |
+
"impl": "pipeline_antispoofing.AntispoofingPipeline",
|
| 23 |
+
"pt": ["AutoModel"]
|
| 24 |
+
}
|
| 25 |
+
}
|
| 26 |
+
}
|
configuration_antispoofing.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PretrainedConfig
|
| 2 |
+
|
| 3 |
+
class DF_Arena_1B_Config(PretrainedConfig):
|
| 4 |
+
model_type = "antispoofing"
|
| 5 |
+
def __init__(self, num_labels=2, sample_rate=16000, **kwargs):
|
| 6 |
+
super().__init__(**kwargs)
|
| 7 |
+
self.num_labels = num_labels
|
| 8 |
+
self.sample_rate = sample_rate
|
| 9 |
+
self.out_dim = 1024
|
conformer.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import torch
|
| 3 |
+
from torch import nn, einsum
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
from torch.nn.modules.transformer import _get_clones
|
| 8 |
+
from torch import Tensor
|
| 9 |
+
from einops import rearrange
|
| 10 |
+
from einops.layers.torch import Rearrange
|
| 11 |
+
|
| 12 |
+
# helper functions
|
| 13 |
+
|
| 14 |
+
def exists(val):
|
| 15 |
+
return val is not None
|
| 16 |
+
|
| 17 |
+
def default(val, d):
|
| 18 |
+
return val if exists(val) else d
|
| 19 |
+
|
| 20 |
+
def calc_same_padding(kernel_size):
|
| 21 |
+
pad = kernel_size // 2
|
| 22 |
+
return (pad, pad - (kernel_size + 1) % 2)
|
| 23 |
+
|
| 24 |
+
# helper classes
|
| 25 |
+
|
| 26 |
+
class Swish(nn.Module):
|
| 27 |
+
def forward(self, x):
|
| 28 |
+
return x * x.sigmoid()
|
| 29 |
+
|
| 30 |
+
class GLU(nn.Module):
|
| 31 |
+
def __init__(self, dim):
|
| 32 |
+
super().__init__()
|
| 33 |
+
self.dim = dim
|
| 34 |
+
|
| 35 |
+
def forward(self, x):
|
| 36 |
+
out, gate = x.chunk(2, dim=self.dim)
|
| 37 |
+
return out * gate.sigmoid()
|
| 38 |
+
|
| 39 |
+
class DepthWiseConv1d(nn.Module):
|
| 40 |
+
def __init__(self, chan_in, chan_out, kernel_size, padding):
|
| 41 |
+
super().__init__()
|
| 42 |
+
self.padding = padding
|
| 43 |
+
self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
|
| 44 |
+
|
| 45 |
+
def forward(self, x):
|
| 46 |
+
x = F.pad(x, self.padding)
|
| 47 |
+
return self.conv(x)
|
| 48 |
+
|
| 49 |
+
# attention, feedforward, and conv module
|
| 50 |
+
|
| 51 |
+
class Scale(nn.Module):
|
| 52 |
+
def __init__(self, scale, fn):
|
| 53 |
+
super().__init__()
|
| 54 |
+
self.fn = fn
|
| 55 |
+
self.scale = scale
|
| 56 |
+
|
| 57 |
+
def forward(self, x, **kwargs):
|
| 58 |
+
return self.fn(x, **kwargs) * self.scale
|
| 59 |
+
|
| 60 |
+
class PreNorm(nn.Module):
|
| 61 |
+
def __init__(self, dim, fn):
|
| 62 |
+
super().__init__()
|
| 63 |
+
self.fn = fn
|
| 64 |
+
self.norm = nn.LayerNorm(dim)
|
| 65 |
+
|
| 66 |
+
def forward(self, x, **kwargs):
|
| 67 |
+
x = self.norm(x)
|
| 68 |
+
return self.fn(x, **kwargs)
|
| 69 |
+
|
| 70 |
+
class Attention(nn.Module):
|
| 71 |
+
# Head Token attention: https://arxiv.org/pdf/2210.05958.pdf
|
| 72 |
+
def __init__(self, dim, heads=8, dim_head=64, qkv_bias=False, dropout=0., proj_drop=0.):
|
| 73 |
+
super().__init__()
|
| 74 |
+
self.num_heads = heads
|
| 75 |
+
inner_dim = dim_head * heads
|
| 76 |
+
self.scale = dim_head ** -0.5
|
| 77 |
+
|
| 78 |
+
self.qkv = nn.Linear(dim, inner_dim * 3, bias=qkv_bias)
|
| 79 |
+
|
| 80 |
+
self.attn_drop = nn.Dropout(dropout)
|
| 81 |
+
self.proj = nn.Linear(inner_dim, dim)
|
| 82 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
| 83 |
+
|
| 84 |
+
self.act = nn.GELU()
|
| 85 |
+
self.ht_proj = nn.Linear(dim_head, dim,bias=True)
|
| 86 |
+
self.ht_norm = nn.LayerNorm(dim_head)
|
| 87 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, self.num_heads, dim))
|
| 88 |
+
|
| 89 |
+
def forward(self, x, mask=None):
|
| 90 |
+
B, N, C = x.shape
|
| 91 |
+
|
| 92 |
+
# head token
|
| 93 |
+
head_pos = self.pos_embed.expand(x.shape[0], -1, -1)
|
| 94 |
+
x_ = x.reshape(B, -1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
|
| 95 |
+
x_ = x_.mean(dim=2) # now the shape is [B, h, 1, d//h]
|
| 96 |
+
x_ = self.ht_proj(x_).reshape(B, -1, self.num_heads, C // self.num_heads)
|
| 97 |
+
x_ = self.act(self.ht_norm(x_)).flatten(2)
|
| 98 |
+
x_ = x_ + head_pos
|
| 99 |
+
x = torch.cat([x, x_], dim=1)
|
| 100 |
+
|
| 101 |
+
# normal mhsa
|
| 102 |
+
qkv = self.qkv(x).reshape(B, N+self.num_heads, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 103 |
+
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
|
| 104 |
+
|
| 105 |
+
attn = (q @ k.transpose(-2, -1)) * self.scale
|
| 106 |
+
attn = attn.softmax(dim=-1)
|
| 107 |
+
# attn = self.attn_drop(attn)
|
| 108 |
+
|
| 109 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N+self.num_heads, C)
|
| 110 |
+
x = self.proj(x)
|
| 111 |
+
|
| 112 |
+
# merge head tokens into cls token
|
| 113 |
+
cls, patch, ht = torch.split(x, [1, N-1, self.num_heads], dim=1)
|
| 114 |
+
cls = cls + torch.mean(ht, dim=1, keepdim=True) + torch.mean(patch, dim=1, keepdim=True)
|
| 115 |
+
x = torch.cat([cls, patch], dim=1)
|
| 116 |
+
|
| 117 |
+
x = self.proj_drop(x)
|
| 118 |
+
|
| 119 |
+
return x, attn
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
class FeedForward(nn.Module):
|
| 123 |
+
def __init__(
|
| 124 |
+
self,
|
| 125 |
+
dim,
|
| 126 |
+
mult = 4,
|
| 127 |
+
dropout = 0.
|
| 128 |
+
):
|
| 129 |
+
super().__init__()
|
| 130 |
+
self.net = nn.Sequential(
|
| 131 |
+
nn.Linear(dim, dim * mult),
|
| 132 |
+
Swish(),
|
| 133 |
+
nn.Dropout(dropout),
|
| 134 |
+
nn.Linear(dim * mult, dim),
|
| 135 |
+
nn.Dropout(dropout)
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
def forward(self, x):
|
| 139 |
+
return self.net(x)
|
| 140 |
+
|
| 141 |
+
class ConformerConvModule(nn.Module):
|
| 142 |
+
def __init__(
|
| 143 |
+
self,
|
| 144 |
+
dim,
|
| 145 |
+
causal = False,
|
| 146 |
+
expansion_factor = 2,
|
| 147 |
+
kernel_size = 31,
|
| 148 |
+
dropout = 0.
|
| 149 |
+
):
|
| 150 |
+
super().__init__()
|
| 151 |
+
|
| 152 |
+
inner_dim = dim * expansion_factor
|
| 153 |
+
padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
|
| 154 |
+
|
| 155 |
+
self.net = nn.Sequential(
|
| 156 |
+
nn.LayerNorm(dim),
|
| 157 |
+
Rearrange('b n c -> b c n'),
|
| 158 |
+
nn.Conv1d(dim, inner_dim * 2, 1),
|
| 159 |
+
GLU(dim=1),
|
| 160 |
+
DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
|
| 161 |
+
nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
|
| 162 |
+
Swish(),
|
| 163 |
+
nn.Conv1d(inner_dim, dim, 1),
|
| 164 |
+
Rearrange('b c n -> b n c'),
|
| 165 |
+
nn.Dropout(dropout)
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
def forward(self, x):
|
| 169 |
+
return self.net(x)
|
| 170 |
+
|
| 171 |
+
# Conformer Block
|
| 172 |
+
|
| 173 |
+
class ConformerBlock(nn.Module):
|
| 174 |
+
def __init__(
|
| 175 |
+
self,
|
| 176 |
+
*,
|
| 177 |
+
dim,
|
| 178 |
+
dim_head = 64,
|
| 179 |
+
heads = 8,
|
| 180 |
+
ff_mult = 4,
|
| 181 |
+
conv_expansion_factor = 2,
|
| 182 |
+
conv_kernel_size = 31,
|
| 183 |
+
attn_dropout = 0.,
|
| 184 |
+
ff_dropout = 0.,
|
| 185 |
+
conv_dropout = 0.,
|
| 186 |
+
conv_causal = False
|
| 187 |
+
):
|
| 188 |
+
super().__init__()
|
| 189 |
+
self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
|
| 190 |
+
self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)
|
| 191 |
+
self.conv = ConformerConvModule(dim = dim, causal = conv_causal, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
|
| 192 |
+
self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
|
| 193 |
+
|
| 194 |
+
self.attn = PreNorm(dim, self.attn)
|
| 195 |
+
self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
|
| 196 |
+
self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
|
| 197 |
+
|
| 198 |
+
self.post_norm = nn.LayerNorm(dim)
|
| 199 |
+
|
| 200 |
+
def forward(self, x, mask = None):
|
| 201 |
+
x = self.ff1(x) + x
|
| 202 |
+
attn_x, attn_weight = self.attn(x, mask = mask)
|
| 203 |
+
x = attn_x + x
|
| 204 |
+
x = self.conv(x) + x
|
| 205 |
+
x = self.ff2(x) + x
|
| 206 |
+
x = self.post_norm(x)
|
| 207 |
+
return x, attn_weight
|
| 208 |
+
|
| 209 |
+
# Conformer
|
| 210 |
+
|
| 211 |
+
class Conformer(nn.Module):
|
| 212 |
+
def __init__(
|
| 213 |
+
self,
|
| 214 |
+
dim,
|
| 215 |
+
*,
|
| 216 |
+
depth,
|
| 217 |
+
dim_head = 64,
|
| 218 |
+
heads = 8,
|
| 219 |
+
ff_mult = 4,
|
| 220 |
+
conv_expansion_factor = 2,
|
| 221 |
+
conv_kernel_size = 31,
|
| 222 |
+
attn_dropout = 0.,
|
| 223 |
+
ff_dropout = 0.,
|
| 224 |
+
conv_dropout = 0.,
|
| 225 |
+
conv_causal = False
|
| 226 |
+
):
|
| 227 |
+
super().__init__()
|
| 228 |
+
self.dim = dim
|
| 229 |
+
self.layers = nn.ModuleList([])
|
| 230 |
+
|
| 231 |
+
for _ in range(depth):
|
| 232 |
+
self.layers.append(ConformerBlock(
|
| 233 |
+
dim = dim,
|
| 234 |
+
dim_head = dim_head,
|
| 235 |
+
heads = heads,
|
| 236 |
+
ff_mult = ff_mult,
|
| 237 |
+
conv_expansion_factor = conv_expansion_factor,
|
| 238 |
+
conv_kernel_size = conv_kernel_size,
|
| 239 |
+
conv_causal = conv_causal
|
| 240 |
+
|
| 241 |
+
))
|
| 242 |
+
|
| 243 |
+
def forward(self, x):
|
| 244 |
+
|
| 245 |
+
for block in self.layers:
|
| 246 |
+
x = block(x)
|
| 247 |
+
|
| 248 |
+
return x
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def sinusoidal_embedding(n_channels, dim):
|
| 253 |
+
pe = torch.FloatTensor([[p / (10000 ** (2 * (i // 2) / dim)) for i in range(dim)]
|
| 254 |
+
for p in range(n_channels)])
|
| 255 |
+
pe[:, 0::2] = torch.sin(pe[:, 0::2])
|
| 256 |
+
pe[:, 1::2] = torch.cos(pe[:, 1::2])
|
| 257 |
+
return pe.unsqueeze(0)
|
| 258 |
+
|
| 259 |
+
class FinalConformer(nn.Module):
|
| 260 |
+
def __init__(self, emb_size=128, heads=4, ffmult=4, exp_fac=2, kernel_size=16, n_encoders=1):
|
| 261 |
+
super(FinalConformer, self).__init__()
|
| 262 |
+
self.dim_head=int(emb_size/heads)
|
| 263 |
+
self.dim=emb_size
|
| 264 |
+
self.heads=heads
|
| 265 |
+
self.kernel_size=kernel_size
|
| 266 |
+
self.n_encoders=n_encoders
|
| 267 |
+
self.positional_emb = nn.Parameter(sinusoidal_embedding(10000, emb_size), requires_grad=False)
|
| 268 |
+
self.encoder_blocks=_get_clones(ConformerBlock( dim = emb_size, dim_head=self.dim_head, heads= heads,
|
| 269 |
+
ff_mult = ffmult, conv_expansion_factor = exp_fac, conv_kernel_size = kernel_size),
|
| 270 |
+
n_encoders)
|
| 271 |
+
self.class_token = nn.Parameter(torch.rand(1, emb_size))
|
| 272 |
+
self.fc5 = nn.Linear(emb_size, 2)
|
| 273 |
+
|
| 274 |
+
def forward(self, x): # x shape [bs, tiempo, frecuencia]
|
| 275 |
+
x = x + self.positional_emb[:, :x.size(1), :]
|
| 276 |
+
x = torch.stack([torch.vstack((self.class_token, x[i])) for i in range(len(x))])#[bs,1+tiempo,emb_size]
|
| 277 |
+
list_attn_weight = []
|
| 278 |
+
for layer in self.encoder_blocks:
|
| 279 |
+
x, attn_weight = layer(x) #[bs,1+tiempo,emb_size]
|
| 280 |
+
list_attn_weight.append(attn_weight)
|
| 281 |
+
embedding=x[:,0,:] #[bs, emb_size]
|
| 282 |
+
out=self.fc5(embedding) #[bs,2]
|
| 283 |
+
return out, list_attn_weight
|
| 284 |
+
|
feature_extraction_antispoofing.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import SequenceFeatureExtractor
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
class AntispoofingFeatureExtractor(SequenceFeatureExtractor):
|
| 6 |
+
def __init__(
|
| 7 |
+
self,
|
| 8 |
+
feature_size=1,
|
| 9 |
+
sampling_rate=16000,
|
| 10 |
+
padding_value=0.0,
|
| 11 |
+
return_attention_mask=True,
|
| 12 |
+
**kwargs
|
| 13 |
+
):
|
| 14 |
+
super().__init__(
|
| 15 |
+
feature_size=feature_size,
|
| 16 |
+
sampling_rate=sampling_rate,
|
| 17 |
+
padding_value=padding_value,
|
| 18 |
+
**kwargs
|
| 19 |
+
)
|
| 20 |
+
self.return_attention_mask = return_attention_mask
|
| 21 |
+
|
| 22 |
+
def __call__(self, audio, sampling_rate=None, return_tensors=True, **kwargs):
|
| 23 |
+
audio = self.pad(audio, 64600)
|
| 24 |
+
audio = torch.Tensor(audio)
|
| 25 |
+
return {
|
| 26 |
+
"input_values": audio
|
| 27 |
+
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
def pad(self, x, max_len):
|
| 31 |
+
x_len = x.shape[0]
|
| 32 |
+
if x_len >= max_len:
|
| 33 |
+
return x[:max_len]
|
| 34 |
+
num_repeats = int(max_len / x_len)+1
|
| 35 |
+
padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
|
| 36 |
+
return padded_x
|
modeling_antispoofing.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import PreTrainedModel
|
| 4 |
+
from .configuration_antispoofing import DF_Arena_1B_Config
|
| 5 |
+
from .backbone import DF_Arena_1B
|
| 6 |
+
from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
|
| 7 |
+
|
| 8 |
+
class DF_Arena_1B_Antispoofing(PreTrainedModel):
|
| 9 |
+
config_class = DF_Arena_1B_Config
|
| 10 |
+
|
| 11 |
+
def __init__(self, config: DF_Arena_1B_Config):
|
| 12 |
+
super().__init__(config)
|
| 13 |
+
self.feature_extractor = AntispoofingFeatureExtractor()
|
| 14 |
+
# your backbone here (CNN/TDNN/Wav2Vec front-end, etc.)
|
| 15 |
+
self.backbone = DF_Arena_1B()
|
| 16 |
+
self.post_init()
|
| 17 |
+
|
| 18 |
+
def forward(self, input_values, attention_mask=None):
|
| 19 |
+
# input_values: (batch, time) float32 waveform @ config.sample_rate
|
| 20 |
+
logits = self.backbone(input_values)
|
| 21 |
+
return {"logits": logits}
|
pipeline_antispoofing.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import Pipeline
|
| 2 |
+
import torch
|
| 3 |
+
from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
|
| 4 |
+
class AntispoofingPipeline(Pipeline):
|
| 5 |
+
def __init__(self, model, **kwargs):
|
| 6 |
+
super().__init__(model=model, **kwargs)
|
| 7 |
+
self.feature_extractor = AntispoofingFeatureExtractor()
|
| 8 |
+
|
| 9 |
+
def _sanitize_parameters(self, **kwargs):
|
| 10 |
+
preprocess_kwargs = {}
|
| 11 |
+
postprocess_kwargs = {}
|
| 12 |
+
|
| 13 |
+
if "sampling_rate" in kwargs:
|
| 14 |
+
preprocess_kwargs["sampling_rate"] = kwargs["sampling_rate"]
|
| 15 |
+
|
| 16 |
+
return preprocess_kwargs, {}, postprocess_kwargs
|
| 17 |
+
|
| 18 |
+
def preprocess(self, audio, sampling_rate=16000):
|
| 19 |
+
audio = self.feature_extractor(audio)['input_values']
|
| 20 |
+
inputs = {"input_values": audio}
|
| 21 |
+
|
| 22 |
+
return inputs
|
| 23 |
+
|
| 24 |
+
def _forward(self, model_inputs):
|
| 25 |
+
outputs = self.model(**model_inputs)
|
| 26 |
+
return outputs
|
| 27 |
+
|
| 28 |
+
def postprocess(self, model_outputs):
|
| 29 |
+
logits = model_outputs['logits']
|
| 30 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
| 31 |
+
predicted_class = torch.argmax(probs, dim=-1).item()
|
| 32 |
+
confidence = probs[0][predicted_class].item()
|
| 33 |
+
|
| 34 |
+
return {
|
| 35 |
+
"label": self.model.config.id2label[predicted_class],
|
| 36 |
+
"logits": logits.tolist(),
|
| 37 |
+
"score": confidence,
|
| 38 |
+
"all_scores": {
|
| 39 |
+
self.model.config.id2label[i]: probs[0][i].item()
|
| 40 |
+
for i in range(len(probs[0]))
|
| 41 |
+
}
|
| 42 |
+
}
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"feature_extractor_type": "AntispoofingFeatureExtractor",
|
| 3 |
+
"processor_class": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
|
| 4 |
+
}
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:780bc14fd4c15e65d58efdef728427cf03cd29cd60be528e97badf8c89087988
|
| 3 |
+
size 4591794734
|