Speech-Arena-2025 commited on
Commit
86a2cc3
·
1 Parent(s): cf6e216

initial commit

Browse files
.gitignore ADDED
File without changes
Readme.md ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ tags:
5
+ - audio
6
+ - audio-classification
7
+ - antispoofing
8
+ - deepfake-detection
9
+ - speech
10
+ license: mit
11
+ pipeline_tag: audio-classification
12
+ ---
13
+
14
+ # DF Arena 1B - Audio Antispoofing Model
15
+
16
+ A deep learning model for detecting spoofed (fake/synthesized) audio vs. bonafide (genuine) speech.
17
+
18
+ ## Model Description
19
+
20
+ This model uses Wav2Vec2-XLS-R-2B as a backbone for audio antispoofing detection. It classifies audio samples as either bonafide (real human speech) or spoofed (synthetic/manipulated audio).
21
+
22
+ **Key Features:**
23
+ - Detects AI-generated speech and voice cloning
24
+ - Works with various audio formats
25
+ - Robust against different spoofing attacks
26
+
27
+ ## Usage
28
+ ```python
29
+ from transformers import pipeline
30
+
31
+ # Load the pipeline
32
+ pipe = pipeline(
33
+ "audio-classification",
34
+ model="your-username/df-arena-1b-antispoofing",
35
+ trust_remote_code=True
36
+ )
37
+
38
+ # Predict from audio file
39
+ result = pipe("audio.wav")
40
+ print(result)
41
+ # Output: {'label': 'bonafide', 'score': 0.95}
42
+ ```
43
+
44
+ ### Advanced Usage
45
+ ```python
46
+ import librosa
47
+ from transformers import pipeline
48
+
49
+ pipe = pipeline(
50
+ "audio-classification",
51
+ model="your-username/df-arena-1b-antispoofing",
52
+ trust_remote_code=True
53
+ )
54
+
55
+ # From numpy array
56
+ audio, sr = librosa.load("audio.wav", sr=16000)
57
+ result = pipe(audio)
58
+
59
+ # Get all scores
60
+ result = pipe("audio.wav", return_all_scores=True)
61
+ # Output: [
62
+ # {'label': 'bonafide', 'score': 0.95},
63
+ # {'label': 'spoof', 'score': 0.05}
64
+ # ]
65
+
66
+ # Batch processing
67
+ results = pipe(["audio1.wav", "audio2.wav", "audio3.wav"])
68
+ ```
69
+
70
+ ## Model Details
71
+
72
+ - **Backbone:** Wav2Vec2-XLS-R-2B
73
+ - **Input:** 16kHz audio (mono)
74
+ - **Output:** Binary classification (bonafide/spoof)
75
+ - **Parameters:** ~2B
76
+
77
+ ## Training Data
78
+
79
+ [Describe your training dataset here]
80
+
81
+ ## Performance
82
+
83
+ [Add performance metrics here]
84
+
85
+ | Dataset | Accuracy | EER |
86
+ |---------|----------|-----|
87
+ | Test Set | XX% | XX% |
88
+
89
+ ## Limitations
90
+
91
+ - Optimized for speech audio (may not work well on music or environmental sounds)
92
+ - Performance may vary with very low quality audio
93
+ - Requires 16kHz sampling rate for best results
94
+
95
+ ## Citation
96
+ ```bibtex
97
+ @misc{df-arena-1b-2024,
98
+ author = {Your Name},
99
+ title = {DF Arena 1B: Audio Antispoofing Model},
100
+ year = {2024},
101
+ publisher = {HuggingFace},
102
+ url = {https://huggingface.co/your-username/df-arena-1b-antispoofing}
103
+ }
104
+ ```
105
+
106
+ ## License
107
+
108
+ MIT License
109
+
110
+ ## Contact
111
+
112
+ For questions or issues, please open an issue on the model repository.
backbone.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch import Tensor
6
+ from transformers import Wav2Vec2Model, Wav2Vec2Config
7
+ from conformer import FinalConformer
8
+
9
+ class DF_Arena_1B(nn.Module):
10
+ def __init__(self):
11
+ super().__init__()
12
+ self.ssl_model = Wav2Vec2Model(Wav2Vec2Config.from_pretrained("facebook/wav2vec2-xls-r-1b"))
13
+ self.ssl_model.config.output_hidden_states = True
14
+ self.first_bn = nn.BatchNorm2d(num_features=1)
15
+ self.selu = nn.SELU(inplace=True)
16
+ self.fc0 = nn.Linear(1280, 1) #1280 for 1b, 1920 for 2b
17
+ self.sig = nn.Sigmoid()
18
+
19
+
20
+ self.conformer = FinalConformer(emb_size=1280, heads=4, ffmult=4, exp_fac=2, kernel_size=31, n_encoders=4)
21
+
22
+ # Learnable attention weights
23
+ self.attn_scores = nn.Linear(1280, 1, bias=False)
24
+
25
+ def get_attenF1Dpooling(self, x):
26
+ #print(x.shape, 'x shape in attnF1Dpooling')
27
+ logits = self.attn_scores(x)
28
+ weights = torch.softmax(logits, dim=1) # (B, T, 1)
29
+ pooled = torch.sum(weights * x, dim=1, keepdim=True) # (B, 1, D)
30
+ return pooled
31
+
32
+ def get_attenF1D(self, layerResult):
33
+ poollayerResult = []
34
+ fullf = []
35
+ for layer in layerResult:
36
+ # layer shape: (B, D, T)
37
+ #layery = layer.permute(0, 2, 1) # (B, T, D)
38
+ layery = self.get_attenF1Dpooling(layer) # (B, 1, D)
39
+ poollayerResult.append(layery)
40
+ fullf.append(layer.unsqueeze(1)) # (B, 1, D, T)
41
+
42
+ layery = torch.cat(poollayerResult, dim=1) # (B, L, D)
43
+ fullfeature = torch.cat(fullf, dim=1) # (B, L, D, T)
44
+ return layery, fullfeature
45
+
46
+ def forward(self, x):
47
+ out_ssl = self.ssl_model(x.unsqueeze(0)) #layerresult = [(x,z),24个] x(201,1,1024) z(1,201,201)
48
+ y0, fullfeature = self.get_attenF1D(out_ssl.hidden_states)
49
+ y0 = self.fc0(y0)
50
+ y0 = self.sig(y0)
51
+ y0 = y0.view(y0.shape[0], y0.shape[1], y0.shape[2], -1)
52
+ fullfeature = fullfeature * y0
53
+ fullfeature = torch.sum(fullfeature, 1)
54
+ fullfeature = fullfeature.unsqueeze(dim=1)
55
+ fullfeature = self.first_bn(fullfeature)
56
+ fullfeature = self.selu(fullfeature)
57
+
58
+
59
+ output, _ = self.conformer(fullfeature.squeeze(1))
60
+
61
+
62
+ return output
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": ["DF-Arena-1B-V0.1"],
3
+ "model_type": "antispoofing",
4
+
5
+ "num_labels": 2,
6
+ "id2label": {
7
+ "1": "bonafide",
8
+ "0": "spoof"
9
+ },
10
+ "label2id": {
11
+ "bonafide": 1,
12
+ "spoof": 0
13
+ },
14
+
15
+ "auto_map": {
16
+ "AutoConfig": "configuration_antispoofing.DF_Arena_1B_Config",
17
+ "AutoModel": "modeling_antispoofing.DF_Arena_1B_Antispoofing",
18
+ "AutoFeatureExtractor": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
19
+ },
20
+ "custom_pipelines": {
21
+ "antispoofing": {
22
+ "impl": "pipeline_antispoofing.AntispoofingPipeline",
23
+ "pt": ["AutoModel"]
24
+ }
25
+ }
26
+ }
configuration_antispoofing.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class DF_Arena_1B_Config(PretrainedConfig):
4
+ model_type = "antispoofing"
5
+ def __init__(self, num_labels=2, sample_rate=16000, **kwargs):
6
+ super().__init__(**kwargs)
7
+ self.num_labels = num_labels
8
+ self.sample_rate = sample_rate
9
+ self.out_dim = 1024
conformer.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn, einsum
4
+ import torch.nn.functional as F
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn.modules.transformer import _get_clones
8
+ from torch import Tensor
9
+ from einops import rearrange
10
+ from einops.layers.torch import Rearrange
11
+
12
+ # helper functions
13
+
14
+ def exists(val):
15
+ return val is not None
16
+
17
+ def default(val, d):
18
+ return val if exists(val) else d
19
+
20
+ def calc_same_padding(kernel_size):
21
+ pad = kernel_size // 2
22
+ return (pad, pad - (kernel_size + 1) % 2)
23
+
24
+ # helper classes
25
+
26
+ class Swish(nn.Module):
27
+ def forward(self, x):
28
+ return x * x.sigmoid()
29
+
30
+ class GLU(nn.Module):
31
+ def __init__(self, dim):
32
+ super().__init__()
33
+ self.dim = dim
34
+
35
+ def forward(self, x):
36
+ out, gate = x.chunk(2, dim=self.dim)
37
+ return out * gate.sigmoid()
38
+
39
+ class DepthWiseConv1d(nn.Module):
40
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
41
+ super().__init__()
42
+ self.padding = padding
43
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
44
+
45
+ def forward(self, x):
46
+ x = F.pad(x, self.padding)
47
+ return self.conv(x)
48
+
49
+ # attention, feedforward, and conv module
50
+
51
+ class Scale(nn.Module):
52
+ def __init__(self, scale, fn):
53
+ super().__init__()
54
+ self.fn = fn
55
+ self.scale = scale
56
+
57
+ def forward(self, x, **kwargs):
58
+ return self.fn(x, **kwargs) * self.scale
59
+
60
+ class PreNorm(nn.Module):
61
+ def __init__(self, dim, fn):
62
+ super().__init__()
63
+ self.fn = fn
64
+ self.norm = nn.LayerNorm(dim)
65
+
66
+ def forward(self, x, **kwargs):
67
+ x = self.norm(x)
68
+ return self.fn(x, **kwargs)
69
+
70
+ class Attention(nn.Module):
71
+ # Head Token attention: https://arxiv.org/pdf/2210.05958.pdf
72
+ def __init__(self, dim, heads=8, dim_head=64, qkv_bias=False, dropout=0., proj_drop=0.):
73
+ super().__init__()
74
+ self.num_heads = heads
75
+ inner_dim = dim_head * heads
76
+ self.scale = dim_head ** -0.5
77
+
78
+ self.qkv = nn.Linear(dim, inner_dim * 3, bias=qkv_bias)
79
+
80
+ self.attn_drop = nn.Dropout(dropout)
81
+ self.proj = nn.Linear(inner_dim, dim)
82
+ self.proj_drop = nn.Dropout(proj_drop)
83
+
84
+ self.act = nn.GELU()
85
+ self.ht_proj = nn.Linear(dim_head, dim,bias=True)
86
+ self.ht_norm = nn.LayerNorm(dim_head)
87
+ self.pos_embed = nn.Parameter(torch.zeros(1, self.num_heads, dim))
88
+
89
+ def forward(self, x, mask=None):
90
+ B, N, C = x.shape
91
+
92
+ # head token
93
+ head_pos = self.pos_embed.expand(x.shape[0], -1, -1)
94
+ x_ = x.reshape(B, -1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
95
+ x_ = x_.mean(dim=2) # now the shape is [B, h, 1, d//h]
96
+ x_ = self.ht_proj(x_).reshape(B, -1, self.num_heads, C // self.num_heads)
97
+ x_ = self.act(self.ht_norm(x_)).flatten(2)
98
+ x_ = x_ + head_pos
99
+ x = torch.cat([x, x_], dim=1)
100
+
101
+ # normal mhsa
102
+ qkv = self.qkv(x).reshape(B, N+self.num_heads, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
103
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
104
+
105
+ attn = (q @ k.transpose(-2, -1)) * self.scale
106
+ attn = attn.softmax(dim=-1)
107
+ # attn = self.attn_drop(attn)
108
+
109
+ x = (attn @ v).transpose(1, 2).reshape(B, N+self.num_heads, C)
110
+ x = self.proj(x)
111
+
112
+ # merge head tokens into cls token
113
+ cls, patch, ht = torch.split(x, [1, N-1, self.num_heads], dim=1)
114
+ cls = cls + torch.mean(ht, dim=1, keepdim=True) + torch.mean(patch, dim=1, keepdim=True)
115
+ x = torch.cat([cls, patch], dim=1)
116
+
117
+ x = self.proj_drop(x)
118
+
119
+ return x, attn
120
+
121
+
122
+ class FeedForward(nn.Module):
123
+ def __init__(
124
+ self,
125
+ dim,
126
+ mult = 4,
127
+ dropout = 0.
128
+ ):
129
+ super().__init__()
130
+ self.net = nn.Sequential(
131
+ nn.Linear(dim, dim * mult),
132
+ Swish(),
133
+ nn.Dropout(dropout),
134
+ nn.Linear(dim * mult, dim),
135
+ nn.Dropout(dropout)
136
+ )
137
+
138
+ def forward(self, x):
139
+ return self.net(x)
140
+
141
+ class ConformerConvModule(nn.Module):
142
+ def __init__(
143
+ self,
144
+ dim,
145
+ causal = False,
146
+ expansion_factor = 2,
147
+ kernel_size = 31,
148
+ dropout = 0.
149
+ ):
150
+ super().__init__()
151
+
152
+ inner_dim = dim * expansion_factor
153
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
154
+
155
+ self.net = nn.Sequential(
156
+ nn.LayerNorm(dim),
157
+ Rearrange('b n c -> b c n'),
158
+ nn.Conv1d(dim, inner_dim * 2, 1),
159
+ GLU(dim=1),
160
+ DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
161
+ nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
162
+ Swish(),
163
+ nn.Conv1d(inner_dim, dim, 1),
164
+ Rearrange('b c n -> b n c'),
165
+ nn.Dropout(dropout)
166
+ )
167
+
168
+ def forward(self, x):
169
+ return self.net(x)
170
+
171
+ # Conformer Block
172
+
173
+ class ConformerBlock(nn.Module):
174
+ def __init__(
175
+ self,
176
+ *,
177
+ dim,
178
+ dim_head = 64,
179
+ heads = 8,
180
+ ff_mult = 4,
181
+ conv_expansion_factor = 2,
182
+ conv_kernel_size = 31,
183
+ attn_dropout = 0.,
184
+ ff_dropout = 0.,
185
+ conv_dropout = 0.,
186
+ conv_causal = False
187
+ ):
188
+ super().__init__()
189
+ self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
190
+ self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)
191
+ self.conv = ConformerConvModule(dim = dim, causal = conv_causal, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
192
+ self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
193
+
194
+ self.attn = PreNorm(dim, self.attn)
195
+ self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
196
+ self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
197
+
198
+ self.post_norm = nn.LayerNorm(dim)
199
+
200
+ def forward(self, x, mask = None):
201
+ x = self.ff1(x) + x
202
+ attn_x, attn_weight = self.attn(x, mask = mask)
203
+ x = attn_x + x
204
+ x = self.conv(x) + x
205
+ x = self.ff2(x) + x
206
+ x = self.post_norm(x)
207
+ return x, attn_weight
208
+
209
+ # Conformer
210
+
211
+ class Conformer(nn.Module):
212
+ def __init__(
213
+ self,
214
+ dim,
215
+ *,
216
+ depth,
217
+ dim_head = 64,
218
+ heads = 8,
219
+ ff_mult = 4,
220
+ conv_expansion_factor = 2,
221
+ conv_kernel_size = 31,
222
+ attn_dropout = 0.,
223
+ ff_dropout = 0.,
224
+ conv_dropout = 0.,
225
+ conv_causal = False
226
+ ):
227
+ super().__init__()
228
+ self.dim = dim
229
+ self.layers = nn.ModuleList([])
230
+
231
+ for _ in range(depth):
232
+ self.layers.append(ConformerBlock(
233
+ dim = dim,
234
+ dim_head = dim_head,
235
+ heads = heads,
236
+ ff_mult = ff_mult,
237
+ conv_expansion_factor = conv_expansion_factor,
238
+ conv_kernel_size = conv_kernel_size,
239
+ conv_causal = conv_causal
240
+
241
+ ))
242
+
243
+ def forward(self, x):
244
+
245
+ for block in self.layers:
246
+ x = block(x)
247
+
248
+ return x
249
+
250
+
251
+
252
+ def sinusoidal_embedding(n_channels, dim):
253
+ pe = torch.FloatTensor([[p / (10000 ** (2 * (i // 2) / dim)) for i in range(dim)]
254
+ for p in range(n_channels)])
255
+ pe[:, 0::2] = torch.sin(pe[:, 0::2])
256
+ pe[:, 1::2] = torch.cos(pe[:, 1::2])
257
+ return pe.unsqueeze(0)
258
+
259
+ class FinalConformer(nn.Module):
260
+ def __init__(self, emb_size=128, heads=4, ffmult=4, exp_fac=2, kernel_size=16, n_encoders=1):
261
+ super(FinalConformer, self).__init__()
262
+ self.dim_head=int(emb_size/heads)
263
+ self.dim=emb_size
264
+ self.heads=heads
265
+ self.kernel_size=kernel_size
266
+ self.n_encoders=n_encoders
267
+ self.positional_emb = nn.Parameter(sinusoidal_embedding(10000, emb_size), requires_grad=False)
268
+ self.encoder_blocks=_get_clones(ConformerBlock( dim = emb_size, dim_head=self.dim_head, heads= heads,
269
+ ff_mult = ffmult, conv_expansion_factor = exp_fac, conv_kernel_size = kernel_size),
270
+ n_encoders)
271
+ self.class_token = nn.Parameter(torch.rand(1, emb_size))
272
+ self.fc5 = nn.Linear(emb_size, 2)
273
+
274
+ def forward(self, x): # x shape [bs, tiempo, frecuencia]
275
+ x = x + self.positional_emb[:, :x.size(1), :]
276
+ x = torch.stack([torch.vstack((self.class_token, x[i])) for i in range(len(x))])#[bs,1+tiempo,emb_size]
277
+ list_attn_weight = []
278
+ for layer in self.encoder_blocks:
279
+ x, attn_weight = layer(x) #[bs,1+tiempo,emb_size]
280
+ list_attn_weight.append(attn_weight)
281
+ embedding=x[:,0,:] #[bs, emb_size]
282
+ out=self.fc5(embedding) #[bs,2]
283
+ return out, list_attn_weight
284
+
feature_extraction_antispoofing.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import SequenceFeatureExtractor
2
+ import numpy as np
3
+ import torch
4
+
5
+ class AntispoofingFeatureExtractor(SequenceFeatureExtractor):
6
+ def __init__(
7
+ self,
8
+ feature_size=1,
9
+ sampling_rate=16000,
10
+ padding_value=0.0,
11
+ return_attention_mask=True,
12
+ **kwargs
13
+ ):
14
+ super().__init__(
15
+ feature_size=feature_size,
16
+ sampling_rate=sampling_rate,
17
+ padding_value=padding_value,
18
+ **kwargs
19
+ )
20
+ self.return_attention_mask = return_attention_mask
21
+
22
+ def __call__(self, audio, sampling_rate=None, return_tensors=True, **kwargs):
23
+ audio = self.pad(audio, 64600)
24
+ audio = torch.Tensor(audio)
25
+ return {
26
+ "input_values": audio
27
+
28
+ }
29
+
30
+ def pad(self, x, max_len):
31
+ x_len = x.shape[0]
32
+ if x_len >= max_len:
33
+ return x[:max_len]
34
+ num_repeats = int(max_len / x_len)+1
35
+ padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
36
+ return padded_x
modeling_antispoofing.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel
4
+ from .configuration_antispoofing import DF_Arena_1B_Config
5
+ from .backbone import DF_Arena_1B
6
+ from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
7
+
8
+ class DF_Arena_1B_Antispoofing(PreTrainedModel):
9
+ config_class = DF_Arena_1B_Config
10
+
11
+ def __init__(self, config: DF_Arena_1B_Config):
12
+ super().__init__(config)
13
+ self.feature_extractor = AntispoofingFeatureExtractor()
14
+ # your backbone here (CNN/TDNN/Wav2Vec front-end, etc.)
15
+ self.backbone = DF_Arena_1B()
16
+ self.post_init()
17
+
18
+ def forward(self, input_values, attention_mask=None):
19
+ # input_values: (batch, time) float32 waveform @ config.sample_rate
20
+ logits = self.backbone(input_values)
21
+ return {"logits": logits}
pipeline_antispoofing.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Pipeline
2
+ import torch
3
+ from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
4
+ class AntispoofingPipeline(Pipeline):
5
+ def __init__(self, model, **kwargs):
6
+ super().__init__(model=model, **kwargs)
7
+ self.feature_extractor = AntispoofingFeatureExtractor()
8
+
9
+ def _sanitize_parameters(self, **kwargs):
10
+ preprocess_kwargs = {}
11
+ postprocess_kwargs = {}
12
+
13
+ if "sampling_rate" in kwargs:
14
+ preprocess_kwargs["sampling_rate"] = kwargs["sampling_rate"]
15
+
16
+ return preprocess_kwargs, {}, postprocess_kwargs
17
+
18
+ def preprocess(self, audio, sampling_rate=16000):
19
+ audio = self.feature_extractor(audio)['input_values']
20
+ inputs = {"input_values": audio}
21
+
22
+ return inputs
23
+
24
+ def _forward(self, model_inputs):
25
+ outputs = self.model(**model_inputs)
26
+ return outputs
27
+
28
+ def postprocess(self, model_outputs):
29
+ logits = model_outputs['logits']
30
+ probs = torch.nn.functional.softmax(logits, dim=-1)
31
+ predicted_class = torch.argmax(probs, dim=-1).item()
32
+ confidence = probs[0][predicted_class].item()
33
+
34
+ return {
35
+ "label": self.model.config.id2label[predicted_class],
36
+ "logits": logits.tolist(),
37
+ "score": confidence,
38
+ "all_scores": {
39
+ self.model.config.id2label[i]: probs[0][i].item()
40
+ for i in range(len(probs[0]))
41
+ }
42
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "AntispoofingFeatureExtractor",
3
+ "processor_class": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
4
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:780bc14fd4c15e65d58efdef728427cf03cd29cd60be528e97badf8c89087988
3
+ size 4591794734