initial commit

Browse files

Files changed (11) hide show

.gitignore +0 -0
Readme.md +112 -0
backbone.py +62 -0
config.json +26 -0
configuration_antispoofing.py +9 -0
conformer.py +284 -0
feature_extraction_antispoofing.py +36 -0
modeling_antispoofing.py +21 -0
pipeline_antispoofing.py +42 -0
preprocessor_config.json +4 -0
pytorch_model.bin +3 -0

.gitignore ADDED Viewed

File without changes

Readme.md ADDED Viewed

	@@ -0,0 +1,112 @@

+---
+language:
+- en
+tags:
+- audio
+- audio-classification
+- antispoofing
+- deepfake-detection
+- speech
+license: mit
+pipeline_tag: audio-classification
+---
+# DF Arena 1B - Audio Antispoofing Model
+A deep learning model for detecting spoofed (fake/synthesized) audio vs. bonafide (genuine) speech.
+## Model Description
+This model uses Wav2Vec2-XLS-R-2B as a backbone for audio antispoofing detection. It classifies audio samples as either bonafide (real human speech) or spoofed (synthetic/manipulated audio).
+**Key Features:**
+- Detects AI-generated speech and voice cloning
+- Works with various audio formats
+- Robust against different spoofing attacks
+## Usage
+```python
+from transformers import pipeline
+# Load the pipeline
+pipe = pipeline(
+    "audio-classification",
+    model="your-username/df-arena-1b-antispoofing",
+    trust_remote_code=True
+)
+# Predict from audio file
+result = pipe("audio.wav")
+print(result)
+# Output: {'label': 'bonafide', 'score': 0.95}
+```
+### Advanced Usage
+```python
+import librosa
+from transformers import pipeline
+pipe = pipeline(
+    "audio-classification",
+    model="your-username/df-arena-1b-antispoofing",
+    trust_remote_code=True
+)
+# From numpy array
+audio, sr = librosa.load("audio.wav", sr=16000)
+result = pipe(audio)
+# Get all scores
+result = pipe("audio.wav", return_all_scores=True)
+# Output: [
+#   {'label': 'bonafide', 'score': 0.95},
+#   {'label': 'spoof', 'score': 0.05}
+# ]
+# Batch processing
+results = pipe(["audio1.wav", "audio2.wav", "audio3.wav"])
+```
+## Model Details
+- **Backbone:** Wav2Vec2-XLS-R-2B
+- **Input:** 16kHz audio (mono)
+- **Output:** Binary classification (bonafide/spoof)
+- **Parameters:** ~2B
+## Training Data
+[Describe your training dataset here]
+## Performance
+[Add performance metrics here]
+| Dataset | Accuracy | EER |
+|---------|----------|-----|
+| Test Set | XX% | XX% |
+## Limitations
+- Optimized for speech audio (may not work well on music or environmental sounds)
+- Performance may vary with very low quality audio
+- Requires 16kHz sampling rate for best results
+## Citation
+```bibtex
+@misc{df-arena-1b-2024,
+  author = {Your Name},
+  title = {DF Arena 1B: Audio Antispoofing Model},
+  year = {2024},
+  publisher = {HuggingFace},
+  url = {https://huggingface.co/your-username/df-arena-1b-antispoofing}
+}
+```
+## License
+MIT License
+## Contact
+For questions or issues, please open an issue on the model repository.

backbone.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from transformers import Wav2Vec2Model, Wav2Vec2Config
+from conformer import FinalConformer
+class DF_Arena_1B(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.ssl_model = Wav2Vec2Model(Wav2Vec2Config.from_pretrained("facebook/wav2vec2-xls-r-1b"))
+        self.ssl_model.config.output_hidden_states = True
+        self.first_bn = nn.BatchNorm2d(num_features=1)
+        self.selu = nn.SELU(inplace=True)
+        self.fc0 = nn.Linear(1280, 1) #1280 for 1b, 1920 for 2b
+        self.sig = nn.Sigmoid()
+        self.conformer = FinalConformer(emb_size=1280, heads=4, ffmult=4, exp_fac=2, kernel_size=31, n_encoders=4)
+        # Learnable attention weights
+        self.attn_scores = nn.Linear(1280, 1, bias=False)
+    def get_attenF1Dpooling(self, x):
+        #print(x.shape, 'x shape in attnF1Dpooling')
+        logits = self.attn_scores(x)
+        weights = torch.softmax(logits, dim=1)  # (B, T, 1)
+        pooled = torch.sum(weights * x, dim=1, keepdim=True)  # (B, 1, D)
+        return pooled
+    def get_attenF1D(self, layerResult):
+        poollayerResult = []
+        fullf = []
+        for layer in layerResult:
+            # layer shape: (B, D, T)
+            #layery = layer.permute(0, 2, 1)  # (B, T, D)
+            layery = self.get_attenF1Dpooling(layer)  # (B, 1, D)
+            poollayerResult.append(layery)
+            fullf.append(layer.unsqueeze(1))  # (B, 1, D, T)
+        layery = torch.cat(poollayerResult, dim=1)      # (B, L, D)
+        fullfeature = torch.cat(fullf, dim=1)          # (B, L, D, T)
+        return layery, fullfeature
+    def forward(self, x):
+        out_ssl = self.ssl_model(x.unsqueeze(0)) #layerresult = [(x,z),24个] x(201,1,1024) z(1,201,201)
+        y0, fullfeature = self.get_attenF1D(out_ssl.hidden_states)
+        y0 = self.fc0(y0)
+        y0 = self.sig(y0)
+        y0 = y0.view(y0.shape[0], y0.shape[1], y0.shape[2], -1)
+        fullfeature = fullfeature * y0
+        fullfeature = torch.sum(fullfeature, 1)
+        fullfeature = fullfeature.unsqueeze(dim=1)
+        fullfeature = self.first_bn(fullfeature)
+        fullfeature = self.selu(fullfeature)
+        output, _ = self.conformer(fullfeature.squeeze(1))
+        return output

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": ["DF-Arena-1B-V0.1"],
+  "model_type": "antispoofing",
+  "num_labels": 2,
+  "id2label": {
+    "1": "bonafide",
+    "0": "spoof"
+  },
+  "label2id": {
+    "bonafide": 1,
+    "spoof": 0
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_antispoofing.DF_Arena_1B_Config",
+    "AutoModel": "modeling_antispoofing.DF_Arena_1B_Antispoofing",
+    "AutoFeatureExtractor": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
+  },
+  "custom_pipelines": {
+    "antispoofing": {
+      "impl": "pipeline_antispoofing.AntispoofingPipeline",
+      "pt": ["AutoModel"]
+    }
+  }
+}

configuration_antispoofing.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from transformers import PretrainedConfig
+class DF_Arena_1B_Config(PretrainedConfig):
+    model_type = "antispoofing"
+    def __init__(self, num_labels=2, sample_rate=16000, **kwargs):
+        super().__init__(**kwargs)
+        self.num_labels = num_labels
+        self.sample_rate = sample_rate
+        self.out_dim = 1024

conformer.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import math
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+import torch
+import torch.nn as nn
+from torch.nn.modules.transformer import _get_clones
+from torch import Tensor
+from einops import rearrange
+from einops.layers.torch import Rearrange
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def calc_same_padding(kernel_size):
+    pad = kernel_size // 2
+    return (pad, pad - (kernel_size + 1) % 2)
+# helper classes
+class Swish(nn.Module):
+    def forward(self, x):
+        return x * x.sigmoid()
+class GLU(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        out, gate = x.chunk(2, dim=self.dim)
+        return out * gate.sigmoid()
+class DepthWiseConv1d(nn.Module):
+    def __init__(self, chan_in, chan_out, kernel_size, padding):
+        super().__init__()
+        self.padding = padding
+        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
+    def forward(self, x):
+        x = F.pad(x, self.padding)
+        return self.conv(x)
+# attention, feedforward, and conv module
+class Scale(nn.Module):
+    def __init__(self, scale, fn):
+        super().__init__()
+        self.fn = fn
+        self.scale = scale
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) * self.scale
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, **kwargs)
+class Attention(nn.Module):
+    # Head Token attention: https://arxiv.org/pdf/2210.05958.pdf
+    def __init__(self, dim, heads=8, dim_head=64, qkv_bias=False, dropout=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = heads
+        inner_dim = dim_head * heads
+        self.scale = dim_head ** -0.5
+        self.qkv = nn.Linear(dim, inner_dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(dropout)
+        self.proj = nn.Linear(inner_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.act = nn.GELU()
+        self.ht_proj = nn.Linear(dim_head, dim,bias=True)
+        self.ht_norm = nn.LayerNorm(dim_head)
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_heads, dim))
+    def forward(self, x, mask=None):
+        B, N, C = x.shape
+        # head token
+        head_pos = self.pos_embed.expand(x.shape[0], -1, -1)
+        x_ = x.reshape(B, -1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        x_ = x_.mean(dim=2)  # now the shape is [B, h, 1, d//h]
+        x_ = self.ht_proj(x_).reshape(B, -1, self.num_heads, C // self.num_heads)
+        x_ = self.act(self.ht_norm(x_)).flatten(2)
+        x_ = x_ + head_pos
+        x = torch.cat([x, x_], dim=1)
+        # normal mhsa
+        qkv = self.qkv(x).reshape(B, N+self.num_heads, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        # attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N+self.num_heads, C)
+        x = self.proj(x)
+        # merge head tokens into cls token
+        cls, patch, ht = torch.split(x, [1, N-1, self.num_heads], dim=1)
+        cls = cls + torch.mean(ht, dim=1, keepdim=True) + torch.mean(patch, dim=1, keepdim=True)
+        x = torch.cat([cls, patch], dim=1)
+        x = self.proj_drop(x)
+        return x, attn
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mult = 4,
+        dropout = 0.
+    ):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim * mult),
+            Swish(),
+            nn.Dropout(dropout),
+            nn.Linear(dim * mult, dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class ConformerConvModule(nn.Module):
+    def __init__(
+        self,
+        dim,
+        causal = False,
+        expansion_factor = 2,
+        kernel_size = 31,
+        dropout = 0.
+    ):
+        super().__init__()
+        inner_dim = dim * expansion_factor
+        padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            Rearrange('b n c -> b c n'),
+            nn.Conv1d(dim, inner_dim * 2, 1),
+            GLU(dim=1),
+            DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
+            nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
+            Swish(),
+            nn.Conv1d(inner_dim, dim, 1),
+            Rearrange('b c n -> b n c'),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+# Conformer Block
+class ConformerBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head = 64,
+        heads = 8,
+        ff_mult = 4,
+        conv_expansion_factor = 2,
+        conv_kernel_size = 31,
+        attn_dropout = 0.,
+        ff_dropout = 0.,
+        conv_dropout = 0.,
+        conv_causal = False
+    ):
+        super().__init__()
+        self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
+        self.attn = Attention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout)
+        self.conv = ConformerConvModule(dim = dim, causal = conv_causal, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
+        self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
+        self.attn = PreNorm(dim, self.attn)
+        self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
+        self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))
+        self.post_norm = nn.LayerNorm(dim)
+    def forward(self, x, mask = None):
+        x = self.ff1(x) + x
+        attn_x, attn_weight = self.attn(x, mask = mask)
+        x = attn_x + x
+        x = self.conv(x) + x
+        x = self.ff2(x) + x
+        x = self.post_norm(x)
+        return x, attn_weight
+# Conformer
+class Conformer(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        depth,
+        dim_head = 64,
+        heads = 8,
+        ff_mult = 4,
+        conv_expansion_factor = 2,
+        conv_kernel_size = 31,
+        attn_dropout = 0.,
+        ff_dropout = 0.,
+        conv_dropout = 0.,
+        conv_causal = False
+    ):
+        super().__init__()
+        self.dim = dim
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(ConformerBlock(
+                dim = dim,
+                dim_head = dim_head,
+                heads = heads,
+                ff_mult = ff_mult,
+                conv_expansion_factor = conv_expansion_factor,
+                conv_kernel_size = conv_kernel_size,
+                conv_causal = conv_causal
+            ))
+    def forward(self, x):
+        for block in self.layers:
+            x = block(x)
+        return x
+def sinusoidal_embedding(n_channels, dim):
+    pe = torch.FloatTensor([[p / (10000 ** (2 * (i // 2) / dim)) for i in range(dim)]
+                            for p in range(n_channels)])
+    pe[:, 0::2] = torch.sin(pe[:, 0::2])
+    pe[:, 1::2] = torch.cos(pe[:, 1::2])
+    return pe.unsqueeze(0)
+class FinalConformer(nn.Module):
+  def __init__(self, emb_size=128, heads=4, ffmult=4, exp_fac=2, kernel_size=16, n_encoders=1):
+    super(FinalConformer, self).__init__()
+    self.dim_head=int(emb_size/heads)
+    self.dim=emb_size
+    self.heads=heads
+    self.kernel_size=kernel_size
+    self.n_encoders=n_encoders
+    self.positional_emb = nn.Parameter(sinusoidal_embedding(10000, emb_size), requires_grad=False)
+    self.encoder_blocks=_get_clones(ConformerBlock( dim = emb_size, dim_head=self.dim_head, heads= heads,
+    ff_mult = ffmult, conv_expansion_factor = exp_fac, conv_kernel_size = kernel_size),
+    n_encoders)
+    self.class_token = nn.Parameter(torch.rand(1, emb_size))
+    self.fc5 = nn.Linear(emb_size, 2)
+  def forward(self, x): # x shape [bs, tiempo, frecuencia]
+    x = x + self.positional_emb[:, :x.size(1), :]
+    x = torch.stack([torch.vstack((self.class_token, x[i])) for i in range(len(x))])#[bs,1+tiempo,emb_size]
+    list_attn_weight = []
+    for layer in self.encoder_blocks:
+            x, attn_weight = layer(x) #[bs,1+tiempo,emb_size]
+            list_attn_weight.append(attn_weight)
+    embedding=x[:,0,:] #[bs, emb_size]
+    out=self.fc5(embedding) #[bs,2]
+    return out, list_attn_weight

feature_extraction_antispoofing.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from transformers import SequenceFeatureExtractor
+import numpy as np
+import torch
+class AntispoofingFeatureExtractor(SequenceFeatureExtractor):
+    def __init__(
+        self,
+        feature_size=1,
+        sampling_rate=16000,
+        padding_value=0.0,
+        return_attention_mask=True,
+        **kwargs
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            **kwargs
+        )
+        self.return_attention_mask = return_attention_mask
+    def __call__(self, audio, sampling_rate=None, return_tensors=True, **kwargs):
+        audio = self.pad(audio, 64600)
+        audio = torch.Tensor(audio)
+        return {
+            "input_values": audio
+        }
+    def pad(self, x, max_len):
+        x_len = x.shape[0]
+        if x_len >= max_len:
+            return x[:max_len]
+        num_repeats = int(max_len / x_len)+1
+        padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
+        return padded_x

modeling_antispoofing.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from .configuration_antispoofing import DF_Arena_1B_Config
+from .backbone import DF_Arena_1B
+from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
+class DF_Arena_1B_Antispoofing(PreTrainedModel):
+    config_class = DF_Arena_1B_Config
+    def __init__(self, config: DF_Arena_1B_Config):
+        super().__init__(config)
+        self.feature_extractor = AntispoofingFeatureExtractor()
+        # your backbone here (CNN/TDNN/Wav2Vec front-end, etc.)
+        self.backbone = DF_Arena_1B()
+        self.post_init()
+    def forward(self, input_values, attention_mask=None):
+        # input_values: (batch, time) float32 waveform @ config.sample_rate
+        logits = self.backbone(input_values)
+        return {"logits": logits}

pipeline_antispoofing.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from transformers import Pipeline
+import torch
+from .feature_extraction_antispoofing import AntispoofingFeatureExtractor
+class AntispoofingPipeline(Pipeline):
+    def __init__(self, model, **kwargs):
+        super().__init__(model=model, **kwargs)
+        self.feature_extractor = AntispoofingFeatureExtractor()
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        if "sampling_rate" in kwargs:
+            preprocess_kwargs["sampling_rate"] = kwargs["sampling_rate"]
+        return preprocess_kwargs, {}, postprocess_kwargs
+    def preprocess(self, audio, sampling_rate=16000):
+        audio = self.feature_extractor(audio)['input_values']
+        inputs = {"input_values": audio}
+        return inputs
+    def _forward(self, model_inputs):
+        outputs = self.model(**model_inputs)
+        return outputs
+    def postprocess(self, model_outputs):
+        logits = model_outputs['logits']
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        predicted_class = torch.argmax(probs, dim=-1).item()
+        confidence = probs[0][predicted_class].item()
+        return {
+            "label": self.model.config.id2label[predicted_class],
+            "logits": logits.tolist(),
+            "score": confidence,
+            "all_scores": {
+                self.model.config.id2label[i]: probs[0][i].item()
+                for i in range(len(probs[0]))
+            }
+        }

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "feature_extractor_type": "AntispoofingFeatureExtractor",
+  "processor_class": "feature_extraction_antispoofing.AntispoofingFeatureExtractor"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:780bc14fd4c15e65d58efdef728427cf03cd29cd60be528e97badf8c89087988
+size 4591794734