Upload 7 files

Browse files

Files changed (7) hide show

README.md +116 -3
config.json +38 -0
configuration_eat.py +66 -0
eat_model.py +99 -0
model.safetensors +3 -0
model_core.py +224 -0
modeling_eat.py +18 -0

README.md CHANGED Viewed

@@ -1,3 +1,116 @@
----
-license: cc-by-4.0
----

+---
+license: mit
+tags:
+- Audio
+- SSL
+- SSLAM
+library_name: transformers
+---
+# SSLAM Pretrain (ViT Base, 15 epochs)
+This repository provides an SSLAM checkpoint formatted for use with Hugging Face Transformers. It is intended for feature extraction in audio LLMs, sound event detection, and general purpose audio representation learning. The implementation follows the [EAT](https://arxiv.org/abs/2401.03497) code path while swapping in SSLAM pretrained weights.
+## 🔧 Usage
+You can load and use the model for feature extraction directly via Hugging Face Transformers:
+```python
+import torchaudio
+import torch
+import soundfile as sf
+import numpy as np
+from transformers import AutoModel
+model_id = "ta012/SSLAM_pretrain"
+model = AutoModel.from_pretrained(model_id, trust_remote_code=True).eval().cuda()
+source_file = "/path/to/input.wav"
+target_length = 1024    # Recommended: 1024 for 10s audio
+norm_mean = -4.268
+norm_std = 4.569
+# Load and resample audio
+wav, sr = sf.read(source_file)
+waveform = torch.tensor(wav).float().cuda()
+if sr != 16000:
+    waveform = torchaudio.functional.resample(waveform, sr, 16000)
+# Normalize and convert to mel-spectrogram
+waveform = waveform - waveform.mean()
+mel = torchaudio.compliance.kaldi.fbank(
+    waveform.unsqueeze(0),
+    htk_compat=True,
+    sample_frequency=16000,
+    use_energy=False,
+    window_type='hanning',
+    num_mel_bins=128,
+    dither=0.0,
+    frame_shift=10
+).unsqueeze(0)
+# Pad or truncate
+n_frames = mel.shape[1]
+if n_frames < target_length:
+    mel = torch.nn.ZeroPad2d((0, 0, 0, target_length - n_frames))(mel)
+else:
+    mel = mel[:, :target_length, :]
+# Normalize
+mel = (mel - norm_mean) / (norm_std * 2)
+mel = mel.unsqueeze(0).cuda()  # shape: [1, 1, T, F]
+# Extract features
+with torch.no_grad():
+    feat = model.extract_features(mel)
+feat = feat.squeeze(0).cpu().numpy()
+print(f"Feature shape: {feat.shape}")
+```
+## 📌 Notes
+See the [feature extraction guide](https://github.com/cwx-worst-one/EAT/tree/main/feature_extract) for more instructions.
+## 🙌 Acknowledgments
+This repository builds on the EAT implementation for Hugging Face models. We remap SSLAM weights to that interface.
+- Paper: EAT: Self supervised pretraining with Efficient Audio Transformer
+- Code: https://github.com/cwx-worst-one/EAT
+We are not affiliated with the EAT authors. All credit for the original implementation belongs to them.
+## 📚 Citation
+If you find our work useful, please cite it as:
+```bibtex
+@inproceedings{alex2025sslam,
+  title={{SSLAM}: Enhancing Self-Supervised Models with Audio Mixtures for Polyphonic Soundscapes},
+  author={Tony Alex and Sara Atito and Armin Mustafa and Muhammad Awais and Philip J B Jackson},
+  booktitle={The Thirteenth International Conference on Learning Representations},
+  year={2025},
+  url={https://openreview.net/forum?id=odU59TxdiB}
+}
+```
+Please also cite EAT:
+```bibtex
+@article{chen2024eat,
+  title={EAT: Self-supervised pre-training with efficient audio transformer},
+  author={Chen, Wenxi and Liang, Yuzhe and Ma, Ziyang and Zheng, Zhisheng and Chen, Xie},
+  journal={arXiv preprint arXiv:2401.03497},
+  year={2024}
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "activation_dropout": 0.0,
+  "architectures": [
+    "EATModel"
+  ],
+  "auto_map": {
+    "AutoModel": "modeling_eat.EATModel",
+    "AutoConfig": "configuration_eat.EATConfig"
+  },
+  "attn_drop_rate": 0.0,
+  "depth": 12,
+  "drop_rate": 0.0,
+  "embed_dim": 768,
+  "end_drop_path_rate": 0.0,
+  "fixed_positions": true,
+  "img_size": [
+    1024,
+    128
+  ],
+  "in_chans": 1,
+  "layer_norm_first": false,
+  "max_length": 768,
+  "mel_bins": 128,
+  "mlp_ratio": 4.0,
+  "model_type": "eat",
+  "model_variant": "pretrain",
+  "norm_affine": true,
+  "norm_eps": 1e-06,
+  "num_classes": 527,
+  "num_heads": 12,
+  "patch_size": 16,
+  "post_mlp_drop": 0.0,
+  "qkv_bias": true,
+  "start_drop_path_rate": 0.0,
+  "stride": 16,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3"
+}

configuration_eat.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# configuration_eat.py
+from transformers import PretrainedConfig
+class EATConfig(PretrainedConfig):
+    model_type = "eat"
+    def __init__(
+        self,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        patch_size=16,
+        stride=16,
+        in_chans=1,
+        mel_bins=128,
+        max_length=768,
+        num_classes=527,
+        model_variant="pretrain",  # or "finetune"
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        activation_dropout=0.0,
+        post_mlp_drop=0.0,
+        start_drop_path_rate=0.0,
+        end_drop_path_rate=0.0,
+        layer_norm_first=False,
+        norm_eps=1e-6,
+        norm_affine=True,
+        fixed_positions=True,
+        img_size=(1024, 128),  # (target_length, mel_bins)
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.stride = stride
+        self.in_chans = in_chans
+        self.mel_bins = mel_bins
+        self.max_length = max_length
+        self.num_classes = num_classes
+        self.model_variant = model_variant
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.activation_dropout = activation_dropout
+        self.post_mlp_drop = post_mlp_drop
+        self.start_drop_path_rate = start_drop_path_rate
+        self.end_drop_path_rate = end_drop_path_rate
+        self.layer_norm_first = layer_norm_first
+        self.norm_eps = norm_eps
+        self.norm_affine = norm_affine
+        self.fixed_positions = fixed_positions
+        self.img_size = img_size

eat_model.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torch.nn as nn
+from timm.models.layers import trunc_normal_
+from functools import partial
+import numpy as np
+from .model_core import (
+    PatchEmbed_new,
+    get_2d_sincos_pos_embed_flexible,
+    FixedPositionalEncoder,
+    AltBlock
+)
+class EAT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.mode = config.model_variant  # "pretrain" or "finetune"
+        # === Embedding / Encoder ===
+        self.local_encoder = PatchEmbed_new(
+            img_size=config.img_size,
+            patch_size=config.patch_size,
+            in_chans=config.in_chans,
+            embed_dim=config.embed_dim,
+            stride=config.stride
+        )
+        self.extra_tokens = nn.Parameter(torch.zeros(1, 1, config.embed_dim))
+        self.pos_drop = nn.Dropout(p=config.drop_rate, inplace=True)
+        trunc_normal_(self.extra_tokens, std=.02)
+        self.fixed_positional_encoder = (
+            FixedPositionalEncoder(self.build_sincos_pos_embed()) if config.fixed_positions else None
+        )
+        norm_layer = partial(nn.LayerNorm, eps=config.norm_eps, elementwise_affine=config.norm_affine)
+        dpr = np.linspace(config.start_drop_path_rate, config.end_drop_path_rate, config.depth)
+        self.blocks = nn.ModuleList([
+            AltBlock(config.embed_dim, config.num_heads, config.mlp_ratio,
+                     qkv_bias=config.qkv_bias, drop=config.drop_rate,
+                     attn_drop=config.attn_drop_rate, mlp_drop=config.activation_dropout,
+                     post_mlp_drop=config.post_mlp_drop, drop_path=dpr[i],
+                     norm_layer=norm_layer, layer_norm_first=config.layer_norm_first,
+                     ffn_targets=True)
+            for i in range(config.depth)
+        ])
+        self.pre_norm = norm_layer(config.embed_dim)
+        # === Head (for finetune) ===
+        if self.mode == "finetune":
+            self.fc_norm = nn.LayerNorm(config.embed_dim)
+            self.head = nn.Linear(config.embed_dim, config.num_classes, bias=True)
+        else:
+            self.head = nn.Identity()
+        self.apply(self._init_weights)
+    def build_sincos_pos_embed(self):
+        W = self.config.mel_bins // self.config.patch_size
+        max_length = self.config.max_length
+        embed_dim = self.config.embed_dim
+        pos_embed = nn.Parameter(torch.zeros(1, max_length * W, embed_dim), requires_grad=False)
+        emb = get_2d_sincos_pos_embed_flexible(embed_dim, (max_length, W), cls_token=False)
+        pos_embed.data.copy_(torch.from_numpy(emb).float().unsqueeze(0))
+        return pos_embed
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def encode(self, x):
+        B = x.shape[0]
+        x = self.local_encoder(x)
+        if self.fixed_positional_encoder is not None:
+            x = x + self.fixed_positional_encoder(x, None)[:, :x.size(1), :]
+        x = torch.cat((self.extra_tokens.expand(B, -1, -1), x), dim=1)
+        x = self.pre_norm(x)
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x, _ = blk(x)
+        return x
+    def forward(self, x):
+        x = self.encode(x)
+        if self.mode == "finetune":
+            x = x[:, 0]  # use cls token
+            x = self.fc_norm(x)
+            x = self.head(x)
+        return x
+    def extract_features(self, x):
+        x = self.encode(x)
+        return x

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ec670adb241c710422ddd894ff1bade142ef0b25cf1ee68577aa45f89432298
+size 359905840

model_core.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from timm.models.layers import to_2tuple
+class PatchEmbed_new(nn.Module):
+    """ Flexible Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, stride=16):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        stride = to_2tuple(stride)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride) # with overlapped patches
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+def get_2d_sincos_pos_embed_flexible(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class FixedPositionalEncoder(nn.Module):
+    def __init__(self, pos_embed):
+        super().__init__()
+        self.positions = pos_embed
+    def forward(self, x, padding_mask):
+        return self.positions
+class AltBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        mlp_drop=0.0,
+        post_mlp_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        layer_norm_first=True,
+        ffn_targets=False,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.layer_norm_first = layer_norm_first
+        self.ffn_targets = ffn_targets
+        from timm.models.vision_transformer import DropPath, Mlp
+        self.norm1 = norm_layer(dim)
+        self.attn = AltAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            cosine_attention=cosine_attention,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop,
+        )
+        self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False)
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        if self.layer_norm_first:
+            x = x + self.drop_path(self.attn(self.norm1(x), padding_mask, alibi_bias))
+            r = x = self.mlp(self.norm2(x))
+            t = x
+            x = r + self.drop_path(self.post_mlp_dropout(x))
+            if not self.ffn_targets:
+                t = x
+        else:
+            x = x + self.drop_path(self.attn(x, padding_mask, alibi_bias))
+            r = x = self.norm1(x)
+            x = self.mlp(x)
+            t = x
+            x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x)))
+            if not self.ffn_targets:
+                t = x
+        return x, t
+class AltAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.cosine_attention = cosine_attention
+        if cosine_attention:
+            self.logit_scale = nn.Parameter(
+                torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True
+            )
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)  # qkv x B x H x L x D
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        dtype = q.dtype
+        if self.cosine_attention:
+            # cosine attention
+            attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)
+            logit_scale = torch.clamp(
+                self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01))
+            ).exp()
+            attn = attn * logit_scale
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+        if alibi_bias is not None:
+            attn = attn.type_as(alibi_bias)
+            attn[:, : alibi_bias.size(1)] += alibi_bias
+        if padding_mask is not None and padding_mask.any():
+            attn = attn.masked_fill(
+                padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                float("-inf"),
+            )
+        attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2)  #
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

modeling_eat.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# modeling_eat.py
+from transformers import PreTrainedModel
+from .configuration_eat import EATConfig
+from .eat_model import EAT
+class EATModel(PreTrainedModel):
+    config_class = EATConfig
+    def __init__(self, config: EATConfig):
+        super().__init__(config)
+        self.model = EAT(config)
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+    def extract_features(self, x):
+        return self.model.extract_features(x)