AbstractPhil
/

max-vit-goliath

Zero-Shot Classification

TensorBoard

Model card Files Files and versions

xet

Metrics Training metrics Community

AbstractPhil commited on Sep 8, 2025

Commit

bdc1770

verified ·

1 Parent(s): c605588

Updated the model to correctly reflect the fixes.

Browse files

Files changed (1) hide show

model.py +265 -171

model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 PentachoraViT: Vision Transformer with Pentachoron Geometric Structure
 Enhanced with Geometric Attention for improved head cohesion and generalization
 """
 import torch
@@ -35,7 +36,7 @@ class PentachoraConfig:
     aux_loss_weight: float = 0.3
     geo_loss_weight: float = 0.1
     vocab: Optional[Any] = None
     @property
     def num_patches(self) -> int:
         return (self.img_size // self.patch_size) ** 2
@@ -74,33 +75,33 @@ class GeometricConfig:
 class GeometricNavigator(nn.Module):
     """Maps inputs to geometric regions in 4D space."""
     def __init__(self, input_dim: int, num_regions: int, config: GeometricConfig):
         super().__init__()
         self.input_dim = input_dim
         self.num_regions = num_regions
         self.config = config
         self.to_nav = nn.Linear(input_dim, 4, bias=False)
         self.vertex_w = nn.Parameter(torch.zeros(num_regions, 5))
         # Initialize geometry after module is created
         self.register_parameter('D', None)
         self.register_parameter('S', None)
     def _lazy_init_geometry(self, device):
         """Initialize geometry on first forward pass."""
         if self.D is not None:
             return
         base = perfect_4simplex(device)
         D = torch.zeros(self.num_regions, 5, 4, device=device)
         S = torch.zeros(self.num_regions, 5, 4, device=device)
         for r in range(self.num_regions):
             D[r] = base + self.config.jitter * torch.randn_like(base)
             theta = torch.tensor(0.27 + 0.05 * (r % self.config.rotate_cycle), device=device)
             rot = torch.eye(4, device=device)
             c, s_val = torch.cos(theta), torch.sin(theta)
@@ -108,67 +109,67 @@ class GeometricNavigator(nn.Module):
             rot[1, 0] = s_val; rot[1, 1] = c
             S[r] = (base @ rot) + self.config.shift
             S[r] += self.config.jitter * torch.randn_like(S[r])
         self.D = nn.Parameter(D)
         self.S = nn.Parameter(S)
     def navigate(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
         """Navigate inputs through geometric space."""
         self._lazy_init_geometry(x.device)
         nav_x = self.to_nav(x)
         nav_x_exp = nav_x[:, None, None, :]
         D_exp = self.D[None, :, :, :]
         d_disp = torch.norm(nav_x_exp - D_exp, dim=-1)
         s_disp = -softmin_over_last(d_disp, self.config.softmin_tau)
         w = F.softmax(self.vertex_w, dim=1)
         phase_scores = []
         for phase in self.config.phases:
             phase_tensor = torch.tensor(phase, device=x.device)
             ct = torch.cos(phase_tensor)
             st = torch.sin(phase_tensor)
             Vt = ct * self.D + st * self.S
             w_expanded = w.unsqueeze(-1)
             Vt_mean = Vt.mean(dim=1, keepdim=True)
             Vt = (1.0 - w_expanded) * Vt + w_expanded * Vt_mean
             Vt_exp = Vt[None, :, :, :]
             d_ribbon = torch.norm(nav_x_exp - Vt_exp, dim=-1)
             s_ribbon = -softmin_over_last(d_ribbon, self.config.softmin_tau)
             phase_scores.append(s_ribbon)
         s_ribbon = torch.stack(phase_scores).mean(dim=0)
         scores = self.config.fuse_alpha * s_ribbon + (1 - self.config.fuse_alpha) * s_disp
         diagnostics = {
             'dispatcher_scores': s_disp.detach(),
             'ribbon_scores': s_ribbon.detach()
         }
         return {'scores': scores, 'diagnostics': diagnostics}
 class GeometricAttention(nn.Module):
     """Multi-head geometric attention with Q-K alignment."""
     def __init__(self, dim: int, num_heads: int = 8, num_regions: Optional[int] = None,
                  config: Optional[GeometricConfig] = None, dropout: float = 0.0):
         super().__init__()
         self.dim = dim
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
         if num_regions is None:
             num_regions = min(self.head_dim, 16)
         if config is None:
             config = GeometricConfig()
         self.config = config
         self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
         self.q_navigators = nn.ModuleList([
             GeometricNavigator(self.head_dim, num_regions, config)
             for _ in range(num_heads)
@@ -177,53 +178,53 @@ class GeometricAttention(nn.Module):
             GeometricNavigator(self.head_dim, num_regions, config)
             for _ in range(num_heads)
         ])
         self.out_proj = nn.Linear(dim, dim)
         self.dropout = nn.Dropout(dropout)
     def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None,
                 return_diagnostics: bool = False) -> Tuple[torch.Tensor, Optional[Dict]]:
         B, T, D = x.shape
         qkv = self.to_qkv(x)
         q, k, v = qkv.chunk(3, dim=-1)
         q = q.reshape(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         k = k.reshape(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         v = v.reshape(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         outputs = []
         all_diagnostics = [] if return_diagnostics else None
         for h in range(self.num_heads):
             q_h_flat = q[:, h].reshape(B * T, self.head_dim)
             k_h_flat = k[:, h].reshape(B * T, self.head_dim)
             q_nav = self.q_navigators[h].navigate(q_h_flat)
             k_nav = self.k_navigators[h].navigate(k_h_flat)
             q_scores = q_nav['scores'].reshape(B, T, -1)
             k_scores = k_nav['scores'].reshape(B, T, -1)
             attn = torch.bmm(q_scores, k_scores.transpose(1, 2))
             attn = attn / math.sqrt(q_scores.size(-1))
             if mask is not None:
                 attn = attn.masked_fill(mask.unsqueeze(1) == 0, -1e9)
             attn = F.softmax(attn, dim=-1)
             attn = self.dropout(attn)
             out = torch.bmm(attn, v[:, h])
             outputs.append(out)
             if return_diagnostics:
                 all_diagnostics.append({'q': q_nav['diagnostics'], 'k': k_nav['diagnostics']})
         output = torch.stack(outputs, dim=1).transpose(1, 2).reshape(B, T, D)
         output = self.out_proj(output)
         output = self.dropout(output)
         if return_diagnostics:
             return output, {'head_diagnostics': all_diagnostics}
         return output, None
@@ -249,103 +250,156 @@ class DropPath(nn.Module):
         return output
 # ============================================
-# HIERARCHICAL CLS WITH PENTACHORA
 # ============================================
 class HierarchicalPentachoronCLS(nn.Module):
     """
     Hierarchical CLS structure with pentachoron geometry.
-    Creates global, vertex-level, and class-specific representations.
     """
     def __init__(self, dim: int, vocab_dim: int, num_classes: int = 100):
         super().__init__()
         self.dim = dim  # Model's internal dimension
         self.vocab_dim = vocab_dim  # Vocabulary's dimension
         self.num_classes = num_classes
-        # Hierarchical CLS tokens (in model dimension)
-        self.global_cls = nn.Parameter(torch.randn(1, 1, dim) * 0.02)
-        self.vertex_cls = nn.Parameter(torch.randn(1, 5, dim) * 0.02)
-        # Class-specific pentachora (in vocabulary dimension)
         self.class_pentachora = nn.Parameter(torch.randn(num_classes, 5, vocab_dim) * 0.02)
-        # Projection layer to align vocab_dim with model dim if they differ
         if vocab_dim != dim:
-            self.vocab_projection = nn.Linear(vocab_dim, dim)
         else:
-            self.vocab_projection = nn.Identity()
-        # Aggregation layers
-        self.vertex_to_global = nn.Linear(dim * 5, dim)
-        self.norm = nn.LayerNorm(dim)
-    def forward(self, batch_size: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Generate CLS tokens for batch."""
-        global_cls = self.global_cls.expand(batch_size, -1, -1)
-        vertex_cls = self.vertex_cls.expand(batch_size, -1, -1)
         return global_cls, vertex_cls
-    def aggregate_vertices(self, vertex_cls: torch.Tensor) -> torch.Tensor:
-        """Aggregate vertex representations to global."""
-        B = vertex_cls.shape[0]
-        flattened = vertex_cls.reshape(B, -1)
-        aggregated = self.vertex_to_global(flattened).unsqueeze(1)
-        return self.norm(aggregated)
 # ============================================
-# GEOMETRIC PROJECTION LAYER
 # ============================================
 class GeometricProjection(nn.Module):
-    """Project patches onto pentachoron geometry."""
     def __init__(self, dim: int, vocab_dim: int, num_classes: int = 100, dropout: float = 0.1):
         super().__init__()
         self.dim = dim  # Model dimension
         self.vocab_dim = vocab_dim  # Vocabulary dimension
         self.num_classes = num_classes
-        # Separate projection for each vertex (project from model dim to vocab dim for alignment)
         self.vertex_projections = nn.ModuleList([
-            nn.Linear(dim, vocab_dim, bias=False) for _ in range(5)
         ])
         self.norm = nn.LayerNorm(dim)
         self.dropout = nn.Dropout(dropout)
     def forward(self, patches: torch.Tensor, pentachora: torch.Tensor) -> torch.Tensor:
         """
         Compute alignment between patches and class pentachora.
         Args:
-            patches: [B, N, D] - patch embeddings
-            pentachora: [C, 5, vocab_dim] - class pentachora
         Returns:
             [B, N, C] - alignment scores
         """
         B, N, D = patches.shape
         C = pentachora.shape[0]
         patches = self.norm(patches)
         # Compute alignment with each vertex
         alignments = []
         for v in range(5):
-            # Project patches through vertex-specific projection
-            patches_proj = self.vertex_projections[v](patches)
-            patches_proj = F.normalize(patches_proj, dim=-1)
             # Get vertex v of all classes
-            vertex_v = F.normalize(pentachora[:, v, :], dim=-1)
             # Compute alignment scores
-            alignment = torch.matmul(patches_proj, vertex_v.T)
             alignments.append(alignment)
         # Average alignments across vertices
-        alignments = torch.stack(alignments, dim=-1).mean(dim=-1)
         return self.dropout(alignments)
 # ============================================
@@ -359,13 +413,13 @@ class MLP(nn.Module):
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
         self.fc1 = nn.Linear(in_features, hidden_features)
         self.act = nn.GELU()
         self.drop1 = nn.Dropout(dropout)
         self.fc2 = nn.Linear(hidden_features, out_features)
         self.drop2 = nn.Dropout(dropout)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.fc1(x)
         x = self.act(x)
@@ -385,7 +439,7 @@ class PentachoronViTBlock(nn.Module):
                  drop_path: float = 0.):
         super().__init__()
         self.norm1 = nn.LayerNorm(dim)
         # Use GeometricAttention for structured layers, standard for others
         if use_mesh:
             self.attn = GeometricAttention(
@@ -398,15 +452,15 @@ class PentachoronViTBlock(nn.Module):
         else:
             # Standard multi-head attention for later layers
             self.attn = nn.MultiheadAttention(dim, heads, dropout=attn_dropout, batch_first=True)
         self.use_mesh = use_mesh
         self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
         self.norm2 = nn.LayerNorm(dim)
         mlp_hidden = int(dim * mlp_ratio)
         self.mlp = MLP(in_features=dim, hidden_features=mlp_hidden, dropout=dropout)
         self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
     def forward(self, x: torch.Tensor, preserve_structure: bool = True) -> torch.Tensor:
         if self.use_mesh:
             # GeometricAttention
@@ -417,7 +471,7 @@ class PentachoronViTBlock(nn.Module):
             normalized = self.norm1(x)
             attn_out, _ = self.attn(normalized, normalized, normalized)
             x = x + self.drop_path1(attn_out)
         x = x + self.drop_path2(self.mlp(self.norm2(x)))
         return x
@@ -433,10 +487,10 @@ class PatchEmbed(nn.Module):
         self.img_size = img_size
         self.patch_size = patch_size
         self.num_patches = (img_size // patch_size) ** 2
         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
         self.norm = nn.LayerNorm(embed_dim)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         x = rearrange(x, 'b c h w -> b (h w) c')
@@ -444,60 +498,61 @@ class PatchEmbed(nn.Module):
         return x
 # ============================================
-# PENTACHORA VISION TRANSFORMER
 # ============================================
 class PentachoraViT(nn.Module):
     """
     Vision Transformer with pentachoron-based hierarchical CLS tokens
     and geometric vocabulary integration.
     """
     def __init__(self, config: Optional[PentachoraConfig] = None, **kwargs):
         super().__init__()
         # Use config or kwargs
         if config is not None:
             cfg = config
         else:
             cfg = PentachoraConfig(**kwargs)
         self.config = cfg
         self.num_classes = cfg.num_classes
         self.dim = cfg.dim
         self.depth = cfg.depth
         self.preserve_structure_until_layer = cfg.preserve_structure_until_layer
-        # Set vocabulary dimension - from config, kwargs, or default to model dim
         if cfg.vocab_dim is not None:
             self.vocab_dim = cfg.vocab_dim
         elif 'vocab_dim' in kwargs:
             self.vocab_dim = kwargs['vocab_dim']
         else:
             self.vocab_dim = cfg.dim
         # Patch embedding
         self.patch_embed = PatchEmbed(
             cfg.img_size, cfg.patch_size, 3, cfg.dim
         )
         num_patches = self.patch_embed.num_patches
         # Positional embedding
         self.pos_embed = nn.Parameter(torch.randn(1, num_patches, cfg.dim) * 0.02)
         self.pos_drop = nn.Dropout(cfg.dropout_rate)
         # CLS tokens with pentachoron structure
         self.cls_tokens = HierarchicalPentachoronCLS(cfg.dim, self.vocab_dim, cfg.num_classes)
-        # Geometric projection layer - CREATE BEFORE vocab init
         self.geometric_proj = GeometricProjection(cfg.dim, self.vocab_dim, cfg.num_classes, cfg.dropout_rate)
-        # Initialize from vocabulary AFTER creating all components
         if cfg.vocab is not None:
             self._init_from_vocab(cfg.vocab)
         # Stochastic depth decay rule
         dpr = [x.item() for x in torch.linspace(0, cfg.drop_path_rate, cfg.depth)]
         # Transformer blocks with geometric attention
         self.blocks = nn.ModuleList([
             PentachoronViTBlock(
@@ -511,17 +566,26 @@ class PentachoraViT(nn.Module):
             )
             for i in range(cfg.depth)
         ])
         # Final norm
         self.norm = nn.LayerNorm(cfg.dim)
         # Classification heads
-        self.head = nn.Linear(cfg.dim, cfg.num_classes)
-        self.head_aux = nn.Linear(cfg.dim * 5, cfg.num_classes)
         # Initialize weights
         self.apply(self._init_weights)
     def _init_weights(self, m: nn.Module):
         """Initialize model weights."""
         if isinstance(m, nn.Linear):
@@ -535,79 +599,81 @@ class PentachoraViT(nn.Module):
             nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
             if m.bias is not None:
                 nn.init.constant_(m.bias, 0)
     def _init_from_vocab(self, vocab):
         """Initialize class pentachora from geometric vocabulary."""
         try:
             print("Initializing pentachora from vocabulary...")
             if not hasattr(vocab, 'encode_batch'):
                 print("Vocabulary provided but encode_batch method not found, using random initialization")
                 return
             # Get CIFAR-100 class names
             class_names = self._get_cifar100_classes()
             # Generate pentachora for all classes
             pentachora_list = vocab.encode_batch(class_names[:self.num_classes], generate=True)
             pentachora = np.stack(pentachora_list, axis=0)
             # Get actual dimensions from the encoded data
             actual_vocab_dim = pentachora.shape[-1]
             print(f"Encoded pentachora shape: {pentachora.shape}")
             print(f"Detected vocabulary dimension: {actual_vocab_dim}")
             # Validate basic shape requirements
             if pentachora.shape[0] != self.num_classes or pentachora.shape[1] != 5:
                 print(f"Invalid shape: expected ({self.num_classes}, 5, ?), got {pentachora.shape}")
                 print("Using random initialization")
                 return
-            # Update all components to use the actual vocabulary dimension
             self.vocab_dim = actual_vocab_dim
             self.cls_tokens.vocab_dim = actual_vocab_dim
             self.geometric_proj.vocab_dim = actual_vocab_dim
             # Replace class_pentachora with the loaded vocabulary
             self.cls_tokens.class_pentachora = nn.Parameter(
                 torch.tensor(pentachora, dtype=torch.float32)
             )
             # Update/create projection layer if dimensions differ
             if actual_vocab_dim != self.dim:
-                self.cls_tokens.vocab_projection = nn.Linear(actual_vocab_dim, self.dim)
             else:
-                self.cls_tokens.vocab_projection = nn.Identity()
-            # Rebuild geometric projection layers with correct dimensions
             self.geometric_proj.vertex_projections = nn.ModuleList([
-                nn.Linear(self.dim, actual_vocab_dim, bias=False) for _ in range(5)
             ])
             # Re-initialize the new layers
             for proj in self.geometric_proj.vertex_projections:
                 nn.init.xavier_uniform_(proj.weight)
             if actual_vocab_dim != self.dim:
-                nn.init.xavier_uniform_(self.cls_tokens.vocab_projection.weight)
             print(f"✓ Successfully initialized {self.num_classes} class pentachora from vocabulary")
             print(f"  Vocabulary dimension: {actual_vocab_dim}")
             print(f"  Model internal dimension: {self.dim}")
-            print(f"  Projection: {actual_vocab_dim} → {self.dim}")
         except Exception as e:
             print(f"Error initializing from vocabulary: {e}")
             print("Using random initialization")
     def _get_cifar100_classes(self):
         """Get CIFAR-100 class names."""
         return [
-            'apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle',
-            'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel',
-            'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock',
-            'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur',
-            'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster',
             'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion',
             'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse',
             'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear',
@@ -618,59 +684,86 @@ class PentachoraViT(nn.Module):
             'tank', 'telephone', 'television', 'tiger', 'tractor', 'train', 'trout',
             'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm'
         ]
-    def forward_features(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
-        """Extract features from input."""
-        B = x.shape[0]
         # Patch embedding
         x = self.patch_embed(x)
         x = x + self.pos_embed
         x = self.pos_drop(x)
-        # Get hierarchical CLS tokens
-        global_cls, vertex_cls = self.cls_tokens(B)
         # Concatenate CLS tokens with patches
         x = torch.cat([global_cls, vertex_cls, x], dim=1)
         # Apply transformer blocks
         for i, block in enumerate(self.blocks):
             preserve = i < self.preserve_structure_until_layer
             x = block(x, preserve_structure=preserve)
         # Apply final norm
         x = self.norm(x)
         # Split tokens
         global_cls = x[:, 0]
         vertex_cls = x[:, 1:6]
         patches = x[:, 6:]
         return {
             'global_cls': global_cls,
             'vertex_cls': vertex_cls,
             'patches': patches
         }
-    def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
-        """Forward pass through the model."""
-        features = self.forward_features(x)
-        # Primary classification using global CLS
-        logits = self.head(features['global_cls'])
         # Auxiliary classification using vertex tokens
         B = features['vertex_cls'].shape[0]
         vertex_flat = features['vertex_cls'].reshape(B, -1)
         aux_logits = self.head_aux(vertex_flat)
         # Geometric alignment scores
         geometric_alignments = self.geometric_proj(
-            features['patches'],
             self.cls_tokens.class_pentachora
         )
         return {
             'logits': logits,
             'aux_logits': aux_logits,
@@ -692,24 +785,24 @@ class PentachoraLoss(nn.Module):
         self.aux_weight = aux_weight
         self.geo_weight = geo_weight
         self.criterion = nn.CrossEntropyLoss(label_smoothing=smoothing)
     def forward(self, outputs: Dict[str, torch.Tensor], targets: torch.Tensor) -> torch.Tensor:
         """Compute combined loss."""
         # Primary classification loss
         loss = self.criterion(outputs['logits'], targets)
         # Auxiliary loss from vertex tokens
         if 'aux_logits' in outputs and self.aux_weight > 0:
             aux_loss = self.criterion(outputs['aux_logits'], targets)
             loss = loss + self.aux_weight * aux_loss
         # Geometric alignment loss
         if 'geometric_alignments' in outputs and self.geo_weight > 0:
             # Average over patches
             geo_logits = outputs['geometric_alignments'].mean(dim=1)
             geo_loss = self.criterion(geo_logits, targets)
             loss = loss + self.geo_weight * geo_loss
         return loss
 # ============================================
@@ -718,8 +811,8 @@ class PentachoraLoss(nn.Module):
 MODEL_CONFIGS = {
     'pentachora_spark': PentachoraConfig(
-        dim=64, depth=5, heads=4, mlp_ratio=4.0,
-        preserve_structure_until_layer=2,
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_tiny': PentachoraConfig(
@@ -749,31 +842,32 @@ def create_pentachora_vit(variant: str = 'pentachora_small',
                          **kwargs) -> PentachoraViT:
     """
     Create PentachoraViT model.
     Args:
         variant: Model variant name
         pretrained: Whether to load pretrained weights
         **kwargs: Override config parameters (including vocab_dim)
     Returns:
         PentachoraViT model
     """
     if variant not in MODEL_CONFIGS:
         raise ValueError(f"Unknown variant: {variant}. Choose from {list(MODEL_CONFIGS.keys())}")
     config = MODEL_CONFIGS[variant]
     # Override config with kwargs
     for key, value in kwargs.items():
         setattr(config, key, value)
     model = PentachoraViT(config)
     if pretrained:
         warnings.warn("Pretrained weights not available yet")
     return model
 def pentachora_vit_spark(pretrained: bool = False, **kwargs) -> PentachoraViT:
     """Create spark variant (smallest)."""
     return create_pentachora_vit('pentachora_spark', pretrained=pretrained, **kwargs)

 """
 PentachoraViT: Vision Transformer with Pentachoron Geometric Structure
 Enhanced with Geometric Attention for improved head cohesion and generalization
+FIXED: CLS tokens now properly reference and utilize vocabulary embeddings
 """
 import torch
     aux_loss_weight: float = 0.3
     geo_loss_weight: float = 0.1
     vocab: Optional[Any] = None
     @property
     def num_patches(self) -> int:
         return (self.img_size // self.patch_size) ** 2
 class GeometricNavigator(nn.Module):
     """Maps inputs to geometric regions in 4D space."""
     def __init__(self, input_dim: int, num_regions: int, config: GeometricConfig):
         super().__init__()
         self.input_dim = input_dim
         self.num_regions = num_regions
         self.config = config
         self.to_nav = nn.Linear(input_dim, 4, bias=False)
         self.vertex_w = nn.Parameter(torch.zeros(num_regions, 5))
         # Initialize geometry after module is created
         self.register_parameter('D', None)
         self.register_parameter('S', None)
     def _lazy_init_geometry(self, device):
         """Initialize geometry on first forward pass."""
         if self.D is not None:
             return
         base = perfect_4simplex(device)
         D = torch.zeros(self.num_regions, 5, 4, device=device)
         S = torch.zeros(self.num_regions, 5, 4, device=device)
         for r in range(self.num_regions):
             D[r] = base + self.config.jitter * torch.randn_like(base)
             theta = torch.tensor(0.27 + 0.05 * (r % self.config.rotate_cycle), device=device)
             rot = torch.eye(4, device=device)
             c, s_val = torch.cos(theta), torch.sin(theta)
             rot[1, 0] = s_val; rot[1, 1] = c
             S[r] = (base @ rot) + self.config.shift
             S[r] += self.config.jitter * torch.randn_like(S[r])
         self.D = nn.Parameter(D)
         self.S = nn.Parameter(S)
     def navigate(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
         """Navigate inputs through geometric space."""
         self._lazy_init_geometry(x.device)
         nav_x = self.to_nav(x)
         nav_x_exp = nav_x[:, None, None, :]
         D_exp = self.D[None, :, :, :]
         d_disp = torch.norm(nav_x_exp - D_exp, dim=-1)
         s_disp = -softmin_over_last(d_disp, self.config.softmin_tau)
         w = F.softmax(self.vertex_w, dim=1)
         phase_scores = []
         for phase in self.config.phases:
             phase_tensor = torch.tensor(phase, device=x.device)
             ct = torch.cos(phase_tensor)
             st = torch.sin(phase_tensor)
             Vt = ct * self.D + st * self.S
             w_expanded = w.unsqueeze(-1)
             Vt_mean = Vt.mean(dim=1, keepdim=True)
             Vt = (1.0 - w_expanded) * Vt + w_expanded * Vt_mean
             Vt_exp = Vt[None, :, :, :]
             d_ribbon = torch.norm(nav_x_exp - Vt_exp, dim=-1)
             s_ribbon = -softmin_over_last(d_ribbon, self.config.softmin_tau)
             phase_scores.append(s_ribbon)
         s_ribbon = torch.stack(phase_scores).mean(dim=0)
         scores = self.config.fuse_alpha * s_ribbon + (1 - self.config.fuse_alpha) * s_disp
         diagnostics = {
             'dispatcher_scores': s_disp.detach(),
             'ribbon_scores': s_ribbon.detach()
         }
         return {'scores': scores, 'diagnostics': diagnostics}
 class GeometricAttention(nn.Module):
     """Multi-head geometric attention with Q-K alignment."""
     def __init__(self, dim: int, num_heads: int = 8, num_regions: Optional[int] = None,
                  config: Optional[GeometricConfig] = None, dropout: float = 0.0):
         super().__init__()
         self.dim = dim
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
         if num_regions is None:
             num_regions = min(self.head_dim, 16)
         if config is None:
             config = GeometricConfig()
         self.config = config
         self.to_qkv = nn.Linear(dim, dim * 3, bias=False)
         self.q_navigators = nn.ModuleList([
             GeometricNavigator(self.head_dim, num_regions, config)
             for _ in range(num_heads)
             GeometricNavigator(self.head_dim, num_regions, config)
             for _ in range(num_heads)
         ])
         self.out_proj = nn.Linear(dim, dim)
         self.dropout = nn.Dropout(dropout)
     def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None,
                 return_diagnostics: bool = False) -> Tuple[torch.Tensor, Optional[Dict]]:
         B, T, D = x.shape
         qkv = self.to_qkv(x)
         q, k, v = qkv.chunk(3, dim=-1)
         q = q.reshape(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         k = k.reshape(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         v = v.reshape(B, T, self.num_heads, self.head_dim).transpose(1, 2)
         outputs = []
         all_diagnostics = [] if return_diagnostics else None
         for h in range(self.num_heads):
             q_h_flat = q[:, h].reshape(B * T, self.head_dim)
             k_h_flat = k[:, h].reshape(B * T, self.head_dim)
             q_nav = self.q_navigators[h].navigate(q_h_flat)
             k_nav = self.k_navigators[h].navigate(k_h_flat)
             q_scores = q_nav['scores'].reshape(B, T, -1)
             k_scores = k_nav['scores'].reshape(B, T, -1)
             attn = torch.bmm(q_scores, k_scores.transpose(1, 2))
             attn = attn / math.sqrt(q_scores.size(-1))
             if mask is not None:
                 attn = attn.masked_fill(mask.unsqueeze(1) == 0, -1e9)
             attn = F.softmax(attn, dim=-1)
             attn = self.dropout(attn)
             out = torch.bmm(attn, v[:, h])
             outputs.append(out)
             if return_diagnostics:
                 all_diagnostics.append({'q': q_nav['diagnostics'], 'k': k_nav['diagnostics']})
         output = torch.stack(outputs, dim=1).transpose(1, 2).reshape(B, T, D)
         output = self.out_proj(output)
         output = self.dropout(output)
         if return_diagnostics:
             return output, {'head_diagnostics': all_diagnostics}
         return output, None
         return output
 # ============================================
+# HIERARCHICAL CLS WITH PENTACHORA (FIXED)
 # ============================================
 class HierarchicalPentachoronCLS(nn.Module):
     """
     Hierarchical CLS structure with pentachoron geometry.
+    FIXED: Now properly uses vocabulary embeddings for CLS tokens.
     """
     def __init__(self, dim: int, vocab_dim: int, num_classes: int = 100):
         super().__init__()
         self.dim = dim  # Model's internal dimension
         self.vocab_dim = vocab_dim  # Vocabulary's dimension
         self.num_classes = num_classes
+        # Class-specific pentachora from vocabulary (in vocabulary dimension)
         self.class_pentachora = nn.Parameter(torch.randn(num_classes, 5, vocab_dim) * 0.02)
+        # Projection from vocabulary dimension to model dimension
         if vocab_dim != dim:
+            self.vocab_to_model = nn.Linear(vocab_dim, dim)
         else:
+            self.vocab_to_model = nn.Identity()
+        # Learnable aggregation weights for creating global CLS from vertices
+        self.vertex_weights = nn.Parameter(torch.ones(5) / 5)
+        # Optional learnable offset for global CLS
+        self.global_offset = nn.Parameter(torch.zeros(1, 1, dim))
+        # Layer norms
+        self.vertex_norm = nn.LayerNorm(dim)
+        self.global_norm = nn.LayerNorm(dim)
+    def forward(self, batch_size: int, class_indices: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Generate CLS tokens for batch.
+        Args:
+            batch_size: Batch size
+            class_indices: Optional class indices for class-specific initialization
+        Returns:
+            global_cls: [B, 1, D] - Global CLS tokens
+            vertex_cls: [B, 5, D] - Vertex CLS tokens
+        """
+        if class_indices is not None and class_indices.shape[0] == batch_size:
+            # Use class-specific pentachora when class indices are provided
+            # This would typically be used during training with labels
+            vertex_cls_vocab = self.class_pentachora[class_indices]  # [B, 5, vocab_dim]
+        else:
+            # Use mean of all class pentachora when no specific classes provided
+            # This is used during inference or when class is unknown
+            vertex_cls_vocab = self.class_pentachora.mean(dim=0, keepdim=True)  # [1, 5, vocab_dim]
+            vertex_cls_vocab = vertex_cls_vocab.expand(batch_size, -1, -1)  # [B, 5, vocab_dim]
+        # Project from vocabulary dimension to model dimension
+        vertex_cls = self.vocab_to_model(vertex_cls_vocab)  # [B, 5, dim]
+        vertex_cls = self.vertex_norm(vertex_cls)
+        # Create global CLS as weighted combination of vertices
+        weights = F.softmax(self.vertex_weights, dim=0)
+        global_cls = torch.einsum('bvd,v->bd', vertex_cls, weights).unsqueeze(1)  # [B, 1, dim]
+        global_cls = global_cls + self.global_offset
+        global_cls = self.global_norm(global_cls)
         return global_cls, vertex_cls
+    def get_class_prototypes(self) -> torch.Tensor:
+        """
+        Get class prototypes in model dimension.
+        Returns:
+            prototypes: [num_classes, dim] - Class prototype vectors
+        """
+        # Project class pentachora to model dimension
+        pentachora_model = self.vocab_to_model(self.class_pentachora)  # [C, 5, dim]
+        # Aggregate vertices to get class prototypes
+        weights = F.softmax(self.vertex_weights, dim=0)
+        prototypes = torch.einsum('cvd,v->cd', pentachora_model, weights)  # [C, dim]
+        return prototypes
 # ============================================
+# GEOMETRIC PROJECTION LAYER (ENHANCED)
 # ============================================
 class GeometricProjection(nn.Module):
+    """
+    Project patches onto pentachoron geometry.
+    ENHANCED: Now provides better integration with vocabulary.
+    """
     def __init__(self, dim: int, vocab_dim: int, num_classes: int = 100, dropout: float = 0.1):
         super().__init__()
         self.dim = dim  # Model dimension
         self.vocab_dim = vocab_dim  # Vocabulary dimension
         self.num_classes = num_classes
+        # Projection from model dim to vocab dim for alignment
+        self.to_vocab_space = nn.Linear(dim, vocab_dim)
+        # Vertex-specific projections for fine-grained alignment
         self.vertex_projections = nn.ModuleList([
+            nn.Linear(vocab_dim, vocab_dim, bias=False) for _ in range(5)
         ])
+        # Temperature for alignment scores
+        self.temperature = nn.Parameter(torch.ones(1))
         self.norm = nn.LayerNorm(dim)
         self.dropout = nn.Dropout(dropout)
     def forward(self, patches: torch.Tensor, pentachora: torch.Tensor) -> torch.Tensor:
         """
         Compute alignment between patches and class pentachora.
         Args:
+            patches: [B, N, D] - patch embeddings in model dimension
+            pentachora: [C, 5, vocab_dim] - class pentachora in vocabulary dimension
         Returns:
             [B, N, C] - alignment scores
         """
         B, N, D = patches.shape
         C = pentachora.shape[0]
+        # Normalize patches
         patches = self.norm(patches)
+        # Project patches to vocabulary space
+        patches_vocab = self.to_vocab_space(patches)  # [B, N, vocab_dim]
+        patches_vocab = F.normalize(patches_vocab, dim=-1)
         # Compute alignment with each vertex
         alignments = []
         for v in range(5):
+            # Apply vertex-specific transformation
+            patches_v = self.vertex_projections[v](patches_vocab)
+            patches_v = F.normalize(patches_v, dim=-1)
             # Get vertex v of all classes
+            vertex_v = F.normalize(pentachora[:, v, :], dim=-1)  # [C, vocab_dim]
             # Compute alignment scores
+            alignment = torch.matmul(patches_v, vertex_v.T) / self.temperature  # [B, N, C]
             alignments.append(alignment)
         # Average alignments across vertices
+        alignments = torch.stack(alignments, dim=-1).mean(dim=-1)  # [B, N, C]
         return self.dropout(alignments)
 # ============================================
         super().__init__()
         out_features = out_features or in_features
         hidden_features = hidden_features or in_features
         self.fc1 = nn.Linear(in_features, hidden_features)
         self.act = nn.GELU()
         self.drop1 = nn.Dropout(dropout)
         self.fc2 = nn.Linear(hidden_features, out_features)
         self.drop2 = nn.Dropout(dropout)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.fc1(x)
         x = self.act(x)
                  drop_path: float = 0.):
         super().__init__()
         self.norm1 = nn.LayerNorm(dim)
         # Use GeometricAttention for structured layers, standard for others
         if use_mesh:
             self.attn = GeometricAttention(
         else:
             # Standard multi-head attention for later layers
             self.attn = nn.MultiheadAttention(dim, heads, dropout=attn_dropout, batch_first=True)
         self.use_mesh = use_mesh
         self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
         self.norm2 = nn.LayerNorm(dim)
         mlp_hidden = int(dim * mlp_ratio)
         self.mlp = MLP(in_features=dim, hidden_features=mlp_hidden, dropout=dropout)
         self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
     def forward(self, x: torch.Tensor, preserve_structure: bool = True) -> torch.Tensor:
         if self.use_mesh:
             # GeometricAttention
             normalized = self.norm1(x)
             attn_out, _ = self.attn(normalized, normalized, normalized)
             x = x + self.drop_path1(attn_out)
         x = x + self.drop_path2(self.mlp(self.norm2(x)))
         return x
         self.img_size = img_size
         self.patch_size = patch_size
         self.num_patches = (img_size // patch_size) ** 2
         self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
         self.norm = nn.LayerNorm(embed_dim)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.proj(x)
         x = rearrange(x, 'b c h w -> b (h w) c')
         return x
 # ============================================
+# PENTACHORA VISION TRANSFORMER (FIXED)
 # ============================================
 class PentachoraViT(nn.Module):
     """
     Vision Transformer with pentachoron-based hierarchical CLS tokens
     and geometric vocabulary integration.
+    FIXED: CLS tokens now properly reference vocabulary embeddings.
     """
     def __init__(self, config: Optional[PentachoraConfig] = None, **kwargs):
         super().__init__()
         # Use config or kwargs
         if config is not None:
             cfg = config
         else:
             cfg = PentachoraConfig(**kwargs)
         self.config = cfg
         self.num_classes = cfg.num_classes
         self.dim = cfg.dim
         self.depth = cfg.depth
         self.preserve_structure_until_layer = cfg.preserve_structure_until_layer
+        # Set vocabulary dimension
         if cfg.vocab_dim is not None:
             self.vocab_dim = cfg.vocab_dim
         elif 'vocab_dim' in kwargs:
             self.vocab_dim = kwargs['vocab_dim']
         else:
             self.vocab_dim = cfg.dim
         # Patch embedding
         self.patch_embed = PatchEmbed(
             cfg.img_size, cfg.patch_size, 3, cfg.dim
         )
         num_patches = self.patch_embed.num_patches
         # Positional embedding
         self.pos_embed = nn.Parameter(torch.randn(1, num_patches, cfg.dim) * 0.02)
         self.pos_drop = nn.Dropout(cfg.dropout_rate)
         # CLS tokens with pentachoron structure
         self.cls_tokens = HierarchicalPentachoronCLS(cfg.dim, self.vocab_dim, cfg.num_classes)
+        # Geometric projection layer
         self.geometric_proj = GeometricProjection(cfg.dim, self.vocab_dim, cfg.num_classes, cfg.dropout_rate)
+        # Initialize from vocabulary if provided
         if cfg.vocab is not None:
             self._init_from_vocab(cfg.vocab)
         # Stochastic depth decay rule
         dpr = [x.item() for x in torch.linspace(0, cfg.drop_path_rate, cfg.depth)]
         # Transformer blocks with geometric attention
         self.blocks = nn.ModuleList([
             PentachoronViTBlock(
             )
             for i in range(cfg.depth)
         ])
         # Final norm
         self.norm = nn.LayerNorm(cfg.dim)
         # Classification heads
+        # Primary head uses prototypes for classification
+        self.use_prototype_classifier = True
+        if self.use_prototype_classifier:
+            # No learnable parameters - uses class prototypes directly
+            self.head = None
+        else:
+            # Traditional linear head
+            self.head = nn.Linear(cfg.dim, cfg.num_classes)
+        # Auxiliary head for vertex tokens
+        self.head_aux = nn.Linear(cfg.dim * 5, cfg.num_classes)
         # Initialize weights
         self.apply(self._init_weights)
     def _init_weights(self, m: nn.Module):
         """Initialize model weights."""
         if isinstance(m, nn.Linear):
             nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
             if m.bias is not None:
                 nn.init.constant_(m.bias, 0)
     def _init_from_vocab(self, vocab):
         """Initialize class pentachora from geometric vocabulary."""
         try:
             print("Initializing pentachora from vocabulary...")
             if not hasattr(vocab, 'encode_batch'):
                 print("Vocabulary provided but encode_batch method not found, using random initialization")
                 return
             # Get CIFAR-100 class names
             class_names = self._get_cifar100_classes()
             # Generate pentachora for all classes
             pentachora_list = vocab.encode_batch(class_names[:self.num_classes], generate=True)
             pentachora = np.stack(pentachora_list, axis=0)
             # Get actual dimensions from the encoded data
             actual_vocab_dim = pentachora.shape[-1]
             print(f"Encoded pentachora shape: {pentachora.shape}")
             print(f"Detected vocabulary dimension: {actual_vocab_dim}")
             # Validate basic shape requirements
             if pentachora.shape[0] != self.num_classes or pentachora.shape[1] != 5:
                 print(f"Invalid shape: expected ({self.num_classes}, 5, ?), got {pentachora.shape}")
                 print("Using random initialization")
                 return
+            # Update vocabulary dimension
             self.vocab_dim = actual_vocab_dim
             self.cls_tokens.vocab_dim = actual_vocab_dim
             self.geometric_proj.vocab_dim = actual_vocab_dim
             # Replace class_pentachora with the loaded vocabulary
             self.cls_tokens.class_pentachora = nn.Parameter(
                 torch.tensor(pentachora, dtype=torch.float32)
             )
             # Update/create projection layer if dimensions differ
             if actual_vocab_dim != self.dim:
+                self.cls_tokens.vocab_to_model = nn.Linear(actual_vocab_dim, self.dim)
             else:
+                self.cls_tokens.vocab_to_model = nn.Identity()
+            # Rebuild geometric projection components
+            self.geometric_proj.to_vocab_space = nn.Linear(self.dim, actual_vocab_dim)
             self.geometric_proj.vertex_projections = nn.ModuleList([
+                nn.Linear(actual_vocab_dim, actual_vocab_dim, bias=False) for _ in range(5)
             ])
             # Re-initialize the new layers
+            nn.init.xavier_uniform_(self.geometric_proj.to_vocab_space.weight)
             for proj in self.geometric_proj.vertex_projections:
                 nn.init.xavier_uniform_(proj.weight)
             if actual_vocab_dim != self.dim:
+                nn.init.xavier_uniform_(self.cls_tokens.vocab_to_model.weight)
             print(f"✓ Successfully initialized {self.num_classes} class pentachora from vocabulary")
             print(f"  Vocabulary dimension: {actual_vocab_dim}")
             print(f"  Model internal dimension: {self.dim}")
+            print(f"  CLS tokens now reference vocabulary embeddings")
         except Exception as e:
             print(f"Error initializing from vocabulary: {e}")
             print("Using random initialization")
     def _get_cifar100_classes(self):
         """Get CIFAR-100 class names."""
         return [
+            'apple', 'aquarium_fish', 'baby', 'bear', 'beaver', 'bed', 'bee', 'beetle',
+            'bicycle', 'bottle', 'bowl', 'boy', 'bridge', 'bus', 'butterfly', 'camel',
+            'can', 'castle', 'caterpillar', 'cattle', 'chair', 'chimpanzee', 'clock',
+            'cloud', 'cockroach', 'couch', 'crab', 'crocodile', 'cup', 'dinosaur',
+            'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster',
             'house', 'kangaroo', 'keyboard', 'lamp', 'lawn_mower', 'leopard', 'lion',
             'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse',
             'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear',
             'tank', 'telephone', 'television', 'tiger', 'tractor', 'train', 'trout',
             'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm'
         ]
+    def forward_features(self, x: torch.Tensor, class_indices: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        """
+        Extract features from input.
+        Args:
+            x: Input images [B, 3, H, W]
+            class_indices: Optional class indices for class-aware CLS tokens [B]
+        """
+        B = x.shape[0]
         # Patch embedding
         x = self.patch_embed(x)
         x = x + self.pos_embed
         x = self.pos_drop(x)
+        # Get hierarchical CLS tokens (now properly using vocabulary)
+        global_cls, vertex_cls = self.cls_tokens(B, class_indices)
         # Concatenate CLS tokens with patches
         x = torch.cat([global_cls, vertex_cls, x], dim=1)
         # Apply transformer blocks
         for i, block in enumerate(self.blocks):
             preserve = i < self.preserve_structure_until_layer
             x = block(x, preserve_structure=preserve)
         # Apply final norm
         x = self.norm(x)
         # Split tokens
         global_cls = x[:, 0]
         vertex_cls = x[:, 1:6]
         patches = x[:, 6:]
         return {
             'global_cls': global_cls,
             'vertex_cls': vertex_cls,
             'patches': patches
         }
+    def forward(self, x: torch.Tensor, targets: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through the model.
+        Args:
+            x: Input images [B, 3, H, W]
+            targets: Optional target labels for class-aware processing [B]
+        """
+        # During training, use target labels for class-specific CLS initialization
+        class_indices = targets if self.training and targets is not None else None
+        features = self.forward_features(x, class_indices)
+        # Primary classification using prototype matching
+        if self.use_prototype_classifier:
+            # Get class prototypes from vocabulary
+            prototypes = self.cls_tokens.get_class_prototypes()  # [C, D]
+            prototypes = F.normalize(prototypes, dim=-1)
+            # Normalize global CLS tokens
+            global_cls_norm = F.normalize(features['global_cls'], dim=-1)  # [B, D]
+            # Compute similarity to prototypes
+            logits = torch.matmul(global_cls_norm, prototypes.T) * 20.0  # Scale for better gradients
+        else:
+            # Traditional linear classification
+            logits = self.head(features['global_cls'])
         # Auxiliary classification using vertex tokens
         B = features['vertex_cls'].shape[0]
         vertex_flat = features['vertex_cls'].reshape(B, -1)
         aux_logits = self.head_aux(vertex_flat)
         # Geometric alignment scores
         geometric_alignments = self.geometric_proj(
+            features['patches'],
             self.cls_tokens.class_pentachora
         )
         return {
             'logits': logits,
             'aux_logits': aux_logits,
         self.aux_weight = aux_weight
         self.geo_weight = geo_weight
         self.criterion = nn.CrossEntropyLoss(label_smoothing=smoothing)
     def forward(self, outputs: Dict[str, torch.Tensor], targets: torch.Tensor) -> torch.Tensor:
         """Compute combined loss."""
         # Primary classification loss
         loss = self.criterion(outputs['logits'], targets)
         # Auxiliary loss from vertex tokens
         if 'aux_logits' in outputs and self.aux_weight > 0:
             aux_loss = self.criterion(outputs['aux_logits'], targets)
             loss = loss + self.aux_weight * aux_loss
         # Geometric alignment loss
         if 'geometric_alignments' in outputs and self.geo_weight > 0:
             # Average over patches
             geo_logits = outputs['geometric_alignments'].mean(dim=1)
             geo_loss = self.criterion(geo_logits, targets)
             loss = loss + self.geo_weight * geo_loss
         return loss
 # ============================================
 MODEL_CONFIGS = {
     'pentachora_spark': PentachoraConfig(
+        dim=100, depth=5, heads=4, mlp_ratio=4.0,
+        preserve_structure_until_layer=1,
         dropout_rate=0.0, drop_path_rate=0.0
     ),
     'pentachora_tiny': PentachoraConfig(
                          **kwargs) -> PentachoraViT:
     """
     Create PentachoraViT model.
     Args:
         variant: Model variant name
         pretrained: Whether to load pretrained weights
         **kwargs: Override config parameters (including vocab_dim)
     Returns:
         PentachoraViT model
     """
     if variant not in MODEL_CONFIGS:
         raise ValueError(f"Unknown variant: {variant}. Choose from {list(MODEL_CONFIGS.keys())}")
     config = MODEL_CONFIGS[variant]
     # Override config with kwargs
     for key, value in kwargs.items():
         setattr(config, key, value)
     model = PentachoraViT(config)
     if pretrained:
         warnings.warn("Pretrained weights not available yet")
     return model
+# Convenience functions for each variant
 def pentachora_vit_spark(pretrained: bool = False, **kwargs) -> PentachoraViT:
     """Create spark variant (smallest)."""
     return create_pentachora_vit('pentachora_spark', pretrained=pretrained, **kwargs)