Spaces:

Heartsync
/

phoenix

Paused

App Files Files Community

seawolf2357 commited on Nov 6

Commit

41f8d59

verified ·

1 Parent(s): 57a0735

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -4

app.py CHANGED Viewed

@@ -171,7 +171,10 @@ class MultiScaleRetention(nn.Module):
             batch_size, seq_len, self.hidden_size
         )
-        # Group norm
         retention_states = self.group_norm(
             retention_states.transpose(1, 2)
         ).transpose(1, 2)
@@ -274,6 +277,15 @@ class HierarchicalRetention(nn.Module):
         if past_key_values is not None:
             past_key_value = past_key_values
         # Base Retention
         retention_output, attn_weights, past_kv = self.base_retention(
             hidden_states, attention_mask, position_ids,
@@ -281,9 +293,9 @@ class HierarchicalRetention(nn.Module):
         )
         # Hierarchical states
-        short_state = torch.zeros(batch_size, self.d_state).to(hidden_states.device)
-        medium_state = torch.zeros(batch_size, self.d_state).to(hidden_states.device)
-        long_state = torch.zeros(batch_size, self.d_state * 2).to(hidden_states.device)
         hierarchical_outputs = []

             batch_size, seq_len, self.hidden_size
         )
+        # ✅ Group norm - ensure it's on the correct device
+        if not next(self.group_norm.parameters()).is_cuda and retention_states.is_cuda:
+            self.group_norm = self.group_norm.to(retention_states.device)
         retention_states = self.group_norm(
             retention_states.transpose(1, 2)
         ).transpose(1, 2)
         if past_key_values is not None:
             past_key_value = past_key_values
+        # ✅ Ensure all submodules are on correct device
+        target_device = hidden_states.device
+        if not next(self.short_proj.parameters()).is_cuda and hidden_states.is_cuda:
+            self.short_proj = self.short_proj.to(target_device)
+            self.medium_proj = self.medium_proj.to(target_device)
+            self.long_proj = self.long_proj.to(target_device)
+            self.fusion = self.fusion.to(target_device)
+            self.norm = self.norm.to(target_device)
         # Base Retention
         retention_output, attn_weights, past_kv = self.base_retention(
             hidden_states, attention_mask, position_ids,
         )
         # Hierarchical states
+        short_state = torch.zeros(batch_size, self.d_state, dtype=hidden_states.dtype, device=target_device)
+        medium_state = torch.zeros(batch_size, self.d_state, dtype=hidden_states.dtype, device=target_device)
+        long_state = torch.zeros(batch_size, self.d_state * 2, dtype=hidden_states.dtype, device=target_device)
         hierarchical_outputs = []