kokolamba
/

SubspaceDecoder_mla0-0-0

kokolamba commited on Sep 30

Commit

c5d2ebc

1 Parent(s): e8e025f

Add 2nd patch to fix DecoderLM head not having generate()

Files changed (1) hide show

task_heads.py CHANGED Viewed

@@ -5,6 +5,7 @@ import torch.nn.functional as F
 from typing import Optional, Union
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from .shared_space_config import SharedSpaceDecoderConfig
 from .shared_space_decoder import (
@@ -34,7 +35,7 @@ def create_norm_layer(hidden_size: int, config: SharedSpaceDecoderConfig) -> nn.
         raise ValueError(f"Unknown norm_type: {config.norm_type}")
-class SharedSpaceDecoderForCausalLM(SharedSpaceDecoderPreTrainedModel):
     """
     Subspace Decoder model with a causal language modeling head.
@@ -207,4 +208,19 @@ class SharedSpaceDecoderForCausalLM(SharedSpaceDecoderPreTrainedModel):
             hidden_states=hidden_states if kwargs.get("output_hidden_states", False) else None,
             attentions=None,
         )

 from typing import Optional, Union
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerationMixin
 from .shared_space_config import SharedSpaceDecoderConfig
 from .shared_space_decoder import (
         raise ValueError(f"Unknown norm_type: {config.norm_type}")
+class SharedSpaceDecoderForCausalLM(GenerationMixin, SharedSpaceDecoderPreTrainedModel):
     """
     Subspace Decoder model with a causal language modeling head.
             hidden_states=hidden_states if kwargs.get("output_hidden_states", False) else None,
             attentions=None,
         )
+    # ---- Add this minimal bridge for generation: PATCH 2----
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        # If you add KV cache later: if past_key_values is not None, slice to input_ids[:, -1:]
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+    # Optional; harmless no-op when you don’t have cache yet
+    def _reorder_cache(self, past_key_values, beam_idx):
+        return past_key_values