rp-yu
/

Dimple-7B

@@ -35,9 +35,23 @@ from transformers.cache_utils import (
     DynamicCache,
 )
 from transformers.generation.utils import GenerationMixin
 logger = logging.get_logger("Dimple."+__name__)
 def top_p_logits(logits, top_p=None):
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
@@ -417,98 +431,9 @@ class DimpleGenerationMixin:
         self,
         inputs: Optional[torch.Tensor] = None,
         generation_config: Optional[DimpleGenerationConfig] = None,
-        # tokenizer=None, # only for debug, need to be removed !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         **kwargs,
     ) -> Union[DimpleModelOutput, torch.LongTensor]:
-        """
-        Generates sequences using a diffusion-based masked token denoising algorithm.
-        This method replaces masked tokens in `inputs` through iterative refinement, based on a denoising process
-        inspired by diffusion models. It uses intermediate confidence-based sampling to progressively fill in masked tokens.
-        Args:
-            inputs (torch.Tensor):
-                Input token IDs.
-            generation_config (DimpleGenerationConfig, optional):
-                An instance of `DimpleGenerationConfig` containing generation hyperparameters. If not provided,
-                the default generation config from the model is used.
-            **kwargs:
-                Additional generation parameters that override those in `generation_config`.
-        Returns:
-            DimpleModelOutput if `return_dict_in_generate=True`, else `torch.LongTensor` of generated token IDs.
-        Key Parameters (either in `generation_config` or passed via kwargs):
-        - `max_new_tokens` (int, default=None):
-            The number of new tokens to generate or fill in. This sets the target length of the generated sequence beyond
-            the prompt. It is added to the input length to determine the total sequence length.
-        - `output_history` (bool, default=False):
-            If `True`, returns the full sequence history at each denoising step. This is useful for visualization or debugging
-            purposes. Only returned if `return_dict_in_generate=True`.
-        - `return_dict_in_generate` (bool, default=False):
-            If `True`, returns a `DimpleModelOutput` dictionary containing the final sequences and, optionally, the stepwise history.
-            If `False`, returns a plain tensor of token IDs.
-        - `steps` (int, default=512):
-            The number of denoising steps to perform during generation. Each step progressively refines the sequence by replacing
-            some masked tokens based on a sampling algorithm.
-        - `temperature` (float, default=0.0):
-            Sampling temperature applied to logits before softmax. Lower values make outputs more deterministic,
-            while higher values allow for more randomness in token selection.
-        - `top_p` (float, default=None):
-            Nucleus sampling parameter. If set, only the most probable tokens whose cumulative probability exceeds `top_p`
-            are considered during sampling.
-        - `alg` (str, default="origin"):
-            The denoising algorithm to use for determining which tokens to replace at each step. Options include:
-              - `"origin"`: random token selection based on a probability ratio.
-              - `"origin-ratio"`: like `"origin"` but uses continuous transfer ratio.
-              - `"autoregressive"`: always fills the left-most masked token.
-              - `"maskgit_plus"`: confidence-based selection similar to Google's MaskGIT.
-              - `"topk_margin"`: token selection based on margin (top1 - top2 probability).
-              - `"entropy"`: prioritizes tokens with high negative entropy (uncertainty).
-        - `use_cache` (bool, default=False):
-            Enables prefilling of past key values (past KV) for efficient decoding.
-        - `alg_p_threshold` (float, optional, default=None):
-            A confidence threshold used to determine whether a token is confident enough to be selected. If the token's
-            confidence is above this value, it is unmasked and committed to the sequence. Helps stabilize generation.
-        - `use_original_confidence` (bool, default=True):
-            If `True`, confidence scores are computed using the original (pre-sampled) probability distribution.
-            If `False`, uses the current step's softmaxed logits. Enables more stable token selection in some cases.
-        - `decoding_pipeline` (str, default="dim"):
-            The generation decoding pipeline to use:
-              - `"dim"`: Dimple decoding pipeline.
-              - `"dream"`: Original DREAM token selection pipeline.
-        Example:
-            ```python
-            output = model.diffusion_generate(
-                inputs=input_ids,
-                max_new_tokens=64,
-                output_history=True,
-                return_dict_in_generate=True,
-                steps=64,
-                temperature=0.2,
-                top_p=0.95,
-                alg="origin",
-                use_cache=True,
-                alg_p_threshold=0.95,
-                use_original_confidence=True,
-                decoding_pipeline="dim"
-            )
-            ```
-        """
         # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         generation_tokens_hook_func = model_kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)
@@ -588,7 +513,7 @@ class DimpleGenerationMixin:
             generation_config=generation_config,
             generation_tokens_hook_func=generation_tokens_hook_func,
             generation_logits_hook_func=generation_logits_hook_func,
-            # tokenizer=tokenizer, # only for debug, need to be removed !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
             **model_kwargs,
         )
         return result
@@ -599,7 +524,7 @@ class DimpleGenerationMixin:
         generation_config: DimpleGenerationConfig,
         generation_tokens_hook_func,
         generation_logits_hook_func,
-        # tokenizer=None, # only for debug, need to be removed !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
         **model_kwargs,
     ) -> Union[DimpleModelOutput, torch.LongTensor]:
         # init values
@@ -618,7 +543,6 @@ class DimpleGenerationMixin:
         top_p = generation_config.top_p
         top_k = generation_config.top_k
         attention_mask = model_kwargs.get("attention_mask", None)
-        attention_mask_4d = model_kwargs.get("attention_mask_4d", None)
         histories = [] if (return_dict_in_generate and output_history) else None
@@ -784,9 +708,15 @@ class DimpleGenerationMixin:
             if histories is not None:
                 histories.append(input_ids.clone())
             if decoding_pipeline == 'dim' and torch.all(input_ids != mask_token_id):
                 break
         if return_dict_in_generate:
             return DimpleModelOutput(
                 sequences=input_ids,

     DynamicCache,
 )
 from transformers.generation.utils import GenerationMixin
+from transformers import TextIteratorStreamer
 logger = logging.get_logger("Dimple."+__name__)
+class FullSequenceStreamer(TextIteratorStreamer):
+    def __init__(self, tokenizer, **kwargs):
+        super().__init__(tokenizer, **kwargs)
+    def put(self, value, stream_end=False):
+        # Assume full token_ids are passed in every time
+        decoded = self.tokenizer.batch_decode(value, **self.decode_kwargs)
+        self.text_queue.put(decoded)
+        if stream_end:
+            self.text_queue.put(self.stop_signal, timeout=self.timeout)
+    def end(self):
+        self.text_queue.put(self.stop_signal, timeout=self.timeout)
 def top_p_logits(logits, top_p=None):
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
         self,
         inputs: Optional[torch.Tensor] = None,
         generation_config: Optional[DimpleGenerationConfig] = None,
+        streamer: Optional[FullSequenceStreamer]=None,
         **kwargs,
     ) -> Union[DimpleModelOutput, torch.LongTensor]:
         # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
         generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         generation_tokens_hook_func = model_kwargs.pop("generation_tokens_hook_func", lambda step, x, logits: x)
             generation_config=generation_config,
             generation_tokens_hook_func=generation_tokens_hook_func,
             generation_logits_hook_func=generation_logits_hook_func,
+            streamer = streamer,
             **model_kwargs,
         )
         return result
         generation_config: DimpleGenerationConfig,
         generation_tokens_hook_func,
         generation_logits_hook_func,
+        streamer: Optional[FullSequenceStreamer] = None,
         **model_kwargs,
     ) -> Union[DimpleModelOutput, torch.LongTensor]:
         # init values
         top_p = generation_config.top_p
         top_k = generation_config.top_k
         attention_mask = model_kwargs.get("attention_mask", None)
         histories = [] if (return_dict_in_generate and output_history) else None
             if histories is not None:
                 histories.append(input_ids.clone())
+            if streamer is not None:
+                streamer.put(input_ids[:, -answer_token_length+1:])
             if decoding_pipeline == 'dim' and torch.all(input_ids != mask_token_id):
                 break
+        if streamer is not None:
+            streamer.end()
         if return_dict_in_generate:
             return DimpleModelOutput(
                 sequences=input_ids,