Qwen
/

Qwen-VL

@@ -56,9 +56,9 @@ QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
 _ERROR_BAD_CHAT_FORMAT = """\
 We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
-If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-VL-Chat" Huggingface model (rather than "Qwen/Qwen-VL") when you call model.chat().
 我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
-如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-VL-Chat"模型（而非"Qwen/Qwen-VL"预训练模型）。
 """
 _SENTINEL = object()
@@ -108,14 +108,6 @@ class QWenAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        max_positions = config.max_position_embeddings
-        self.register_buffer(
-            "bias",
-            torch.tril(
-                torch.ones((max_positions, max_positions), dtype=torch.bool)
-            ).view(1, 1, max_positions, max_positions),
-            persistent=False,
-        )
         self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
         self.seq_length = config.seq_length
@@ -142,20 +134,6 @@ class QWenAttention(nn.Module):
         self.is_fp32 = not (config.bf16 or config.fp16)
         self.bf16 = config.bf16
-        if config.rotary_pct == 1.0:
-            self.rotary_ndims = None
-        else:
-            assert config.rotary_pct < 1
-            self.rotary_ndims = int(
-                self.hidden_size_per_attention_head * config.rotary_pct
-            )
-        dim = (
-            self.rotary_ndims
-            if self.rotary_ndims is not None
-            else self.hidden_size_per_attention_head
-        )
-        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
         self.use_dynamic_ntk = config.use_dynamic_ntk
         self.use_logn_attn = config.use_logn_attn
@@ -164,11 +142,10 @@ class QWenAttention(nn.Module):
             for i in range(1, 32768)
         ]
         self.logn_tensor = torch.tensor(logn_list)[None, :, None, None]
-        self._ntk_cached = 1.0
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         attn_weights = torch.matmul(query, key.transpose(-1, -2))
         if self.scale_attn_weights:
@@ -206,7 +183,7 @@ class QWenAttention(nn.Module):
         return attn_output, attn_weights
     def _upcast_and_reordered_attn(
-        self, query, key, value, attention_mask=None, head_mask=None
     ):
         bsz, num_heads, q_seq_len, dk = query.size()
         _, _, k_seq_len, _ = key.size()
@@ -233,7 +210,7 @@ class QWenAttention(nn.Module):
             attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
         query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = self.bias[
             :, :, key_length - query_length : key_length, :key_length
         ]
         mask_value = torch.finfo(attn_weights.dtype).min
@@ -274,6 +251,8 @@ class QWenAttention(nn.Module):
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -284,43 +263,19 @@ class QWenAttention(nn.Module):
     ):
         mixed_x_layer = self.c_attn(hidden_states)
         query, key, value = mixed_x_layer.split(self.split_size, dim=2)
         query = self._split_heads(query, self.num_heads, self.head_dim)
         key = self._split_heads(key, self.num_heads, self.head_dim)
         value = self._split_heads(value, self.num_heads, self.head_dim)
-        kv_seq_len = hidden_states.size()[1]
-        if layer_past:
-            # layer past[0] shape: bs * seq_len * head_num * dim
-            kv_seq_len += layer_past[0].shape[1]
-        if (
-            self.use_dynamic_ntk
-            and kv_seq_len == hidden_states.size()[1]
-            and not self.training
-        ):
-            context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
-            ntk_alpha = 2 ** math.ceil(context_value) - 1
-            ntk_alpha = max(ntk_alpha, 1)
-            self._ntk_cached = ntk_alpha
-        else:
-            ntk_alpha = self._ntk_cached
-        rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
-            hidden_states.device
-        )
-        if rotary_pos_emb is not None:
-            if isinstance(rotary_pos_emb, tuple):
-                rotary_pos_emb = rotary_pos_emb
-            else:
-                rotary_pos_emb = (rotary_pos_emb,) * 2
         if rotary_pos_emb is not None:
             q_pos_emb, k_pos_emb = rotary_pos_emb
             # Slice the pos emb for current inference
-            cur_len = query.shape[1]
-            q_pos_emb = q_pos_emb[:, -cur_len:, :, :]
-            k_pos_emb = k_pos_emb[:, -cur_len:, :, :]
             query = apply_rotary_pos_emb(query, q_pos_emb)
             key = apply_rotary_pos_emb(key, k_pos_emb)
@@ -346,13 +301,14 @@ class QWenAttention(nn.Module):
         key = key.permute(0, 2, 1, 3)
         value = value.permute(0, 2, 1, 3)
         attn_output, attn_weight = self._attn(
-            query, key, value, attention_mask, head_mask
         )
         context_layer = self._merge_heads(
             attn_output, self.num_heads, self.head_dim
         )
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
             outputs += (attn_weight,)
@@ -379,7 +335,6 @@ class QWenMLP(nn.Module):
         output = self.c_proj(intermediate_parallel)
         return output
 class QWenBlock(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -401,6 +356,8 @@ class QWenBlock(nn.Module):
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -413,6 +370,8 @@ class QWenBlock(nn.Module):
         attn_outputs = self.attn(
             layernorm_output,
             layer_past=layer_past,
             attention_mask=attention_mask,
             head_mask=head_mask,
@@ -488,14 +447,50 @@ class QWenModel(QWenPreTrainedModel):
         self.embed_dim = config.hidden_size
         self.gradient_checkpointing = False
         self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
         self.drop = nn.Dropout(config.emb_dropout_prob)
         self.h = nn.ModuleList(
             [
                 QWenBlock(
-                    config,
                 )
                 for i in range(config.num_hidden_layers)
             ]
@@ -637,6 +632,25 @@ class QWenModel(QWenPreTrainedModel):
         hidden_states = inputs_embeds
         hidden_states = self.drop(hidden_states)
         if images is not None:
             for idx, (i, a, b) in enumerate(img_pos):
@@ -670,6 +684,8 @@ class QWenModel(QWenPreTrainedModel):
                 outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
                     None,
                     attention_mask,
                     head_mask[i],
@@ -680,6 +696,8 @@ class QWenModel(QWenPreTrainedModel):
                 outputs = block(
                     hidden_states,
                     layer_past=layer_past,
                     attention_mask=attention_mask,
                     head_mask=head_mask[i],
                     encoder_hidden_states=encoder_hidden_states,
@@ -690,10 +708,10 @@ class QWenModel(QWenPreTrainedModel):
             hidden_states = outputs[0]
             if use_cache is True:
-                presents = presents + (outputs[2 if output_attentions else 1],)
             if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[1],)
         hidden_states = self.ln_f(hidden_states)
         hidden_states = hidden_states.view(output_shape)
@@ -890,10 +908,13 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         append_history: bool = True,
         stream: Optional[bool] = _SENTINEL,
         stop_words_ids: Optional[List[List[int]]] = None,
         **kwargs,
     ) -> Tuple[str, HistoryType]:
         assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
-        assert self.generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
         if stop_words_ids is None:
@@ -901,24 +922,25 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         max_window_size = kwargs.get('max_window_size', None)
         if max_window_size is None:
-            max_window_size = self.generation_config.max_window_size
         raw_text, context_tokens = make_context(
             tokenizer,
             query,
             history=history,
             system=system,
             max_window_size=max_window_size,
-            chat_format=self.generation_config.chat_format,
         )
         stop_words_ids.extend(get_stop_words_ids(
-            self.generation_config.chat_format, tokenizer
         ))
         input_ids = torch.tensor([context_tokens]).to(self.device)
         outputs = self.generate(
                     input_ids,
-                    stop_words_ids = stop_words_ids,
-                    return_dict_in_generate = False,
                     **kwargs,
                 )
@@ -927,7 +949,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             tokenizer,
             raw_text_len=len(raw_text),
             context_length=len(context_tokens),
-            chat_format=self.generation_config.chat_format,
             verbose=False,
             errors='replace'
         )
@@ -945,9 +967,11 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             system: str = "You are a helpful assistant.",
             stop_words_ids: Optional[List[List[int]]] = None,
             logits_processor: Optional[LogitsProcessorList] = None,
             **kwargs,
     ) -> Generator[str, Any, None]:
-        assert self.generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
         if stop_words_ids is None:
@@ -955,23 +979,23 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         max_window_size = kwargs.get('max_window_size', None)
         if max_window_size is None:
-            max_window_size = self.generation_config.max_window_size
         raw_text, context_tokens = make_context(
             tokenizer,
             query,
             history=history,
             system=system,
             max_window_size=max_window_size,
-            chat_format=self.generation_config.chat_format,
         )
         stop_words_ids.extend(get_stop_words_ids(
-            self.generation_config.chat_format, tokenizer
         ))
         if stop_words_ids is not None:
             stop_words_logits_processor = StopWordsLogitsProcessor(
                 stop_words_ids=stop_words_ids,
-                eos_token_id=self.generation_config.eos_token_id,
             )
             if logits_processor is None:
                 logits_processor = LogitsProcessorList([stop_words_logits_processor])
@@ -982,7 +1006,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
         self.__class__.generate_stream = NewGenerationMixin.generate
         self.__class__.sample_stream = NewGenerationMixin.sample_stream
-        stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
         def stream_generator():
             outputs = []
             for token in self.generate_stream(
@@ -1011,17 +1036,19 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         streamer: Optional["BaseStreamer"] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         # Process stop_words_ids.
         stop_words_ids = kwargs.pop("stop_words_ids", None)
         if stop_words_ids is None and generation_config is not None:
             stop_words_ids = getattr(generation_config, "stop_words_ids", None)
         if stop_words_ids is None:
-            stop_words_ids = getattr(self.generation_config, "stop_words_ids", None)
         if stop_words_ids is not None:
             stop_words_logits_processor = StopWordsLogitsProcessor(
                 stop_words_ids=stop_words_ids,
-                eos_token_id=self.generation_config.eos_token_id,
             )
             if logits_processor is None:
                 logits_processor = LogitsProcessorList([stop_words_logits_processor])
@@ -1069,14 +1096,19 @@ class RotaryEmbedding(torch.nn.Module):
             self._ntk_alpha_cached = ntk_alpha
             seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
             freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
             emb = torch.cat((freqs, freqs), dim=-1)
             from einops import rearrange
-            self._rotary_pos_emb_cache = rearrange(emb, "n d -> 1 n 1 d")
     def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
         self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
-        return self._rotary_pos_emb_cache[:, offset : offset + max_seq_len]
 def _rotate_half(x):
@@ -1088,19 +1120,20 @@ def _rotate_half(x):
 def apply_rotary_pos_emb(t, freqs):
     if apply_rotary_emb_func is not None and t.is_cuda:
         t_ = t.float()
-        freqs = freqs.squeeze(0).squeeze(1)
-        cos = freqs[:, : freqs.shape[-1] // 2].cos()
-        sin = freqs[:, : freqs.shape[-1] // 2].sin()
         output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
         return output
     else:
-        rot_dim = freqs.shape[-1]
         t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
         t_ = t_.float()
         t_pass_ = t_pass_.float()
-        t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
         return torch.cat((t_, t_pass_), dim=-1).type_as(t)

 _ERROR_BAD_CHAT_FORMAT = """\
 We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
+If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
 我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
+如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
 """
 _SENTINEL = object()
     def __init__(self, config):
         super().__init__()
         self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
         self.seq_length = config.seq_length
         self.is_fp32 = not (config.bf16 or config.fp16)
         self.bf16 = config.bf16
         self.use_dynamic_ntk = config.use_dynamic_ntk
         self.use_logn_attn = config.use_logn_attn
             for i in range(1, 32768)
         ]
         self.logn_tensor = torch.tensor(logn_list)[None, :, None, None]
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
+    def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
         attn_weights = torch.matmul(query, key.transpose(-1, -2))
         if self.scale_attn_weights:
         return attn_output, attn_weights
     def _upcast_and_reordered_attn(
+        self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
     ):
         bsz, num_heads, q_seq_len, dk = query.size()
         _, _, k_seq_len, _ = key.size()
             attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
         query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = registered_causal_mask[
             :, :, key_length - query_length : key_length, :key_length
         ]
         mask_value = torch.finfo(attn_weights.dtype).min
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
+        rotary_pos_emb: Optional[List[torch.Tensor]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
     ):
         mixed_x_layer = self.c_attn(hidden_states)
         query, key, value = mixed_x_layer.split(self.split_size, dim=2)
         query = self._split_heads(query, self.num_heads, self.head_dim)
         key = self._split_heads(key, self.num_heads, self.head_dim)
         value = self._split_heads(value, self.num_heads, self.head_dim)
         if rotary_pos_emb is not None:
+            cur_len = query.shape[1]
+            rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
+            rotary_pos_emb = (rotary_pos_emb,) * 2
             q_pos_emb, k_pos_emb = rotary_pos_emb
             # Slice the pos emb for current inference
             query = apply_rotary_pos_emb(query, q_pos_emb)
             key = apply_rotary_pos_emb(key, k_pos_emb)
         key = key.permute(0, 2, 1, 3)
         value = value.permute(0, 2, 1, 3)
         attn_output, attn_weight = self._attn(
+            query, key, value, registered_causal_mask, attention_mask, head_mask
         )
         context_layer = self._merge_heads(
             attn_output, self.num_heads, self.head_dim
         )
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
             outputs += (attn_weight,)
         output = self.c_proj(intermediate_parallel)
         return output
 class QWenBlock(nn.Module):
     def __init__(self, config):
         super().__init__()
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
+        rotary_pos_emb: Optional[List[torch.Tensor]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         attn_outputs = self.attn(
             layernorm_output,
+            rotary_pos_emb,
+            registered_causal_mask=registered_causal_mask,
             layer_past=layer_past,
             attention_mask=attention_mask,
             head_mask=head_mask,
         self.embed_dim = config.hidden_size
         self.gradient_checkpointing = False
+        self.use_dynamic_ntk = config.use_dynamic_ntk
+        self.seq_length = config.seq_length
         self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
         self.drop = nn.Dropout(config.emb_dropout_prob)
+        if config.rotary_pct == 1.0:
+            self.rotary_ndims = None
+        else:
+            assert config.rotary_pct < 1
+            self.rotary_ndims = int(
+                config.kv_channels * config.rotary_pct
+            )
+        dim = (
+            self.rotary_ndims
+            if self.rotary_ndims is not None
+            else config.kv_channels
+        )
+        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
+        self.use_flash_attn = config.use_flash_attn
+        self.is_fp32 = not (config.bf16 or config.fp16)
+        self.registered_causal_mask = None
+        # if (
+        #     self.use_flash_attn
+        #     and flash_attn_unpadded_func is not None
+        #     and not self.is_fp32
+        # ):
+        #     self.registered_causal_mask = None
+        # else:
+        #     max_positions = config.max_position_embeddings
+        #     self.register_buffer(
+        #         "registered_causal_mask",
+        #         torch.tril(
+        #             torch.ones((max_positions, max_positions), dtype=torch.bool)
+        #         ).view(1, 1, max_positions, max_positions),
+        #         persistent=False,
+        #     )
         self.h = nn.ModuleList(
             [
                 QWenBlock(
+                    config
                 )
                 for i in range(config.num_hidden_layers)
             ]
         hidden_states = inputs_embeds
+        kv_seq_len = hidden_states.size()[1]
+        if past_key_values[0] is not None:
+            # past key values[0][0] shape: bs * seq_len * head_num * dim
+            kv_seq_len += past_key_values[0][0].shape[1]
+        if (
+            self.use_dynamic_ntk
+            and kv_seq_len == hidden_states.size()[1]
+            and not self.training
+        ):
+            context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
+            ntk_alpha = 2 ** math.ceil(context_value) - 1
+            ntk_alpha = max(ntk_alpha, 1)
+        else:
+            ntk_alpha = self.rotary_emb._ntk_alpha_cached
+        rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
+        for idx in range(len(rotary_pos_emb)):
+            rotary_pos_emb[idx] = rotary_pos_emb[idx].to(hidden_states.device)
         hidden_states = self.drop(hidden_states)
         if images is not None:
             for idx, (i, a, b) in enumerate(img_pos):
                 outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(block),
                     hidden_states,
+                    rotary_pos_emb,
+                    self.registered_causal_mask,
                     None,
                     attention_mask,
                     head_mask[i],
                 outputs = block(
                     hidden_states,
                     layer_past=layer_past,
+                    rotary_pos_emb=rotary_pos_emb,
+                    registered_causal_mask=self.registered_causal_mask,
                     attention_mask=attention_mask,
                     head_mask=head_mask[i],
                     encoder_hidden_states=encoder_hidden_states,
             hidden_states = outputs[0]
             if use_cache is True:
+                presents = presents + (outputs[1],)
             if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
         hidden_states = self.ln_f(hidden_states)
         hidden_states = hidden_states.view(output_shape)
         append_history: bool = True,
         stream: Optional[bool] = _SENTINEL,
         stop_words_ids: Optional[List[List[int]]] = None,
+        generation_config: Optional[GenerationConfig] = None,
         **kwargs,
     ) -> Tuple[str, HistoryType]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
         assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
+        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
         if stop_words_ids is None:
         max_window_size = kwargs.get('max_window_size', None)
         if max_window_size is None:
+            max_window_size = generation_config.max_window_size
         raw_text, context_tokens = make_context(
             tokenizer,
             query,
             history=history,
             system=system,
             max_window_size=max_window_size,
+            chat_format=generation_config.chat_format,
         )
         stop_words_ids.extend(get_stop_words_ids(
+            generation_config.chat_format, tokenizer
         ))
         input_ids = torch.tensor([context_tokens]).to(self.device)
         outputs = self.generate(
                     input_ids,
+                    stop_words_ids=stop_words_ids,
+                    return_dict_in_generate=False,
+                    generation_config=generation_config,
                     **kwargs,
                 )
             tokenizer,
             raw_text_len=len(raw_text),
             context_length=len(context_tokens),
+            chat_format=generation_config.chat_format,
             verbose=False,
             errors='replace'
         )
             system: str = "You are a helpful assistant.",
             stop_words_ids: Optional[List[List[int]]] = None,
             logits_processor: Optional[LogitsProcessorList] = None,
+            generation_config: Optional[GenerationConfig] = None,
             **kwargs,
     ) -> Generator[str, Any, None]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
+        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
         if stop_words_ids is None:
         max_window_size = kwargs.get('max_window_size', None)
         if max_window_size is None:
+            max_window_size = generation_config.max_window_size
         raw_text, context_tokens = make_context(
             tokenizer,
             query,
             history=history,
             system=system,
             max_window_size=max_window_size,
+            chat_format=generation_config.chat_format,
         )
         stop_words_ids.extend(get_stop_words_ids(
+            generation_config.chat_format, tokenizer
         ))
         if stop_words_ids is not None:
             stop_words_logits_processor = StopWordsLogitsProcessor(
                 stop_words_ids=stop_words_ids,
+                eos_token_id=generation_config.eos_token_id,
             )
             if logits_processor is None:
                 logits_processor = LogitsProcessorList([stop_words_logits_processor])
         from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
         self.__class__.generate_stream = NewGenerationMixin.generate
         self.__class__.sample_stream = NewGenerationMixin.sample_stream
+        stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
         def stream_generator():
             outputs = []
             for token in self.generate_stream(
         streamer: Optional["BaseStreamer"] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
+        generation_config = generation_config if generation_config is not None else self.generation_config
         # Process stop_words_ids.
         stop_words_ids = kwargs.pop("stop_words_ids", None)
         if stop_words_ids is None and generation_config is not None:
             stop_words_ids = getattr(generation_config, "stop_words_ids", None)
         if stop_words_ids is None:
+            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
         if stop_words_ids is not None:
             stop_words_logits_processor = StopWordsLogitsProcessor(
                 stop_words_ids=stop_words_ids,
+                eos_token_id=generation_config.eos_token_id,
             )
             if logits_processor is None:
                 logits_processor = LogitsProcessorList([stop_words_logits_processor])
             self._ntk_alpha_cached = ntk_alpha
             seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
             freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
             emb = torch.cat((freqs, freqs), dim=-1)
             from einops import rearrange
+            emb = rearrange(emb, "n d -> 1 n 1 d")
+            cos, sin = emb.cos(), emb.sin()
+            self._rotary_pos_emb_cache = [cos, sin]
     def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
         self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
+        cos, sin = self._rotary_pos_emb_cache
+        return [cos[:, offset : offset + max_seq_len], sin[:, offset : offset + max_seq_len]]
 def _rotate_half(x):
 def apply_rotary_pos_emb(t, freqs):
+    cos, sin = freqs
     if apply_rotary_emb_func is not None and t.is_cuda:
         t_ = t.float()
+        cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
+        sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
         output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
         return output
     else:
+        rot_dim = freqs[0].shape[-1]
+        cos, sin = freqs
         t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
         t_ = t_.float()
         t_pass_ = t_pass_.float()
+        t_ = (t_ * cos) + (_rotate_half(t_) * sin)
         return torch.cat((t_, t_pass_), dim=-1).type_as(t)