upload custom code

Browse files

Files changed (5) hide show

attention.py +771 -0
blocks.py +120 -0
configuration.py +207 -0
modeling_mpt.py +837 -0
utils.py +17 -0

attention.py ADDED Viewed

	@@ -0,0 +1,771 @@

+# Adapted from https://github.com/mosaicml/llm-foundry
+# Classes changed: MultiheadAttention
+# Functions changed: scaled_multihead_dot_product_attention, build_alibi_bias, build_attn_bias
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layers."""
+import math
+import warnings
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from packaging import version
+from torch import nn
+from torch.linalg import vector_norm
+from llmfoundry.models.layers.norm import LPLayerNorm
+from torch.nn import functional as F
+def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
+                     original_is_causal: bool):
+    # disable causal when it is not needed
+    # necessary for flash & triton for generation with kv_cache
+    if original_is_causal and num_query_tokens != num_key_tokens:
+        if num_query_tokens != 1:
+            raise NotImplementedError(
+                'MPT does not support query and key with different number of tokens, unless number of query tokens is 1.'
+            )
+        else:
+            return False
+    return original_is_causal
+def scaled_multihead_dot_product_attention(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    long_range_past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    attn_bias_ae=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+    topk=None,
+    faiss_indexes=None,
+    n_layers=None,
+    current_layer=None,
+    mask_by_sim=False,
+    sim_threshold=0.0
+):
+    q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    kv_n_heads = 1 if multiquery else n_heads
+    k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
+    v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
+    had_kv=False
+    if past_key_value is not None:
+        # attn_impl: flash & triton use kernels which expect input shape [b, s, h, d_head].
+        # kv_cache is therefore stored using that shape.
+        # attn_impl: torch stores the kv_cache in the ordering which is most advantageous
+        # for its attn computation ie
+        # keys are stored as tensors with shape [b, h, d_head, s] and
+        # values are stored as tensors with shape [b, h, s, d_head]
+        if len(past_key_value) != 0:
+            k = torch.cat([past_key_value[0], k], dim=3)
+            v = torch.cat([past_key_value[1], v], dim=2)
+            had_kv=True
+        past_key_value = (k, v)
+    b, h, s_q, d = q.shape
+    s_k = k.size(-1)
+    if softmax_scale is None:
+        softmax_scale = 1 / math.sqrt(d)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+        if (attn_bias.size(-1) != 1 and
+                attn_bias.size(-1) != s_k) or (attn_bias.size(-2) != 1 and
+                                               attn_bias.size(-2) != s_q):
+            raise RuntimeError(
+                f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.'
+            )
+        attn_weight = attn_weight + attn_bias
+    if needs_weights:
+        reshaped_idx = None
+    if long_range_past_key_value is not None or faiss_indexes is not None:
+        if long_range_past_key_value is not None: #manual external memories
+            k_cache, v_cache = long_range_past_key_value
+            s_cache = k_cache.size(-1)
+            k_cache = k_cache.to(k.device)
+            v_cache = v_cache.to(k.device)
+            q_n = q/vector_norm(q, ord=2, dim=-1, keepdim=True)
+            k_n = k_cache/vector_norm(k_cache, ord=2, dim=-2, keepdim=True)
+            sim = q_n.matmul(k_n)
+            if s_cache<topk:
+                topk = s_cache #number of tokens in cache < topk
+            val, idx = torch.topk(sim, k=topk, dim=-1)
+            reshaped_idx = idx.reshape(b, h, s_q * topk)
+            selected_k = k_cache.gather(dim=-1, index=reshaped_idx.unsqueeze(-2).expand(-1, -1, d, -1))
+            selected_v = v_cache.gather(dim=-2, index=reshaped_idx.unsqueeze(-1).expand(-1, -1, -1, d))
+            sim_mask = rearrange(~ (val > sim_threshold).bool(), 'b h s i -> b h (s i)').unsqueeze(-2).expand(-1, -1, s_q, -1)
+            min_val = torch.finfo(selected_k.dtype).min
+        elif faiss_indexes is not None: #faiss indexes
+            kn_index, kv_index = faiss_indexes
+            q_n = q/vector_norm(q, ord=2, dim=-1, keepdim=True)
+            one_hot_encodings = F.one_hot(torch.arange(0, n_heads*n_layers, device=q.device))*10
+            q_n = torch.concat([rearrange(q_n, 'b h s d -> b (h s) d', h=n_heads), one_hot_encodings[n_heads*current_layer:n_heads*(current_layer+1)].unsqueeze(0).repeat_interleave(repeats=q.size(-2), dim=-2)], dim=-1).squeeze()
+            D, I = kn_index.search(q_n.to('cpu').numpy(), k=topk)
+            selected_k=rearrange(torch.tensor(kv_index.reconstruct_batch(I.flatten()))[:,:d], '(h s) d -> 1 h d s', h=32).to(q.device)
+            selected_v=rearrange(torch.tensor(kv_index.reconstruct_batch(I.flatten()))[:,d:], '(h s) d -> 1 h s d', h=32).to(q.device)
+        s_k_ae = selected_k.size(-1)
+        s_k += s_k_ae
+        attn_weight_cache = q.matmul(selected_k) * softmax_scale
+        if mask_by_sim:
+            attn_weight_cache = attn_weight_cache.masked_fill(sim_mask, min_val)
+        if attn_bias_ae is not None:
+            # clamp to 0 necessary for torch 2.0 compile()
+            _s_q = max(0, attn_bias_ae.size(2) - s_q)
+            _s_k = max(0, attn_bias_ae.size(3) - s_k_ae)
+            attn_bias_ae = attn_bias_ae[:, :, _s_q:, _s_k:]
+            if (attn_bias_ae.size(-1) != 1 and
+                    attn_bias_ae.size(-1) != s_k_ae) or (attn_bias_ae.size(-2) != 1 and
+                                                attn_bias_ae.size(-2) != s_q):
+                raise RuntimeError(
+                    f'attn_bias (shape: {attn_bias_ae.shape}) is expected to broadcast to shape: {attn_weight_cache.shape}.'
+                )
+            attn_weight_cache = attn_weight_cache + attn_bias_ae
+        attn_weight = torch.cat([attn_weight_cache, attn_weight], dim=-1)
+        v = torch.cat([selected_v, v], dim=-2)
+    min_val = torch.finfo(q.dtype).min
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn(
+                'Propogating key_padding_mask to the attention module ' +\
+                'and applying it within the attention module can cause ' +\
+                'unneccessary computation/memory usage. Consider integrating ' +\
+                'into attn_bias once and passing that to each attention ' +\
+                'module instead.'
+            )
+        attn_weight = attn_weight.masked_fill(
+            ~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    def _create_active_externalism_mask(k, s_q, device):
+        mask = torch.zeros(s_q, s_q * k, device=device, dtype=torch.bool)
+        for i in range(s_q):
+            mask[i, i * k : (i + 1) * k] = 1
+        return ~mask
+    if is_causal and (not q.size(2) == 1):
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        if long_range_past_key_value is not None:
+            mask = _create_active_externalism_mask(k=topk,s_q=s_q, device=attn_weight.device)
+            s=s_q
+            if had_kv:
+                s += (past_key_value[0][0].size(-1) -s_q)
+            causal_mask = torch.cat([mask, causal_mask[:,-s:]], dim=1)
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k),
+                                              min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(attn_weight,
+                                                  p=dropout_p,
+                                                  training=training,
+                                                  inplace=True)
+    out = attn_weight.to(v.dtype).matmul(v)
+    out = rearrange(out, 'b h s d -> b s (h d)')
+    if needs_weights:
+        return out, attn_weight, past_key_value, reshaped_idx
+    return out, None, past_key_value, None
+def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
+    for tensor in tensors:
+        if tensor.dtype not in valid_dtypes:
+            raise TypeError(f'{tensor.dtype=} must be in {valid_dtypes=}.')
+        if not tensor.is_cuda:
+            raise TypeError(f'Inputs must be cuda tensors ({tensor.is_cuda=}).')
+def flash_attn_fn(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+):
+    try:
+        from flash_attn import bert_padding, flash_attn_interface  # type: ignore # yapf: disable # isort: skip
+    except:
+        raise RuntimeError('Please install flash-attn==1.0.3.post0')
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if attn_bias is not None:
+        raise NotImplementedError(f'attn_bias not implemented for flash attn.')
+    batch_size, seqlen = query.shape[:2]
+    if key_padding_mask is None:
+        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+    query_padding_mask = key_padding_mask[:, -query.size(1):]
+    query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
+        query, query_padding_mask)
+    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+    key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
+        key, key_padding_mask)
+    key_unpad = rearrange(key_unpad,
+                          'nnz (h d) -> nnz h d',
+                          h=1 if multiquery else n_heads)
+    value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
+    value_unpad = rearrange(value_unpad,
+                            'nnz (h d) -> nnz h d',
+                            h=1 if multiquery else n_heads)
+    if multiquery:
+        # Expanding a tensor does not allocate new memory, but only creates a new
+        # view on the existing tensor where a dimension of size one is expanded
+        # to a larger size by setting the stride to 0.
+        # - pytorch docs
+        #
+        # hopefully the kernels can utilize this and we're jot just wasting BW here
+        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads,
+                                     key_unpad.size(-1))
+        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads,
+                                         value_unpad.size(-1))
+    dropout_p = dropout_p if training else 0.0
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
+        query_unpad,
+        key_unpad,
+        value_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale=softmax_scale,
+        causal=reset_is_causal,
+        return_attn_probs=needs_weights)
+    output = bert_padding.pad_input(
+        rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size,
+        seqlen)
+    return output, None, past_key_value
+def triton_flash_attn_fn(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+):
+    try:
+        from llmfoundry.models.layers.flash_attn_triton import flash_attn_func
+    except:
+        _installed = False
+        if version.parse(torch.__version__) < version.parse('2.0.0'):
+            _installed = True
+            # if torch1.13.1 revert to using triton flash attn from HazyResearch
+            # with flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202
+            try:
+                from flash_attn.flash_attn_triton import flash_attn_func
+            except:
+                _installed = False
+        if not _installed:
+            # installing triton-pre-mlir works for both torch1.13.1 and torch2.0+
+            # default recommendation is to install this variant
+            raise RuntimeError(
+                'Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU '
+                'and `pip install .[gpu]` if installing from llm-foundry source or '
+                '`pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` '
+                'if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). '
+                'Note: (1) requires you have CMake and PyTorch already installed.'
+            )
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if dropout_p:
+        raise NotImplementedError(
+            f'Dropout not implemented for attn_impl: triton.')
+    if needs_weights:
+        raise NotImplementedError(
+            f'attn_impl: triton cannot return attn weights.')
+    if key_padding_mask is not None:
+        warnings.warn(
+            'Propagating key_padding_mask to the attention module ' +\
+            'and applying it within the attention module can cause ' +\
+            'unnecessary computation/memory usage. Consider integrating ' +\
+            'into attn_bias once and passing that to each attention ' +\
+            'module instead.'
+        )
+        b_size, s_k = key_padding_mask.shape[:2]
+        if attn_bias is None:
+            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
+        attn_bias = attn_bias.masked_fill(
+            ~key_padding_mask.view((b_size, 1, 1, s_k)),
+            torch.finfo(query.dtype).min)
+    query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
+    key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
+    value = rearrange(value,
+                      'b s (h d) -> b s h d',
+                      h=1 if multiquery else n_heads)
+    if multiquery:
+        # Expanding a tensor does not allocate new memory, but only creates a new
+        # view on the existing tensor where a dimension of size one is expanded
+        # to a larger size by setting the stride to 0.
+        # - pytorch docs
+        #
+        # hopefully the kernels can utilize this and we're jot just wasting BW here
+        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
+        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal,
+                                  softmax_scale)
+    output = attn_output.view(*attn_output.shape[:2], -1)
+    return output, None, past_key_value
+class MultiheadAttention(nn.Module):
+    """Multi-head self attention.
+    Using torch or triton attention implemetation enables user to also use
+    additive bias.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        attn_impl: str = 'triton',
+        clip_qkv: Optional[float] = None,
+        qk_ln: bool = False,
+        softmax_scale: Optional[float] = None,
+        attn_pdrop: float = 0.0,
+        low_precision_layernorm: bool = False,
+        verbose: int = 0,
+        device: Optional[str] = None,
+    ):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
+        self.attn_dropout_p = attn_pdrop
+        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
+        # for param init fn; enables shape based init of fused layers
+        fuse_splits = (d_model, 2 * d_model)
+        self.Wqkv._fused = (0, fuse_splits)  # type: ignore
+        if self.qk_ln:
+            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
+            self.q_ln = layernorm_class(self.d_model, device=device)
+            self.k_ln = layernorm_class(self.d_model, device=device)
+        if self.attn_impl == 'flash':
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == 'triton':
+            self.attn_fn = triton_flash_attn_fn
+            if verbose:
+                warnings.warn(
+                    'While `attn_impl: triton` can be faster than `attn_impl: flash` ' +\
+                    'it uses more memory. When training larger models this can trigger '  +\
+                    'alloc retries which hurts performance. If encountered, we recommend ' +\
+                    'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.'
+                )
+        elif self.attn_impl == 'torch':
+            self.attn_fn = scaled_multihead_dot_product_attention
+            if torch.cuda.is_available() and verbose:
+                warnings.warn(
+                    'Using `attn_impl: torch`. If your model does not use `alibi` or ' +\
+                    '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' +\
+                    'we recommend using `attn_impl: triton`.'
+                )
+        else:
+            raise ValueError(f'{attn_impl=} is an invalid setting.')
+        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
+        self.out_proj._is_residual = True  # type: ignore
+    def forward(
+        self,
+        x,
+        past_key_value=None,
+        long_range_past_key_value=None,
+        attn_bias=None,
+        attn_bias_ae=None,
+        attention_mask=None,
+        is_causal=True,
+        needs_weights=False,
+        topk=None,
+        faiss_indexes=None,
+        n_layers=None,
+        current_layer=None,
+        mask_by_sim=None,
+        sim_threshold=None
+    ):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        query, key, value = qkv.chunk(3, dim=2)
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            # Applying layernorm to qk
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        context, attn_weights, past_key_value, reshaped_idx = self.attn_fn(
+            query,
+            key,
+            value,
+            self.n_heads,
+            past_key_value=past_key_value,
+            long_range_past_key_value=long_range_past_key_value,
+            softmax_scale=self.softmax_scale,
+            attn_bias=attn_bias,
+            attn_bias_ae=attn_bias_ae,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            dropout_p=self.attn_dropout_p,
+            training=self.training,
+            needs_weights=needs_weights,
+            topk=topk,
+            faiss_indexes=faiss_indexes,
+            n_layers=n_layers,
+            current_layer=current_layer,
+            mask_by_sim=mask_by_sim,
+            sim_threshold=sim_threshold
+        )
+        return self.out_proj(context), attn_weights, past_key_value, reshaped_idx
+class MultiQueryAttention(nn.Module):
+    """Multi-Query self attention.
+    Using torch or triton attention implemetation enables user to also use
+    additive bias.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        attn_impl: str = 'triton',
+        clip_qkv: Optional[float] = None,
+        qk_ln: bool = False,
+        softmax_scale: Optional[float] = None,
+        attn_pdrop: float = 0.0,
+        low_precision_layernorm: bool = False,
+        verbose: int = 0,
+        device: Optional[str] = None,
+    ):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.head_dim = d_model // n_heads
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.head_dim)
+        self.attn_dropout_p = attn_pdrop
+        # NOTE: if we ever want to make attn TensorParallel, I'm pretty sure we'll
+        # want to split Wqkv into Wq and Wkv where Wq can be TensorParallel but
+        # Wkv shouldn't be TensorParallel
+        # - vchiley
+        self.Wqkv = nn.Linear(
+            d_model,
+            d_model + 2 * self.head_dim,
+            device=device,
+        )
+        # for param init fn; enables shape based init of fused layers
+        fuse_splits = (d_model, d_model + self.head_dim)
+        self.Wqkv._fused = (0, fuse_splits)  # type: ignore
+        if self.qk_ln:
+            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
+            self.q_ln = layernorm_class(d_model, device=device)
+            self.k_ln = layernorm_class(self.head_dim, device=device)
+        if self.attn_impl == 'flash':
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == 'triton':
+            self.attn_fn = triton_flash_attn_fn
+            if verbose:
+                warnings.warn(
+                    'While `attn_impl: triton` can be faster than `attn_impl: flash` ' +\
+                    'it uses more memory. When training larger models this can trigger '  +\
+                    'alloc retries which hurts performance. If encountered, we recommend ' +\
+                    'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.'
+                )
+        elif self.attn_impl == 'torch':
+            self.attn_fn = scaled_multihead_dot_product_attention
+            if torch.cuda.is_available() and verbose:
+                warnings.warn(
+                    'Using `attn_impl: torch`. If your model does not use `alibi` or ' +\
+                    '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' +\
+                    'we recommend using `attn_impl: triton`.'
+                )
+        else:
+            raise ValueError(f'{attn_impl=} is an invalid setting.')
+        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
+        self.out_proj._is_residual = True  # type: ignore
+    def forward(
+        self,
+        x,
+        past_key_value=None,
+        attn_bias=None,
+        attention_mask=None,
+        is_causal=True,
+        needs_weights=False,
+    ):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        query, key, value = qkv.split(
+            [self.d_model, self.head_dim, self.head_dim], dim=2)
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            # Applying layernorm to qk
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        context, attn_weights, past_key_value = self.attn_fn(
+            query,
+            key,
+            value,
+            self.n_heads,
+            past_key_value=past_key_value,
+            softmax_scale=self.softmax_scale,
+            attn_bias=attn_bias,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            dropout_p=self.attn_dropout_p,
+            training=self.training,
+            needs_weights=needs_weights,
+            multiquery=True,
+        )
+        return self.out_proj(context), attn_weights, past_key_value
+def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal,
+                    use_sequence_id):
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            if (prefix_lm or not causal) or use_sequence_id:
+                return (1, n_heads, seq_len, seq_len)
+            return (1, n_heads, 1, seq_len)
+        elif prefix_lm or use_sequence_id:
+            return (1, 1, seq_len, seq_len)
+        return None
+    else:
+        raise ValueError(f'{attn_impl=} is an invalid setting.')
+def build_attn_bias(
+    attn_impl,
+    n_heads,
+    seq_len,
+    attn_bias=None,
+    causal=False,
+    alibi=False,
+    alibi_bias_max=8,
+    for_ae=False,
+    topk=0,
+    device=None,
+    dtype=None
+):
+    if attn_impl == 'flash':
+        return None
+    elif attn_impl in ['torch', 'triton']:
+        if alibi:
+            # in place add alibi to attn bias
+            if attn_bias is not None:
+                attn_bias = attn_bias.add(
+                    build_alibi_bias(
+                        n_heads,
+                        seq_len,
+                        full=not causal,
+                        alibi_bias_max=alibi_bias_max,
+                        device=device,
+                        dtype=dtype,
+                        for_ae=for_ae,
+                        topk=topk
+                    ))
+            else:
+                attn_bias = build_alibi_bias(
+                        n_heads,
+                        seq_len,
+                        full=not causal,
+                        alibi_bias_max=alibi_bias_max,
+                        for_ae=for_ae,
+                        topk=topk)
+        return attn_bias
+def gen_slopes(n_heads, alibi_bias_max=8, device=None):
+    _n_heads = 2**math.ceil(math.log2(n_heads))
+    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
+    m = m.mul(alibi_bias_max / _n_heads)
+    slopes = (1. / torch.pow(2, m))
+    if _n_heads != n_heads:
+        # if n_heads is not a power of two,
+        # Huggingface and FasterTransformer calculate slopes normally,
+        # then return this strided concatenation of slopes
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
+    return slopes.view(1, n_heads, 1, 1)
+def build_alibi_bias(
+    n_heads,
+    seq_len,
+    full=False,
+    alibi_bias_max=8,
+    device=None,
+    dtype=None,
+    for_ae=False,
+    topk=0
+):
+    if not for_ae:
+        alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32,
+                              device=device).view(1, 1, 1, seq_len)
+    else:
+        alibi_bias = torch.tensor(-seq_len, dtype=torch.int32,
+                            device=device).repeat(seq_len*topk).view(1, 1, 1, seq_len*(topk))
+    if full:
+        # generate 1 x Heads x SeqLen x SeqLen alibi bias mask
+        # otherwise the mask is 1 x Heads x 1 x SeqLen (which is broadcast to the appropriate size)
+        alibi_bias = alibi_bias - torch.arange(
+            1 - seq_len, 1, dtype=torch.int32, device=device).view(
+                1, 1, seq_len, 1)
+        alibi_bias = alibi_bias.abs().mul(-1)
+    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
+    alibi_bias = alibi_bias * slopes
+    return alibi_bias.to(dtype=dtype)
+ATTN_CLASS_REGISTRY = {
+    'multihead_attention': MultiheadAttention,
+    'multiquery_attention': MultiQueryAttention,
+}

blocks.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Adapted from https://github.com/mosaicml/llm-foundry
+# Classes changed: MPTBlock
+# SPDX-License-Identifier: Apache-2.0
+"""GPT Blocks used for the GPT Model."""
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from .attention import ATTN_CLASS_REGISTRY
+from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
+class MPTMLP(nn.Module):
+    def __init__(self,
+                 d_model: int,
+                 expansion_ratio: int,
+                 device: Optional[str] = None):
+        super().__init__()
+        self.up_proj = nn.Linear(d_model,
+                                 expansion_ratio * d_model,
+                                 device=device)
+        self.act = nn.GELU(approximate='none')
+        self.down_proj = nn.Linear(expansion_ratio * d_model,
+                                   d_model,
+                                   device=device)
+        self.down_proj._is_residual = True  # type: ignore
+    def forward(self, x):
+        return self.down_proj(self.act(self.up_proj(x)))
+class MPTBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_heads: int,
+            expansion_ratio: int,
+            attn_config: Dict = {
+                'attn_type': 'multihead_attention',
+                'attn_pdrop': 0.0,
+                'attn_impl': 'triton',
+                'qk_ln': False,
+                'clip_qkv': None,
+                'softmax_scale': None,
+                'prefix_lm': False,
+                'attn_uses_sequence_id': False,
+                'alibi': False,
+                'alibi_bias_max': 8,
+            },
+            resid_pdrop: float = 0.0,
+            norm_type: str = 'low_precision_layernorm',
+            verbose: int = 0,
+            device: Optional[str] = None,
+            **kwargs):
+        del kwargs  # unused, just to capture any extra args from the config
+        super().__init__()
+        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+        attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
+        self.norm_1 = norm_class(d_model, device=device)
+        self.attn = attn_class(
+            attn_impl=attn_config['attn_impl'],
+            clip_qkv=attn_config['clip_qkv'],
+            qk_ln=attn_config['qk_ln'],
+            softmax_scale=attn_config['softmax_scale'],
+            attn_pdrop=attn_config['attn_pdrop'],
+            d_model=d_model,
+            n_heads=n_heads,
+            verbose=verbose,
+            device=device,
+        )
+        self.norm_2 = norm_class(d_model, device=device)
+        self.ffn = MPTMLP(
+            d_model=d_model,
+            expansion_ratio=expansion_ratio,
+            device=device,
+        )
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        long_range_past_key_value:Optional[Tuple[torch.Tensor]] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attn_bias_ae: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal: bool = True,
+        topk:int=None,
+        needs_weights:bool=None,
+        faiss_indexes:Tuple=None,
+        n_layers:int=None,
+        current_layer:int=None,
+        mask_by_sim:bool=False,
+        sim_threshold:float=None
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
+        a = self.norm_1(x)
+        b, attn_weights, past_key_value, reshaped_idx = self.attn(
+            a,
+            past_key_value=past_key_value,
+            long_range_past_key_value=long_range_past_key_value,
+            attn_bias=attn_bias,
+            attn_bias_ae=attn_bias_ae,
+            attention_mask=attention_mask,
+            is_causal=is_causal,
+            topk=topk,
+            needs_weights=needs_weights,
+            faiss_indexes=faiss_indexes,
+            n_layers=n_layers,
+            current_layer=current_layer,
+            mask_by_sim=mask_by_sim,
+            sim_threshold=sim_threshold
+        )
+        x = x + self.resid_attn_dropout(b)
+        m = self.norm_2(x)
+        n = self.ffn(m)
+        x = x + self.resid_ffn_dropout(n)
+        return x, attn_weights, past_key_value, reshaped_idx

configuration.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Adapted from https://github.com/mosaicml/llm-foundry
+# Classes changed: MPTConfig
+# SPDX-License-Identifier: Apache-2.0
+"""A HuggingFace-style model configuration."""
+from typing import Dict, List, Optional, Union
+from transformers import PretrainedConfig
+attn_config_defaults: Dict = {
+    'attn_type': 'multihead_attention',
+    'attn_pdrop': 0.0,
+    'attn_impl': 'torch',
+    'qk_ln': False,
+    'clip_qkv': None,
+    'softmax_scale': None,
+    'prefix_lm': False,
+    'attn_uses_sequence_id': False,
+    'alibi': True,
+    'alibi_bias_max': 8,
+    "topk": 10,
+    'mask_by_sim':True,
+    'sim_threshold':0.25,
+    'use_active_externalism':True,
+    'memory_type':'manual'
+}
+init_config_defaults: Dict = {
+    'name': 'kaiming_normal_',
+    'fan_mode': 'fan_in',
+    'init_nonlinearity': 'relu',
+    'init_div_is_residual': True,
+    'emb_init_std': None,
+    'emb_init_uniform_lim': None,
+    'init_std': None,
+    'init_gain': 0.0,
+}
+class ExtendedMPTConfig(PretrainedConfig):
+    model_type = 'extended-mpt'
+    def __init__(
+        self,
+        d_model: int = 4096,
+        n_heads: int = 32,
+        n_layers: int = 32,
+        expansion_ratio: int = 4,
+        max_seq_len: int = 2048,
+        vocab_size: int = 50432,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        learned_pos_emb: bool = True,
+        attn_config: Dict = attn_config_defaults,
+        init_device: str = 'cpu',
+        logit_scale: Optional[Union[float, str]] = None,
+        no_bias: bool = True,
+        verbose: int = 0,
+        embedding_fraction: float = 1.0,
+        norm_type: str = 'low_precision_layernorm',
+        use_cache: bool = False,
+        init_config: Dict = init_config_defaults,
+        use_active_externalism_by_layer: List[bool] = [True for _ in range(32)],
+        memory_device:str = 'cpu',
+        **kwargs,
+    ):
+        """The MPT configuration class.
+        Args:
+            d_model (int): The size of the embedding dimension of the model.
+            n_heads (int): The number of attention heads.
+            n_layers (int): The number of layers in the model.
+            expansion_ratio (int): The ratio of the up/down scale in the MLP.
+            max_seq_len (int): The maximum sequence length of the model.
+            vocab_size (int): The size of the vocabulary.
+            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
+            emb_pdrop (float): The dropout probability for the embedding layer.
+            learned_pos_emb (bool): Whether to use learned positional embeddings
+            attn_config (Dict):  A dictionary used to configure the model's attention module:
+                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
+                attn_pdrop (float): The dropout probability for the attention layers.
+                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
+                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
+                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
+                    this value.
+                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
+                    use the default scale of ``1/sqrt(d_keys)``.
+                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
+                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
+                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
+                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
+                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
+                    which sub-sequence each token belongs to.
+                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
+                alibi (bool): Whether to use the alibi bias instead of position embeddings.
+                alibi_bias_max (int): The maximum value of the alibi bias.
+            init_device (str): The device to use for parameter initialization.
+            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
+            no_bias (bool): Whether to use bias in all layers.
+            verbose (int): The verbosity level. 0 is silent.
+            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
+            norm_type (str): choose type of norm to use
+            multiquery_attention (bool): Whether to use multiquery attention implementation.
+            use_cache (bool): Whether or not the model should return the last key/values attentions
+            init_config (Dict): A dictionary used to configure the model initialization:
+                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
+                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
+                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
+                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
+                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
+                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
+                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
+                init_std (float): The standard deviation of the normal distribution used to initialize the model,
+                    if using the baseline_ parameter initialization scheme.
+                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
+                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
+                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
+                ---
+                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
+        """
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.attn_config = attn_config
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.verbose = verbose
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.use_cache = use_cache
+        self.init_config = init_config
+        self.use_active_externalism_by_layer = use_active_externalism_by_layer
+        self.memory_device = memory_device
+        if 'name' in kwargs:
+            del kwargs['name']
+        if 'loss_fn' in kwargs:
+            del kwargs['loss_fn']
+        super().__init__(**kwargs)
+        self._validate_config()
+    def _set_config_defaults(self, config, config_defaults):
+        # set config defaults
+        for k, v in config_defaults.items():
+            if k not in config:
+                config[k] = v
+        return config
+    def _validate_config(self):
+        # set config defaults
+        self.attn_config = self._set_config_defaults(
+            self.attn_config,
+            attn_config_defaults,
+        )
+        self.init_config = self._set_config_defaults(
+            self.init_config,
+            init_config_defaults,
+        )
+        if self.d_model % self.n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads')
+        if any(
+                prob < 0 or prob > 1 for prob in
+            [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop]):
+            raise ValueError(
+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1"
+            )
+        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
+            raise ValueError(
+                f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if self.attn_config['prefix_lm'] and self.attn_config[
+                'attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'prefix_lm only implemented with torch and triton attention.')
+        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in [
+                'torch', 'triton'
+        ]:
+            raise NotImplementedError(
+                'alibi only implemented with torch and triton attention.')
+        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
+                'attn_impl'] not in ['torch', 'triton']:
+            raise NotImplementedError(
+                'attn_uses_sequence_id only implemented with torch and triton attention.'
+            )
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError(
+                'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'
+            )
+        if isinstance(self.logit_scale,
+                      str) and self.logit_scale != 'inv_sqrt_d_model':
+            raise ValueError(
+                f"{self.logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
+            )
+        if self.init_config.get('name', None) is None:
+            raise ValueError(f"{self.init_config=} 'name' needs to be set.")
+        if not self.learned_pos_emb and not self.attn_config['alibi']:
+            raise ValueError(
+                f'Positional information must be provided to the model using either learned_pos_emb or alibi.'
+            )

modeling_mpt.py ADDED Viewed

	@@ -0,0 +1,837 @@

+# Adapted from https://github.com/mosaicml/llm-foundry
+# Classes changed: MPTModel, MPTForCausalLM
+# SPDX-License-Identifier: Apache-2.0
+"""A simple, flexible implementation of a GPT model.
+Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+"""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.linalg import vector_norm
+import faiss
+from einops import rearrange
+from composer.utils import dist
+from omegaconf import DictConfig
+from transformers import (PreTrainedModel, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast)
+from llmfoundry.models.layers.custom_embedding import SharedEmbedding
+from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
+from llmfoundry.models.utils.param_init_fns import MODEL_INIT_REGISTRY
+from .configuration import ExtendedMPTConfig
+from .attention import attn_bias_shape, build_attn_bias
+from .blocks import MPTBlock
+from .utils import instantiate_from_config
+Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+class MPTPreTrainedModel(PreTrainedModel):
+    config_class = ExtendedMPTConfig
+    base_model_prefix = 'model'
+    _no_split_modules = ['MPTBlock']
+class ExtendedMPTModel(MPTPreTrainedModel):
+    def __init__(self, config: ExtendedMPTConfig):
+        config._validate_config()
+        super().__init__(config)
+        self.attn_impl = config.attn_config['attn_impl']
+        self.prefix_lm = config.attn_config['prefix_lm']
+        self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
+        self.alibi = config.attn_config['alibi']
+        self.alibi_bias_max = config.attn_config['alibi_bias_max']
+        self.mask_by_sim = config.attn_config['mask_by_sim']
+        self.sim_threshold = config.attn_config['sim_threshold']
+        self.topk = config.attn_config['topk']
+        self.use_active_externalism = config.attn_config['use_active_externalism']
+        self.use_active_externalism_by_layer = config.use_active_externalism_by_layer
+        if config.init_device == 'mixed':
+            if dist.get_local_rank() == 0:
+                config.init_device = 'cpu'
+            else:
+                config.init_device = 'meta'
+        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
+            norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
+            raise NotImplementedError(
+                f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).'
+            )
+        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
+        # CogView (https://arxiv.org/abs/2105.13290) and GLM-130B (https://arxiv.org/abs/2210.02414)
+        # both report this helping with stabilizing training
+        self.embedding_fraction = config.embedding_fraction
+        self.wte = SharedEmbedding(config.vocab_size,
+                                   config.d_model,
+                                   device=config.init_device)
+        if not self.alibi:
+            self.wpe = torch.nn.Embedding(config.max_seq_len,
+                                          config.d_model,
+                                          device=config.init_device)
+        self.emb_drop = nn.Dropout(config.emb_pdrop)
+        self.blocks = nn.ModuleList([
+            MPTBlock(
+                device=config.init_device,
+                **config.to_dict(),
+            ) for _ in range(config.n_layers)
+        ])
+        self.norm_f = norm_class(config.d_model, device=config.init_device)
+        if config.init_device != 'meta':
+            print(
+                f'You are using {config.init_device=}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.'
+            )
+            self.apply(self.param_init_fn)
+        self.is_causal = not self.prefix_lm
+        # define attn mask
+        self._attn_bias_initialized = False
+        self.attn_bias = None
+        self.attn_bias_shape = attn_bias_shape(
+            self.attn_impl,
+            config.n_heads,
+            config.max_seq_len,
+            self.alibi,
+            prefix_lm=self.prefix_lm,
+            causal=self.is_causal,
+            use_sequence_id=self.attn_uses_sequence_id,
+        )
+        self._attn_bias_ae_initialized = False
+        self.attn_bias_ae = None
+        if self.config.no_bias:
+            for module in self.modules():
+                if hasattr(module, 'bias') and isinstance(
+                        module.bias, nn.Parameter):
+                    if self.config.verbose:
+                        warnings.warn(
+                            f'Removing bias ({module.bias}) from {module}.')
+                    module.register_parameter('bias', None)
+        # Print verbose info
+        if config.verbose and config.verbose > 2:
+            print(self)
+        if 'verbose' not in self.config.init_config:
+            self.config.init_config['verbose'] = self.config.verbose
+        if self.config.init_config['verbose'] > 1:
+            init_fn_name = self.config.init_config['name']
+            warnings.warn(f'Using {init_fn_name} initialization.')
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, value: nn.Embedding):
+        self.wte = value
+    @torch.no_grad()
+    def _attn_bias(
+        self,
+        device,
+        dtype,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        seq_len: Optional[int] = None,
+        use_active_externalism:bool=None,
+        topk=None,
+    ):
+        if not self._attn_bias_initialized:
+            if self.attn_bias_shape:
+                self.attn_bias = torch.zeros(self.attn_bias_shape,
+                                             device=device,
+                                             dtype=dtype)
+                self.attn_bias = build_attn_bias(
+                    self.attn_impl,
+                    self.config.n_heads,
+                    self.config.max_seq_len,
+                    device=device,
+                    dtype=dtype,
+                    attn_bias = self.attn_bias,
+                    causal=self.is_causal,
+                    alibi=self.alibi,
+                    alibi_bias_max=self.alibi_bias_max
+                )
+            self._attn_bias_initialized = True
+        if use_active_externalism:
+            self.attn_bias_ae = build_attn_bias(
+                self.attn_impl,
+                self.config.n_heads,
+                seq_len,
+                device=device,
+                dtype=dtype,
+                causal=self.is_causal,
+                alibi=self.alibi,
+                alibi_bias_max=self.alibi_bias_max,
+                for_ae=use_active_externalism,
+                topk=topk
+            )
+            self._attn_bias_ae_initialized = True
+        # flash does not support prefix_lm and will incorporate any
+        # attention_mask inside the attention module
+        if self.attn_impl == 'flash':
+            return self.attn_bias, attention_mask
+        if self.attn_bias is not None:
+            # .to(*args, **kwargs) is a no-op if tensor is already on
+            # specified device or of specificed dtype
+            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
+        attn_bias = self.attn_bias
+        if self.attn_bias_ae is not None:
+            self.attn_bias_ae = self.attn_bias_ae.to(dtype=dtype, device=device)
+        attn_bias_ae = self.attn_bias_ae
+        # If using torch or triton, we incorporate the prefix_mask (if appropriate)
+        if self.prefix_lm:
+            assert isinstance(attn_bias, torch.Tensor)  # pyright
+            assert isinstance(prefix_mask, torch.Tensor)  # pyright
+            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
+        # If using torch or triton, we incorporate sequence_id (if appropriate)
+        if self.attn_uses_sequence_id and sequence_id is not None:
+            assert isinstance(attn_bias, torch.Tensor)  # pyright
+            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
+        # If using torch or triton, we incorporate attention_mask. This will output
+        # None in place of attention_mask since it will not be further needed in the
+        # attention modules.
+        if attention_mask is not None:
+            s_k = attention_mask.shape[-1]
+            if attn_bias is None:
+                attn_bias = torch.zeros((1, 1, 1, s_k),
+                                        device=device,
+                                        dtype=dtype)
+            else:
+                # clamp to 0 necessary for torch 2.0 compile()
+                _s_k = max(0, attn_bias.size(-1) - s_k)
+                attn_bias = attn_bias[:, :, :, _s_k:]
+            if prefix_mask is not None and (attention_mask.shape !=
+                                            prefix_mask.shape):
+                raise ValueError(
+                    f'attention_mask shape={attention_mask.shape} ' +
+                    f'and prefix_mask shape={prefix_mask.shape} are not equal.')
+            min_val = torch.finfo(attn_bias.dtype).min
+            attn_bias = attn_bias.masked_fill(
+                ~attention_mask.view(-1, 1, 1, s_k), min_val)
+        return attn_bias, attn_bias_ae, None
+    def _apply_prefix_mask(self, attn_bias: torch.Tensor,
+                           prefix_mask: torch.Tensor):
+        s_k, s_q = attn_bias.shape[-2:]
+        if (s_k != self.config.max_seq_len) or (s_q != self.config.max_seq_len):
+            raise ValueError(
+                'attn_bias does not match the expected shape. ' +
+                f'The last two dimensions should both be {self.config.max_length} '
+                + f'but are {s_k} and {s_q}.')
+        seq_len = prefix_mask.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(
+                f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
+            )
+        # select seq_len subset of attn mask
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        # Mix the causal max and the bidirectional mask to get the full
+        # allowable attention (i.e. full = not accounting for padding yet)
+        causal = torch.tril(
+            torch.ones((seq_len, seq_len),
+                       dtype=torch.bool,
+                       device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
+        prefix = prefix_mask.view(-1, 1, 1, seq_len)
+        cannot_attend = ~torch.logical_or(causal, prefix.bool())
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+    def _apply_sequence_id(self, attn_bias: torch.Tensor,
+                           sequence_id: torch.LongTensor):
+        seq_len = sequence_id.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(
+                f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
+            )
+        # select seq_len subset of attn mask
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        # Restrict attention to tokens that share the same value
+        # in sequence_id
+        cannot_attend = torch.logical_not(
+            torch.eq(
+                sequence_id.view(-1, seq_len, 1),
+                sequence_id.view(-1, 1, seq_len),
+            )).unsqueeze(1)
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_active_externalism:Optional[bool]=None,
+        long_range_past_key_values:Optional[List[Tuple[torch.FloatTensor]]] = None,
+        faiss_indexes:Tuple=None,
+        topk:int=None,
+    ):
+        return_dict = (return_dict
+                       if return_dict is not None else self.config.return_dict)
+        use_cache = (use_cache
+                     if use_cache is not None else self.config.use_cache)
+        use_active_externalism = (use_active_externalism
+                     if use_active_externalism is not None else self.use_active_externalism)
+        topk = (topk if topk is not None else self.topk)
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool()
+        if prefix_mask is not None:
+            prefix_mask = prefix_mask.bool()
+        # These args are passed in by keyword in huggingface's generate function
+        # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/generation/utils.py#L2201-L2206
+        # but have not yet been fully implemented in MPTModel
+        if not return_dict:
+            raise NotImplementedError(
+                'return_dict False is not implemented yet for MPT')
+        if output_attentions:
+            if self.attn_impl != 'torch':
+                raise NotImplementedError(
+                    'output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.'
+                )
+        if (attention_mask is not None and
+                attention_mask[:, 0].sum() != attention_mask.shape[0] and
+                self.training):
+            raise NotImplementedError(
+                'MPT does not support training with left padding.')
+        if self.prefix_lm and prefix_mask is None:
+            raise ValueError(
+                'prefix_mask is a required argument when MPT is configured with prefix_lm=True.'
+            )
+        # Raise a not implemented error if input_embeds is not None (this is an arg in huggingface transformers and we need to support it for PEFT)
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                'inputs_embeds is not implemented for MPT.')
+        if self.training:
+            if self.attn_uses_sequence_id and sequence_id is None:
+                raise ValueError(
+                    'sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True '
+                    + 'and the model is in train mode.')
+            elif (self.attn_uses_sequence_id is False) and (sequence_id
+                                                            is not None):
+                warnings.warn(
+                    'MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. '
+                    +
+                    'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.'
+                )
+        S = input_ids.size(1)
+        assert (
+            S <= self.config.max_seq_len
+        ), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
+        tok_emb = self.wte(input_ids)  # type: ignore
+        if self.alibi:
+            x = tok_emb
+        else:
+            past_position = 0
+            if past_key_values is not None:
+                if len(past_key_values) != self.config.n_layers:
+                    raise ValueError(
+                        f'past_key_values must provide a past_key_value for each attention '
+                        +
+                        f'layer in the network ({len(past_key_values)=}; {self.config.n_layers=}).'
+                    )
+                # For attn_impl: triton and flash the past key tensor spec is (batch, seq, dim).
+                # For attn_impl: torch the past key tensor spec is (batch, heads, head_dim, seq).
+                # Here we shift position embedding using the `seq` dim of the past key
+                past_position = past_key_values[0][0].size(1)
+                if self.attn_impl == 'torch':
+                    past_position = past_key_values[0][0].size(3)
+            if S + past_position > self.config.max_seq_len:
+                raise ValueError(
+                    f'Cannot forward input with past sequence length {past_position} and current sequence length '
+                    f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.'
+                )
+            pos = torch.arange(
+                past_position,
+                S + past_position,
+                dtype=torch.long,
+                device=input_ids.device,
+            ).unsqueeze(0)
+            if attention_mask is not None:
+                # adjust the position indices to account for padding tokens
+                pos = torch.clamp(
+                    pos - torch.cumsum((~attention_mask).to(torch.int32),
+                                       dim=1)[:, past_position:],
+                    min=0,
+                )
+            pos_emb = self.wpe(pos)  # type: ignore
+            x = tok_emb + pos_emb
+        if self.embedding_fraction == 1:
+            x = self.emb_drop(x)  # type: ignore
+        else:
+            # this implementation is proposed on page 7 of the GLM-130B paper https://arxiv.org/abs/2210.02414
+            x_shrunk = (x * self.embedding_fraction) + (
+                x.detach() * (1 - self.embedding_fraction))
+            assert isinstance(self.emb_drop, nn.Module)  # pyright
+            x = self.emb_drop(x_shrunk)
+        # self._attn_bias_initialized = False #right now this needs to run each step
+        seq_len = S
+        if past_key_values is not None:
+            past_position = past_key_values[0][0].size(-1)
+            seq_len += past_position
+        attn_bias, attn_bias_ae, attention_mask = self._attn_bias(
+            device=x.device,
+            dtype=torch.float32,
+            attention_mask=attention_mask,
+            prefix_mask=prefix_mask,
+            sequence_id=sequence_id,
+            seq_len = seq_len,
+            use_active_externalism=use_active_externalism,
+            topk=topk
+        )
+        # initialize the past key values cache if it should be used
+        if use_cache and past_key_values is None:
+            past_key_values = [() for _ in range(self.config.n_layers)
+                              ]  # type: ignore
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_idx = () if output_attentions else None
+        for b_idx, block in enumerate(self.blocks):  # type: ignore
+            if output_hidden_states:
+                assert all_hidden_states is not None  # pyright
+                all_hidden_states = all_hidden_states + (x,)
+            past_key_value = (past_key_values[b_idx]
+                              if past_key_values is not None else None)
+            long_range_past_key_value = (long_range_past_key_values[b_idx]
+                                          if (long_range_past_key_values is not None and self.use_active_externalism_by_layer[b_idx] and use_active_externalism is True) else None)
+            if long_range_past_key_value is not None and faiss_indexes is not None:
+                raise NotImplementedError(
+                    'Using faiss and passing key value pairs manually are mutually exclusive right now.')
+            x, attn_weights, past_key_value, reshaped_idx = block(
+                x,
+                past_key_value=past_key_value,
+                long_range_past_key_value=long_range_past_key_value,
+                attn_bias=attn_bias,
+                attention_mask=attention_mask,
+                attn_bias_ae=attn_bias_ae,
+                is_causal=self.is_causal,
+                topk=topk,
+                needs_weights=output_attentions,
+                faiss_indexes=faiss_indexes,
+                n_layers=self.config.n_layers,
+                current_layer=b_idx,
+                mask_by_sim=self.mask_by_sim,
+                sim_threshold=self.sim_threshold,
+            )
+            if past_key_values is not None:
+                past_key_values[b_idx] = past_key_value
+            if output_attentions:
+                assert all_self_attns is not None  # pyright
+                all_self_attns = all_self_attns + (attn_weights,)
+                assert all_idx is not None
+                all_idx = all_idx + (reshaped_idx,)
+        x = self.norm_f(x)  # type: ignore
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            assert all_hidden_states is not None  # pyright
+            all_hidden_states = all_hidden_states + (x,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=x,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=(all_self_attns, all_idx),
+        )
+    # Param Initialization, needed for device='meta' fast initialization
+    def param_init_fn(self, module):
+        init_fn_name = self.config.init_config['name']
+        MODEL_INIT_REGISTRY[init_fn_name](
+            module=module,
+            n_layers=self.config.n_layers,
+            d_model=self.config.d_model,
+            **self.config.init_config,
+        )
+    # FSDP Wrap function
+    def fsdp_wrap_fn(self, module):
+        return isinstance(module, MPTBlock)
+    # Activation Checkpointing
+    def activation_checkpointing_fn(self, module):
+        return isinstance(module, MPTBlock)
+class ExtendedMPTForCausalLM(MPTPreTrainedModel):
+    def __init__(self, config:ExtendedMPTConfig, external_memories=None):
+        if isinstance(config, DictConfig):
+            config = instantiate_from_config(config)
+        super().__init__(config)
+        if not config.tie_word_embeddings:
+            raise ValueError(
+                'MPTForCausalLM only supports tied word embeddings')
+        print(f'Instantiating an MPTForCausalLM model from {__file__}')
+        self.transformer: ExtendedMPTModel = ExtendedMPTModel(config)
+        self.use_active_externalism = config.attn_config['use_active_externalism']
+        self.memory_type = config.attn_config['memory_type']
+        self._memories = None
+        self.memory_device = config.memory_device
+        for child in self.transformer.children():
+            if isinstance(child, torch.nn.ModuleList):
+                continue
+            if isinstance(child, torch.nn.Module):
+                child._fsdp_wrap = True
+        # enables scaling output logits; similar to a softmax "temperature"
+        # PaLM paper uses scale 1/sqrt(config.d_model)
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == 'inv_sqrt_d_model':
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(
+                        f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
+                    )
+            self.logit_scale = logit_scale
+        if external_memories is not None:
+            self._memories = external_memories
+            self.memories = None
+    def set_memories(self, memories):
+        self.memories = memories
+    def empty_memories(self):
+        self.memories = None
+    def get_input_embeddings(self):
+        return self.transformer.wte
+    def set_input_embeddings(self, value):
+        self.transformer.wte = value
+    def get_output_embeddings(self):
+        return self.transformer.wte
+    def set_output_embeddings(self, new_embeddings):
+        self.transformer.wte = new_embeddings
+    def set_decoder(self, decoder):
+        self.transformer = decoder
+    def get_decoder(self):
+        return self.transformer
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_active_externalism: Optional[bool]=None,
+        topk:int=None
+    ):
+        if self._memories is not None and self.memories is None:
+            self.memories = self.generate_cache(self._memories, cache_type=self.memory_type)
+        return_dict = (return_dict
+                       if return_dict is not None else self.config.return_dict)
+        use_cache = (use_cache
+                     if use_cache is not None else self.config.use_cache)
+        use_active_externalism = (use_active_externalism
+                     if use_active_externalism is not None else self.use_active_externalism)
+        topk = topk if topk is not None else None
+        # if input_embeds is not none, raise a not implemented error
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                'inputs_embeds has to be None (for hf/peft support).')
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        if hasattr(self, "memories") and type(self.memories)==list:
+            long_range_past_key_values = self.memories
+            faiss_indexes = None
+        elif hasattr(self, "memories"):
+            long_range_past_key_values = None
+            faiss_indexes = self.memories
+        else:
+            long_range_past_key_values = None
+            faiss_indexes = None
+        outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            long_range_past_key_values=long_range_past_key_values,
+            faiss_indexes=faiss_indexes,
+            attention_mask=attention_mask,
+            prefix_mask=prefix_mask,
+            sequence_id=sequence_id,
+            return_dict=return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            use_active_externalism=use_active_externalism,
+            topk=topk
+        )
+        # move outputs to same device as weights for token embedding
+        # needed to support HF `device_map`
+        logits = self.transformer.wte(
+            outputs.last_hidden_state.to(self.transformer.wte.weight.device),
+            True,
+        )
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(
+                    f'Multiplying logits by {self.logit_scale=}. This will produce uniform (uninformative) outputs.'
+                )
+            logits *= self.logit_scale
+        loss = None
+        if labels is not None:
+            _labels = torch.roll(labels, shifts=-1)
+            _labels[:, -1] = -100
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                _labels.to(logits.device).view(-1),
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    # Param Initialization, needed for device='meta' fast initialization
+    def param_init_fn(self, module):
+        init_fn_name = self.config.init_config['name']
+        MODEL_INIT_REGISTRY[init_fn_name](
+            module=module,
+            n_layers=self.config.n_layers,
+            d_model=self.config.d_model,
+            **self.config.init_config,
+        )
+    # FSDP Wrap function
+    def fsdp_wrap_fn(self, module):
+        return isinstance(module, MPTBlock)
+    # Activation Checkpointing
+    def activation_checkpointing_fn(self, module):
+        return isinstance(module, MPTBlock)
+    def generate_cache(self,
+                       input_ids:torch.LongTensor,
+                       stride:int=512,
+                       max_len:int=2048,
+                       cache_type:str='manual'):
+        if cache_type not in ['manual', 'faiss']:
+            raise NotImplementedError(f"Cache type {cache_type} not implemented.")
+        prev_end_loc=0
+        long_range_past_key_values = None
+        faiss_indexes= None
+        for b_idx in range(0, input_ids.size(-1), stride):
+            end_loc = min(b_idx + max_len, input_ids.size(-1))
+            trg_len = end_loc - prev_end_loc
+            subseq = input_ids[:, b_idx:end_loc].to(self.device)
+            with torch.no_grad():
+                outputs = self.transformer(subseq, use_cache=True, use_active_externalism=False)
+            to_cache = [(
+                        kv[0][:,:,:,-trg_len:],
+                        kv[1][:,:,-trg_len:])
+                        for kv in outputs.past_key_values
+                        ]
+            long_range_past_key_values, faiss_indexes = self.cache(to_cache, cache_type, long_range_past_key_values=long_range_past_key_values, faiss_indexes=faiss_indexes)
+            prev_end_loc = end_loc
+            if end_loc == input_ids.size(-1):
+                break
+        if long_range_past_key_values is not None:
+            return long_range_past_key_values
+        else:
+            return faiss_indexes
+    def cache(self,
+              to_cache:List,
+              cache_type:str='manual',
+              long_range_past_key_values:List=None,
+              faiss_indexes:faiss.IndexFlatIP=None,
+              max_length_cache=100000,
+              verbose=False):
+        if long_range_past_key_values is not None and faiss_indexes is not None:
+            raise NotImplementedError("Using faiss and passing key value pairs manually are mutually exclusive right now.")
+        if cache_type=='faiss':
+            one_hot_encodings = F.one_hot(torch.arange(0, self.config.n_heads*self.config.n_layers))*10
+            if faiss_indexes is None:
+                faiss_indexes = (faiss.IndexFlatIP(to_cache[0][0].size(-2)+one_hot_encodings.size(-1)), faiss.IndexFlatIP(to_cache[0][1].size(-1)*2))
+            kn_index, kv_index = faiss_indexes
+            for b_idx, (k, v) in enumerate(to_cache):
+                k_n = (k/vector_norm(k, ord=2, dim=-2, keepdim=True)).to('cpu')
+                k_n = torch.concat([rearrange(k_n, 'b h d s -> b (h s) d', h=self.config.n_heads), one_hot_encodings[self.config.n_heads*b_idx:self.config.n_heads*(b_idx+1)].unsqueeze(0).repeat_interleave(repeats=k.size(-1), dim=-2)], dim=-1)
+                kn_index.add(k_n.squeeze().numpy())
+                k= rearrange(k, 'b h d s -> b (h s) d', h=self.config.n_heads)
+                v= rearrange(v, 'b h s d -> b (h s) d', h=self.config.n_heads)
+                kv_index.add(torch.concat([v.squeeze(), k.squeeze()], dim=1).to('cpu').numpy())
+        else:
+            if long_range_past_key_values is None:
+                long_range_past_key_values = [(k.to(self.memory_device),v.to(self.memory_device)) for k,v in to_cache]
+            else:
+                long_range_past_key_values = [
+                    (
+                    torch.concat([kv[0], to_cache[ind][0].to(self.memory_device)], dim=3),
+                    torch.concat([kv[1], to_cache[ind][1].to(self.memory_device)], dim=2)
+                    )
+                    for ind, kv in enumerate(long_range_past_key_values)
+                ]
+        if long_range_past_key_values is not None:
+            if long_range_past_key_values[0][0].size(-1) > max_length_cache: #set a limit on manual memory length
+                long_range_past_key_values = [
+                    (
+                        kv[0][:, :, :, -max_length_cache:],
+                        kv[1][:, :, -max_length_cache:]
+                    )
+                    for kv in long_range_past_key_values]
+        if verbose:
+            if cache_type == 'faiss':
+                print(f"{kn_index.ntotal} keys in faiss index")
+            else:
+                print(f"{long_range_past_key_values[0][0].size(-1)} cached kvs")
+        return long_range_past_key_values, (kn_index, kv_index) if cache_type == 'faiss' else None
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                'inputs_embeds is not implemented for MPT yet')
+        attention_mask = kwargs['attention_mask'].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError(
+                'MPT does not support generation with right padding.')
+        if self.transformer.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if self.transformer.prefix_lm:
+            # Leverage a convenience of sequential generation!
+            prefix_mask = torch.ones_like(attention_mask)
+            # This requires that we're using the cache
+            if kwargs.get('use_cache') == False:
+                raise NotImplementedError(
+                    'MPT with prefix_lm=True does not support use_cache=False.')
+        else:
+            prefix_mask = None
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'prefix_mask': prefix_mask,
+            'sequence_id': sequence_id,
+            'past_key_values': past_key_values,
+            'use_cache': kwargs.get('use_cache', True),
+            'use_active_externalism': kwargs.get('use_active_externalism'),
+            'topk': kwargs.get('topk', None),
+        }
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        """Used by HuggingFace generate when using beam search with kv-caching.
+        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
+        for an example in transformers.
+        """
+        reordered_past = []
+        for layer_past in past_key_values:
+            reordered_past += [
+                tuple(
+                    past_state.index_select(0, beam_idx)
+                    for past_state in layer_past)
+            ]
+        return reordered_past

utils.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .utils import *
+import importlib
+def instantiate_from_config(config):
+    if not "target" in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)