nvidia
/

Llama-3_3-Nemotron-Super-49B-v1_5

@@ -31,6 +31,9 @@ class VariableCache(Cache_4_44_2, Cache):
     The default implementation for the layer caches is StaticCache.
     The cache of each layer is allocated to the same gpu as the layer itself.
     """
     def __init__(
             self,
@@ -50,7 +53,7 @@ class VariableCache(Cache_4_44_2, Cache):
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype
-        self.layer_caches: list[Cache_4_44_2 | None] = [None] * config.num_hidden_layers
         self.layer_devices: list[torch.device | None] = [None] * config.num_hidden_layers
     def update(
@@ -60,11 +63,11 @@ class VariableCache(Cache_4_44_2, Cache):
             layer_idx: int,
             cache_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if self.layer_caches[layer_idx] is None:
             self.layer_devices[layer_idx] = key_states.device
             self._init_layer_cache(layer_idx)
-        layer_cache = self.layer_caches[layer_idx]
         assert layer_cache is not None, f"Trying to update the cache of a cache-less layer: {layer_idx=}"
         k_out, v_out = layer_cache.update(key_states=key_states,
@@ -93,37 +96,37 @@ class VariableCache(Cache_4_44_2, Cache):
         if attention_config.window_length is not None:
             if not attention_config.is_sink:
                 config.sliding_window = attention_config.window_length
-                self.layer_caches[layer_idx] = SlidingWindowCache(config=config,
                                                                   max_batch_size=self.max_batch_size,
                                                                   max_cache_len=self.max_cache_len,
                                                                   device=device,
                                                                   dtype=self.dtype)
                 return
             elif not attention_config.unshifted_sink:
-                self.layer_caches[layer_idx] = SinkCache(window_length=attention_config.window_length,
                                                          num_sink_tokens=attention_config.num_sink_tokens)
                 return
-        self.layer_caches[layer_idx] = StaticCache(config=config,
                                                    max_batch_size=self.max_batch_size,
                                                    max_cache_len=self.max_cache_len,
                                                    device=device,
                                                    dtype=self.dtype)
     def _get_first_real_cache(self) -> Cache:
-        for layer_cache in self.layer_caches:
             if layer_cache is not None:
                 return layer_cache
         raise ValueError(f"No real cache found, all layer caches are None.")
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-        if layer_idx == 0 and self.layer_caches[0] is None:
             try:
                 layer_cache = self._get_first_real_cache()
             except ValueError:
                 return 0
         else:
-            layer_cache = self.layer_caches[layer_idx]
         return layer_cache.get_seq_length()
     def get_max_length(self) -> Optional[int]:
@@ -131,9 +134,12 @@ class VariableCache(Cache_4_44_2, Cache):
         return self.max_cache_len
     def reset(self):
-        for layer_idx in range(len(self.layer_caches)):
-            layer_cache = self.layer_caches[layer_idx]
             if hasattr(layer_cache, "reset"):
                 layer_cache.reset()
             else:
                 self._init_layer_cache(layer_idx)

     The default implementation for the layer caches is StaticCache.
     The cache of each layer is allocated to the same gpu as the layer itself.
     """
+    max_batch_size = None
+    max_cache_len = None
     def __init__(
             self,
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype
+        self.layers: list[Cache_4_44_2 | None] = [None] * config.num_hidden_layers
         self.layer_devices: list[torch.device | None] = [None] * config.num_hidden_layers
     def update(
             layer_idx: int,
             cache_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.layers[layer_idx] is None:
             self.layer_devices[layer_idx] = key_states.device
             self._init_layer_cache(layer_idx)
+        layer_cache = self.layers[layer_idx]
         assert layer_cache is not None, f"Trying to update the cache of a cache-less layer: {layer_idx=}"
         k_out, v_out = layer_cache.update(key_states=key_states,
         if attention_config.window_length is not None:
             if not attention_config.is_sink:
                 config.sliding_window = attention_config.window_length
+                self.layers[layer_idx] = SlidingWindowCache(config=config,
                                                                   max_batch_size=self.max_batch_size,
                                                                   max_cache_len=self.max_cache_len,
                                                                   device=device,
                                                                   dtype=self.dtype)
                 return
             elif not attention_config.unshifted_sink:
+                self.layers[layer_idx] = SinkCache(window_length=attention_config.window_length,
                                                          num_sink_tokens=attention_config.num_sink_tokens)
                 return
+        self.layers[layer_idx] = StaticCache(config=config,
                                                    max_batch_size=self.max_batch_size,
                                                    max_cache_len=self.max_cache_len,
                                                    device=device,
                                                    dtype=self.dtype)
     def _get_first_real_cache(self) -> Cache:
+        for layer_cache in self.layers:
             if layer_cache is not None:
                 return layer_cache
         raise ValueError(f"No real cache found, all layer caches are None.")
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        if layer_idx == 0 and self.layers[0] is None:
             try:
                 layer_cache = self._get_first_real_cache()
             except ValueError:
                 return 0
         else:
+            layer_cache = self.layers[layer_idx]
         return layer_cache.get_seq_length()
     def get_max_length(self) -> Optional[int]:
         return self.max_cache_len
     def reset(self):
+        for layer_idx in range(len(self.layers)):
+            layer_cache = self.layers[layer_idx]
             if hasattr(layer_cache, "reset"):
                 layer_cache.reset()
             else:
                 self._init_layer_cache(layer_idx)
+    def is_compileable(self) -> bool:
+        return False