ffurfaro commited on
Commit
61f4597
·
verified ·
1 Parent(s): 73bcbf2

Upload model + init tptt code

Browse files
lora_delta_product_m0.5_constant/README.md CHANGED
@@ -75,17 +75,17 @@ print(tokenizer.decode(outputs, skip_special_tokens=True))
75
  - **Batch size:** 1
76
  - **Epochs:** 1.0
77
  - **Learning rate (final):** N/A
78
- - **Loss (final):** 1.2633397308452659
79
- - **Training runtime:** 11512.6247 sec
80
- - **Samples per second:** 0.174
81
- - **Steps per second:** 0.174
82
  - **Total FLOPs:** 5574366965268480.0
83
  - **Gradient norm (final):** N/A
84
 
85
  ## Evaluation
86
 
87
  - **Metrics:** Training loss only (no eval yet, table soon : PiQA, ARC, Hella, Wino, GSM8K, MMLU)
88
- - **Results:** Final training loss: 1.2633397308452659
89
 
90
 
91
  ## Citation & Contact
 
75
  - **Batch size:** 1
76
  - **Epochs:** 1.0
77
  - **Learning rate (final):** N/A
78
+ - **Loss (final):** 1.2270214224887408
79
+ - **Training runtime:** 11919.5421 sec
80
+ - **Samples per second:** 0.168
81
+ - **Steps per second:** 0.168
82
  - **Total FLOPs:** 5574366965268480.0
83
  - **Gradient norm (final):** N/A
84
 
85
  ## Evaluation
86
 
87
  - **Metrics:** Training loss only (no eval yet, table soon : PiQA, ARC, Hella, Wino, GSM8K, MMLU)
88
+ - **Results:** Final training loss: 1.2270214224887408
89
 
90
 
91
  ## Citation & Contact
lora_delta_product_m0.5_constant/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa460de2d5833515a362ba04953808e73f83cd2f909de119d2605e18b79d8ec9
3
  size 27298792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c2081ab470b794ec92ac84ee7460cf5e9717d100c98b572257d5616a5daa2ec
3
  size 27298792
lora_delta_product_m0.5_constant/config.json CHANGED
@@ -43,10 +43,10 @@
43
  "rank_pattern": {},
44
  "revision": null,
45
  "target_modules": [
46
- "q_proj",
47
- "k_proj",
48
  "o_proj",
49
- "v_proj"
 
 
50
  ],
51
  "task_type": "CAUSAL_LM",
52
  "use_dora": false,
@@ -79,7 +79,7 @@
79
  "attention"
80
  ],
81
  "torch_dtype": "bfloat16",
82
- "transformers_version": "4.49.0",
83
  "use_cache": true,
84
  "use_linear_checkpoint": true,
85
  "vocab_size": 32768
 
43
  "rank_pattern": {},
44
  "revision": null,
45
  "target_modules": [
 
 
46
  "o_proj",
47
+ "k_proj",
48
+ "v_proj",
49
+ "q_proj"
50
  ],
51
  "task_type": "CAUSAL_LM",
52
  "use_dora": false,
 
79
  "attention"
80
  ],
81
  "torch_dtype": "bfloat16",
82
+ "transformers_version": "4.51.3",
83
  "use_cache": true,
84
  "use_linear_checkpoint": true,
85
  "vocab_size": 32768
lora_delta_product_m0.5_constant/modeling_tptt.py CHANGED
@@ -312,6 +312,7 @@ class LiZAttention(nn.Module):
312
  self.head_dim,
313
  self.num_key_value_heads,
314
  self.num_key_value_groups,
 
315
  ) = self._get_attention_parameters(base_attn, base_config)
316
  self.scaling = self.head_dim**-0.5
317
 
@@ -321,7 +322,7 @@ class LiZAttention(nn.Module):
321
  operator_mode=operator_mode,
322
  use_linear_checkpoint=use_linear_checkpoint,
323
  recurrent_config=recurrent_config,
324
- hidden_dim=base_config.hidden_size,
325
  num_heads=self.num_heads,
326
  head_dim=self.head_dim,
327
  num_key_value_heads=self.num_key_value_heads,
@@ -364,11 +365,13 @@ class LiZAttention(nn.Module):
364
  num_key_value_groups = getattr(base_attn, "num_key_value_groups", None) or (
365
  num_heads // num_key_value_heads if num_heads and num_key_value_heads else 1
366
  )
 
367
  return (
368
  num_heads,
369
  head_dim,
370
  num_key_value_heads,
371
  num_key_value_groups,
 
372
  )
373
 
374
  def _apply_shared_projections(
 
312
  self.head_dim,
313
  self.num_key_value_heads,
314
  self.num_key_value_groups,
315
+ self.hidden_dim,
316
  ) = self._get_attention_parameters(base_attn, base_config)
317
  self.scaling = self.head_dim**-0.5
318
 
 
322
  operator_mode=operator_mode,
323
  use_linear_checkpoint=use_linear_checkpoint,
324
  recurrent_config=recurrent_config,
325
+ hidden_dim=self.hidden_dim,
326
  num_heads=self.num_heads,
327
  head_dim=self.head_dim,
328
  num_key_value_heads=self.num_key_value_heads,
 
365
  num_key_value_groups = getattr(base_attn, "num_key_value_groups", None) or (
366
  num_heads // num_key_value_heads if num_heads and num_key_value_heads else 1
367
  )
368
+ hidden_dim = getattr(base_config, "hidden_size", None) or head_dim * num_heads
369
  return (
370
  num_heads,
371
  head_dim,
372
  num_key_value_heads,
373
  num_key_value_groups,
374
+ hidden_dim,
375
  )
376
 
377
  def _apply_shared_projections(
lora_delta_product_m0.5_constant/runs/Aug29_07-27-47_c47f5a3d6521/events.out.tfevents.1756452484.c47f5a3d6521.19.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc53b55271fd13e60534334a32a02a17d8dba6167a60bd2d77142740b1d7fc18
3
+ size 115789
modeling_tptt.py CHANGED
@@ -312,6 +312,7 @@ class LiZAttention(nn.Module):
312
  self.head_dim,
313
  self.num_key_value_heads,
314
  self.num_key_value_groups,
 
315
  ) = self._get_attention_parameters(base_attn, base_config)
316
  self.scaling = self.head_dim**-0.5
317
 
@@ -321,7 +322,7 @@ class LiZAttention(nn.Module):
321
  operator_mode=operator_mode,
322
  use_linear_checkpoint=use_linear_checkpoint,
323
  recurrent_config=recurrent_config,
324
- hidden_dim=base_config.hidden_size,
325
  num_heads=self.num_heads,
326
  head_dim=self.head_dim,
327
  num_key_value_heads=self.num_key_value_heads,
@@ -364,11 +365,13 @@ class LiZAttention(nn.Module):
364
  num_key_value_groups = getattr(base_attn, "num_key_value_groups", None) or (
365
  num_heads // num_key_value_heads if num_heads and num_key_value_heads else 1
366
  )
 
367
  return (
368
  num_heads,
369
  head_dim,
370
  num_key_value_heads,
371
  num_key_value_groups,
 
372
  )
373
 
374
  def _apply_shared_projections(
 
312
  self.head_dim,
313
  self.num_key_value_heads,
314
  self.num_key_value_groups,
315
+ self.hidden_dim,
316
  ) = self._get_attention_parameters(base_attn, base_config)
317
  self.scaling = self.head_dim**-0.5
318
 
 
322
  operator_mode=operator_mode,
323
  use_linear_checkpoint=use_linear_checkpoint,
324
  recurrent_config=recurrent_config,
325
+ hidden_dim=self.hidden_dim,
326
  num_heads=self.num_heads,
327
  head_dim=self.head_dim,
328
  num_key_value_heads=self.num_key_value_heads,
 
365
  num_key_value_groups = getattr(base_attn, "num_key_value_groups", None) or (
366
  num_heads // num_key_value_heads if num_heads and num_key_value_heads else 1
367
  )
368
+ hidden_dim = getattr(base_config, "hidden_size", None) or head_dim * num_heads
369
  return (
370
  num_heads,
371
  head_dim,
372
  num_key_value_heads,
373
  num_key_value_groups,
374
+ hidden_dim,
375
  )
376
 
377
  def _apply_shared_projections(
train_tptt.py CHANGED
@@ -115,7 +115,7 @@ class LiZACallback(TrainerCallback):
115
  if mag_weight is not None and logs is not None:
116
  logs["mag_weight"] = float(mag_weight)
117
  if disable_linear_attn is not None and logs is not None:
118
- logs["disable_linear_attn"] = not bool(disable_linear_attn)
119
 
120
 
121
  def ensure_int(value: Union[int, tuple, list]) -> int:
 
115
  if mag_weight is not None and logs is not None:
116
  logs["mag_weight"] = float(mag_weight)
117
  if disable_linear_attn is not None and logs is not None:
118
+ logs["disable_linear_attn"] = bool(disable_linear_attn)
119
 
120
 
121
  def ensure_int(value: Union[int, tuple, list]) -> int: