adricl commited on
Commit
4ec02d1
·
1 Parent(s): c20cfc8

Trained Model with 14gb/2 dataset

Browse files
GPU_settings ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ training_config = TrainingArguments(
2
+ model_dir_str, False, True, True, False, "steps",
3
+ per_device_train_batch_size=64, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time
4
+ per_device_eval_batch_size=64, #was 24 now 32
5
+ gradient_accumulation_steps=3, #change this to 4
6
+ eval_accumulation_steps=None,
7
+ eval_steps=2000,
8
+ learning_rate=1e-4,
9
+ weight_decay=0.01,
10
+ max_grad_norm=1.0,
11
+ max_steps=50000,
12
+ lr_scheduler_type="cosine",
13
+ warmup_ratio=0.08,
14
+ log_level="debug",
15
+ logging_strategy="steps",
16
+ logging_steps=20,
17
+ save_strategy="steps",
18
+ save_steps=1000,
19
+ save_total_limit=5,
20
+ no_cuda=not USE_CUDA,
21
+ seed=444,
22
+ fp16=FP16,
23
+ fp16_full_eval=FP16_EVAL,
24
+ bf16=BF16,
25
+ bf16_full_eval=BF16_EVAL,
26
+ load_best_model_at_end=True,
27
+ label_smoothing_factor=0.,
28
+ optim="adamw_torch",
29
+ report_to=["tensorboard"],
30
+ gradient_checkpointing=True,
31
+ dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process
32
+ dataloader_pin_memory=True, #we want the dataset in memory
33
+ torch_compile=True #added to speed up
34
+
35
+
36
+
37
+ Better sugested by ai
38
+ training_config = TrainingArguments(
39
+ model_dir_str, False, True, True, False, "steps",
40
+ per_device_train_batch_size=64, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time
41
+ per_device_eval_batch_size=64, #was 24 now 32
42
+ gradient_accumulation_steps=3, #change this to 4
43
+ eval_accumulation_steps=None,
44
+ eval_steps=3000,
45
+ eval_delay=6000,
46
+ learning_rate=1e-4,
47
+ weight_decay=0.01,
48
+ max_grad_norm=1.0,
49
+ max_steps=30000,
50
+ lr_scheduler_type="cosine",
51
+ warmup_ratio=0.08,
52
+ log_level="debug",
53
+ logging_strategy="steps",
54
+ logging_steps=100,
55
+ save_strategy="steps",
56
+ save_steps=3000,
57
+ save_total_limit=5,
58
+ no_cuda=not USE_CUDA,
59
+ seed=444,
60
+ fp16=FP16,
61
+ fp16_full_eval=FP16_EVAL,
62
+ bf16=BF16,
63
+ bf16_full_eval=BF16_EVAL,
64
+ load_best_model_at_end=True,
65
+ label_smoothing_factor=0.05,
66
+ optim="adamw_torch",
67
+ report_to=["tensorboard"],
68
+ gradient_checkpointing=False,
69
+ dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process
70
+ dataloader_pin_memory=True, #we want the dataset in memory
71
+ torch_compile=True #added to speed up
HuggingFace_Mistral_Transformer_Single_Instrument.json CHANGED
The diff for this file is too large to render. See raw diff
 
config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "_name_or_path": "/home/wombat/Documents/projects/music/midiTok/data/HuggingFace_Mistral_Transformer_Single_Instrument/run/model.safetensors",
3
  "architectures": [
4
  "MistralForCausalLM"
5
  ],
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 1,
 
8
  "eos_token_id": 2,
9
- "head_dim": 64,
10
  "hidden_act": "silu",
11
  "hidden_size": 512,
12
  "initializer_range": 0.02,
@@ -21,8 +21,7 @@
21
  "rope_theta": 10000.0,
22
  "sliding_window": 256,
23
  "tie_word_embeddings": false,
24
- "torch_dtype": "float32",
25
- "transformers_version": "4.46.2",
26
  "use_cache": true,
27
- "vocab_size": 32000
28
  }
 
1
  {
 
2
  "architectures": [
3
  "MistralForCausalLM"
4
  ],
5
  "attention_dropout": 0.0,
6
  "bos_token_id": 1,
7
+ "dtype": "float32",
8
  "eos_token_id": 2,
9
+ "head_dim": null,
10
  "hidden_act": "silu",
11
  "hidden_size": 512,
12
  "initializer_range": 0.02,
 
21
  "rope_theta": 10000.0,
22
  "sliding_window": 256,
23
  "tie_word_embeddings": false,
24
+ "transformers_version": "4.56.2",
 
25
  "use_cache": true,
26
+ "vocab_size": 24000
27
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
  "pad_token_id": 0,
6
- "transformers_version": "4.46.2"
7
  }
 
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
  "pad_token_id": 0,
6
+ "transformers_version": "4.56.2"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a36a5a2deaeaccf3c60fea79860b75fedca40b881d9a46f0133565023849741
3
- size 256944240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f49ec94974c83fd18ef12815c1351c4fef7e3aa729bfd38cf6457d55366ca999
3
+ size 224176232
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c875a81a1eef2afb2c24c78e8fd15292dd9ed57f50c8266448d36a7f65be2bd
3
+ size 448399883
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e717e3eca82ee0f6b33f1773c9e85e3d6d0912794766ebb2336f9cff2d8a049
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7356bb5c8ec3657ebcedb2b29bf1bb59a0e6d04ad9df685225b3375419862023
3
+ size 1465
test.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from pathlib import Path
3
+
4
+
5
+
6
+ dataset_test = torch.load(Path("/media/wombat/c6928dc9-ba03-411d-9483-8e28df5973b9/Music Data/HuggingFace_Mistral_Transformer_Single_Instrument/data/dataset_valid.pt"), weights_only=False)
7
+ print(f"valid dataset length: {len(dataset_test)}")
train.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ from pathlib import Path
3
+ from random import shuffle, sample
4
+
5
+ from evaluate import load as load_metric
6
+ from miditok import REMI, TokenizerConfig, TokTrainingIterator
7
+ from miditok.pytorch_data import DatasetMIDI, DataCollator
8
+ from miditok.utils import split_files_for_training
9
+
10
+ from miditok.data_augmentation import augment_dataset
11
+ from torch import Tensor, argmax, torch
12
+ from torch.utils.data import DataLoader
13
+ from torch.cuda import is_available as cuda_available, is_bf16_supported
14
+ from torch.backends.mps import is_available as mps_available
15
+ from transformers import AutoModelForCausalLM, MistralConfig, Trainer, TrainingArguments, GenerationConfig, AutoConfig
16
+ from transformers.trainer_utils import set_seed
17
+ from tqdm import tqdm
18
+
19
+ root_data_dir = Path('/home/wombat/Documents/projects/music/midiTok/data/')
20
+ root_save = Path(root_data_dir / 'HuggingFace_Mistral_Transformer_Single_Instrument')
21
+
22
+ tokenizer_name = "HuggingFace_Mistral_Transformer_Single_Instrument_v3_single_track.json"
23
+
24
+ tokenizer = REMI(params=Path(root_save / tokenizer_name))
25
+
26
+ sequence_length = 1024 # The maximum sequence length for data samples.
27
+ kwargs_dataset = {"max_seq_len": sequence_length, "tokenizer": tokenizer, "bos_token_id": tokenizer["BOS_None"], "eos_token_id": tokenizer["EOS_None"]}
28
+
29
+
30
+ dataset_dir = root_save / "data"
31
+ dataset_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+
34
+ dataset_train = torch.load(Path(dataset_dir / "dataset_train.pt"), weights_only=False)
35
+ dataset_valid = torch.load(Path(dataset_dir / "dataset_valid.pt"), weights_only=False)
36
+ dataset_test = torch.load(Path(dataset_dir / "dataset_test.pt"), weights_only=False)
37
+
38
+ # Creates model
39
+ model_config = MistralConfig(
40
+ vocab_size=len(tokenizer), #from miditok output default 32K
41
+ hidden_size=512, # default 4096
42
+ intermediate_size=2048, # default 14336
43
+ num_hidden_layers=8, # default 32
44
+ num_attention_heads=8, # default 32
45
+ num_key_value_heads=4, # default 8
46
+ sliding_window=256, # default 4096
47
+ max_position_embeddings=8192, #has no effect on the parms count or training just limits the input length # default 4096*32
48
+ pad_token_id=tokenizer['PAD_None'],
49
+ bos_token_id=tokenizer['BOS_None'],
50
+ eos_token_id=tokenizer['EOS_None'],
51
+ )
52
+ model = AutoModelForCausalLM.from_config(model_config)
53
+
54
+ model_dir = root_save / 'run'
55
+ model_dir_str = str(model_dir)
56
+ print(model_dir)
57
+
58
+ metrics = {metric: load_metric(metric) for metric in ["accuracy"]}
59
+
60
+ def compute_metrics(eval_pred):
61
+ """
62
+ Compute metrics for pretraining.
63
+
64
+ Must use preprocess_logits function that converts logits to predictions (argmax or sampling).
65
+
66
+ :param eval_pred: EvalPrediction containing predictions and labels
67
+ :return: metrics
68
+ """
69
+ predictions, labels = eval_pred
70
+ not_pad_mask = labels != -100
71
+ labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]
72
+ return metrics["accuracy"].compute(predictions=predictions.flatten(), references=labels.flatten())
73
+
74
+ def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:
75
+ """
76
+ Preprocess the logits before accumulating them during evaluation.
77
+
78
+ This allows to significantly reduce the memory usage and make the training tractable.
79
+ """
80
+ pred_ids = argmax(logits, dim=-1) # long dtype
81
+ return pred_ids
82
+
83
+ # Create config for the Trainer
84
+ USE_CUDA = cuda_available()
85
+ print(USE_CUDA)
86
+ if not cuda_available():
87
+ FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
88
+ elif is_bf16_supported():
89
+ BF16 = BF16_EVAL = True
90
+ FP16 = FP16_EVAL = False
91
+ else:
92
+ BF16 = BF16_EVAL = False
93
+ FP16 = FP16_EVAL = True
94
+ USE_MPS = not USE_CUDA and mps_available()
95
+ training_config = TrainingArguments(
96
+ model_dir_str, False, True, True, False, "steps",
97
+ per_device_train_batch_size=24, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time
98
+ per_device_eval_batch_size=24, #was 24 now 32
99
+ gradient_accumulation_steps=2, #change this to 4
100
+ eval_accumulation_steps=None,
101
+ eval_steps=1000,
102
+ learning_rate=1e-4,
103
+ weight_decay=0.01,
104
+ max_grad_norm=3.0,
105
+ max_steps=40000,
106
+ lr_scheduler_type="cosine_with_restarts",
107
+ warmup_ratio=0.3,
108
+ log_level="debug",
109
+ logging_strategy="steps",
110
+ logging_steps=20,
111
+ save_strategy="steps",
112
+ save_steps=1000,
113
+ save_total_limit=5,
114
+ no_cuda=not USE_CUDA,
115
+ seed=444,
116
+ fp16=FP16,
117
+ fp16_full_eval=FP16_EVAL,
118
+ bf16=BF16,
119
+ bf16_full_eval=BF16_EVAL,
120
+ load_best_model_at_end=True,
121
+ label_smoothing_factor=0.,
122
+ optim="adamw_torch",
123
+ report_to=["tensorboard"],
124
+ gradient_checkpointing=True,
125
+ dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process
126
+ dataloader_pin_memory=True, #we want the dataset in memory
127
+ torch_compile=True #added to speed up
128
+
129
+ )
130
+
131
+ collator = DataCollator(tokenizer["PAD_None"], copy_inputs_as_labels=True, pad_on_left=True) #not sure about the pad_on_left, it might get better results
132
+ trainer = Trainer(
133
+ model=model,
134
+ args=training_config,
135
+ data_collator=collator,
136
+ train_dataset=dataset_train,
137
+ eval_dataset=dataset_valid,
138
+ compute_metrics=compute_metrics,
139
+ callbacks=None,
140
+ preprocess_logits_for_metrics=preprocess_logits,
141
+
142
+ )
143
+
144
+
145
+
146
+ #%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
147
+
148
+ # Training
149
+ train_result = trainer.train()
150
+ trainer.save_model() # Saves the tokenizer too
151
+ trainer.log_metrics("train", train_result.metrics)
152
+ trainer.save_metrics("train", train_result.metrics)
153
+ trainer.save_state()
154
+
trainer_state.json ADDED
@@ -0,0 +1,1120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 15000,
3
+ "best_metric": 1.6942352056503296,
4
+ "best_model_checkpoint": "/workspace/traindata/data/HuggingFace_Mistral_Transformer_Single_Instrument/run/checkpoint-15000",
5
+ "epoch": 0.258492928782326,
6
+ "eval_steps": 3000,
7
+ "global_step": 15000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0017232861918821732,
14
+ "grad_norm": 0.6133952736854553,
15
+ "learning_rate": 4.125e-06,
16
+ "loss": 1.5965,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.0034465723837643464,
21
+ "grad_norm": 0.5784599184989929,
22
+ "learning_rate": 8.291666666666667e-06,
23
+ "loss": 1.5982,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.005169858575646519,
28
+ "grad_norm": 0.5842740535736084,
29
+ "learning_rate": 1.2458333333333334e-05,
30
+ "loss": 1.5828,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.006893144767528693,
35
+ "grad_norm": 0.5865280032157898,
36
+ "learning_rate": 1.6625e-05,
37
+ "loss": 1.5934,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.008616430959410866,
42
+ "grad_norm": 0.6083072423934937,
43
+ "learning_rate": 2.0791666666666666e-05,
44
+ "loss": 1.6052,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.010339717151293039,
49
+ "grad_norm": 0.5992451906204224,
50
+ "learning_rate": 2.4958333333333335e-05,
51
+ "loss": 1.5995,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.012063003343175211,
56
+ "grad_norm": 0.6140967011451721,
57
+ "learning_rate": 2.9125000000000003e-05,
58
+ "loss": 1.5791,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.013786289535057386,
63
+ "grad_norm": 0.6324509382247925,
64
+ "learning_rate": 3.329166666666667e-05,
65
+ "loss": 1.6014,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.015509575726939558,
70
+ "grad_norm": 0.6500518918037415,
71
+ "learning_rate": 3.7458333333333334e-05,
72
+ "loss": 1.6042,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.017232861918821733,
77
+ "grad_norm": 0.6341643929481506,
78
+ "learning_rate": 4.1625e-05,
79
+ "loss": 1.5796,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.018956148110703903,
84
+ "grad_norm": 0.6603251099586487,
85
+ "learning_rate": 4.579166666666667e-05,
86
+ "loss": 1.5855,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.020679434302586078,
91
+ "grad_norm": 0.7315922379493713,
92
+ "learning_rate": 4.995833333333333e-05,
93
+ "loss": 1.5976,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.022402720494468252,
98
+ "grad_norm": 0.6418182849884033,
99
+ "learning_rate": 5.4125e-05,
100
+ "loss": 1.5834,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.024126006686350423,
105
+ "grad_norm": 0.6903438568115234,
106
+ "learning_rate": 5.829166666666667e-05,
107
+ "loss": 1.6235,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.025849292878232597,
112
+ "grad_norm": 0.6109316945075989,
113
+ "learning_rate": 6.245833333333334e-05,
114
+ "loss": 1.6143,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 0.02757257907011477,
119
+ "grad_norm": 0.6458160281181335,
120
+ "learning_rate": 6.6625e-05,
121
+ "loss": 1.609,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 0.029295865261996946,
126
+ "grad_norm": 0.6940888166427612,
127
+ "learning_rate": 7.079166666666666e-05,
128
+ "loss": 1.6048,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 0.031019151453879117,
133
+ "grad_norm": 0.6740911602973938,
134
+ "learning_rate": 7.495833333333334e-05,
135
+ "loss": 1.6116,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 0.03274243764576129,
140
+ "grad_norm": 0.634560763835907,
141
+ "learning_rate": 7.9125e-05,
142
+ "loss": 1.5999,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 0.034465723837643465,
147
+ "grad_norm": 0.677970826625824,
148
+ "learning_rate": 8.329166666666667e-05,
149
+ "loss": 1.6104,
150
+ "step": 2000
151
+ },
152
+ {
153
+ "epoch": 0.03618901002952564,
154
+ "grad_norm": 0.6901321411132812,
155
+ "learning_rate": 8.745833333333334e-05,
156
+ "loss": 1.6018,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 0.03791229622140781,
161
+ "grad_norm": 0.6881032586097717,
162
+ "learning_rate": 9.1625e-05,
163
+ "loss": 1.6303,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 0.03963558241328998,
168
+ "grad_norm": 0.6821079254150391,
169
+ "learning_rate": 9.579166666666667e-05,
170
+ "loss": 1.6207,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 0.041358868605172155,
175
+ "grad_norm": 0.7254959940910339,
176
+ "learning_rate": 9.995833333333334e-05,
177
+ "loss": 1.6106,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 0.04308215479705433,
182
+ "grad_norm": 0.7417749166488647,
183
+ "learning_rate": 9.99968254119042e-05,
184
+ "loss": 1.6141,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 0.044805440988936504,
189
+ "grad_norm": 0.6578373312950134,
190
+ "learning_rate": 9.998717347022716e-05,
191
+ "loss": 1.6214,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 0.04652872718081868,
196
+ "grad_norm": 0.6432535648345947,
197
+ "learning_rate": 9.997104510158365e-05,
198
+ "loss": 1.6303,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 0.048252013372700846,
203
+ "grad_norm": 0.6907160878181458,
204
+ "learning_rate": 9.994844239559375e-05,
205
+ "loss": 1.6105,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 0.04997529956458302,
210
+ "grad_norm": 0.7411105036735535,
211
+ "learning_rate": 9.991936828070421e-05,
212
+ "loss": 1.629,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 0.051698585756465194,
217
+ "grad_norm": 0.6869089603424072,
218
+ "learning_rate": 9.988382652380897e-05,
219
+ "loss": 1.6249,
220
+ "step": 3000
221
+ },
222
+ {
223
+ "epoch": 0.05342187194834737,
224
+ "grad_norm": 0.662797212600708,
225
+ "learning_rate": 9.984182172976115e-05,
226
+ "loss": 1.633,
227
+ "step": 3100
228
+ },
229
+ {
230
+ "epoch": 0.05514515814022954,
231
+ "grad_norm": 0.6771135926246643,
232
+ "learning_rate": 9.979335934077652e-05,
233
+ "loss": 1.6243,
234
+ "step": 3200
235
+ },
236
+ {
237
+ "epoch": 0.05686844433211172,
238
+ "grad_norm": 0.6237235069274902,
239
+ "learning_rate": 9.97384456357282e-05,
240
+ "loss": 1.6184,
241
+ "step": 3300
242
+ },
243
+ {
244
+ "epoch": 0.05859173052399389,
245
+ "grad_norm": 0.6165594458580017,
246
+ "learning_rate": 9.967708772933339e-05,
247
+ "loss": 1.6178,
248
+ "step": 3400
249
+ },
250
+ {
251
+ "epoch": 0.06031501671587606,
252
+ "grad_norm": 0.6119577884674072,
253
+ "learning_rate": 9.960929357123137e-05,
254
+ "loss": 1.6171,
255
+ "step": 3500
256
+ },
257
+ {
258
+ "epoch": 0.06203830290775823,
259
+ "grad_norm": 0.6392346024513245,
260
+ "learning_rate": 9.953507194495366e-05,
261
+ "loss": 1.6283,
262
+ "step": 3600
263
+ },
264
+ {
265
+ "epoch": 0.06376158909964041,
266
+ "grad_norm": 0.7036736607551575,
267
+ "learning_rate": 9.945443246678599e-05,
268
+ "loss": 1.6278,
269
+ "step": 3700
270
+ },
271
+ {
272
+ "epoch": 0.06548487529152258,
273
+ "grad_norm": 0.7011469006538391,
274
+ "learning_rate": 9.936738558452233e-05,
275
+ "loss": 1.6087,
276
+ "step": 3800
277
+ },
278
+ {
279
+ "epoch": 0.06720816148340475,
280
+ "grad_norm": 0.6176936030387878,
281
+ "learning_rate": 9.927394257611137e-05,
282
+ "loss": 1.6285,
283
+ "step": 3900
284
+ },
285
+ {
286
+ "epoch": 0.06893144767528693,
287
+ "grad_norm": 0.6255926489830017,
288
+ "learning_rate": 9.91741155481952e-05,
289
+ "loss": 1.618,
290
+ "step": 4000
291
+ },
292
+ {
293
+ "epoch": 0.0706547338671691,
294
+ "grad_norm": 0.6118465065956116,
295
+ "learning_rate": 9.906791743454082e-05,
296
+ "loss": 1.629,
297
+ "step": 4100
298
+ },
299
+ {
300
+ "epoch": 0.07237802005905128,
301
+ "grad_norm": 0.6299500465393066,
302
+ "learning_rate": 9.895536199436449e-05,
303
+ "loss": 1.6328,
304
+ "step": 4200
305
+ },
306
+ {
307
+ "epoch": 0.07410130625093345,
308
+ "grad_norm": 0.7682228684425354,
309
+ "learning_rate": 9.883646381054886e-05,
310
+ "loss": 1.5985,
311
+ "step": 4300
312
+ },
313
+ {
314
+ "epoch": 0.07582459244281561,
315
+ "grad_norm": 0.5980575084686279,
316
+ "learning_rate": 9.871123828775381e-05,
317
+ "loss": 1.6114,
318
+ "step": 4400
319
+ },
320
+ {
321
+ "epoch": 0.0775478786346978,
322
+ "grad_norm": 0.5819905400276184,
323
+ "learning_rate": 9.857970165042046e-05,
324
+ "loss": 1.622,
325
+ "step": 4500
326
+ },
327
+ {
328
+ "epoch": 0.07927116482657996,
329
+ "grad_norm": 0.8043591976165771,
330
+ "learning_rate": 9.844187094066913e-05,
331
+ "loss": 1.5978,
332
+ "step": 4600
333
+ },
334
+ {
335
+ "epoch": 0.08099445101846214,
336
+ "grad_norm": 0.6172861456871033,
337
+ "learning_rate": 9.829776401609134e-05,
338
+ "loss": 1.5887,
339
+ "step": 4700
340
+ },
341
+ {
342
+ "epoch": 0.08271773721034431,
343
+ "grad_norm": 0.6270127296447754,
344
+ "learning_rate": 9.814739954743617e-05,
345
+ "loss": 1.5971,
346
+ "step": 4800
347
+ },
348
+ {
349
+ "epoch": 0.08444102340222649,
350
+ "grad_norm": 0.6117558479309082,
351
+ "learning_rate": 9.79907970161912e-05,
352
+ "loss": 1.6033,
353
+ "step": 4900
354
+ },
355
+ {
356
+ "epoch": 0.08616430959410866,
357
+ "grad_norm": 0.6499077677726746,
358
+ "learning_rate": 9.78279767120585e-05,
359
+ "loss": 1.6129,
360
+ "step": 5000
361
+ },
362
+ {
363
+ "epoch": 0.08788759578599083,
364
+ "grad_norm": 0.5596637725830078,
365
+ "learning_rate": 9.76589597303258e-05,
366
+ "loss": 1.6211,
367
+ "step": 5100
368
+ },
369
+ {
370
+ "epoch": 0.08961088197787301,
371
+ "grad_norm": 0.5757789015769958,
372
+ "learning_rate": 9.748376796913344e-05,
373
+ "loss": 1.6225,
374
+ "step": 5200
375
+ },
376
+ {
377
+ "epoch": 0.09133416816975518,
378
+ "grad_norm": 0.6331895589828491,
379
+ "learning_rate": 9.730242412663709e-05,
380
+ "loss": 1.5732,
381
+ "step": 5300
382
+ },
383
+ {
384
+ "epoch": 0.09305745436163736,
385
+ "grad_norm": 0.5809708833694458,
386
+ "learning_rate": 9.711495169806705e-05,
387
+ "loss": 1.6233,
388
+ "step": 5400
389
+ },
390
+ {
391
+ "epoch": 0.09478074055351952,
392
+ "grad_norm": 0.8100622296333313,
393
+ "learning_rate": 9.69213749726841e-05,
394
+ "loss": 1.6118,
395
+ "step": 5500
396
+ },
397
+ {
398
+ "epoch": 0.09650402674540169,
399
+ "grad_norm": 0.5590764284133911,
400
+ "learning_rate": 9.672171903063253e-05,
401
+ "loss": 1.5967,
402
+ "step": 5600
403
+ },
404
+ {
405
+ "epoch": 0.09822731293728387,
406
+ "grad_norm": 0.5601252317428589,
407
+ "learning_rate": 9.651600973969077e-05,
408
+ "loss": 1.6189,
409
+ "step": 5700
410
+ },
411
+ {
412
+ "epoch": 0.09995059912916604,
413
+ "grad_norm": 0.5881779193878174,
414
+ "learning_rate": 9.630427375191989e-05,
415
+ "loss": 1.6133,
416
+ "step": 5800
417
+ },
418
+ {
419
+ "epoch": 0.10167388532104822,
420
+ "grad_norm": 0.5713782906532288,
421
+ "learning_rate": 9.608653850021045e-05,
422
+ "loss": 1.5912,
423
+ "step": 5900
424
+ },
425
+ {
426
+ "epoch": 0.10339717151293039,
427
+ "grad_norm": 0.5922852754592896,
428
+ "learning_rate": 9.586283219472836e-05,
429
+ "loss": 1.6022,
430
+ "step": 6000
431
+ },
432
+ {
433
+ "epoch": 0.10339717151293039,
434
+ "eval_accuracy": 0.010507585123200762,
435
+ "eval_loss": 1.751858115196228,
436
+ "eval_runtime": 1766.3832,
437
+ "eval_samples_per_second": 481.157,
438
+ "eval_steps_per_second": 15.036,
439
+ "step": 6000
440
+ },
441
+ {
442
+ "epoch": 0.10512045770481257,
443
+ "grad_norm": 0.5831886529922485,
444
+ "learning_rate": 9.563318381925982e-05,
445
+ "loss": 1.5952,
446
+ "step": 6100
447
+ },
448
+ {
449
+ "epoch": 0.10684374389669474,
450
+ "grad_norm": 0.6007715463638306,
451
+ "learning_rate": 9.539762312745619e-05,
452
+ "loss": 1.6048,
453
+ "step": 6200
454
+ },
455
+ {
456
+ "epoch": 0.1085670300885769,
457
+ "grad_norm": 0.6652824282646179,
458
+ "learning_rate": 9.515618063897906e-05,
459
+ "loss": 1.6103,
460
+ "step": 6300
461
+ },
462
+ {
463
+ "epoch": 0.11029031628045909,
464
+ "grad_norm": 0.5715838670730591,
465
+ "learning_rate": 9.490888763554596e-05,
466
+ "loss": 1.5917,
467
+ "step": 6400
468
+ },
469
+ {
470
+ "epoch": 0.11201360247234125,
471
+ "grad_norm": 0.5438059568405151,
472
+ "learning_rate": 9.465577615687761e-05,
473
+ "loss": 1.6222,
474
+ "step": 6500
475
+ },
476
+ {
477
+ "epoch": 0.11373688866422343,
478
+ "grad_norm": 0.5527771711349487,
479
+ "learning_rate": 9.43968789965467e-05,
480
+ "loss": 1.588,
481
+ "step": 6600
482
+ },
483
+ {
484
+ "epoch": 0.1154601748561056,
485
+ "grad_norm": 0.6176398396492004,
486
+ "learning_rate": 9.413222969772906e-05,
487
+ "loss": 1.5747,
488
+ "step": 6700
489
+ },
490
+ {
491
+ "epoch": 0.11718346104798778,
492
+ "grad_norm": 0.589585542678833,
493
+ "learning_rate": 9.386186254885783e-05,
494
+ "loss": 1.5938,
495
+ "step": 6800
496
+ },
497
+ {
498
+ "epoch": 0.11890674723986995,
499
+ "grad_norm": 0.5356398820877075,
500
+ "learning_rate": 9.3585812579181e-05,
501
+ "loss": 1.5822,
502
+ "step": 6900
503
+ },
504
+ {
505
+ "epoch": 0.12063003343175212,
506
+ "grad_norm": 0.6897734999656677,
507
+ "learning_rate": 9.330411555422285e-05,
508
+ "loss": 1.5982,
509
+ "step": 7000
510
+ },
511
+ {
512
+ "epoch": 0.1223533196236343,
513
+ "grad_norm": 0.5214916467666626,
514
+ "learning_rate": 9.30168079711502e-05,
515
+ "loss": 1.5898,
516
+ "step": 7100
517
+ },
518
+ {
519
+ "epoch": 0.12407660581551647,
520
+ "grad_norm": 0.5990198850631714,
521
+ "learning_rate": 9.272392705404372e-05,
522
+ "loss": 1.6032,
523
+ "step": 7200
524
+ },
525
+ {
526
+ "epoch": 0.12579989200739863,
527
+ "grad_norm": 0.531247079372406,
528
+ "learning_rate": 9.242551074907519e-05,
529
+ "loss": 1.6082,
530
+ "step": 7300
531
+ },
532
+ {
533
+ "epoch": 0.12752317819928083,
534
+ "grad_norm": 0.607933521270752,
535
+ "learning_rate": 9.212159771959101e-05,
536
+ "loss": 1.5817,
537
+ "step": 7400
538
+ },
539
+ {
540
+ "epoch": 0.129246464391163,
541
+ "grad_norm": 0.5464344024658203,
542
+ "learning_rate": 9.181222734110301e-05,
543
+ "loss": 1.5759,
544
+ "step": 7500
545
+ },
546
+ {
547
+ "epoch": 0.13096975058304516,
548
+ "grad_norm": 0.6487947106361389,
549
+ "learning_rate": 9.149743969618683e-05,
550
+ "loss": 1.6067,
551
+ "step": 7600
552
+ },
553
+ {
554
+ "epoch": 0.13269303677492733,
555
+ "grad_norm": 0.556429922580719,
556
+ "learning_rate": 9.117727556928875e-05,
557
+ "loss": 1.5863,
558
+ "step": 7700
559
+ },
560
+ {
561
+ "epoch": 0.1344163229668095,
562
+ "grad_norm": 0.5772918462753296,
563
+ "learning_rate": 9.085177644144167e-05,
564
+ "loss": 1.5888,
565
+ "step": 7800
566
+ },
567
+ {
568
+ "epoch": 0.1361396091586917,
569
+ "grad_norm": 0.5730582475662231,
570
+ "learning_rate": 9.052098448489062e-05,
571
+ "loss": 1.5983,
572
+ "step": 7900
573
+ },
574
+ {
575
+ "epoch": 0.13786289535057386,
576
+ "grad_norm": 0.5221332311630249,
577
+ "learning_rate": 9.018494255762894e-05,
578
+ "loss": 1.5757,
579
+ "step": 8000
580
+ },
581
+ {
582
+ "epoch": 0.13958618154245603,
583
+ "grad_norm": 0.5817165970802307,
584
+ "learning_rate": 8.98436941978455e-05,
585
+ "loss": 1.5813,
586
+ "step": 8100
587
+ },
588
+ {
589
+ "epoch": 0.1413094677343382,
590
+ "grad_norm": 0.5672810673713684,
591
+ "learning_rate": 8.949728361828381e-05,
592
+ "loss": 1.5997,
593
+ "step": 8200
594
+ },
595
+ {
596
+ "epoch": 0.14303275392622036,
597
+ "grad_norm": 0.5768831372261047,
598
+ "learning_rate": 8.914575570051375e-05,
599
+ "loss": 1.5707,
600
+ "step": 8300
601
+ },
602
+ {
603
+ "epoch": 0.14475604011810256,
604
+ "grad_norm": 0.6032638549804688,
605
+ "learning_rate": 8.878915598911664e-05,
606
+ "loss": 1.5892,
607
+ "step": 8400
608
+ },
609
+ {
610
+ "epoch": 0.14647932630998473,
611
+ "grad_norm": 0.5976369976997375,
612
+ "learning_rate": 8.842753068578434e-05,
613
+ "loss": 1.5996,
614
+ "step": 8500
615
+ },
616
+ {
617
+ "epoch": 0.1482026125018669,
618
+ "grad_norm": 0.559442400932312,
619
+ "learning_rate": 8.806092664333333e-05,
620
+ "loss": 1.5813,
621
+ "step": 8600
622
+ },
623
+ {
624
+ "epoch": 0.14992589869374906,
625
+ "grad_norm": 0.516207218170166,
626
+ "learning_rate": 8.768939135963447e-05,
627
+ "loss": 1.5742,
628
+ "step": 8700
629
+ },
630
+ {
631
+ "epoch": 0.15164918488563123,
632
+ "grad_norm": 0.553333044052124,
633
+ "learning_rate": 8.731297297145889e-05,
634
+ "loss": 1.585,
635
+ "step": 8800
636
+ },
637
+ {
638
+ "epoch": 0.15337247107751342,
639
+ "grad_norm": 0.5709084868431091,
640
+ "learning_rate": 8.693172024824143e-05,
641
+ "loss": 1.5811,
642
+ "step": 8900
643
+ },
644
+ {
645
+ "epoch": 0.1550957572693956,
646
+ "grad_norm": 0.52576744556427,
647
+ "learning_rate": 8.654568258576197e-05,
648
+ "loss": 1.5843,
649
+ "step": 9000
650
+ },
651
+ {
652
+ "epoch": 0.1550957572693956,
653
+ "eval_accuracy": 0.010426478228323498,
654
+ "eval_loss": 1.732275128364563,
655
+ "eval_runtime": 1695.9476,
656
+ "eval_samples_per_second": 501.14,
657
+ "eval_steps_per_second": 15.661,
658
+ "step": 9000
659
+ },
660
+ {
661
+ "epoch": 0.15681904346127776,
662
+ "grad_norm": 0.5634833574295044,
663
+ "learning_rate": 8.615490999974563e-05,
664
+ "loss": 1.5927,
665
+ "step": 9100
666
+ },
667
+ {
668
+ "epoch": 0.15854232965315992,
669
+ "grad_norm": 0.5738709568977356,
670
+ "learning_rate": 8.575945311938262e-05,
671
+ "loss": 1.6131,
672
+ "step": 9200
673
+ },
674
+ {
675
+ "epoch": 0.1602656158450421,
676
+ "grad_norm": 0.5828307867050171,
677
+ "learning_rate": 8.535936318076864e-05,
678
+ "loss": 1.5766,
679
+ "step": 9300
680
+ },
681
+ {
682
+ "epoch": 0.1619889020369243,
683
+ "grad_norm": 0.580729603767395,
684
+ "learning_rate": 8.495469202026669e-05,
685
+ "loss": 1.5902,
686
+ "step": 9400
687
+ },
688
+ {
689
+ "epoch": 0.16371218822880645,
690
+ "grad_norm": 0.568894624710083,
691
+ "learning_rate": 8.454549206779092e-05,
692
+ "loss": 1.5671,
693
+ "step": 9500
694
+ },
695
+ {
696
+ "epoch": 0.16543547442068862,
697
+ "grad_norm": 0.5564482808113098,
698
+ "learning_rate": 8.413181634001391e-05,
699
+ "loss": 1.5778,
700
+ "step": 9600
701
+ },
702
+ {
703
+ "epoch": 0.1671587606125708,
704
+ "grad_norm": 0.5514076948165894,
705
+ "learning_rate": 8.371371843349755e-05,
706
+ "loss": 1.5874,
707
+ "step": 9700
708
+ },
709
+ {
710
+ "epoch": 0.16888204680445298,
711
+ "grad_norm": 0.5865207314491272,
712
+ "learning_rate": 8.329125251774916e-05,
713
+ "loss": 1.5637,
714
+ "step": 9800
715
+ },
716
+ {
717
+ "epoch": 0.17060533299633515,
718
+ "grad_norm": 0.5577490329742432,
719
+ "learning_rate": 8.286447332820298e-05,
720
+ "loss": 1.5801,
721
+ "step": 9900
722
+ },
723
+ {
724
+ "epoch": 0.17232861918821732,
725
+ "grad_norm": 0.5502321124076843,
726
+ "learning_rate": 8.243343615912877e-05,
727
+ "loss": 1.5695,
728
+ "step": 10000
729
+ },
730
+ {
731
+ "epoch": 0.1740519053800995,
732
+ "grad_norm": 0.5614681243896484,
733
+ "learning_rate": 8.199819685646759e-05,
734
+ "loss": 1.5892,
735
+ "step": 10100
736
+ },
737
+ {
738
+ "epoch": 0.17577519157198165,
739
+ "grad_norm": 0.5736984014511108,
740
+ "learning_rate": 8.155881181059644e-05,
741
+ "loss": 1.5911,
742
+ "step": 10200
743
+ },
744
+ {
745
+ "epoch": 0.17749847776386385,
746
+ "grad_norm": 0.49306830763816833,
747
+ "learning_rate": 8.111533794902217e-05,
748
+ "loss": 1.5481,
749
+ "step": 10300
750
+ },
751
+ {
752
+ "epoch": 0.17922176395574602,
753
+ "grad_norm": 0.5843108892440796,
754
+ "learning_rate": 8.066783272900586e-05,
755
+ "loss": 1.582,
756
+ "step": 10400
757
+ },
758
+ {
759
+ "epoch": 0.18094505014762818,
760
+ "grad_norm": 0.5754996538162231,
761
+ "learning_rate": 8.02163541301185e-05,
762
+ "loss": 1.5885,
763
+ "step": 10500
764
+ },
765
+ {
766
+ "epoch": 0.18266833633951035,
767
+ "grad_norm": 0.5479796528816223,
768
+ "learning_rate": 7.976096064672915e-05,
769
+ "loss": 1.5693,
770
+ "step": 10600
771
+ },
772
+ {
773
+ "epoch": 0.18439162253139252,
774
+ "grad_norm": 0.5987735390663147,
775
+ "learning_rate": 7.930171128042627e-05,
776
+ "loss": 1.5679,
777
+ "step": 10700
778
+ },
779
+ {
780
+ "epoch": 0.1861149087232747,
781
+ "grad_norm": 0.5608052611351013,
782
+ "learning_rate": 7.88386655323733e-05,
783
+ "loss": 1.5559,
784
+ "step": 10800
785
+ },
786
+ {
787
+ "epoch": 0.18783819491515688,
788
+ "grad_norm": 0.5474194288253784,
789
+ "learning_rate": 7.83718833955997e-05,
790
+ "loss": 1.5846,
791
+ "step": 10900
792
+ },
793
+ {
794
+ "epoch": 0.18956148110703905,
795
+ "grad_norm": 0.5139473676681519,
796
+ "learning_rate": 7.790142534722805e-05,
797
+ "loss": 1.5715,
798
+ "step": 11000
799
+ },
800
+ {
801
+ "epoch": 0.19128476729892122,
802
+ "grad_norm": 0.5175371170043945,
803
+ "learning_rate": 7.742735234063859e-05,
804
+ "loss": 1.5864,
805
+ "step": 11100
806
+ },
807
+ {
808
+ "epoch": 0.19300805349080338,
809
+ "grad_norm": 0.5598956942558289,
810
+ "learning_rate": 7.694972579757193e-05,
811
+ "loss": 1.5373,
812
+ "step": 11200
813
+ },
814
+ {
815
+ "epoch": 0.19473133968268558,
816
+ "grad_norm": 0.5191853642463684,
817
+ "learning_rate": 7.646860760017124e-05,
818
+ "loss": 1.5573,
819
+ "step": 11300
820
+ },
821
+ {
822
+ "epoch": 0.19645462587456775,
823
+ "grad_norm": 0.5062898993492126,
824
+ "learning_rate": 7.598406008296456e-05,
825
+ "loss": 1.5797,
826
+ "step": 11400
827
+ },
828
+ {
829
+ "epoch": 0.1981779120664499,
830
+ "grad_norm": 0.5880659222602844,
831
+ "learning_rate": 7.549614602478872e-05,
832
+ "loss": 1.558,
833
+ "step": 11500
834
+ },
835
+ {
836
+ "epoch": 0.19990119825833208,
837
+ "grad_norm": 0.5346918702125549,
838
+ "learning_rate": 7.500492864065559e-05,
839
+ "loss": 1.562,
840
+ "step": 11600
841
+ },
842
+ {
843
+ "epoch": 0.20162448445021428,
844
+ "grad_norm": 0.5520205497741699,
845
+ "learning_rate": 7.451047157356182e-05,
846
+ "loss": 1.5458,
847
+ "step": 11700
848
+ },
849
+ {
850
+ "epoch": 0.20334777064209644,
851
+ "grad_norm": 0.5452098250389099,
852
+ "learning_rate": 7.401283888624307e-05,
853
+ "loss": 1.5783,
854
+ "step": 11800
855
+ },
856
+ {
857
+ "epoch": 0.2050710568339786,
858
+ "grad_norm": 0.5486232042312622,
859
+ "learning_rate": 7.351209505287412e-05,
860
+ "loss": 1.5635,
861
+ "step": 11900
862
+ },
863
+ {
864
+ "epoch": 0.20679434302586078,
865
+ "grad_norm": 0.5769017934799194,
866
+ "learning_rate": 7.300830495071524e-05,
867
+ "loss": 1.5473,
868
+ "step": 12000
869
+ },
870
+ {
871
+ "epoch": 0.20679434302586078,
872
+ "eval_accuracy": 0.010505145107655028,
873
+ "eval_loss": 1.7127723693847656,
874
+ "eval_runtime": 1714.6588,
875
+ "eval_samples_per_second": 495.671,
876
+ "eval_steps_per_second": 15.49,
877
+ "step": 12000
878
+ },
879
+ {
880
+ "epoch": 0.20851762921774294,
881
+ "grad_norm": 0.5299004912376404,
882
+ "learning_rate": 7.250153385170675e-05,
883
+ "loss": 1.5631,
884
+ "step": 12100
885
+ },
886
+ {
887
+ "epoch": 0.21024091540962514,
888
+ "grad_norm": 0.6350430250167847,
889
+ "learning_rate": 7.199184741401222e-05,
890
+ "loss": 1.5484,
891
+ "step": 12200
892
+ },
893
+ {
894
+ "epoch": 0.2119642016015073,
895
+ "grad_norm": 0.5689346790313721,
896
+ "learning_rate": 7.147931167351162e-05,
897
+ "loss": 1.5616,
898
+ "step": 12300
899
+ },
900
+ {
901
+ "epoch": 0.21368748779338947,
902
+ "grad_norm": 0.5793879628181458,
903
+ "learning_rate": 7.096399303524577e-05,
904
+ "loss": 1.5496,
905
+ "step": 12400
906
+ },
907
+ {
908
+ "epoch": 0.21541077398527164,
909
+ "grad_norm": 0.5497804284095764,
910
+ "learning_rate": 7.044595826481253e-05,
911
+ "loss": 1.5667,
912
+ "step": 12500
913
+ },
914
+ {
915
+ "epoch": 0.2171340601771538,
916
+ "grad_norm": 0.5804843902587891,
917
+ "learning_rate": 6.992527447971677e-05,
918
+ "loss": 1.5586,
919
+ "step": 12600
920
+ },
921
+ {
922
+ "epoch": 0.218857346369036,
923
+ "grad_norm": 0.5805226564407349,
924
+ "learning_rate": 6.940200914067431e-05,
925
+ "loss": 1.5428,
926
+ "step": 12700
927
+ },
928
+ {
929
+ "epoch": 0.22058063256091817,
930
+ "grad_norm": 0.5112205743789673,
931
+ "learning_rate": 6.887623004287182e-05,
932
+ "loss": 1.5597,
933
+ "step": 12800
934
+ },
935
+ {
936
+ "epoch": 0.22230391875280034,
937
+ "grad_norm": 0.5555017590522766,
938
+ "learning_rate": 6.834800530718285e-05,
939
+ "loss": 1.5349,
940
+ "step": 12900
941
+ },
942
+ {
943
+ "epoch": 0.2240272049446825,
944
+ "grad_norm": 0.5393018126487732,
945
+ "learning_rate": 6.781740337134229e-05,
946
+ "loss": 1.5392,
947
+ "step": 13000
948
+ },
949
+ {
950
+ "epoch": 0.22575049113656467,
951
+ "grad_norm": 0.515864372253418,
952
+ "learning_rate": 6.728449298107919e-05,
953
+ "loss": 1.5617,
954
+ "step": 13100
955
+ },
956
+ {
957
+ "epoch": 0.22747377732844687,
958
+ "grad_norm": 0.5203471779823303,
959
+ "learning_rate": 6.674934318121013e-05,
960
+ "loss": 1.5492,
961
+ "step": 13200
962
+ },
963
+ {
964
+ "epoch": 0.22919706352032904,
965
+ "grad_norm": 0.5489692091941833,
966
+ "learning_rate": 6.621202330669354e-05,
967
+ "loss": 1.544,
968
+ "step": 13300
969
+ },
970
+ {
971
+ "epoch": 0.2309203497122112,
972
+ "grad_norm": 0.5596274137496948,
973
+ "learning_rate": 6.567260297364659e-05,
974
+ "loss": 1.5463,
975
+ "step": 13400
976
+ },
977
+ {
978
+ "epoch": 0.23264363590409337,
979
+ "grad_norm": 0.5610251426696777,
980
+ "learning_rate": 6.513115207032557e-05,
981
+ "loss": 1.5802,
982
+ "step": 13500
983
+ },
984
+ {
985
+ "epoch": 0.23436692209597557,
986
+ "grad_norm": 0.5264619588851929,
987
+ "learning_rate": 6.458774074807107e-05,
988
+ "loss": 1.5545,
989
+ "step": 13600
990
+ },
991
+ {
992
+ "epoch": 0.23609020828785773,
993
+ "grad_norm": 0.5814141631126404,
994
+ "learning_rate": 6.404243941221903e-05,
995
+ "loss": 1.5521,
996
+ "step": 13700
997
+ },
998
+ {
999
+ "epoch": 0.2378134944797399,
1000
+ "grad_norm": 0.5240880846977234,
1001
+ "learning_rate": 6.349531871297896e-05,
1002
+ "loss": 1.5675,
1003
+ "step": 13800
1004
+ },
1005
+ {
1006
+ "epoch": 0.23953678067162207,
1007
+ "grad_norm": 0.4984951913356781,
1008
+ "learning_rate": 6.294644953628023e-05,
1009
+ "loss": 1.5559,
1010
+ "step": 13900
1011
+ },
1012
+ {
1013
+ "epoch": 0.24126006686350424,
1014
+ "grad_norm": 0.5289067029953003,
1015
+ "learning_rate": 6.239590299458814e-05,
1016
+ "loss": 1.5285,
1017
+ "step": 14000
1018
+ },
1019
+ {
1020
+ "epoch": 0.24298335305538643,
1021
+ "grad_norm": 0.5221706032752991,
1022
+ "learning_rate": 6.184375041769032e-05,
1023
+ "loss": 1.553,
1024
+ "step": 14100
1025
+ },
1026
+ {
1027
+ "epoch": 0.2447066392472686,
1028
+ "grad_norm": 0.5475857257843018,
1029
+ "learning_rate": 6.12900633434552e-05,
1030
+ "loss": 1.5675,
1031
+ "step": 14200
1032
+ },
1033
+ {
1034
+ "epoch": 0.24642992543915077,
1035
+ "grad_norm": 0.5271047353744507,
1036
+ "learning_rate": 6.0734913508563395e-05,
1037
+ "loss": 1.5487,
1038
+ "step": 14300
1039
+ },
1040
+ {
1041
+ "epoch": 0.24815321163103293,
1042
+ "grad_norm": 0.5180040001869202,
1043
+ "learning_rate": 6.0178372839213406e-05,
1044
+ "loss": 1.5281,
1045
+ "step": 14400
1046
+ },
1047
+ {
1048
+ "epoch": 0.2498764978229151,
1049
+ "grad_norm": 0.566608726978302,
1050
+ "learning_rate": 5.9620513441802714e-05,
1051
+ "loss": 1.5602,
1052
+ "step": 14500
1053
+ },
1054
+ {
1055
+ "epoch": 0.25159978401479727,
1056
+ "grad_norm": 0.5131779909133911,
1057
+ "learning_rate": 5.906140759358555e-05,
1058
+ "loss": 1.5111,
1059
+ "step": 14600
1060
+ },
1061
+ {
1062
+ "epoch": 0.25332307020667943,
1063
+ "grad_norm": 0.5626484751701355,
1064
+ "learning_rate": 5.85011277333085e-05,
1065
+ "loss": 1.5528,
1066
+ "step": 14700
1067
+ },
1068
+ {
1069
+ "epoch": 0.25504635639856166,
1070
+ "grad_norm": 0.550121545791626,
1071
+ "learning_rate": 5.793974645182526e-05,
1072
+ "loss": 1.5401,
1073
+ "step": 14800
1074
+ },
1075
+ {
1076
+ "epoch": 0.2567696425904438,
1077
+ "grad_norm": 0.5408352017402649,
1078
+ "learning_rate": 5.737733648269162e-05,
1079
+ "loss": 1.5437,
1080
+ "step": 14900
1081
+ },
1082
+ {
1083
+ "epoch": 0.258492928782326,
1084
+ "grad_norm": 0.5391642451286316,
1085
+ "learning_rate": 5.6813970692741945e-05,
1086
+ "loss": 1.5407,
1087
+ "step": 15000
1088
+ },
1089
+ {
1090
+ "epoch": 0.258492928782326,
1091
+ "eval_accuracy": 0.010407141521982707,
1092
+ "eval_loss": 1.6942352056503296,
1093
+ "eval_runtime": 1748.5708,
1094
+ "eval_samples_per_second": 486.058,
1095
+ "eval_steps_per_second": 15.19,
1096
+ "step": 15000
1097
+ }
1098
+ ],
1099
+ "logging_steps": 100,
1100
+ "max_steps": 30000,
1101
+ "num_input_tokens_seen": 0,
1102
+ "num_train_epochs": 1,
1103
+ "save_steps": 3000,
1104
+ "stateful_callbacks": {
1105
+ "TrainerControl": {
1106
+ "args": {
1107
+ "should_epoch_stop": false,
1108
+ "should_evaluate": false,
1109
+ "should_log": false,
1110
+ "should_save": true,
1111
+ "should_training_stop": false
1112
+ },
1113
+ "attributes": {}
1114
+ }
1115
+ },
1116
+ "total_flos": 3.041448973814661e+17,
1117
+ "train_batch_size": 32,
1118
+ "trial_name": null,
1119
+ "trial_params": null
1120
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2efb46a23f61853f9d1bd735587f175002d35b4a6ab1a89a7574378204b2326
3
+ size 5905