Trained Model with 14gb/2 dataset

Browse files

Files changed (12) hide show

GPU_settings +71 -0
HuggingFace_Mistral_Transformer_Single_Instrument.json +0 -0
config.json +4 -5
generation_config.json +1 -1
model.safetensors +2 -2
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
test.py +7 -0
train.py +154 -0
trainer_state.json +1120 -0
training_args.bin +3 -0

GPU_settings ADDED Viewed

	@@ -0,0 +1,71 @@

+training_config = TrainingArguments(
+    model_dir_str, False, True, True, False, "steps",
+    per_device_train_batch_size=64, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time
+    per_device_eval_batch_size=64, #was 24 now 32
+    gradient_accumulation_steps=3, #change this to 4
+    eval_accumulation_steps=None,
+    eval_steps=2000,
+    learning_rate=1e-4,
+    weight_decay=0.01,
+    max_grad_norm=1.0,
+    max_steps=50000,
+    lr_scheduler_type="cosine",
+    warmup_ratio=0.08,
+    log_level="debug",
+    logging_strategy="steps",
+    logging_steps=20,
+    save_strategy="steps",
+    save_steps=1000,
+    save_total_limit=5,
+    no_cuda=not USE_CUDA,
+    seed=444,
+    fp16=FP16,
+    fp16_full_eval=FP16_EVAL,
+    bf16=BF16,
+    bf16_full_eval=BF16_EVAL,
+    load_best_model_at_end=True,
+    label_smoothing_factor=0.,
+    optim="adamw_torch",
+    report_to=["tensorboard"],
+    gradient_checkpointing=True,
+    dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process
+    dataloader_pin_memory=True, #we want the dataset in memory
+    torch_compile=True #added to speed up
+Better sugested by ai
+training_config = TrainingArguments(
+    model_dir_str, False, True, True, False, "steps",
+    per_device_train_batch_size=64, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time
+    per_device_eval_batch_size=64, #was 24 now 32
+    gradient_accumulation_steps=3, #change this to 4
+    eval_accumulation_steps=None,
+    eval_steps=3000,
+    eval_delay=6000,
+    learning_rate=1e-4,
+    weight_decay=0.01,
+    max_grad_norm=1.0,
+    max_steps=30000,
+    lr_scheduler_type="cosine",
+    warmup_ratio=0.08,
+    log_level="debug",
+    logging_strategy="steps",
+    logging_steps=100,
+    save_strategy="steps",
+    save_steps=3000,
+    save_total_limit=5,
+    no_cuda=not USE_CUDA,
+    seed=444,
+    fp16=FP16,
+    fp16_full_eval=FP16_EVAL,
+    bf16=BF16,
+    bf16_full_eval=BF16_EVAL,
+    load_best_model_at_end=True,
+    label_smoothing_factor=0.05,
+    optim="adamw_torch",
+    report_to=["tensorboard"],
+    gradient_checkpointing=False,
+    dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process
+    dataloader_pin_memory=True, #we want the dataset in memory
+    torch_compile=True #added to speed up

HuggingFace_Mistral_Transformer_Single_Instrument.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

config.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
-  "_name_or_path": "/home/wombat/Documents/projects/music/midiTok/data/HuggingFace_Mistral_Transformer_Single_Instrument/run/model.safetensors",
   "architectures": [
     "MistralForCausalLM"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 1,
   "eos_token_id": 2,
-  "head_dim": 64,
   "hidden_act": "silu",
   "hidden_size": 512,
   "initializer_range": 0.02,
@@ -21,8 +21,7 @@
   "rope_theta": 10000.0,
   "sliding_window": 256,
   "tie_word_embeddings": false,
-  "torch_dtype": "float32",
-  "transformers_version": "4.46.2",
   "use_cache": true,
-  "vocab_size": 32000
 }

 {
   "architectures": [
     "MistralForCausalLM"
   ],
   "attention_dropout": 0.0,
   "bos_token_id": 1,
+  "dtype": "float32",
   "eos_token_id": 2,
+  "head_dim": null,
   "hidden_act": "silu",
   "hidden_size": 512,
   "initializer_range": 0.02,
   "rope_theta": 10000.0,
   "sliding_window": 256,
   "tie_word_embeddings": false,
+  "transformers_version": "4.56.2",
   "use_cache": true,
+  "vocab_size": 24000
 }

generation_config.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "bos_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 0,
-  "transformers_version": "4.46.2"
 }

   "bos_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 0,
+  "transformers_version": "4.56.2"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a36a5a2deaeaccf3c60fea79860b75fedca40b881d9a46f0133565023849741
-size 256944240

 version https://git-lfs.github.com/spec/v1
+oid sha256:f49ec94974c83fd18ef12815c1351c4fef7e3aa729bfd38cf6457d55366ca999
+size 224176232

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c875a81a1eef2afb2c24c78e8fd15292dd9ed57f50c8266448d36a7f65be2bd
+size 448399883

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e717e3eca82ee0f6b33f1773c9e85e3d6d0912794766ebb2336f9cff2d8a049
+size 14645

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7356bb5c8ec3657ebcedb2b29bf1bb59a0e6d04ad9df685225b3375419862023
+size 1465

test.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import torch
+from pathlib import Path
+dataset_test = torch.load(Path("/media/wombat/c6928dc9-ba03-411d-9483-8e28df5973b9/Music Data/HuggingFace_Mistral_Transformer_Single_Instrument/data/dataset_valid.pt"), weights_only=False)
+print(f"valid dataset length: {len(dataset_test)}")

train.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from copy import deepcopy
+from pathlib import Path
+from random import shuffle, sample
+from evaluate import load as load_metric
+from miditok import REMI, TokenizerConfig, TokTrainingIterator
+from miditok.pytorch_data import DatasetMIDI, DataCollator
+from miditok.utils import split_files_for_training
+from miditok.data_augmentation import augment_dataset
+from torch import Tensor, argmax, torch
+from torch.utils.data import DataLoader
+from torch.cuda import is_available as cuda_available, is_bf16_supported
+from torch.backends.mps import is_available as mps_available
+from transformers import AutoModelForCausalLM, MistralConfig, Trainer, TrainingArguments, GenerationConfig, AutoConfig
+from transformers.trainer_utils import set_seed
+from tqdm import tqdm
+root_data_dir = Path('/home/wombat/Documents/projects/music/midiTok/data/')
+root_save = Path(root_data_dir / 'HuggingFace_Mistral_Transformer_Single_Instrument')
+tokenizer_name = "HuggingFace_Mistral_Transformer_Single_Instrument_v3_single_track.json"
+tokenizer = REMI(params=Path(root_save / tokenizer_name))
+sequence_length = 1024  # The maximum sequence length for data samples.
+kwargs_dataset = {"max_seq_len": sequence_length, "tokenizer": tokenizer, "bos_token_id": tokenizer["BOS_None"], "eos_token_id": tokenizer["EOS_None"]}
+dataset_dir = root_save / "data"
+dataset_dir.mkdir(parents=True, exist_ok=True)
+dataset_train = torch.load(Path(dataset_dir / "dataset_train.pt"), weights_only=False)
+dataset_valid = torch.load(Path(dataset_dir / "dataset_valid.pt"), weights_only=False)
+dataset_test = torch.load(Path(dataset_dir / "dataset_test.pt"), weights_only=False)
+# Creates model
+model_config = MistralConfig(
+    vocab_size=len(tokenizer), #from miditok output default 32K
+    hidden_size=512, # default 4096
+    intermediate_size=2048, # default  14336
+    num_hidden_layers=8, # default  32
+    num_attention_heads=8, # default  32
+    num_key_value_heads=4, # default 8
+    sliding_window=256, # default  4096
+    max_position_embeddings=8192, #has no effect on the parms count or training just limits the input length  # default 4096*32
+    pad_token_id=tokenizer['PAD_None'],
+    bos_token_id=tokenizer['BOS_None'],
+    eos_token_id=tokenizer['EOS_None'],
+)
+model = AutoModelForCausalLM.from_config(model_config)
+model_dir = root_save / 'run'
+model_dir_str = str(model_dir)
+print(model_dir)
+metrics = {metric: load_metric(metric) for metric in ["accuracy"]}
+def compute_metrics(eval_pred):
+    """
+    Compute metrics for pretraining.
+    Must use preprocess_logits function that converts logits to predictions (argmax or sampling).
+    :param eval_pred: EvalPrediction containing predictions and labels
+    :return: metrics
+    """
+    predictions, labels = eval_pred
+    not_pad_mask = labels != -100
+    labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]
+    return metrics["accuracy"].compute(predictions=predictions.flatten(), references=labels.flatten())
+def preprocess_logits(logits: Tensor, _: Tensor) -> Tensor:
+    """
+    Preprocess the logits before accumulating them during evaluation.
+    This allows to significantly reduce the memory usage and make the training tractable.
+    """
+    pred_ids = argmax(logits, dim=-1)  # long dtype
+    return pred_ids
+# Create config for the Trainer
+USE_CUDA = cuda_available()
+print(USE_CUDA)
+if not cuda_available():
+    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
+elif is_bf16_supported():
+    BF16 = BF16_EVAL = True
+    FP16 = FP16_EVAL = False
+else:
+    BF16 = BF16_EVAL = False
+    FP16 = FP16_EVAL = True
+USE_MPS = not USE_CUDA and mps_available()
+training_config = TrainingArguments(
+    model_dir_str, False, True, True, False, "steps",
+    per_device_train_batch_size=24, #76% @ 24 batch size #76% @ 32 batch size try 64 batch size next time
+    per_device_eval_batch_size=24, #was 24 now 32
+    gradient_accumulation_steps=2, #change this to 4
+    eval_accumulation_steps=None,
+    eval_steps=1000,
+    learning_rate=1e-4,
+    weight_decay=0.01,
+    max_grad_norm=3.0,
+    max_steps=40000,
+    lr_scheduler_type="cosine_with_restarts",
+    warmup_ratio=0.3,
+    log_level="debug",
+    logging_strategy="steps",
+    logging_steps=20,
+    save_strategy="steps",
+    save_steps=1000,
+    save_total_limit=5,
+    no_cuda=not USE_CUDA,
+    seed=444,
+    fp16=FP16,
+    fp16_full_eval=FP16_EVAL,
+    bf16=BF16,
+    bf16_full_eval=BF16_EVAL,
+    load_best_model_at_end=True,
+    label_smoothing_factor=0.,
+    optim="adamw_torch",
+    report_to=["tensorboard"],
+    gradient_checkpointing=True,
+    dataloader_num_workers=8, #added to fix trashing isssue with the gpu not having enough data to process
+    dataloader_pin_memory=True, #we want the dataset in memory
+    torch_compile=True #added to speed up
+)
+collator = DataCollator(tokenizer["PAD_None"], copy_inputs_as_labels=True, pad_on_left=True) #not sure about the pad_on_left, it might get better results
+trainer = Trainer(
+    model=model,
+    args=training_config,
+    data_collator=collator,
+    train_dataset=dataset_train,
+    eval_dataset=dataset_valid,
+    compute_metrics=compute_metrics,
+    callbacks=None,
+    preprocess_logits_for_metrics=preprocess_logits,
+)
+#%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+# Training
+train_result = trainer.train()
+trainer.save_model()  # Saves the tokenizer too
+trainer.log_metrics("train", train_result.metrics)
+trainer.save_metrics("train", train_result.metrics)
+trainer.save_state()

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1120 @@

+{
+  "best_global_step": 15000,
+  "best_metric": 1.6942352056503296,
+  "best_model_checkpoint": "/workspace/traindata/data/HuggingFace_Mistral_Transformer_Single_Instrument/run/checkpoint-15000",
+  "epoch": 0.258492928782326,
+  "eval_steps": 3000,
+  "global_step": 15000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0017232861918821732,
+      "grad_norm": 0.6133952736854553,
+      "learning_rate": 4.125e-06,
+      "loss": 1.5965,
+      "step": 100
+    },
+    {
+      "epoch": 0.0034465723837643464,
+      "grad_norm": 0.5784599184989929,
+      "learning_rate": 8.291666666666667e-06,
+      "loss": 1.5982,
+      "step": 200
+    },
+    {
+      "epoch": 0.005169858575646519,
+      "grad_norm": 0.5842740535736084,
+      "learning_rate": 1.2458333333333334e-05,
+      "loss": 1.5828,
+      "step": 300
+    },
+    {
+      "epoch": 0.006893144767528693,
+      "grad_norm": 0.5865280032157898,
+      "learning_rate": 1.6625e-05,
+      "loss": 1.5934,
+      "step": 400
+    },
+    {
+      "epoch": 0.008616430959410866,
+      "grad_norm": 0.6083072423934937,
+      "learning_rate": 2.0791666666666666e-05,
+      "loss": 1.6052,
+      "step": 500
+    },
+    {
+      "epoch": 0.010339717151293039,
+      "grad_norm": 0.5992451906204224,
+      "learning_rate": 2.4958333333333335e-05,
+      "loss": 1.5995,
+      "step": 600
+    },
+    {
+      "epoch": 0.012063003343175211,
+      "grad_norm": 0.6140967011451721,
+      "learning_rate": 2.9125000000000003e-05,
+      "loss": 1.5791,
+      "step": 700
+    },
+    {
+      "epoch": 0.013786289535057386,
+      "grad_norm": 0.6324509382247925,
+      "learning_rate": 3.329166666666667e-05,
+      "loss": 1.6014,
+      "step": 800
+    },
+    {
+      "epoch": 0.015509575726939558,
+      "grad_norm": 0.6500518918037415,
+      "learning_rate": 3.7458333333333334e-05,
+      "loss": 1.6042,
+      "step": 900
+    },
+    {
+      "epoch": 0.017232861918821733,
+      "grad_norm": 0.6341643929481506,
+      "learning_rate": 4.1625e-05,
+      "loss": 1.5796,
+      "step": 1000
+    },
+    {
+      "epoch": 0.018956148110703903,
+      "grad_norm": 0.6603251099586487,
+      "learning_rate": 4.579166666666667e-05,
+      "loss": 1.5855,
+      "step": 1100
+    },
+    {
+      "epoch": 0.020679434302586078,
+      "grad_norm": 0.7315922379493713,
+      "learning_rate": 4.995833333333333e-05,
+      "loss": 1.5976,
+      "step": 1200
+    },
+    {
+      "epoch": 0.022402720494468252,
+      "grad_norm": 0.6418182849884033,
+      "learning_rate": 5.4125e-05,
+      "loss": 1.5834,
+      "step": 1300
+    },
+    {
+      "epoch": 0.024126006686350423,
+      "grad_norm": 0.6903438568115234,
+      "learning_rate": 5.829166666666667e-05,
+      "loss": 1.6235,
+      "step": 1400
+    },
+    {
+      "epoch": 0.025849292878232597,
+      "grad_norm": 0.6109316945075989,
+      "learning_rate": 6.245833333333334e-05,
+      "loss": 1.6143,
+      "step": 1500
+    },
+    {
+      "epoch": 0.02757257907011477,
+      "grad_norm": 0.6458160281181335,
+      "learning_rate": 6.6625e-05,
+      "loss": 1.609,
+      "step": 1600
+    },
+    {
+      "epoch": 0.029295865261996946,
+      "grad_norm": 0.6940888166427612,
+      "learning_rate": 7.079166666666666e-05,
+      "loss": 1.6048,
+      "step": 1700
+    },
+    {
+      "epoch": 0.031019151453879117,
+      "grad_norm": 0.6740911602973938,
+      "learning_rate": 7.495833333333334e-05,
+      "loss": 1.6116,
+      "step": 1800
+    },
+    {
+      "epoch": 0.03274243764576129,
+      "grad_norm": 0.634560763835907,
+      "learning_rate": 7.9125e-05,
+      "loss": 1.5999,
+      "step": 1900
+    },
+    {
+      "epoch": 0.034465723837643465,
+      "grad_norm": 0.677970826625824,
+      "learning_rate": 8.329166666666667e-05,
+      "loss": 1.6104,
+      "step": 2000
+    },
+    {
+      "epoch": 0.03618901002952564,
+      "grad_norm": 0.6901321411132812,
+      "learning_rate": 8.745833333333334e-05,
+      "loss": 1.6018,
+      "step": 2100
+    },
+    {
+      "epoch": 0.03791229622140781,
+      "grad_norm": 0.6881032586097717,
+      "learning_rate": 9.1625e-05,
+      "loss": 1.6303,
+      "step": 2200
+    },
+    {
+      "epoch": 0.03963558241328998,
+      "grad_norm": 0.6821079254150391,
+      "learning_rate": 9.579166666666667e-05,
+      "loss": 1.6207,
+      "step": 2300
+    },
+    {
+      "epoch": 0.041358868605172155,
+      "grad_norm": 0.7254959940910339,
+      "learning_rate": 9.995833333333334e-05,
+      "loss": 1.6106,
+      "step": 2400
+    },
+    {
+      "epoch": 0.04308215479705433,
+      "grad_norm": 0.7417749166488647,
+      "learning_rate": 9.99968254119042e-05,
+      "loss": 1.6141,
+      "step": 2500
+    },
+    {
+      "epoch": 0.044805440988936504,
+      "grad_norm": 0.6578373312950134,
+      "learning_rate": 9.998717347022716e-05,
+      "loss": 1.6214,
+      "step": 2600
+    },
+    {
+      "epoch": 0.04652872718081868,
+      "grad_norm": 0.6432535648345947,
+      "learning_rate": 9.997104510158365e-05,
+      "loss": 1.6303,
+      "step": 2700
+    },
+    {
+      "epoch": 0.048252013372700846,
+      "grad_norm": 0.6907160878181458,
+      "learning_rate": 9.994844239559375e-05,
+      "loss": 1.6105,
+      "step": 2800
+    },
+    {
+      "epoch": 0.04997529956458302,
+      "grad_norm": 0.7411105036735535,
+      "learning_rate": 9.991936828070421e-05,
+      "loss": 1.629,
+      "step": 2900
+    },
+    {
+      "epoch": 0.051698585756465194,
+      "grad_norm": 0.6869089603424072,
+      "learning_rate": 9.988382652380897e-05,
+      "loss": 1.6249,
+      "step": 3000
+    },
+    {
+      "epoch": 0.05342187194834737,
+      "grad_norm": 0.662797212600708,
+      "learning_rate": 9.984182172976115e-05,
+      "loss": 1.633,
+      "step": 3100
+    },
+    {
+      "epoch": 0.05514515814022954,
+      "grad_norm": 0.6771135926246643,
+      "learning_rate": 9.979335934077652e-05,
+      "loss": 1.6243,
+      "step": 3200
+    },
+    {
+      "epoch": 0.05686844433211172,
+      "grad_norm": 0.6237235069274902,
+      "learning_rate": 9.97384456357282e-05,
+      "loss": 1.6184,
+      "step": 3300
+    },
+    {
+      "epoch": 0.05859173052399389,
+      "grad_norm": 0.6165594458580017,
+      "learning_rate": 9.967708772933339e-05,
+      "loss": 1.6178,
+      "step": 3400
+    },
+    {
+      "epoch": 0.06031501671587606,
+      "grad_norm": 0.6119577884674072,
+      "learning_rate": 9.960929357123137e-05,
+      "loss": 1.6171,
+      "step": 3500
+    },
+    {
+      "epoch": 0.06203830290775823,
+      "grad_norm": 0.6392346024513245,
+      "learning_rate": 9.953507194495366e-05,
+      "loss": 1.6283,
+      "step": 3600
+    },
+    {
+      "epoch": 0.06376158909964041,
+      "grad_norm": 0.7036736607551575,
+      "learning_rate": 9.945443246678599e-05,
+      "loss": 1.6278,
+      "step": 3700
+    },
+    {
+      "epoch": 0.06548487529152258,
+      "grad_norm": 0.7011469006538391,
+      "learning_rate": 9.936738558452233e-05,
+      "loss": 1.6087,
+      "step": 3800
+    },
+    {
+      "epoch": 0.06720816148340475,
+      "grad_norm": 0.6176936030387878,
+      "learning_rate": 9.927394257611137e-05,
+      "loss": 1.6285,
+      "step": 3900
+    },
+    {
+      "epoch": 0.06893144767528693,
+      "grad_norm": 0.6255926489830017,
+      "learning_rate": 9.91741155481952e-05,
+      "loss": 1.618,
+      "step": 4000
+    },
+    {
+      "epoch": 0.0706547338671691,
+      "grad_norm": 0.6118465065956116,
+      "learning_rate": 9.906791743454082e-05,
+      "loss": 1.629,
+      "step": 4100
+    },
+    {
+      "epoch": 0.07237802005905128,
+      "grad_norm": 0.6299500465393066,
+      "learning_rate": 9.895536199436449e-05,
+      "loss": 1.6328,
+      "step": 4200
+    },
+    {
+      "epoch": 0.07410130625093345,
+      "grad_norm": 0.7682228684425354,
+      "learning_rate": 9.883646381054886e-05,
+      "loss": 1.5985,
+      "step": 4300
+    },
+    {
+      "epoch": 0.07582459244281561,
+      "grad_norm": 0.5980575084686279,
+      "learning_rate": 9.871123828775381e-05,
+      "loss": 1.6114,
+      "step": 4400
+    },
+    {
+      "epoch": 0.0775478786346978,
+      "grad_norm": 0.5819905400276184,
+      "learning_rate": 9.857970165042046e-05,
+      "loss": 1.622,
+      "step": 4500
+    },
+    {
+      "epoch": 0.07927116482657996,
+      "grad_norm": 0.8043591976165771,
+      "learning_rate": 9.844187094066913e-05,
+      "loss": 1.5978,
+      "step": 4600
+    },
+    {
+      "epoch": 0.08099445101846214,
+      "grad_norm": 0.6172861456871033,
+      "learning_rate": 9.829776401609134e-05,
+      "loss": 1.5887,
+      "step": 4700
+    },
+    {
+      "epoch": 0.08271773721034431,
+      "grad_norm": 0.6270127296447754,
+      "learning_rate": 9.814739954743617e-05,
+      "loss": 1.5971,
+      "step": 4800
+    },
+    {
+      "epoch": 0.08444102340222649,
+      "grad_norm": 0.6117558479309082,
+      "learning_rate": 9.79907970161912e-05,
+      "loss": 1.6033,
+      "step": 4900
+    },
+    {
+      "epoch": 0.08616430959410866,
+      "grad_norm": 0.6499077677726746,
+      "learning_rate": 9.78279767120585e-05,
+      "loss": 1.6129,
+      "step": 5000
+    },
+    {
+      "epoch": 0.08788759578599083,
+      "grad_norm": 0.5596637725830078,
+      "learning_rate": 9.76589597303258e-05,
+      "loss": 1.6211,
+      "step": 5100
+    },
+    {
+      "epoch": 0.08961088197787301,
+      "grad_norm": 0.5757789015769958,
+      "learning_rate": 9.748376796913344e-05,
+      "loss": 1.6225,
+      "step": 5200
+    },
+    {
+      "epoch": 0.09133416816975518,
+      "grad_norm": 0.6331895589828491,
+      "learning_rate": 9.730242412663709e-05,
+      "loss": 1.5732,
+      "step": 5300
+    },
+    {
+      "epoch": 0.09305745436163736,
+      "grad_norm": 0.5809708833694458,
+      "learning_rate": 9.711495169806705e-05,
+      "loss": 1.6233,
+      "step": 5400
+    },
+    {
+      "epoch": 0.09478074055351952,
+      "grad_norm": 0.8100622296333313,
+      "learning_rate": 9.69213749726841e-05,
+      "loss": 1.6118,
+      "step": 5500
+    },
+    {
+      "epoch": 0.09650402674540169,
+      "grad_norm": 0.5590764284133911,
+      "learning_rate": 9.672171903063253e-05,
+      "loss": 1.5967,
+      "step": 5600
+    },
+    {
+      "epoch": 0.09822731293728387,
+      "grad_norm": 0.5601252317428589,
+      "learning_rate": 9.651600973969077e-05,
+      "loss": 1.6189,
+      "step": 5700
+    },
+    {
+      "epoch": 0.09995059912916604,
+      "grad_norm": 0.5881779193878174,
+      "learning_rate": 9.630427375191989e-05,
+      "loss": 1.6133,
+      "step": 5800
+    },
+    {
+      "epoch": 0.10167388532104822,
+      "grad_norm": 0.5713782906532288,
+      "learning_rate": 9.608653850021045e-05,
+      "loss": 1.5912,
+      "step": 5900
+    },
+    {
+      "epoch": 0.10339717151293039,
+      "grad_norm": 0.5922852754592896,
+      "learning_rate": 9.586283219472836e-05,
+      "loss": 1.6022,
+      "step": 6000
+    },
+    {
+      "epoch": 0.10339717151293039,
+      "eval_accuracy": 0.010507585123200762,
+      "eval_loss": 1.751858115196228,
+      "eval_runtime": 1766.3832,
+      "eval_samples_per_second": 481.157,
+      "eval_steps_per_second": 15.036,
+      "step": 6000
+    },
+    {
+      "epoch": 0.10512045770481257,
+      "grad_norm": 0.5831886529922485,
+      "learning_rate": 9.563318381925982e-05,
+      "loss": 1.5952,
+      "step": 6100
+    },
+    {
+      "epoch": 0.10684374389669474,
+      "grad_norm": 0.6007715463638306,
+      "learning_rate": 9.539762312745619e-05,
+      "loss": 1.6048,
+      "step": 6200
+    },
+    {
+      "epoch": 0.1085670300885769,
+      "grad_norm": 0.6652824282646179,
+      "learning_rate": 9.515618063897906e-05,
+      "loss": 1.6103,
+      "step": 6300
+    },
+    {
+      "epoch": 0.11029031628045909,
+      "grad_norm": 0.5715838670730591,
+      "learning_rate": 9.490888763554596e-05,
+      "loss": 1.5917,
+      "step": 6400
+    },
+    {
+      "epoch": 0.11201360247234125,
+      "grad_norm": 0.5438059568405151,
+      "learning_rate": 9.465577615687761e-05,
+      "loss": 1.6222,
+      "step": 6500
+    },
+    {
+      "epoch": 0.11373688866422343,
+      "grad_norm": 0.5527771711349487,
+      "learning_rate": 9.43968789965467e-05,
+      "loss": 1.588,
+      "step": 6600
+    },
+    {
+      "epoch": 0.1154601748561056,
+      "grad_norm": 0.6176398396492004,
+      "learning_rate": 9.413222969772906e-05,
+      "loss": 1.5747,
+      "step": 6700
+    },
+    {
+      "epoch": 0.11718346104798778,
+      "grad_norm": 0.589585542678833,
+      "learning_rate": 9.386186254885783e-05,
+      "loss": 1.5938,
+      "step": 6800
+    },
+    {
+      "epoch": 0.11890674723986995,
+      "grad_norm": 0.5356398820877075,
+      "learning_rate": 9.3585812579181e-05,
+      "loss": 1.5822,
+      "step": 6900
+    },
+    {
+      "epoch": 0.12063003343175212,
+      "grad_norm": 0.6897734999656677,
+      "learning_rate": 9.330411555422285e-05,
+      "loss": 1.5982,
+      "step": 7000
+    },
+    {
+      "epoch": 0.1223533196236343,
+      "grad_norm": 0.5214916467666626,
+      "learning_rate": 9.30168079711502e-05,
+      "loss": 1.5898,
+      "step": 7100
+    },
+    {
+      "epoch": 0.12407660581551647,
+      "grad_norm": 0.5990198850631714,
+      "learning_rate": 9.272392705404372e-05,
+      "loss": 1.6032,
+      "step": 7200
+    },
+    {
+      "epoch": 0.12579989200739863,
+      "grad_norm": 0.531247079372406,
+      "learning_rate": 9.242551074907519e-05,
+      "loss": 1.6082,
+      "step": 7300
+    },
+    {
+      "epoch": 0.12752317819928083,
+      "grad_norm": 0.607933521270752,
+      "learning_rate": 9.212159771959101e-05,
+      "loss": 1.5817,
+      "step": 7400
+    },
+    {
+      "epoch": 0.129246464391163,
+      "grad_norm": 0.5464344024658203,
+      "learning_rate": 9.181222734110301e-05,
+      "loss": 1.5759,
+      "step": 7500
+    },
+    {
+      "epoch": 0.13096975058304516,
+      "grad_norm": 0.6487947106361389,
+      "learning_rate": 9.149743969618683e-05,
+      "loss": 1.6067,
+      "step": 7600
+    },
+    {
+      "epoch": 0.13269303677492733,
+      "grad_norm": 0.556429922580719,
+      "learning_rate": 9.117727556928875e-05,
+      "loss": 1.5863,
+      "step": 7700
+    },
+    {
+      "epoch": 0.1344163229668095,
+      "grad_norm": 0.5772918462753296,
+      "learning_rate": 9.085177644144167e-05,
+      "loss": 1.5888,
+      "step": 7800
+    },
+    {
+      "epoch": 0.1361396091586917,
+      "grad_norm": 0.5730582475662231,
+      "learning_rate": 9.052098448489062e-05,
+      "loss": 1.5983,
+      "step": 7900
+    },
+    {
+      "epoch": 0.13786289535057386,
+      "grad_norm": 0.5221332311630249,
+      "learning_rate": 9.018494255762894e-05,
+      "loss": 1.5757,
+      "step": 8000
+    },
+    {
+      "epoch": 0.13958618154245603,
+      "grad_norm": 0.5817165970802307,
+      "learning_rate": 8.98436941978455e-05,
+      "loss": 1.5813,
+      "step": 8100
+    },
+    {
+      "epoch": 0.1413094677343382,
+      "grad_norm": 0.5672810673713684,
+      "learning_rate": 8.949728361828381e-05,
+      "loss": 1.5997,
+      "step": 8200
+    },
+    {
+      "epoch": 0.14303275392622036,
+      "grad_norm": 0.5768831372261047,
+      "learning_rate": 8.914575570051375e-05,
+      "loss": 1.5707,
+      "step": 8300
+    },
+    {
+      "epoch": 0.14475604011810256,
+      "grad_norm": 0.6032638549804688,
+      "learning_rate": 8.878915598911664e-05,
+      "loss": 1.5892,
+      "step": 8400
+    },
+    {
+      "epoch": 0.14647932630998473,
+      "grad_norm": 0.5976369976997375,
+      "learning_rate": 8.842753068578434e-05,
+      "loss": 1.5996,
+      "step": 8500
+    },
+    {
+      "epoch": 0.1482026125018669,
+      "grad_norm": 0.559442400932312,
+      "learning_rate": 8.806092664333333e-05,
+      "loss": 1.5813,
+      "step": 8600
+    },
+    {
+      "epoch": 0.14992589869374906,
+      "grad_norm": 0.516207218170166,
+      "learning_rate": 8.768939135963447e-05,
+      "loss": 1.5742,
+      "step": 8700
+    },
+    {
+      "epoch": 0.15164918488563123,
+      "grad_norm": 0.553333044052124,
+      "learning_rate": 8.731297297145889e-05,
+      "loss": 1.585,
+      "step": 8800
+    },
+    {
+      "epoch": 0.15337247107751342,
+      "grad_norm": 0.5709084868431091,
+      "learning_rate": 8.693172024824143e-05,
+      "loss": 1.5811,
+      "step": 8900
+    },
+    {
+      "epoch": 0.1550957572693956,
+      "grad_norm": 0.52576744556427,
+      "learning_rate": 8.654568258576197e-05,
+      "loss": 1.5843,
+      "step": 9000
+    },
+    {
+      "epoch": 0.1550957572693956,
+      "eval_accuracy": 0.010426478228323498,
+      "eval_loss": 1.732275128364563,
+      "eval_runtime": 1695.9476,
+      "eval_samples_per_second": 501.14,
+      "eval_steps_per_second": 15.661,
+      "step": 9000
+    },
+    {
+      "epoch": 0.15681904346127776,
+      "grad_norm": 0.5634833574295044,
+      "learning_rate": 8.615490999974563e-05,
+      "loss": 1.5927,
+      "step": 9100
+    },
+    {
+      "epoch": 0.15854232965315992,
+      "grad_norm": 0.5738709568977356,
+      "learning_rate": 8.575945311938262e-05,
+      "loss": 1.6131,
+      "step": 9200
+    },
+    {
+      "epoch": 0.1602656158450421,
+      "grad_norm": 0.5828307867050171,
+      "learning_rate": 8.535936318076864e-05,
+      "loss": 1.5766,
+      "step": 9300
+    },
+    {
+      "epoch": 0.1619889020369243,
+      "grad_norm": 0.580729603767395,
+      "learning_rate": 8.495469202026669e-05,
+      "loss": 1.5902,
+      "step": 9400
+    },
+    {
+      "epoch": 0.16371218822880645,
+      "grad_norm": 0.568894624710083,
+      "learning_rate": 8.454549206779092e-05,
+      "loss": 1.5671,
+      "step": 9500
+    },
+    {
+      "epoch": 0.16543547442068862,
+      "grad_norm": 0.5564482808113098,
+      "learning_rate": 8.413181634001391e-05,
+      "loss": 1.5778,
+      "step": 9600
+    },
+    {
+      "epoch": 0.1671587606125708,
+      "grad_norm": 0.5514076948165894,
+      "learning_rate": 8.371371843349755e-05,
+      "loss": 1.5874,
+      "step": 9700
+    },
+    {
+      "epoch": 0.16888204680445298,
+      "grad_norm": 0.5865207314491272,
+      "learning_rate": 8.329125251774916e-05,
+      "loss": 1.5637,
+      "step": 9800
+    },
+    {
+      "epoch": 0.17060533299633515,
+      "grad_norm": 0.5577490329742432,
+      "learning_rate": 8.286447332820298e-05,
+      "loss": 1.5801,
+      "step": 9900
+    },
+    {
+      "epoch": 0.17232861918821732,
+      "grad_norm": 0.5502321124076843,
+      "learning_rate": 8.243343615912877e-05,
+      "loss": 1.5695,
+      "step": 10000
+    },
+    {
+      "epoch": 0.1740519053800995,
+      "grad_norm": 0.5614681243896484,
+      "learning_rate": 8.199819685646759e-05,
+      "loss": 1.5892,
+      "step": 10100
+    },
+    {
+      "epoch": 0.17577519157198165,
+      "grad_norm": 0.5736984014511108,
+      "learning_rate": 8.155881181059644e-05,
+      "loss": 1.5911,
+      "step": 10200
+    },
+    {
+      "epoch": 0.17749847776386385,
+      "grad_norm": 0.49306830763816833,
+      "learning_rate": 8.111533794902217e-05,
+      "loss": 1.5481,
+      "step": 10300
+    },
+    {
+      "epoch": 0.17922176395574602,
+      "grad_norm": 0.5843108892440796,
+      "learning_rate": 8.066783272900586e-05,
+      "loss": 1.582,
+      "step": 10400
+    },
+    {
+      "epoch": 0.18094505014762818,
+      "grad_norm": 0.5754996538162231,
+      "learning_rate": 8.02163541301185e-05,
+      "loss": 1.5885,
+      "step": 10500
+    },
+    {
+      "epoch": 0.18266833633951035,
+      "grad_norm": 0.5479796528816223,
+      "learning_rate": 7.976096064672915e-05,
+      "loss": 1.5693,
+      "step": 10600
+    },
+    {
+      "epoch": 0.18439162253139252,
+      "grad_norm": 0.5987735390663147,
+      "learning_rate": 7.930171128042627e-05,
+      "loss": 1.5679,
+      "step": 10700
+    },
+    {
+      "epoch": 0.1861149087232747,
+      "grad_norm": 0.5608052611351013,
+      "learning_rate": 7.88386655323733e-05,
+      "loss": 1.5559,
+      "step": 10800
+    },
+    {
+      "epoch": 0.18783819491515688,
+      "grad_norm": 0.5474194288253784,
+      "learning_rate": 7.83718833955997e-05,
+      "loss": 1.5846,
+      "step": 10900
+    },
+    {
+      "epoch": 0.18956148110703905,
+      "grad_norm": 0.5139473676681519,
+      "learning_rate": 7.790142534722805e-05,
+      "loss": 1.5715,
+      "step": 11000
+    },
+    {
+      "epoch": 0.19128476729892122,
+      "grad_norm": 0.5175371170043945,
+      "learning_rate": 7.742735234063859e-05,
+      "loss": 1.5864,
+      "step": 11100
+    },
+    {
+      "epoch": 0.19300805349080338,
+      "grad_norm": 0.5598956942558289,
+      "learning_rate": 7.694972579757193e-05,
+      "loss": 1.5373,
+      "step": 11200
+    },
+    {
+      "epoch": 0.19473133968268558,
+      "grad_norm": 0.5191853642463684,
+      "learning_rate": 7.646860760017124e-05,
+      "loss": 1.5573,
+      "step": 11300
+    },
+    {
+      "epoch": 0.19645462587456775,
+      "grad_norm": 0.5062898993492126,
+      "learning_rate": 7.598406008296456e-05,
+      "loss": 1.5797,
+      "step": 11400
+    },
+    {
+      "epoch": 0.1981779120664499,
+      "grad_norm": 0.5880659222602844,
+      "learning_rate": 7.549614602478872e-05,
+      "loss": 1.558,
+      "step": 11500
+    },
+    {
+      "epoch": 0.19990119825833208,
+      "grad_norm": 0.5346918702125549,
+      "learning_rate": 7.500492864065559e-05,
+      "loss": 1.562,
+      "step": 11600
+    },
+    {
+      "epoch": 0.20162448445021428,
+      "grad_norm": 0.5520205497741699,
+      "learning_rate": 7.451047157356182e-05,
+      "loss": 1.5458,
+      "step": 11700
+    },
+    {
+      "epoch": 0.20334777064209644,
+      "grad_norm": 0.5452098250389099,
+      "learning_rate": 7.401283888624307e-05,
+      "loss": 1.5783,
+      "step": 11800
+    },
+    {
+      "epoch": 0.2050710568339786,
+      "grad_norm": 0.5486232042312622,
+      "learning_rate": 7.351209505287412e-05,
+      "loss": 1.5635,
+      "step": 11900
+    },
+    {
+      "epoch": 0.20679434302586078,
+      "grad_norm": 0.5769017934799194,
+      "learning_rate": 7.300830495071524e-05,
+      "loss": 1.5473,
+      "step": 12000
+    },
+    {
+      "epoch": 0.20679434302586078,
+      "eval_accuracy": 0.010505145107655028,
+      "eval_loss": 1.7127723693847656,
+      "eval_runtime": 1714.6588,
+      "eval_samples_per_second": 495.671,
+      "eval_steps_per_second": 15.49,
+      "step": 12000
+    },
+    {
+      "epoch": 0.20851762921774294,
+      "grad_norm": 0.5299004912376404,
+      "learning_rate": 7.250153385170675e-05,
+      "loss": 1.5631,
+      "step": 12100
+    },
+    {
+      "epoch": 0.21024091540962514,
+      "grad_norm": 0.6350430250167847,
+      "learning_rate": 7.199184741401222e-05,
+      "loss": 1.5484,
+      "step": 12200
+    },
+    {
+      "epoch": 0.2119642016015073,
+      "grad_norm": 0.5689346790313721,
+      "learning_rate": 7.147931167351162e-05,
+      "loss": 1.5616,
+      "step": 12300
+    },
+    {
+      "epoch": 0.21368748779338947,
+      "grad_norm": 0.5793879628181458,
+      "learning_rate": 7.096399303524577e-05,
+      "loss": 1.5496,
+      "step": 12400
+    },
+    {
+      "epoch": 0.21541077398527164,
+      "grad_norm": 0.5497804284095764,
+      "learning_rate": 7.044595826481253e-05,
+      "loss": 1.5667,
+      "step": 12500
+    },
+    {
+      "epoch": 0.2171340601771538,
+      "grad_norm": 0.5804843902587891,
+      "learning_rate": 6.992527447971677e-05,
+      "loss": 1.5586,
+      "step": 12600
+    },
+    {
+      "epoch": 0.218857346369036,
+      "grad_norm": 0.5805226564407349,
+      "learning_rate": 6.940200914067431e-05,
+      "loss": 1.5428,
+      "step": 12700
+    },
+    {
+      "epoch": 0.22058063256091817,
+      "grad_norm": 0.5112205743789673,
+      "learning_rate": 6.887623004287182e-05,
+      "loss": 1.5597,
+      "step": 12800
+    },
+    {
+      "epoch": 0.22230391875280034,
+      "grad_norm": 0.5555017590522766,
+      "learning_rate": 6.834800530718285e-05,
+      "loss": 1.5349,
+      "step": 12900
+    },
+    {
+      "epoch": 0.2240272049446825,
+      "grad_norm": 0.5393018126487732,
+      "learning_rate": 6.781740337134229e-05,
+      "loss": 1.5392,
+      "step": 13000
+    },
+    {
+      "epoch": 0.22575049113656467,
+      "grad_norm": 0.515864372253418,
+      "learning_rate": 6.728449298107919e-05,
+      "loss": 1.5617,
+      "step": 13100
+    },
+    {
+      "epoch": 0.22747377732844687,
+      "grad_norm": 0.5203471779823303,
+      "learning_rate": 6.674934318121013e-05,
+      "loss": 1.5492,
+      "step": 13200
+    },
+    {
+      "epoch": 0.22919706352032904,
+      "grad_norm": 0.5489692091941833,
+      "learning_rate": 6.621202330669354e-05,
+      "loss": 1.544,
+      "step": 13300
+    },
+    {
+      "epoch": 0.2309203497122112,
+      "grad_norm": 0.5596274137496948,
+      "learning_rate": 6.567260297364659e-05,
+      "loss": 1.5463,
+      "step": 13400
+    },
+    {
+      "epoch": 0.23264363590409337,
+      "grad_norm": 0.5610251426696777,
+      "learning_rate": 6.513115207032557e-05,
+      "loss": 1.5802,
+      "step": 13500
+    },
+    {
+      "epoch": 0.23436692209597557,
+      "grad_norm": 0.5264619588851929,
+      "learning_rate": 6.458774074807107e-05,
+      "loss": 1.5545,
+      "step": 13600
+    },
+    {
+      "epoch": 0.23609020828785773,
+      "grad_norm": 0.5814141631126404,
+      "learning_rate": 6.404243941221903e-05,
+      "loss": 1.5521,
+      "step": 13700
+    },
+    {
+      "epoch": 0.2378134944797399,
+      "grad_norm": 0.5240880846977234,
+      "learning_rate": 6.349531871297896e-05,
+      "loss": 1.5675,
+      "step": 13800
+    },
+    {
+      "epoch": 0.23953678067162207,
+      "grad_norm": 0.4984951913356781,
+      "learning_rate": 6.294644953628023e-05,
+      "loss": 1.5559,
+      "step": 13900
+    },
+    {
+      "epoch": 0.24126006686350424,
+      "grad_norm": 0.5289067029953003,
+      "learning_rate": 6.239590299458814e-05,
+      "loss": 1.5285,
+      "step": 14000
+    },
+    {
+      "epoch": 0.24298335305538643,
+      "grad_norm": 0.5221706032752991,
+      "learning_rate": 6.184375041769032e-05,
+      "loss": 1.553,
+      "step": 14100
+    },
+    {
+      "epoch": 0.2447066392472686,
+      "grad_norm": 0.5475857257843018,
+      "learning_rate": 6.12900633434552e-05,
+      "loss": 1.5675,
+      "step": 14200
+    },
+    {
+      "epoch": 0.24642992543915077,
+      "grad_norm": 0.5271047353744507,
+      "learning_rate": 6.0734913508563395e-05,
+      "loss": 1.5487,
+      "step": 14300
+    },
+    {
+      "epoch": 0.24815321163103293,
+      "grad_norm": 0.5180040001869202,
+      "learning_rate": 6.0178372839213406e-05,
+      "loss": 1.5281,
+      "step": 14400
+    },
+    {
+      "epoch": 0.2498764978229151,
+      "grad_norm": 0.566608726978302,
+      "learning_rate": 5.9620513441802714e-05,
+      "loss": 1.5602,
+      "step": 14500
+    },
+    {
+      "epoch": 0.25159978401479727,
+      "grad_norm": 0.5131779909133911,
+      "learning_rate": 5.906140759358555e-05,
+      "loss": 1.5111,
+      "step": 14600
+    },
+    {
+      "epoch": 0.25332307020667943,
+      "grad_norm": 0.5626484751701355,
+      "learning_rate": 5.85011277333085e-05,
+      "loss": 1.5528,
+      "step": 14700
+    },
+    {
+      "epoch": 0.25504635639856166,
+      "grad_norm": 0.550121545791626,
+      "learning_rate": 5.793974645182526e-05,
+      "loss": 1.5401,
+      "step": 14800
+    },
+    {
+      "epoch": 0.2567696425904438,
+      "grad_norm": 0.5408352017402649,
+      "learning_rate": 5.737733648269162e-05,
+      "loss": 1.5437,
+      "step": 14900
+    },
+    {
+      "epoch": 0.258492928782326,
+      "grad_norm": 0.5391642451286316,
+      "learning_rate": 5.6813970692741945e-05,
+      "loss": 1.5407,
+      "step": 15000
+    },
+    {
+      "epoch": 0.258492928782326,
+      "eval_accuracy": 0.010407141521982707,
+      "eval_loss": 1.6942352056503296,
+      "eval_runtime": 1748.5708,
+      "eval_samples_per_second": 486.058,
+      "eval_steps_per_second": 15.19,
+      "step": 15000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 30000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 3000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.041448973814661e+17,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2efb46a23f61853f9d1bd735587f175002d35b4a6ab1a89a7574378204b2326
+size 5905