xLSTM is ready for image generation
Browse files- .gitattributes +6 -0
- config.json +81 -0
- logs/56000/_CHECKPOINT_METADATA +1 -0
- logs/56000/default/_METADATA +1 -0
- logs/56000/default/_sharding +1 -0
- logs/56000/default/array_metadatas/process_0 +1 -0
- logs/56000/default/d/d2faa21b74d6262917c3f434015260c2 +0 -0
- logs/56000/default/manifest.ocdbt +0 -0
- logs/56000/default/ocdbt.process_0/d/3e7dd4d7d62ed74ff3a3008771491be8 +0 -0
- logs/56000/default/ocdbt.process_0/d/589bbb1911ef77a68bd39d66cb5ad987 +3 -0
- logs/56000/default/ocdbt.process_0/d/73f37c4291d4f4108e7c0be33fa9ece1 +0 -0
- logs/56000/default/ocdbt.process_0/d/b7cccee99dacbfc00deb918b7696e346 +3 -0
- logs/56000/default/ocdbt.process_0/manifest.ocdbt +0 -0
- logs/58000/_CHECKPOINT_METADATA +1 -0
- logs/58000/default/_METADATA +1 -0
- logs/58000/default/_sharding +1 -0
- logs/58000/default/array_metadatas/process_0 +1 -0
- logs/58000/default/d/5c12eab1ced4990bdc0f82c4d8394dfe +0 -0
- logs/58000/default/manifest.ocdbt +0 -0
- logs/58000/default/ocdbt.process_0/d/d1ae375ddade45c9c8ea42c34a429dba +0 -0
- logs/58000/default/ocdbt.process_0/d/da222406832c4aae4cfe890d233bea85 +0 -0
- logs/58000/default/ocdbt.process_0/d/ec9e13c2502c3b87b5d5b37860c03ad2 +3 -0
- logs/58000/default/ocdbt.process_0/d/f72b9e7aa4ac1fe48d1dc7bfa695d52e +3 -0
- logs/58000/default/ocdbt.process_0/manifest.ocdbt +0 -0
- logs/60000/_CHECKPOINT_METADATA +1 -0
- logs/60000/default/_METADATA +1 -0
- logs/60000/default/_sharding +1 -0
- logs/60000/default/array_metadatas/process_0 +1 -0
- logs/60000/default/d/47276f771abe8618db20f064c855d18b +0 -0
- logs/60000/default/manifest.ocdbt +0 -0
- logs/60000/default/ocdbt.process_0/d/5422c45e937337d9c874e919f365f8bb +0 -0
- logs/60000/default/ocdbt.process_0/d/5d4ce2d31fe50a5e794e04730d5408b9 +0 -0
- logs/60000/default/ocdbt.process_0/d/a649a7f03f371810ad4a4d0072986f38 +3 -0
- logs/60000/default/ocdbt.process_0/d/bf2745114a156076b53b8b5635b0aeac +3 -0
- logs/60000/default/ocdbt.process_0/manifest.ocdbt +0 -0
- logs/xLSTM-TPU/events.out.tfevents.1761148059.5b24db4313ba.16459.0.v2 +3 -0
- timing_summary.json +7 -0
- train_history.json +11 -0
- trainer_config.json +47 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
logs/56000/default/ocdbt.process_0/d/589bbb1911ef77a68bd39d66cb5ad987 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
logs/56000/default/ocdbt.process_0/d/b7cccee99dacbfc00deb918b7696e346 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
logs/58000/default/ocdbt.process_0/d/ec9e13c2502c3b87b5d5b37860c03ad2 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
logs/58000/default/ocdbt.process_0/d/f72b9e7aa4ac1fe48d1dc7bfa695d52e filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
logs/60000/default/ocdbt.process_0/d/a649a7f03f371810ad4a4d0072986f38 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
logs/60000/default/ocdbt.process_0/d/bf2745114a156076b53b8b5635b0aeac filter=lfs diff=lfs merge=lfs -text
|
config.json
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"mlstm_block": {
|
| 3 |
+
"mlstm": {
|
| 4 |
+
"proj_factor": 2.0,
|
| 5 |
+
"round_proj_up_dim_up": true,
|
| 6 |
+
"round_proj_up_to_multiple_of": 64,
|
| 7 |
+
"_proj_up_dim": 1024,
|
| 8 |
+
"conv1d_kernel_size": 4,
|
| 9 |
+
"qkv_proj_blocksize": 32,
|
| 10 |
+
"num_heads": 4,
|
| 11 |
+
"embedding_dim": 512,
|
| 12 |
+
"bias": false,
|
| 13 |
+
"dropout": 0.0,
|
| 14 |
+
"context_length": 384,
|
| 15 |
+
"_num_blocks": 12,
|
| 16 |
+
"_inner_embedding_dim": 1024
|
| 17 |
+
},
|
| 18 |
+
"_num_blocks": 12,
|
| 19 |
+
"_block_idx": null
|
| 20 |
+
},
|
| 21 |
+
"slstm_block": {
|
| 22 |
+
"slstm": {
|
| 23 |
+
"hidden_size": 512,
|
| 24 |
+
"num_heads": 4,
|
| 25 |
+
"num_states": 4,
|
| 26 |
+
"backend": "vanilla",
|
| 27 |
+
"function": "slstm",
|
| 28 |
+
"bias_init": "powerlaw_blockdependent",
|
| 29 |
+
"recurrent_weight_init": "zeros",
|
| 30 |
+
"_block_idx": null,
|
| 31 |
+
"_num_blocks": 12,
|
| 32 |
+
"num_gates": 4,
|
| 33 |
+
"gradient_recurrent_clipval": null,
|
| 34 |
+
"forward_clipval": null,
|
| 35 |
+
"batch_size": 8,
|
| 36 |
+
"input_shape": "BSGNH",
|
| 37 |
+
"internal_input_shape": "SBNGH",
|
| 38 |
+
"output_shape": "BNSH",
|
| 39 |
+
"dtype": "bfloat16",
|
| 40 |
+
"dtype_b": "float32",
|
| 41 |
+
"dtype_r": "bfloat16",
|
| 42 |
+
"dtype_w": "bfloat16",
|
| 43 |
+
"dtype_g": "bfloat16",
|
| 44 |
+
"dtype_s": "bfloat16",
|
| 45 |
+
"dtype_a": "float32",
|
| 46 |
+
"initial_val": 0.0,
|
| 47 |
+
"enable_automatic_mixed_precision": true,
|
| 48 |
+
"embedding_dim": 512,
|
| 49 |
+
"conv1d_kernel_size": 4,
|
| 50 |
+
"group_norm_weight": true,
|
| 51 |
+
"dropout": 0.0
|
| 52 |
+
},
|
| 53 |
+
"feedforward": {
|
| 54 |
+
"proj_factor": 1.3,
|
| 55 |
+
"round_proj_up_dim_up": true,
|
| 56 |
+
"round_proj_up_to_multiple_of": 64,
|
| 57 |
+
"_proj_up_dim": 0,
|
| 58 |
+
"act_fn": "swish",
|
| 59 |
+
"embedding_dim": -1,
|
| 60 |
+
"dropout": 0.0,
|
| 61 |
+
"bias": false,
|
| 62 |
+
"ff_type": "ffn_gated",
|
| 63 |
+
"_num_blocks": 1
|
| 64 |
+
},
|
| 65 |
+
"_num_blocks": 12,
|
| 66 |
+
"_block_idx": null
|
| 67 |
+
},
|
| 68 |
+
"context_length": 384,
|
| 69 |
+
"num_blocks": 12,
|
| 70 |
+
"embedding_dim": 512,
|
| 71 |
+
"add_post_blocks_norm": true,
|
| 72 |
+
"bias": false,
|
| 73 |
+
"dropout": 0.0,
|
| 74 |
+
"slstm_at": [],
|
| 75 |
+
"_block_map": "0,0,0,0,0,0,0,0,0,0,0,0",
|
| 76 |
+
"vocab_size": 49152,
|
| 77 |
+
"tie_weights": false,
|
| 78 |
+
"weight_decay_on_embedding": false,
|
| 79 |
+
"add_embedding_dropout": false,
|
| 80 |
+
"pad_token_id": 0
|
| 81 |
+
}
|
logs/56000/_CHECKPOINT_METADATA
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"item_handlers": {"default": "orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler"}, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1761150675355040026, "commit_timestamp_nsecs": 1761150676191431438, "custom_metadata": {}}
|
logs/56000/default/_METADATA
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"tree_metadata": {"('lm_head', 'kernel', 'value')": {"key_metadata": [{"key": "lm_head", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64, 49152]}}, "('token_embedding', 'embedding', 'value')": {"key_metadata": [{"key": "token_embedding", "key_type": 2}, {"key": "embedding", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6144, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 1, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'k_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "k_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'learnable_skip', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "learnable_skip", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'outnorm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "outnorm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_down', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_down", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_up', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_up", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64, 2048]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'q_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "q_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'v_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "v_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64]}}, "('xlstm_block_stack', 'post_blocks_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "post_blocks_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64]}}}, "use_zarr3": false, "store_array_data_equal_to_fill_value": true, "custom_metadata": null}
|
logs/56000/default/_sharding
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bG1faGVhZC5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","dG9rZW5fZW1iZWRkaW5nLmVtYmVkZGluZy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwub3V0bm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252LmJpYXMudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252Lmtlcm5lbC52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmtfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmxlYXJuYWJsZV9za2lwLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfZG93bi5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfdXAua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnFfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnZfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtX25vcm0uc2NhbGUudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2sucG9zdF9ibG9ja3Nfbm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}"}
|
logs/56000/default/array_metadatas/process_0
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"array_metadatas": [{"array_metadata": {"param_name": "lm_head.kernel.value", "write_shape": [64, 49152], "chunk_shape": [64, 49152], "ext_metadata": null}}, {"array_metadata": {"param_name": "token_embedding.embedding.value", "write_shape": [6144, 512], "chunk_shape": [6144, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.bias.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.kernel.value", "write_shape": [12, 4, 1, 128], "chunk_shape": [12, 4, 1, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.k_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.learnable_skip.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.outnorm.scale.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_down.kernel.value", "write_shape": [12, 128, 512], "chunk_shape": [12, 128, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_up.kernel.value", "write_shape": [12, 64, 2048], "chunk_shape": [12, 64, 2048], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.q_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.v_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm_norm.scale.value", "write_shape": [12, 64], "chunk_shape": [12, 64], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.post_blocks_norm.scale.value", "write_shape": [64], "chunk_shape": [64], "ext_metadata": null}}]}
|
logs/56000/default/d/d2faa21b74d6262917c3f434015260c2
ADDED
|
Binary file (1.69 kB). View file
|
|
|
logs/56000/default/manifest.ocdbt
ADDED
|
Binary file (117 Bytes). View file
|
|
|
logs/56000/default/ocdbt.process_0/d/3e7dd4d7d62ed74ff3a3008771491be8
ADDED
|
Binary file (199 Bytes). View file
|
|
|
logs/56000/default/ocdbt.process_0/d/589bbb1911ef77a68bd39d66cb5ad987
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eaee57fb3f8c9d549f5453721c7266aad4fc9d2efed75d515fea7e35b63461da
|
| 3 |
+
size 16465920
|
logs/56000/default/ocdbt.process_0/d/73f37c4291d4f4108e7c0be33fa9ece1
ADDED
|
Binary file (559 Bytes). View file
|
|
|
logs/56000/default/ocdbt.process_0/d/b7cccee99dacbfc00deb918b7696e346
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:820e1281fbbd8366ac393a5c30e7c0128e3616e8337ca5b604dfa678df16fd56
|
| 3 |
+
size 85221376
|
logs/56000/default/ocdbt.process_0/manifest.ocdbt
ADDED
|
Binary file (262 Bytes). View file
|
|
|
logs/58000/_CHECKPOINT_METADATA
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"item_handlers": {"default": "orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler"}, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1761150766941610609, "commit_timestamp_nsecs": 1761150767825678391, "custom_metadata": {}}
|
logs/58000/default/_METADATA
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"tree_metadata": {"('lm_head', 'kernel', 'value')": {"key_metadata": [{"key": "lm_head", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64, 49152]}}, "('token_embedding', 'embedding', 'value')": {"key_metadata": [{"key": "token_embedding", "key_type": 2}, {"key": "embedding", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6144, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 1, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'k_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "k_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'learnable_skip', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "learnable_skip", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'outnorm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "outnorm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_down', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_down", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_up', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_up", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64, 2048]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'q_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "q_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'v_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "v_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64]}}, "('xlstm_block_stack', 'post_blocks_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "post_blocks_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64]}}}, "use_zarr3": false, "store_array_data_equal_to_fill_value": true, "custom_metadata": null}
|
logs/58000/default/_sharding
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bG1faGVhZC5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","dG9rZW5fZW1iZWRkaW5nLmVtYmVkZGluZy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwub3V0bm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252LmJpYXMudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252Lmtlcm5lbC52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmtfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmxlYXJuYWJsZV9za2lwLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfZG93bi5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfdXAua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnFfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnZfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtX25vcm0uc2NhbGUudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2sucG9zdF9ibG9ja3Nfbm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}"}
|
logs/58000/default/array_metadatas/process_0
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"array_metadatas": [{"array_metadata": {"param_name": "lm_head.kernel.value", "write_shape": [64, 49152], "chunk_shape": [64, 49152], "ext_metadata": null}}, {"array_metadata": {"param_name": "token_embedding.embedding.value", "write_shape": [6144, 512], "chunk_shape": [6144, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.bias.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.kernel.value", "write_shape": [12, 4, 1, 128], "chunk_shape": [12, 4, 1, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.k_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.learnable_skip.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.outnorm.scale.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_down.kernel.value", "write_shape": [12, 128, 512], "chunk_shape": [12, 128, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_up.kernel.value", "write_shape": [12, 64, 2048], "chunk_shape": [12, 64, 2048], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.q_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.v_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm_norm.scale.value", "write_shape": [12, 64], "chunk_shape": [12, 64], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.post_blocks_norm.scale.value", "write_shape": [64], "chunk_shape": [64], "ext_metadata": null}}]}
|
logs/58000/default/d/5c12eab1ced4990bdc0f82c4d8394dfe
ADDED
|
Binary file (1.69 kB). View file
|
|
|
logs/58000/default/manifest.ocdbt
ADDED
|
Binary file (117 Bytes). View file
|
|
|
logs/58000/default/ocdbt.process_0/d/d1ae375ddade45c9c8ea42c34a429dba
ADDED
|
Binary file (559 Bytes). View file
|
|
|
logs/58000/default/ocdbt.process_0/d/da222406832c4aae4cfe890d233bea85
ADDED
|
Binary file (199 Bytes). View file
|
|
|
logs/58000/default/ocdbt.process_0/d/ec9e13c2502c3b87b5d5b37860c03ad2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:163595e099050baecbf46ff01560ab28e5c8702a310cd9c665c153061864ba22
|
| 3 |
+
size 85213184
|
logs/58000/default/ocdbt.process_0/d/f72b9e7aa4ac1fe48d1dc7bfa695d52e
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf69569633994461bc99856879d9c17d6fccdaafabf8679f4a56f84876f06b93
|
| 3 |
+
size 16465920
|
logs/58000/default/ocdbt.process_0/manifest.ocdbt
ADDED
|
Binary file (262 Bytes). View file
|
|
|
logs/60000/_CHECKPOINT_METADATA
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"item_handlers": {"default": "orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler"}, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1761150858253452873, "commit_timestamp_nsecs": 1761150859069589925, "custom_metadata": {}}
|
logs/60000/default/_METADATA
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"tree_metadata": {"('lm_head', 'kernel', 'value')": {"key_metadata": [{"key": "lm_head", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64, 49152]}}, "('token_embedding', 'embedding', 'value')": {"key_metadata": [{"key": "token_embedding", "key_type": 2}, {"key": "embedding", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6144, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 1, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'k_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "k_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'learnable_skip', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "learnable_skip", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'outnorm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "outnorm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_down', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_down", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_up', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_up", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64, 2048]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'q_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "q_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'v_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "v_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64]}}, "('xlstm_block_stack', 'post_blocks_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "post_blocks_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64]}}}, "use_zarr3": false, "store_array_data_equal_to_fill_value": true, "custom_metadata": null}
|
logs/60000/default/_sharding
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bG1faGVhZC5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","dG9rZW5fZW1iZWRkaW5nLmVtYmVkZGluZy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwub3V0bm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252LmJpYXMudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252Lmtlcm5lbC52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmtfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmxlYXJuYWJsZV9za2lwLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfZG93bi5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfdXAua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnFfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnZfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtX25vcm0uc2NhbGUudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2sucG9zdF9ibG9ja3Nfbm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}"}
|
logs/60000/default/array_metadatas/process_0
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"array_metadatas": [{"array_metadata": {"param_name": "lm_head.kernel.value", "write_shape": [64, 49152], "chunk_shape": [64, 49152], "ext_metadata": null}}, {"array_metadata": {"param_name": "token_embedding.embedding.value", "write_shape": [6144, 512], "chunk_shape": [6144, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.bias.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.kernel.value", "write_shape": [12, 4, 1, 128], "chunk_shape": [12, 4, 1, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.k_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.learnable_skip.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.outnorm.scale.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_down.kernel.value", "write_shape": [12, 128, 512], "chunk_shape": [12, 128, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_up.kernel.value", "write_shape": [12, 64, 2048], "chunk_shape": [12, 64, 2048], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.q_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.v_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm_norm.scale.value", "write_shape": [12, 64], "chunk_shape": [12, 64], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.post_blocks_norm.scale.value", "write_shape": [64], "chunk_shape": [64], "ext_metadata": null}}]}
|
logs/60000/default/d/47276f771abe8618db20f064c855d18b
ADDED
|
Binary file (1.7 kB). View file
|
|
|
logs/60000/default/manifest.ocdbt
ADDED
|
Binary file (117 Bytes). View file
|
|
|
logs/60000/default/ocdbt.process_0/d/5422c45e937337d9c874e919f365f8bb
ADDED
|
Binary file (559 Bytes). View file
|
|
|
logs/60000/default/ocdbt.process_0/d/5d4ce2d31fe50a5e794e04730d5408b9
ADDED
|
Binary file (199 Bytes). View file
|
|
|
logs/60000/default/ocdbt.process_0/d/a649a7f03f371810ad4a4d0072986f38
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2fa4c45e567d3a34e091f09099fe1e455323cc28221527d176e696206607403c
|
| 3 |
+
size 16461824
|
logs/60000/default/ocdbt.process_0/d/bf2745114a156076b53b8b5635b0aeac
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f07d9f22eee85b5671cdd0e0f37204f9170e44eb3bc9f41fa628bea6cc9e90de
|
| 3 |
+
size 85209088
|
logs/60000/default/ocdbt.process_0/manifest.ocdbt
ADDED
|
Binary file (265 Bytes). View file
|
|
|
logs/xLSTM-TPU/events.out.tfevents.1761148059.5b24db4313ba.16459.0.v2
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b395e4454ae503b6908ee750a088e9a267ca0a930c9b9d2532304df6d517beb2
|
| 3 |
+
size 1566297
|
timing_summary.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"total_training_duration_seconds": 2785.887582896,
|
| 3 |
+
"total_training_duration_hours": 0.7738576619155555,
|
| 4 |
+
"average_epoch_duration_seconds": 91.71301750610003,
|
| 5 |
+
"num_epochs_completed": 30,
|
| 6 |
+
"num_evaluations_completed": 30
|
| 7 |
+
}
|
train_history.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"total_training_duration": 2785.887582896,
|
| 3 |
+
"avg_epoch_duration": 91.71301750610003,
|
| 4 |
+
"num_epochs_completed": 30,
|
| 5 |
+
"global_steps": 60000,
|
| 6 |
+
"global_optimizer_steps": 12000,
|
| 7 |
+
"params": {
|
| 8 |
+
"millions": 70.77,
|
| 9 |
+
"billions": 0.07
|
| 10 |
+
}
|
| 11 |
+
}
|
trainer_config.json
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tokenizer": "HuggingFaceTB/SmolLM2-135M",
|
| 3 |
+
"dtype": "fp32",
|
| 4 |
+
"param_dtype": "bf16",
|
| 5 |
+
"num_train_epochs": 30,
|
| 6 |
+
"per_device_train_batch_size": 32,
|
| 7 |
+
"per_device_eval_batch_size": 32,
|
| 8 |
+
"gradient_accumulation_steps": 5,
|
| 9 |
+
"seed": 42,
|
| 10 |
+
"learning_rate": 0.0003,
|
| 11 |
+
"weight_decay": 0.01,
|
| 12 |
+
"adam_beta1": 0.9,
|
| 13 |
+
"adam_beta2": 0.999,
|
| 14 |
+
"warmup_ratio": 0.2,
|
| 15 |
+
"max_grad_norm": 1.0,
|
| 16 |
+
"logging_steps": 200,
|
| 17 |
+
"output_dir": "./artifacts/",
|
| 18 |
+
"logging_dir": "./artifacts/logs/",
|
| 19 |
+
"run_name": "train",
|
| 20 |
+
"best_metric_key": "perplexity",
|
| 21 |
+
"best_n_to_keep": 3,
|
| 22 |
+
"hub_model_id": "thiomajid/xLSTM-TPU",
|
| 23 |
+
"hub_private_repo": false,
|
| 24 |
+
"upload_message": "xLSTM is ready for image generation",
|
| 25 |
+
"train_dataset_url": "roneneldan/TinyStories",
|
| 26 |
+
"train_subset": null,
|
| 27 |
+
"train_split": "train",
|
| 28 |
+
"train_samples": 64000,
|
| 29 |
+
"eval_dataset_url": "roneneldan/TinyStories",
|
| 30 |
+
"eval_subset": null,
|
| 31 |
+
"eval_split": "validation",
|
| 32 |
+
"eval_samples": 3200,
|
| 33 |
+
"dataloader_drop_last": true,
|
| 34 |
+
"dataloader_num_workers": 4,
|
| 35 |
+
"worker_buffer_size": 2,
|
| 36 |
+
"text_column": "text",
|
| 37 |
+
"use_dataset_cache": true,
|
| 38 |
+
"dataset_cache_dir": "./.hf_data_cache",
|
| 39 |
+
"mesh_shape": [
|
| 40 |
+
8,
|
| 41 |
+
1
|
| 42 |
+
],
|
| 43 |
+
"axis_names": [
|
| 44 |
+
"dp",
|
| 45 |
+
"tp"
|
| 46 |
+
]
|
| 47 |
+
}
|