thiomajid commited on
Commit
669c9b0
·
verified ·
1 Parent(s): ca4546c

xLSTM is ready for image generation

Browse files
Files changed (39) hide show
  1. .gitattributes +6 -0
  2. config.json +81 -0
  3. logs/56000/_CHECKPOINT_METADATA +1 -0
  4. logs/56000/default/_METADATA +1 -0
  5. logs/56000/default/_sharding +1 -0
  6. logs/56000/default/array_metadatas/process_0 +1 -0
  7. logs/56000/default/d/d2faa21b74d6262917c3f434015260c2 +0 -0
  8. logs/56000/default/manifest.ocdbt +0 -0
  9. logs/56000/default/ocdbt.process_0/d/3e7dd4d7d62ed74ff3a3008771491be8 +0 -0
  10. logs/56000/default/ocdbt.process_0/d/589bbb1911ef77a68bd39d66cb5ad987 +3 -0
  11. logs/56000/default/ocdbt.process_0/d/73f37c4291d4f4108e7c0be33fa9ece1 +0 -0
  12. logs/56000/default/ocdbt.process_0/d/b7cccee99dacbfc00deb918b7696e346 +3 -0
  13. logs/56000/default/ocdbt.process_0/manifest.ocdbt +0 -0
  14. logs/58000/_CHECKPOINT_METADATA +1 -0
  15. logs/58000/default/_METADATA +1 -0
  16. logs/58000/default/_sharding +1 -0
  17. logs/58000/default/array_metadatas/process_0 +1 -0
  18. logs/58000/default/d/5c12eab1ced4990bdc0f82c4d8394dfe +0 -0
  19. logs/58000/default/manifest.ocdbt +0 -0
  20. logs/58000/default/ocdbt.process_0/d/d1ae375ddade45c9c8ea42c34a429dba +0 -0
  21. logs/58000/default/ocdbt.process_0/d/da222406832c4aae4cfe890d233bea85 +0 -0
  22. logs/58000/default/ocdbt.process_0/d/ec9e13c2502c3b87b5d5b37860c03ad2 +3 -0
  23. logs/58000/default/ocdbt.process_0/d/f72b9e7aa4ac1fe48d1dc7bfa695d52e +3 -0
  24. logs/58000/default/ocdbt.process_0/manifest.ocdbt +0 -0
  25. logs/60000/_CHECKPOINT_METADATA +1 -0
  26. logs/60000/default/_METADATA +1 -0
  27. logs/60000/default/_sharding +1 -0
  28. logs/60000/default/array_metadatas/process_0 +1 -0
  29. logs/60000/default/d/47276f771abe8618db20f064c855d18b +0 -0
  30. logs/60000/default/manifest.ocdbt +0 -0
  31. logs/60000/default/ocdbt.process_0/d/5422c45e937337d9c874e919f365f8bb +0 -0
  32. logs/60000/default/ocdbt.process_0/d/5d4ce2d31fe50a5e794e04730d5408b9 +0 -0
  33. logs/60000/default/ocdbt.process_0/d/a649a7f03f371810ad4a4d0072986f38 +3 -0
  34. logs/60000/default/ocdbt.process_0/d/bf2745114a156076b53b8b5635b0aeac +3 -0
  35. logs/60000/default/ocdbt.process_0/manifest.ocdbt +0 -0
  36. logs/xLSTM-TPU/events.out.tfevents.1761148059.5b24db4313ba.16459.0.v2 +3 -0
  37. timing_summary.json +7 -0
  38. train_history.json +11 -0
  39. trainer_config.json +47 -0
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ logs/56000/default/ocdbt.process_0/d/589bbb1911ef77a68bd39d66cb5ad987 filter=lfs diff=lfs merge=lfs -text
37
+ logs/56000/default/ocdbt.process_0/d/b7cccee99dacbfc00deb918b7696e346 filter=lfs diff=lfs merge=lfs -text
38
+ logs/58000/default/ocdbt.process_0/d/ec9e13c2502c3b87b5d5b37860c03ad2 filter=lfs diff=lfs merge=lfs -text
39
+ logs/58000/default/ocdbt.process_0/d/f72b9e7aa4ac1fe48d1dc7bfa695d52e filter=lfs diff=lfs merge=lfs -text
40
+ logs/60000/default/ocdbt.process_0/d/a649a7f03f371810ad4a4d0072986f38 filter=lfs diff=lfs merge=lfs -text
41
+ logs/60000/default/ocdbt.process_0/d/bf2745114a156076b53b8b5635b0aeac filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mlstm_block": {
3
+ "mlstm": {
4
+ "proj_factor": 2.0,
5
+ "round_proj_up_dim_up": true,
6
+ "round_proj_up_to_multiple_of": 64,
7
+ "_proj_up_dim": 1024,
8
+ "conv1d_kernel_size": 4,
9
+ "qkv_proj_blocksize": 32,
10
+ "num_heads": 4,
11
+ "embedding_dim": 512,
12
+ "bias": false,
13
+ "dropout": 0.0,
14
+ "context_length": 384,
15
+ "_num_blocks": 12,
16
+ "_inner_embedding_dim": 1024
17
+ },
18
+ "_num_blocks": 12,
19
+ "_block_idx": null
20
+ },
21
+ "slstm_block": {
22
+ "slstm": {
23
+ "hidden_size": 512,
24
+ "num_heads": 4,
25
+ "num_states": 4,
26
+ "backend": "vanilla",
27
+ "function": "slstm",
28
+ "bias_init": "powerlaw_blockdependent",
29
+ "recurrent_weight_init": "zeros",
30
+ "_block_idx": null,
31
+ "_num_blocks": 12,
32
+ "num_gates": 4,
33
+ "gradient_recurrent_clipval": null,
34
+ "forward_clipval": null,
35
+ "batch_size": 8,
36
+ "input_shape": "BSGNH",
37
+ "internal_input_shape": "SBNGH",
38
+ "output_shape": "BNSH",
39
+ "dtype": "bfloat16",
40
+ "dtype_b": "float32",
41
+ "dtype_r": "bfloat16",
42
+ "dtype_w": "bfloat16",
43
+ "dtype_g": "bfloat16",
44
+ "dtype_s": "bfloat16",
45
+ "dtype_a": "float32",
46
+ "initial_val": 0.0,
47
+ "enable_automatic_mixed_precision": true,
48
+ "embedding_dim": 512,
49
+ "conv1d_kernel_size": 4,
50
+ "group_norm_weight": true,
51
+ "dropout": 0.0
52
+ },
53
+ "feedforward": {
54
+ "proj_factor": 1.3,
55
+ "round_proj_up_dim_up": true,
56
+ "round_proj_up_to_multiple_of": 64,
57
+ "_proj_up_dim": 0,
58
+ "act_fn": "swish",
59
+ "embedding_dim": -1,
60
+ "dropout": 0.0,
61
+ "bias": false,
62
+ "ff_type": "ffn_gated",
63
+ "_num_blocks": 1
64
+ },
65
+ "_num_blocks": 12,
66
+ "_block_idx": null
67
+ },
68
+ "context_length": 384,
69
+ "num_blocks": 12,
70
+ "embedding_dim": 512,
71
+ "add_post_blocks_norm": true,
72
+ "bias": false,
73
+ "dropout": 0.0,
74
+ "slstm_at": [],
75
+ "_block_map": "0,0,0,0,0,0,0,0,0,0,0,0",
76
+ "vocab_size": 49152,
77
+ "tie_weights": false,
78
+ "weight_decay_on_embedding": false,
79
+ "add_embedding_dropout": false,
80
+ "pad_token_id": 0
81
+ }
logs/56000/_CHECKPOINT_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"item_handlers": {"default": "orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler"}, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1761150675355040026, "commit_timestamp_nsecs": 1761150676191431438, "custom_metadata": {}}
logs/56000/default/_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tree_metadata": {"('lm_head', 'kernel', 'value')": {"key_metadata": [{"key": "lm_head", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64, 49152]}}, "('token_embedding', 'embedding', 'value')": {"key_metadata": [{"key": "token_embedding", "key_type": 2}, {"key": "embedding", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6144, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 1, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'k_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "k_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'learnable_skip', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "learnable_skip", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'outnorm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "outnorm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_down', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_down", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_up', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_up", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64, 2048]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'q_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "q_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'v_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "v_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64]}}, "('xlstm_block_stack', 'post_blocks_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "post_blocks_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64]}}}, "use_zarr3": false, "store_array_data_equal_to_fill_value": true, "custom_metadata": null}
logs/56000/default/_sharding ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bG1faGVhZC5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","dG9rZW5fZW1iZWRkaW5nLmVtYmVkZGluZy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwub3V0bm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252LmJpYXMudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252Lmtlcm5lbC52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmtfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmxlYXJuYWJsZV9za2lwLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfZG93bi5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfdXAua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnFfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnZfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtX25vcm0uc2NhbGUudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2sucG9zdF9ibG9ja3Nfbm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}"}
logs/56000/default/array_metadatas/process_0 ADDED
@@ -0,0 +1 @@
 
 
1
+ {"array_metadatas": [{"array_metadata": {"param_name": "lm_head.kernel.value", "write_shape": [64, 49152], "chunk_shape": [64, 49152], "ext_metadata": null}}, {"array_metadata": {"param_name": "token_embedding.embedding.value", "write_shape": [6144, 512], "chunk_shape": [6144, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.bias.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.kernel.value", "write_shape": [12, 4, 1, 128], "chunk_shape": [12, 4, 1, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.k_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.learnable_skip.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.outnorm.scale.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_down.kernel.value", "write_shape": [12, 128, 512], "chunk_shape": [12, 128, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_up.kernel.value", "write_shape": [12, 64, 2048], "chunk_shape": [12, 64, 2048], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.q_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.v_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm_norm.scale.value", "write_shape": [12, 64], "chunk_shape": [12, 64], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.post_blocks_norm.scale.value", "write_shape": [64], "chunk_shape": [64], "ext_metadata": null}}]}
logs/56000/default/d/d2faa21b74d6262917c3f434015260c2 ADDED
Binary file (1.69 kB). View file
 
logs/56000/default/manifest.ocdbt ADDED
Binary file (117 Bytes). View file
 
logs/56000/default/ocdbt.process_0/d/3e7dd4d7d62ed74ff3a3008771491be8 ADDED
Binary file (199 Bytes). View file
 
logs/56000/default/ocdbt.process_0/d/589bbb1911ef77a68bd39d66cb5ad987 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eaee57fb3f8c9d549f5453721c7266aad4fc9d2efed75d515fea7e35b63461da
3
+ size 16465920
logs/56000/default/ocdbt.process_0/d/73f37c4291d4f4108e7c0be33fa9ece1 ADDED
Binary file (559 Bytes). View file
 
logs/56000/default/ocdbt.process_0/d/b7cccee99dacbfc00deb918b7696e346 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:820e1281fbbd8366ac393a5c30e7c0128e3616e8337ca5b604dfa678df16fd56
3
+ size 85221376
logs/56000/default/ocdbt.process_0/manifest.ocdbt ADDED
Binary file (262 Bytes). View file
 
logs/58000/_CHECKPOINT_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"item_handlers": {"default": "orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler"}, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1761150766941610609, "commit_timestamp_nsecs": 1761150767825678391, "custom_metadata": {}}
logs/58000/default/_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tree_metadata": {"('lm_head', 'kernel', 'value')": {"key_metadata": [{"key": "lm_head", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64, 49152]}}, "('token_embedding', 'embedding', 'value')": {"key_metadata": [{"key": "token_embedding", "key_type": 2}, {"key": "embedding", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6144, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 1, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'k_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "k_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'learnable_skip', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "learnable_skip", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'outnorm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "outnorm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_down', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_down", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_up', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_up", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64, 2048]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'q_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "q_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'v_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "v_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64]}}, "('xlstm_block_stack', 'post_blocks_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "post_blocks_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64]}}}, "use_zarr3": false, "store_array_data_equal_to_fill_value": true, "custom_metadata": null}
logs/58000/default/_sharding ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bG1faGVhZC5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","dG9rZW5fZW1iZWRkaW5nLmVtYmVkZGluZy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwub3V0bm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252LmJpYXMudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252Lmtlcm5lbC52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmtfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmxlYXJuYWJsZV9za2lwLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfZG93bi5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfdXAua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnFfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnZfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtX25vcm0uc2NhbGUudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2sucG9zdF9ibG9ja3Nfbm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}"}
logs/58000/default/array_metadatas/process_0 ADDED
@@ -0,0 +1 @@
 
 
1
+ {"array_metadatas": [{"array_metadata": {"param_name": "lm_head.kernel.value", "write_shape": [64, 49152], "chunk_shape": [64, 49152], "ext_metadata": null}}, {"array_metadata": {"param_name": "token_embedding.embedding.value", "write_shape": [6144, 512], "chunk_shape": [6144, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.bias.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.kernel.value", "write_shape": [12, 4, 1, 128], "chunk_shape": [12, 4, 1, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.k_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.learnable_skip.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.outnorm.scale.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_down.kernel.value", "write_shape": [12, 128, 512], "chunk_shape": [12, 128, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_up.kernel.value", "write_shape": [12, 64, 2048], "chunk_shape": [12, 64, 2048], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.q_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.v_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm_norm.scale.value", "write_shape": [12, 64], "chunk_shape": [12, 64], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.post_blocks_norm.scale.value", "write_shape": [64], "chunk_shape": [64], "ext_metadata": null}}]}
logs/58000/default/d/5c12eab1ced4990bdc0f82c4d8394dfe ADDED
Binary file (1.69 kB). View file
 
logs/58000/default/manifest.ocdbt ADDED
Binary file (117 Bytes). View file
 
logs/58000/default/ocdbt.process_0/d/d1ae375ddade45c9c8ea42c34a429dba ADDED
Binary file (559 Bytes). View file
 
logs/58000/default/ocdbt.process_0/d/da222406832c4aae4cfe890d233bea85 ADDED
Binary file (199 Bytes). View file
 
logs/58000/default/ocdbt.process_0/d/ec9e13c2502c3b87b5d5b37860c03ad2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:163595e099050baecbf46ff01560ab28e5c8702a310cd9c665c153061864ba22
3
+ size 85213184
logs/58000/default/ocdbt.process_0/d/f72b9e7aa4ac1fe48d1dc7bfa695d52e ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf69569633994461bc99856879d9c17d6fccdaafabf8679f4a56f84876f06b93
3
+ size 16465920
logs/58000/default/ocdbt.process_0/manifest.ocdbt ADDED
Binary file (262 Bytes). View file
 
logs/60000/_CHECKPOINT_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"item_handlers": {"default": "orbax.checkpoint._src.handlers.standard_checkpoint_handler.StandardCheckpointHandler"}, "metrics": {}, "performance_metrics": {}, "init_timestamp_nsecs": 1761150858253452873, "commit_timestamp_nsecs": 1761150859069589925, "custom_metadata": {}}
logs/60000/default/_METADATA ADDED
@@ -0,0 +1 @@
 
 
1
+ {"tree_metadata": {"('lm_head', 'kernel', 'value')": {"key_metadata": [{"key": "lm_head", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64, 49152]}}, "('token_embedding', 'embedding', 'value')": {"key_metadata": [{"key": "token_embedding", "key_type": 2}, {"key": "embedding", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [6144, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'conv1d', 'conv', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "conv1d", "key_type": 2}, {"key": "conv", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 1, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'k_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "k_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'learnable_skip', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "learnable_skip", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'fgate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "fgate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'bias', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "bias", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'igate', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "igate", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 384, 4]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'mlstm_cell', 'outnorm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "mlstm_cell", "key_type": 2}, {"key": "outnorm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_down', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_down", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 128, 512]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'proj_up', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "proj_up", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64, 2048]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'q_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "q_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm', 'v_proj', 'kernel', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm", "key_type": 2}, {"key": "v_proj", "key_type": 2}, {"key": "kernel", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 4, 32, 32]}}, "('xlstm_block_stack', 'blocks', 'xlstm_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "blocks", "key_type": 2}, {"key": "xlstm_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [12, 64]}}, "('xlstm_block_stack', 'post_blocks_norm', 'scale', 'value')": {"key_metadata": [{"key": "xlstm_block_stack", "key_type": 2}, {"key": "post_blocks_norm", "key_type": 2}, {"key": "scale", "key_type": 2}, {"key": "value", "key_type": 2}], "value_metadata": {"value_type": "jax.Array", "skip_deserialize": false, "write_shape": [64]}}}, "use_zarr3": false, "store_array_data_equal_to_fill_value": true, "custom_metadata": null}
logs/60000/default/_sharding ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bG1faGVhZC5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","dG9rZW5fZW1iZWRkaW5nLmVtYmVkZGluZy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuZmdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUuYmlhcy52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwuaWdhdGUua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLm1sc3RtX2NlbGwub3V0bm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252LmJpYXMudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmNvbnYxZC5jb252Lmtlcm5lbC52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmtfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLmxlYXJuYWJsZV9za2lwLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfZG93bi5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnByb2pfdXAua2VybmVsLnZhbHVl":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnFfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtLnZfcHJvai5rZXJuZWwudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2suYmxvY2tzLnhsc3RtX25vcm0uc2NhbGUudmFsdWU=":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [null, \"tp\"], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}","eGxzdG1fYmxvY2tfc3RhY2sucG9zdF9ibG9ja3Nfbm9ybS5zY2FsZS52YWx1ZQ==":"{\"sharding_type\": \"NamedSharding\", \"shape\": [8, 1], \"axis_names\": [\"dp\", \"tp\"], \"partition_spec\": [], \"device_mesh\": {\"mesh\": [[{\"id\": 0}], [{\"id\": 1}], [{\"id\": 2}], [{\"id\": 3}], [{\"id\": 7}], [{\"id\": 6}], [{\"id\": 5}], [{\"id\": 4}]]}}"}
logs/60000/default/array_metadatas/process_0 ADDED
@@ -0,0 +1 @@
 
 
1
+ {"array_metadatas": [{"array_metadata": {"param_name": "lm_head.kernel.value", "write_shape": [64, 49152], "chunk_shape": [64, 49152], "ext_metadata": null}}, {"array_metadata": {"param_name": "token_embedding.embedding.value", "write_shape": [6144, 512], "chunk_shape": [6144, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.bias.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.conv1d.conv.kernel.value", "write_shape": [12, 4, 1, 128], "chunk_shape": [12, 4, 1, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.k_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.learnable_skip.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.fgate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.bias.value", "write_shape": [12, 4], "chunk_shape": [12, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.igate.kernel.value", "write_shape": [12, 384, 4], "chunk_shape": [12, 384, 4], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.mlstm_cell.outnorm.scale.value", "write_shape": [12, 128], "chunk_shape": [12, 128], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_down.kernel.value", "write_shape": [12, 128, 512], "chunk_shape": [12, 128, 512], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.proj_up.kernel.value", "write_shape": [12, 64, 2048], "chunk_shape": [12, 64, 2048], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.q_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm.v_proj.kernel.value", "write_shape": [12, 4, 32, 32], "chunk_shape": [12, 4, 32, 32], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.blocks.xlstm_norm.scale.value", "write_shape": [12, 64], "chunk_shape": [12, 64], "ext_metadata": null}}, {"array_metadata": {"param_name": "xlstm_block_stack.post_blocks_norm.scale.value", "write_shape": [64], "chunk_shape": [64], "ext_metadata": null}}]}
logs/60000/default/d/47276f771abe8618db20f064c855d18b ADDED
Binary file (1.7 kB). View file
 
logs/60000/default/manifest.ocdbt ADDED
Binary file (117 Bytes). View file
 
logs/60000/default/ocdbt.process_0/d/5422c45e937337d9c874e919f365f8bb ADDED
Binary file (559 Bytes). View file
 
logs/60000/default/ocdbt.process_0/d/5d4ce2d31fe50a5e794e04730d5408b9 ADDED
Binary file (199 Bytes). View file
 
logs/60000/default/ocdbt.process_0/d/a649a7f03f371810ad4a4d0072986f38 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fa4c45e567d3a34e091f09099fe1e455323cc28221527d176e696206607403c
3
+ size 16461824
logs/60000/default/ocdbt.process_0/d/bf2745114a156076b53b8b5635b0aeac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f07d9f22eee85b5671cdd0e0f37204f9170e44eb3bc9f41fa628bea6cc9e90de
3
+ size 85209088
logs/60000/default/ocdbt.process_0/manifest.ocdbt ADDED
Binary file (265 Bytes). View file
 
logs/xLSTM-TPU/events.out.tfevents.1761148059.5b24db4313ba.16459.0.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b395e4454ae503b6908ee750a088e9a267ca0a930c9b9d2532304df6d517beb2
3
+ size 1566297
timing_summary.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_training_duration_seconds": 2785.887582896,
3
+ "total_training_duration_hours": 0.7738576619155555,
4
+ "average_epoch_duration_seconds": 91.71301750610003,
5
+ "num_epochs_completed": 30,
6
+ "num_evaluations_completed": 30
7
+ }
train_history.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_training_duration": 2785.887582896,
3
+ "avg_epoch_duration": 91.71301750610003,
4
+ "num_epochs_completed": 30,
5
+ "global_steps": 60000,
6
+ "global_optimizer_steps": 12000,
7
+ "params": {
8
+ "millions": 70.77,
9
+ "billions": 0.07
10
+ }
11
+ }
trainer_config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer": "HuggingFaceTB/SmolLM2-135M",
3
+ "dtype": "fp32",
4
+ "param_dtype": "bf16",
5
+ "num_train_epochs": 30,
6
+ "per_device_train_batch_size": 32,
7
+ "per_device_eval_batch_size": 32,
8
+ "gradient_accumulation_steps": 5,
9
+ "seed": 42,
10
+ "learning_rate": 0.0003,
11
+ "weight_decay": 0.01,
12
+ "adam_beta1": 0.9,
13
+ "adam_beta2": 0.999,
14
+ "warmup_ratio": 0.2,
15
+ "max_grad_norm": 1.0,
16
+ "logging_steps": 200,
17
+ "output_dir": "./artifacts/",
18
+ "logging_dir": "./artifacts/logs/",
19
+ "run_name": "train",
20
+ "best_metric_key": "perplexity",
21
+ "best_n_to_keep": 3,
22
+ "hub_model_id": "thiomajid/xLSTM-TPU",
23
+ "hub_private_repo": false,
24
+ "upload_message": "xLSTM is ready for image generation",
25
+ "train_dataset_url": "roneneldan/TinyStories",
26
+ "train_subset": null,
27
+ "train_split": "train",
28
+ "train_samples": 64000,
29
+ "eval_dataset_url": "roneneldan/TinyStories",
30
+ "eval_subset": null,
31
+ "eval_split": "validation",
32
+ "eval_samples": 3200,
33
+ "dataloader_drop_last": true,
34
+ "dataloader_num_workers": 4,
35
+ "worker_buffer_size": 2,
36
+ "text_column": "text",
37
+ "use_dataset_cache": true,
38
+ "dataset_cache_dir": "./.hf_data_cache",
39
+ "mesh_shape": [
40
+ 8,
41
+ 1
42
+ ],
43
+ "axis_names": [
44
+ "dp",
45
+ "tp"
46
+ ]
47
+ }