htlou commited on
Commit
8d3b74e
·
verified ·
1 Parent(s): 60cce00

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. arguments.yaml +51 -0
  3. config.json +0 -0
  4. environ.txt +164 -0
  5. preprocessor_config.json +28 -0
  6. processor_config.json +5 -0
  7. pytorch_model.bin +3 -0
  8. script.sh +84 -0
  9. slice_1200/arguments.yaml +51 -0
  10. slice_1200/config.json +0 -0
  11. slice_1200/environ.txt +164 -0
  12. slice_1200/preprocessor_config.json +28 -0
  13. slice_1200/processor_config.json +5 -0
  14. slice_1200/pytorch_model.bin +3 -0
  15. slice_1200/script.sh +84 -0
  16. slice_1200/special_tokens_map.json +37 -0
  17. slice_1200/tokenizer.json +0 -0
  18. slice_1200/tokenizer_config.json +0 -0
  19. slice_1200/wandb/debug-internal.log +22 -0
  20. slice_1200/wandb/debug.log +33 -0
  21. slice_1200/wandb/run-20250101_084116-coewtb43/files/config.yaml +98 -0
  22. slice_1200/wandb/run-20250101_084116-coewtb43/files/output.log +224 -0
  23. slice_1200/wandb/run-20250101_084116-coewtb43/files/requirements.txt +248 -0
  24. slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json +112 -0
  25. slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json +1 -0
  26. slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log +22 -0
  27. slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug.log +33 -0
  28. slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb +3 -0
  29. slice_400/arguments.yaml +51 -0
  30. slice_400/config.json +0 -0
  31. slice_400/environ.txt +164 -0
  32. slice_400/preprocessor_config.json +28 -0
  33. slice_400/processor_config.json +5 -0
  34. slice_400/pytorch_model.bin +3 -0
  35. slice_400/script.sh +84 -0
  36. slice_400/special_tokens_map.json +37 -0
  37. slice_400/tokenizer.json +0 -0
  38. slice_400/tokenizer_config.json +0 -0
  39. slice_400/wandb/debug-internal.log +22 -0
  40. slice_400/wandb/debug.log +33 -0
  41. slice_400/wandb/run-20250101_084116-coewtb43/files/config.yaml +98 -0
  42. slice_400/wandb/run-20250101_084116-coewtb43/files/output.log +224 -0
  43. slice_400/wandb/run-20250101_084116-coewtb43/files/requirements.txt +248 -0
  44. slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json +112 -0
  45. slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json +1 -0
  46. slice_400/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log +22 -0
  47. slice_400/wandb/run-20250101_084116-coewtb43/logs/debug.log +33 -0
  48. slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb +3 -0
  49. slice_800/arguments.yaml +51 -0
  50. slice_800/config.json +0 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
37
+ slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
38
+ slice_800/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
39
+ wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
arguments.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_cfgs:
2
+ eval_data_files: null
3
+ eval_datasets: null
4
+ eval_optional_args: []
5
+ eval_size: null
6
+ eval_split: null
7
+ eval_subset: null
8
+ eval_template: null
9
+ train_data_files: q0_40_preference.pt
10
+ train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
11
+ train_optional_args: []
12
+ train_size: null
13
+ train_split: train
14
+ train_subset: null
15
+ train_template: Chameleon_preference
16
+ logger_cfgs:
17
+ cache_dir: null
18
+ log_project: align-anything
19
+ log_run_name: dpo
20
+ log_type: wandb
21
+ output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
22
+ save_interval: 400.0
23
+ model_cfgs:
24
+ model_max_length: 4096
25
+ model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
26
+ trust_remote_code: true
27
+ special_tokens: null
28
+ train_cfgs:
29
+ adam_betas:
30
+ - 0.9
31
+ - 0.95
32
+ bf16: true
33
+ ds_cfgs: ds_z3_config.json
34
+ epochs: 3.0
35
+ eval_interval: 10
36
+ eval_strategy: epoch
37
+ fp16: false
38
+ freeze_language_model: true
39
+ freeze_mm_proj: true
40
+ freeze_vision_tower: false
41
+ gradient_accumulation_steps: 2.0
42
+ gradient_checkpointing: true
43
+ learning_rate: 1.0e-06
44
+ lr_scheduler_type: cosine
45
+ lr_warmup_ratio: 0.03
46
+ per_device_eval_batch_size: 4.0
47
+ per_device_train_batch_size: 4.0
48
+ regularization: 0.001
49
+ scale_coeff: 0.1
50
+ seed: 42
51
+ weight_decay: 0.01
config.json ADDED
The diff for this file is too large to render. See raw diff
 
environ.txt ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
2
+ AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
3
+ AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
4
+ BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
5
+ BUILD=x86_64-conda-linux-gnu
6
+ CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
7
+ CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
8
+ CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
9
+ CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
10
+ COLORTERM=truecolor
11
+ CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
12
+ CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
13
+ CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
14
+ CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
15
+ CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
16
+ CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
17
+ CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
18
+ CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
19
+ CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
20
+ CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
21
+ CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
22
+ CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
23
+ CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
24
+ CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
25
+ CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
26
+ CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
27
+ CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
28
+ CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
29
+ CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
30
+ CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
31
+ CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
32
+ CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
33
+ CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
34
+ CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
35
+ CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
36
+ CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
37
+ CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
38
+ CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
39
+ CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
40
+ CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
41
+ CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
42
+ CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
43
+ CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
44
+ CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
45
+ CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
46
+ CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
47
+ CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
48
+ CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
49
+ CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
50
+ CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
51
+ CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
52
+ CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
53
+ CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
54
+ CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
55
+ CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
56
+ CONDA_DEFAULT_ENV=hantao_stable
57
+ CONDA_EXE=/data/align-anything/miniconda3/bin/conda
58
+ CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
59
+ CONDA_PREFIX_1=/home/align-anything/miniconda3
60
+ CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
61
+ CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
62
+ CONDA_PREFIX_3=/data/align-anything/miniconda3
63
+ CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
64
+ CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
65
+ CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
66
+ CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
67
+ CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
68
+ CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
69
+ CONDA_PROMPT_MODIFIER=(hantao_stable)
70
+ CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
71
+ CONDA_ROOT=/home/align-anything/miniconda3
72
+ CONDA_SHLVL=11
73
+ CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
74
+ CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
75
+ CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
76
+ CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
77
+ CROSS_RANK=0
78
+ CROSS_SIZE=1
79
+ CUDA_MODULE_LOADING=LAZY
80
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
81
+ CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
82
+ CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
83
+ CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
84
+ CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
85
+ DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
86
+ DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
87
+ DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
88
+ DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
89
+ DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
90
+ ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
91
+ GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
92
+ GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
93
+ GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
94
+ GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
95
+ GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
96
+ GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
97
+ GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
98
+ HOME=/home/align-anything
99
+ HOST=x86_64-conda-linux-gnu
100
+ LANG=en_US.UTF-8
101
+ LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
102
+ LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
103
+ LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
104
+ LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
105
+ LESSCLOSE=/usr/bin/lesspipe %s %s
106
+ LESSOPEN=| /usr/bin/lesspipe %s
107
+ LOCAL_RANK=0
108
+ LOCAL_SIZE=8
109
+ LOGLEVEL=WARNING
110
+ LOGNAME=align-anything
111
+ LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
112
+ MASTER_ADDR=127.0.0.1
113
+ MASTER_PORT=52201
114
+ MOTD_SHOWN=pam
115
+ NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
116
+ NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
117
+ OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
118
+ OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
119
+ OLDPWD=/data/align-anything/hantao/LLaMA-Factory
120
+ PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
121
+ PWD=/data/align-anything/hantao/align-anything/scripts
122
+ PYGAME_HIDE_SUPPORT_PROMPT=1
123
+ PYTHONHASHSEED=42
124
+ PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
125
+ QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
126
+ QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
127
+ RANK=0
128
+ RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
129
+ READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
130
+ SHELL=/bin/bash
131
+ SHLVL=3
132
+ SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
133
+ SSH_CLIENT=117.136.0.149 36325 30400
134
+ SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
135
+ SSL_CERT_DIR=/usr/lib/ssl/certs
136
+ SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
137
+ STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
138
+ STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
139
+ TERM=screen
140
+ TERM_PROGRAM=vscode
141
+ TERM_PROGRAM_VERSION=0.41.3
142
+ TMUX=/tmp/tmux-2000/default,34082,51
143
+ TMUX_PANE=%59
144
+ TRITON_CACHE_DIR=/home/align-anything/cache/triton
145
+ USER=align-anything
146
+ VSCODE_GIT_ASKPASS_EXTRA_ARGS=
147
+ VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
148
+ VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
149
+ VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
150
+ VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
151
+ WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
152
+ WANDB_MODE=online
153
+ WANDB_SERVICE=2-675697-tcp-localhost-45541
154
+ WORLD_SIZE=8
155
+ XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
156
+ XDG_RUNTIME_DIR=/run/user/2000
157
+ XDG_SESSION_CLASS=user
158
+ XDG_SESSION_ID=11
159
+ XDG_SESSION_TYPE=tty
160
+ _=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
161
+ _CE_CONDA=
162
+ _CE_M=
163
+ build_alias=x86_64-conda-linux-gnu
164
+ host_alias=x86_64-conda-linux-gnu
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 512,
4
+ "width": 512
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 1.0,
13
+ 1.0,
14
+ 1.0
15
+ ],
16
+ "image_processor_type": "ChameleonImageProcessor",
17
+ "image_std": [
18
+ 1.0,
19
+ 1.0,
20
+ 1.0
21
+ ],
22
+ "processor_class": "ChameleonProcessor",
23
+ "resample": 1,
24
+ "rescale_factor": 0.0078,
25
+ "size": {
26
+ "shortest_edge": 512
27
+ }
28
+ }
processor_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 1024,
3
+ "image_token": "<image>",
4
+ "processor_class": "ChameleonProcessor"
5
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43395241aee86be4cd8c53758c653e006b4e5ddd39103fd6e68ea3e6882d2269
3
+ size 14086364170
script.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright 2024 PKU-Alignment Team. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
19
+ export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
20
+
21
+ export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
22
+
23
+ export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
24
+ export WANDB_MODE=online
25
+
26
+ MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
27
+
28
+ DATASET_PATH=(
29
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
30
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
31
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
32
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
33
+ )
34
+
35
+ DATASET_NAME=(
36
+ "q0_10_preference"
37
+ "q0_20_preference"
38
+ "q0_30_preference"
39
+ "q0_40_preference"
40
+ "q0_50_preference"
41
+ "q0_60_preference"
42
+ "q0_70_preference"
43
+ "q0_80_preference"
44
+ "q0_90_preference"
45
+ )
46
+
47
+ OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
48
+ mkdir -p $OUTPUT_PATH
49
+
50
+ # Initialize variables
51
+
52
+ for dataset_path in ${DATASET_PATH[@]}; do
53
+ for dataset_name in ${DATASET_NAME[@]}; do
54
+ TRAIN_DATASETS=$dataset_path
55
+
56
+ # dataset middle name
57
+ middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
58
+ OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
59
+ mkdir -p $OUTPUT_DIR
60
+ echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
61
+ # Source the setup script
62
+ source ./setup.sh
63
+
64
+ # Execute deepspeed command
65
+ deepspeed \
66
+ --master_port ${MASTER_PORT} \
67
+ --module align_anything.trainers.text_image_to_text_image.dpo \
68
+ --model_name_or_path ${MODEL_NAME_OR_PATH} \
69
+ --train_datasets ${TRAIN_DATASETS} \
70
+ --output_dir ${OUTPUT_DIR} \
71
+ --per_device_train_batch_size 4 \
72
+ --per_device_eval_batch_size 4 \
73
+ --gradient_accumulation_steps 2 \
74
+ --train_template Chameleon_preference \
75
+ --train_split train \
76
+ --train_data_files ${dataset_name}.pt \
77
+ --learning_rate 1e-6 \
78
+ --epochs 3 \
79
+ --lr_scheduler_type cosine \
80
+ --save_interval 400
81
+
82
+ bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
83
+ done
84
+ done
slice_1200/arguments.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_cfgs:
2
+ eval_data_files: null
3
+ eval_datasets: null
4
+ eval_optional_args: []
5
+ eval_size: null
6
+ eval_split: null
7
+ eval_subset: null
8
+ eval_template: null
9
+ train_data_files: q0_40_preference.pt
10
+ train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
11
+ train_optional_args: []
12
+ train_size: null
13
+ train_split: train
14
+ train_subset: null
15
+ train_template: Chameleon_preference
16
+ logger_cfgs:
17
+ cache_dir: null
18
+ log_project: align-anything
19
+ log_run_name: dpo
20
+ log_type: wandb
21
+ output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
22
+ save_interval: 400.0
23
+ model_cfgs:
24
+ model_max_length: 4096
25
+ model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
26
+ trust_remote_code: true
27
+ special_tokens: null
28
+ train_cfgs:
29
+ adam_betas:
30
+ - 0.9
31
+ - 0.95
32
+ bf16: true
33
+ ds_cfgs: ds_z3_config.json
34
+ epochs: 3.0
35
+ eval_interval: 10
36
+ eval_strategy: epoch
37
+ fp16: false
38
+ freeze_language_model: true
39
+ freeze_mm_proj: true
40
+ freeze_vision_tower: false
41
+ gradient_accumulation_steps: 2.0
42
+ gradient_checkpointing: true
43
+ learning_rate: 1.0e-06
44
+ lr_scheduler_type: cosine
45
+ lr_warmup_ratio: 0.03
46
+ per_device_eval_batch_size: 4.0
47
+ per_device_train_batch_size: 4.0
48
+ regularization: 0.001
49
+ scale_coeff: 0.1
50
+ seed: 42
51
+ weight_decay: 0.01
slice_1200/config.json ADDED
The diff for this file is too large to render. See raw diff
 
slice_1200/environ.txt ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
2
+ AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
3
+ AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
4
+ BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
5
+ BUILD=x86_64-conda-linux-gnu
6
+ CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
7
+ CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
8
+ CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
9
+ CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
10
+ COLORTERM=truecolor
11
+ CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
12
+ CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
13
+ CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
14
+ CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
15
+ CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
16
+ CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
17
+ CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
18
+ CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
19
+ CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
20
+ CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
21
+ CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
22
+ CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
23
+ CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
24
+ CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
25
+ CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
26
+ CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
27
+ CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
28
+ CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
29
+ CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
30
+ CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
31
+ CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
32
+ CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
33
+ CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
34
+ CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
35
+ CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
36
+ CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
37
+ CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
38
+ CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
39
+ CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
40
+ CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
41
+ CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
42
+ CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
43
+ CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
44
+ CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
45
+ CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
46
+ CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
47
+ CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
48
+ CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
49
+ CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
50
+ CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
51
+ CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
52
+ CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
53
+ CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
54
+ CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
55
+ CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
56
+ CONDA_DEFAULT_ENV=hantao_stable
57
+ CONDA_EXE=/data/align-anything/miniconda3/bin/conda
58
+ CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
59
+ CONDA_PREFIX_1=/home/align-anything/miniconda3
60
+ CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
61
+ CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
62
+ CONDA_PREFIX_3=/data/align-anything/miniconda3
63
+ CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
64
+ CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
65
+ CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
66
+ CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
67
+ CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
68
+ CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
69
+ CONDA_PROMPT_MODIFIER=(hantao_stable)
70
+ CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
71
+ CONDA_ROOT=/home/align-anything/miniconda3
72
+ CONDA_SHLVL=11
73
+ CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
74
+ CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
75
+ CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
76
+ CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
77
+ CROSS_RANK=0
78
+ CROSS_SIZE=1
79
+ CUDA_MODULE_LOADING=LAZY
80
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
81
+ CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
82
+ CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
83
+ CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
84
+ CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
85
+ DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
86
+ DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
87
+ DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
88
+ DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
89
+ DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
90
+ ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
91
+ GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
92
+ GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
93
+ GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
94
+ GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
95
+ GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
96
+ GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
97
+ GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
98
+ HOME=/home/align-anything
99
+ HOST=x86_64-conda-linux-gnu
100
+ LANG=en_US.UTF-8
101
+ LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
102
+ LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
103
+ LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
104
+ LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
105
+ LESSCLOSE=/usr/bin/lesspipe %s %s
106
+ LESSOPEN=| /usr/bin/lesspipe %s
107
+ LOCAL_RANK=0
108
+ LOCAL_SIZE=8
109
+ LOGLEVEL=WARNING
110
+ LOGNAME=align-anything
111
+ LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
112
+ MASTER_ADDR=127.0.0.1
113
+ MASTER_PORT=52201
114
+ MOTD_SHOWN=pam
115
+ NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
116
+ NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
117
+ OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
118
+ OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
119
+ OLDPWD=/data/align-anything/hantao/LLaMA-Factory
120
+ PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
121
+ PWD=/data/align-anything/hantao/align-anything/scripts
122
+ PYGAME_HIDE_SUPPORT_PROMPT=1
123
+ PYTHONHASHSEED=42
124
+ PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
125
+ QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
126
+ QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
127
+ RANK=0
128
+ RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
129
+ READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
130
+ SHELL=/bin/bash
131
+ SHLVL=3
132
+ SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
133
+ SSH_CLIENT=117.136.0.149 36325 30400
134
+ SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
135
+ SSL_CERT_DIR=/usr/lib/ssl/certs
136
+ SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
137
+ STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
138
+ STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
139
+ TERM=screen
140
+ TERM_PROGRAM=vscode
141
+ TERM_PROGRAM_VERSION=0.41.3
142
+ TMUX=/tmp/tmux-2000/default,34082,51
143
+ TMUX_PANE=%59
144
+ TRITON_CACHE_DIR=/home/align-anything/cache/triton
145
+ USER=align-anything
146
+ VSCODE_GIT_ASKPASS_EXTRA_ARGS=
147
+ VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
148
+ VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
149
+ VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
150
+ VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
151
+ WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
152
+ WANDB_MODE=online
153
+ WANDB_SERVICE=2-675697-tcp-localhost-45541
154
+ WORLD_SIZE=8
155
+ XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
156
+ XDG_RUNTIME_DIR=/run/user/2000
157
+ XDG_SESSION_CLASS=user
158
+ XDG_SESSION_ID=11
159
+ XDG_SESSION_TYPE=tty
160
+ _=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
161
+ _CE_CONDA=
162
+ _CE_M=
163
+ build_alias=x86_64-conda-linux-gnu
164
+ host_alias=x86_64-conda-linux-gnu
slice_1200/preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 512,
4
+ "width": 512
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 1.0,
13
+ 1.0,
14
+ 1.0
15
+ ],
16
+ "image_processor_type": "ChameleonImageProcessor",
17
+ "image_std": [
18
+ 1.0,
19
+ 1.0,
20
+ 1.0
21
+ ],
22
+ "processor_class": "ChameleonProcessor",
23
+ "resample": 1,
24
+ "rescale_factor": 0.0078,
25
+ "size": {
26
+ "shortest_edge": 512
27
+ }
28
+ }
slice_1200/processor_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 1024,
3
+ "image_token": "<image>",
4
+ "processor_class": "ChameleonProcessor"
5
+ }
slice_1200/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9571b162d6b31a5769b7b07e625c1594325aaeb450e02cadcb988815ad68a79d
3
+ size 14086366930
slice_1200/script.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright 2024 PKU-Alignment Team. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
19
+ export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
20
+
21
+ export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
22
+
23
+ export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
24
+ export WANDB_MODE=online
25
+
26
+ MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
27
+
28
+ DATASET_PATH=(
29
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
30
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
31
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
32
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
33
+ )
34
+
35
+ DATASET_NAME=(
36
+ "q0_10_preference"
37
+ "q0_20_preference"
38
+ "q0_30_preference"
39
+ "q0_40_preference"
40
+ "q0_50_preference"
41
+ "q0_60_preference"
42
+ "q0_70_preference"
43
+ "q0_80_preference"
44
+ "q0_90_preference"
45
+ )
46
+
47
+ OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
48
+ mkdir -p $OUTPUT_PATH
49
+
50
+ # Initialize variables
51
+
52
+ for dataset_path in ${DATASET_PATH[@]}; do
53
+ for dataset_name in ${DATASET_NAME[@]}; do
54
+ TRAIN_DATASETS=$dataset_path
55
+
56
+ # dataset middle name
57
+ middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
58
+ OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
59
+ mkdir -p $OUTPUT_DIR
60
+ echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
61
+ # Source the setup script
62
+ source ./setup.sh
63
+
64
+ # Execute deepspeed command
65
+ deepspeed \
66
+ --master_port ${MASTER_PORT} \
67
+ --module align_anything.trainers.text_image_to_text_image.dpo \
68
+ --model_name_or_path ${MODEL_NAME_OR_PATH} \
69
+ --train_datasets ${TRAIN_DATASETS} \
70
+ --output_dir ${OUTPUT_DIR} \
71
+ --per_device_train_batch_size 4 \
72
+ --per_device_eval_batch_size 4 \
73
+ --gradient_accumulation_steps 2 \
74
+ --train_template Chameleon_preference \
75
+ --train_split train \
76
+ --train_data_files ${dataset_name}.pt \
77
+ --learning_rate 1e-6 \
78
+ --epochs 3 \
79
+ --lr_scheduler_type cosine \
80
+ --save_interval 400
81
+
82
+ bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
83
+ done
84
+ done
slice_1200/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "<reserved08706>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
slice_1200/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
slice_1200/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
slice_1200/wandb/debug-internal.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
3
+ {"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
5
+ {"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
6
+ {"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
7
+ {"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
8
+ {"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
9
+ {"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
11
+ {"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
12
+ {"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
13
+ {"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
14
+ {"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
15
+ {"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
16
+ {"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
17
+ {"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
18
+ {"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
19
+ {"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
20
+ {"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
21
+ {"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
22
+ {"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}
slice_1200/wandb/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
3
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
8
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
9
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
11
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
12
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:init():617] calling init triggers
13
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
15
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():667] starting backend
16
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():671] sending inform_init request
17
+ 2025-01-01 08:41:16,156 INFO MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2025-01-01 08:41:16,157 INFO MainThread:675697 [wandb_init.py:init():684] backend started and connected
19
+ 2025-01-01 08:41:16,160 INFO MainThread:675697 [wandb_init.py:init():779] updated telemetry
20
+ 2025-01-01 08:41:16,221 INFO MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2025-01-01 08:41:16,848 INFO MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
22
+ 2025-01-01 08:41:17,411 INFO MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2025-01-01 08:41:17,413 INFO MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2025-01-01 08:41:17,424 INFO MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
27
+ 2025-01-01 12:07:46,853 INFO MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
28
+ 2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
29
+ 2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_restore():2410] restore
30
+ 2025-01-01 12:07:46,855 INFO MainThread:675697 [wandb_run.py:_restore():2416] restore done
31
+ 2025-01-01 12:07:50,513 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
32
+ 2025-01-01 12:07:50,515 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
33
+ 2025-01-01 12:07:50,525 INFO MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files
slice_1200/wandb/run-20250101_084116-coewtb43/files/config.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.3
4
+ m: []
5
+ python_version: 3.11.10
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 11
10
+ - 41
11
+ - 49
12
+ - 51
13
+ - 55
14
+ - 71
15
+ - 83
16
+ - 98
17
+ - 105
18
+ "2":
19
+ - 1
20
+ - 11
21
+ - 41
22
+ - 49
23
+ - 51
24
+ - 55
25
+ - 71
26
+ - 83
27
+ - 98
28
+ - 105
29
+ "3":
30
+ - 2
31
+ - 13
32
+ - 16
33
+ - 23
34
+ - 55
35
+ - 61
36
+ "4": 3.11.10
37
+ "5": 0.18.3
38
+ "6": 4.45.2
39
+ "8":
40
+ - 5
41
+ "12": 0.18.3
42
+ "13": linux-x86_64
43
+ data_cfgs:
44
+ value:
45
+ eval_data_files: null
46
+ eval_datasets: null
47
+ eval_optional_args: []
48
+ eval_size: null
49
+ eval_split: null
50
+ eval_subset: null
51
+ eval_template: null
52
+ train_data_files: q0_40_preference.pt
53
+ train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
54
+ train_optional_args: []
55
+ train_size: null
56
+ train_split: train
57
+ train_subset: null
58
+ train_template: Chameleon_preference
59
+ logger_cfgs:
60
+ value:
61
+ cache_dir: null
62
+ log_project: align-anything
63
+ log_run_name: dpo
64
+ log_type: wandb
65
+ output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
66
+ save_interval: 400
67
+ model_cfgs:
68
+ value:
69
+ model_max_length: 4096
70
+ model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
71
+ trust_remote_code: true
72
+ special_tokens:
73
+ value: null
74
+ train_cfgs:
75
+ value:
76
+ adam_betas:
77
+ - 0.9
78
+ - 0.95
79
+ bf16: true
80
+ ds_cfgs: ds_z3_config.json
81
+ epochs: 3
82
+ eval_interval: 10
83
+ eval_strategy: epoch
84
+ fp16: false
85
+ freeze_language_model: true
86
+ freeze_mm_proj: true
87
+ freeze_vision_tower: false
88
+ gradient_accumulation_steps: 2
89
+ gradient_checkpointing: true
90
+ learning_rate: 1e-06
91
+ lr_scheduler_type: cosine
92
+ lr_warmup_ratio: 0.03
93
+ per_device_eval_batch_size: 4
94
+ per_device_train_batch_size: 4
95
+ regularization: 0.001
96
+ scale_coeff: 0.1
97
+ seed: 42
98
+ weight_decay: 0.01
slice_1200/wandb/run-20250101_084116-coewtb43/files/output.log ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/3.0 epoch: 0%| | 0/1422.0 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+ Training 1/3.0 epoch (loss 11.8749): 21%|██████████████████████████████████████████████████▋ | 299/1422.0 [43:24<2:59:40, 9.60s/it]
4
+ [2025-01-01 08:42:55,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
5
+ [2025-01-01 08:43:25,944] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
6
+ [2025-01-01 08:44:17,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[4.761904761904761e-07, 4.761904761904761e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
7
+ [2025-01-01 08:45:03,075] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
8
+ [2025-01-01 08:47:04,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.523809523809522e-07, 9.523809523809522e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
9
+ [2025-01-01 08:48:02,529] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
10
+ [2025-01-01 08:48:43,904] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
11
+ [2025-01-01 08:50:07,690] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[9.995802740501932e-07, 9.995802740501932e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
12
+ [2025-01-01 08:51:09,012] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
13
+ [2025-01-01 08:53:09,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[9.98130274211278e-07, 9.98130274211278e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
14
+ [2025-01-01 08:55:49,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
15
+ [2025-01-01 08:55:49,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[9.956478233113064e-07, 9.956478233113064e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
16
+ [2025-01-01 08:58:16,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[9.921380666088558e-07, 9.921380666088558e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
17
+ [2025-01-01 09:01:07,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[9.876082786106545e-07, 9.876082786106545e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
18
+ [2025-01-01 09:01:27,047] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
19
+ [2025-01-01 09:04:03,792] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
20
+ [2025-01-01 09:04:24,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
21
+ [2025-01-01 09:04:24,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[9.820678479940571e-07, 9.820678479940571e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
22
+ [2025-01-01 09:05:30,688] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
23
+ [2025-01-01 09:07:12,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.755282581475767e-07, 9.755282581475767e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
24
+ [2025-01-01 09:09:19,110] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
25
+ [2025-01-01 09:10:10,785] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
26
+ [2025-01-01 09:10:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[9.68003063369808e-07, 9.68003063369808e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
27
+ [2025-01-01 09:10:51,320] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
28
+ [2025-01-01 09:13:33,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[9.595078607760749e-07, 9.595078607760749e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
29
+ [2025-01-01 09:14:56,362] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
30
+ [2025-01-01 09:16:12,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[9.500602579710255e-07, 9.500602579710255e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
31
+ [2025-01-01 09:17:52,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
32
+ [2025-01-01 09:19:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[9.39679836554184e-07, 9.39679836554184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
33
+ [2025-01-01 09:21:51,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[9.283881115340955e-07, 9.283881115340955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
34
+ [2025-01-01 09:22:22,638] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
35
+ [2025-01-01 09:24:48,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[9.16208486735184e-07, 9.16208486735184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
36
+ [2025-01-01 09:25:39,046] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
37
+ [2025-01-01 09:27:38,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[9.03166206289754e-07, 9.03166206289754e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
38
+ [2025-01-01 09:28:42,774] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
39
+ [2025-01-01 09:29:54,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[8.8928830231567e-07, 8.8928830231567e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
40
+ [2025-01-01 09:30:34,992] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
41
+ [2025-01-01 09:31:34,787] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
42
+ [2025-01-01 09:32:15,776] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
43
+ [2025-01-01 09:32:50,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[8.746035388881654e-07, 8.746035388881654e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
44
+ [2025-01-01 09:35:09,447] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
45
+ [2025-01-01 09:35:48,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[8.591423524219029e-07, 8.591423524219029e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
46
+ [2025-01-01 09:36:31,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
47
+ [2025-01-01 09:37:59,484] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
48
+ [2025-01-01 09:38:30,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[8.429367885868581e-07, 8.429367885868581e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
49
+ Saving checkpoint at step 400 ...
50
+ Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
51
+ Saving 16-bit model...
52
+ [2025-01-01 09:38:39,848] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
53
+ [2025-01-01 09:38:39,849] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
54
+ [2025-01-01 09:38:39,850] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
55
+ [2025-01-01 09:38:58,770] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
56
+ [2025-01-01 09:38:58,773] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
57
+ Model saved!
58
+ Saving 16-bit model...
59
+ [2025-01-01 09:39:05,477] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
60
+ [2025-01-01 09:39:05,478] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
61
+ [2025-01-01 09:39:05,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
62
+ [2025-01-01 09:39:27,199] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
63
+ [2025-01-01 09:39:27,202] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
64
+ Model saved!
65
+ Checkpoint saved.
66
+ [2025-01-01 09:42:08,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[8.260204358887753e-07, 8.260204358887753e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
67
+ [2025-01-01 09:43:59,051] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
68
+ [2025-01-01 09:44:19,070] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
69
+ [2025-01-01 09:45:02,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[8.084283560518583e-07, 8.084283560518583e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
70
+ [2025-01-01 09:47:51,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[7.901970113479955e-07, 7.901970113479955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
71
+ [2025-01-01 09:48:18,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
72
+ [2025-01-01 09:50:44,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[7.713641890231308e-07, 7.713641890231308e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
73
+ [2025-01-01 09:51:29,985] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
74
+ [2025-01-01 09:52:00,601] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
75
+ [2025-01-01 09:53:37,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
76
+ [2025-01-01 09:53:37,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[7.51968922977428e-07, 7.51968922977428e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
77
+ [2025-01-01 09:56:36,667] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
78
+ [2025-01-01 09:56:36,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[7.320514128615511e-07, 7.320514128615511e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
79
+ [2025-01-01 09:57:18,253] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
80
+ [2025-01-01 09:59:22,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[7.116529407567488e-07, 7.116529407567488e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
81
+ [2025-01-01 09:59:42,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
82
+ [2025-01-01 10:02:27,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[6.908157856114392e-07, 6.908157856114392e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
83
+ [2025-01-01 10:04:22,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
84
+ [2025-01-01 10:05:15,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[6.695831356116303e-07, 6.695831356116303e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
85
+ [2025-01-01 10:07:32,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[6.479989986668117e-07, 6.479989986668117e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
86
+ [2025-01-01 10:09:58,442] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
87
+ [2025-01-01 10:10:45,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[6.261081111968403e-07, 6.261081111968403e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
88
+ [2025-01-01 10:12:34,690] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
89
+ [2025-01-01 10:12:55,185] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
90
+ [2025-01-01 10:13:41,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[6.039558454088795e-07, 6.039558454088795e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
91
+ [2025-01-01 10:14:01,438] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
92
+ [2025-01-01 10:16:42,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[5.815881152565711e-07, 5.815881152565711e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
93
+ [2025-01-01 10:17:49,827] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
94
+ [2025-01-01 10:18:41,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
95
+ [2025-01-01 10:19:22,050] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
96
+ [2025-01-01 10:20:00,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[5.590512812763541e-07, 5.590512812763541e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
97
+ [2025-01-01 10:22:48,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[5.363920544981748e-07, 5.363920544981748e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
98
+ [2025-01-01 10:23:27,066] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
99
+ [2025-01-01 10:25:42,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[5.136573996297429e-07, 5.136573996297429e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
100
+ [2025-01-01 10:26:22,366] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
101
+ [2025-01-01 10:28:45,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[4.908944377150043e-07, 4.908944377150043e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
102
+ [2025-01-01 10:30:51,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
103
+ [2025-01-01 10:31:03,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[4.681503484685803e-07, 4.681503484685803e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
104
+ [2025-01-01 10:34:07,594] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
105
+ [2025-01-01 10:34:07,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[4.454722724886051e-07, 4.454722724886051e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
106
+ [2025-01-01 10:36:51,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[4.229072135506384e-07, 4.229072135506384e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
107
+ Saving checkpoint at step 800 ...
108
+ Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
109
+ Saving 16-bit model...
110
+ [2025-01-01 10:37:00,920] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
111
+ [2025-01-01 10:37:00,921] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
112
+ [2025-01-01 10:37:00,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
113
+ [2025-01-01 10:37:17,303] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
114
+ [2025-01-01 10:37:17,305] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
115
+ Model saved!
116
+ Saving 16-bit model...
117
+ [2025-01-01 10:37:24,304] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
118
+ [2025-01-01 10:37:24,306] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
119
+ [2025-01-01 10:37:24,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
120
+ [2025-01-01 10:37:47,861] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
121
+ [2025-01-01 10:37:47,862] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
122
+ Model saved!
123
+ Checkpoint saved.
124
+ [2025-01-01 10:38:06,545] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
125
+ [2025-01-01 10:39:58,968] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
126
+ [2025-01-01 10:40:18,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[4.005019411851609e-07, 4.005019411851609e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
127
+ [2025-01-01 10:40:58,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
128
+ [2025-01-01 10:41:39,701] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
129
+ [2025-01-01 10:43:08,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[3.783028937405821e-07, 3.783028937405821e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
130
+ [2025-01-01 10:44:32,585] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
131
+ [2025-01-01 10:45:54,172] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
132
+ [2025-01-01 10:45:54,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[3.563560821326706e-07, 3.563560821326706e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
133
+ [2025-01-01 10:47:22,278] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
134
+ [2025-01-01 10:48:37,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[3.3470699447990527e-07, 3.3470699447990527e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
135
+ [2025-01-01 10:51:26,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[3.1340050182240436e-07, 3.1340050182240436e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
136
+ [2025-01-01 10:52:26,337] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
137
+ [2025-01-01 10:52:46,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
138
+ [2025-01-01 10:54:14,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[2.92480765119841e-07, 2.92480765119841e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
139
+ [2025-01-01 10:56:46,300] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
140
+ [2025-01-01 10:57:28,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[2.719911437211122e-07, 2.719911437211122e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
141
+ [2025-01-01 10:59:56,353] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
142
+ [2025-01-01 10:59:56,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[2.5197410549546595e-07, 2.5197410549546595e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
143
+ [2025-01-01 11:00:26,971] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
144
+ [2025-01-01 11:02:03,646] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
145
+ [2025-01-01 11:02:54,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[2.3247113881135781e-07, 2.3247113881135781e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
146
+ [2025-01-01 11:05:02,388] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
147
+ [2025-01-01 11:05:43,792] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
148
+ [2025-01-01 11:05:55,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[2.1352266654547125e-07, 2.1352266654547125e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
149
+ [2025-01-01 11:08:07,702] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
150
+ [2025-01-01 11:08:38,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[1.9516796230013272e-07, 1.9516796230013272e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
151
+ [2025-01-01 11:11:35,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[1.774450690027746e-07, 1.774450690027746e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
152
+ [2025-01-01 11:12:47,870] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
153
+ [2025-01-01 11:14:16,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[1.6039072005615716e-07, 1.6039072005615716e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
154
+ [2025-01-01 11:17:00,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[1.4404026320278317e-07, 1.4404026320278317e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
155
+ [2025-01-01 11:18:23,592] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
156
+ [2025-01-01 11:19:58,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[1.284275872613028e-07, 1.284275872613028e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
157
+ [2025-01-01 11:20:59,657] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
158
+ [2025-01-01 11:21:20,032] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
159
+ [2025-01-01 11:22:26,091] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
160
+ [2025-01-01 11:23:04,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[1.1358505188676288e-07, 1.1358505188676288e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
161
+ [2025-01-01 11:26:14,637] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
162
+ [2025-01-01 11:26:14,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[9.95434205002792e-08, 9.95434205002792e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
163
+ [2025-01-01 11:27:06,507] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
164
+ [2025-01-01 11:27:47,054] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
165
+ [2025-01-01 11:29:24,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[8.633179652714916e-08, 8.633179652714916e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
166
+ [2025-01-01 11:31:52,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
167
+ [2025-01-01 11:32:11,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[7.397756307555885e-08, 7.397756307555885e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
168
+ [2025-01-01 11:34:47,839] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
169
+ [2025-01-01 11:34:59,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[6.250632618090867e-08, 6.250632618090867e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
170
+ Saving checkpoint at step 1200 ...
171
+ Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
172
+ Saving 16-bit model...
173
+ [2025-01-01 11:35:08,703] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
174
+ [2025-01-01 11:35:08,704] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
175
+ [2025-01-01 11:35:08,704] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
176
+ [2025-01-01 11:35:25,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
177
+ [2025-01-01 11:35:25,317] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
178
+ Model saved!
179
+ Saving 16-bit model...
180
+ [2025-01-01 11:35:32,446] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
181
+ [2025-01-01 11:35:32,447] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
182
+ [2025-01-01 11:35:32,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
183
+ [2025-01-01 11:35:53,847] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
184
+ [2025-01-01 11:35:53,849] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
185
+ Model saved!
186
+ Checkpoint saved.
187
+ [2025-01-01 11:38:41,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[5.194186173339599e-08, 5.194186173339599e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
188
+ [2025-01-01 11:40:10,376] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
189
+ [2025-01-01 11:41:25,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[4.230606619885108e-08, 4.230606619885108e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
190
+ [2025-01-01 11:43:26,712] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
191
+ [2025-01-01 11:44:25,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[3.3618911234968236e-08, 3.3618911234968236e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
192
+ [2025-01-01 11:46:29,769] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
193
+ [2025-01-01 11:46:53,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[2.589840229699558e-08, 2.589840229699558e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
194
+ [2025-01-01 11:48:22,179] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
195
+ [2025-01-01 11:49:22,087] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
196
+ [2025-01-01 11:49:35,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[1.9160541318679224e-08, 1.9160541318679224e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
197
+ [2025-01-01 11:50:02,959] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
198
+ [2025-01-01 11:52:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[1.3419293545812338e-08, 1.3419293545812338e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
199
+ [2025-01-01 11:52:56,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
200
+ [2025-01-01 11:54:18,301] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
201
+ [2025-01-01 11:55:03,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[8.686558591130156e-09, 8.686558591130156e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
202
+ [2025-01-01 11:55:46,245] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
203
+ [2025-01-01 11:57:47,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[4.972145770545999e-09, 4.972145770545999e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
204
+ [2025-01-01 12:00:50,391] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
205
+ [2025-01-01 12:00:50,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[2.283753771845587e-09, 2.283753771845587e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
206
+ [2025-01-01 12:01:10,430] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
207
+ [2025-01-01 12:03:22,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[6.269546979813523e-10, 6.269546979813523e-10], mom=[[0.9, 0.95], [0.9, 0.95]]
208
+ [2025-01-01 12:05:10,192] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
209
+ [2025-01-01 12:06:28,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[5.182518037827321e-12, 5.182518037827321e-12], mom=[[0.9, 0.95], [0.9, 0.95]]
210
+ Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
211
+ Saving 16-bit model...
212
+ [2025-01-01 12:06:57,036] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
213
+ [2025-01-01 12:06:57,037] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
214
+ [2025-01-01 12:06:57,037] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
215
+ [2025-01-01 12:07:17,768] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
216
+ [2025-01-01 12:07:17,770] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
217
+ Model saved!
218
+ Saving 16-bit model...
219
+ [2025-01-01 12:07:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
220
+ [2025-01-01 12:07:24,459] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
221
+ [2025-01-01 12:07:24,459] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
222
+ [2025-01-01 12:07:46,734] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
223
+ [2025-01-01 12:07:46,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
224
+ Model saved!
slice_1200/wandb/run-20250101_084116-coewtb43/files/requirements.txt ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ align-anything==0.0.1.dev0
2
+ gitdb==4.0.11
3
+ wcwidth==0.2.13
4
+ identify==2.6.1
5
+ tomlkit==0.12.0
6
+ bitsandbytes==0.44.1
7
+ trl==0.9.6
8
+ pytest-split==0.8.0
9
+ gradio==4.44.1
10
+ pip==24.2
11
+ multidict==6.1.0
12
+ fairscale==0.4.13
13
+ mistral_common==1.4.4
14
+ python-dotenv==1.0.1
15
+ uvloop==0.20.0
16
+ absl-py==2.1.0
17
+ tiktoken==0.7.0
18
+ pydub==0.25.1
19
+ websockets==12.0
20
+ llamafactory==0.9.1.dev0
21
+ triton==3.0.0
22
+ tifffile==2024.9.20
23
+ safe-rlhf==0.0.1.dev0
24
+ pandas==2.2.3
25
+ grpcio==1.66.2
26
+ click==8.1.7
27
+ ninja==1.11.1.1
28
+ rich==13.9.2
29
+ Jinja2==3.1.4
30
+ Pygments==2.18.0
31
+ nvidia-cudnn-cu12==9.1.0.70
32
+ importlib_resources==6.4.5
33
+ GitPython==3.1.43
34
+ nvidia-cufft-cu12==11.0.2.54
35
+ tensorboard-data-server==0.7.2
36
+ align-anything==0.0.1.dev0
37
+ six==1.16.0
38
+ scipy==1.14.1
39
+ mpmath==1.3.0
40
+ jsonschema-specifications==2024.10.1
41
+ scikit-image==0.24.0
42
+ zipp==3.20.2
43
+ cycler==0.12.1
44
+ MarkupSafe==2.1.5
45
+ tzdata==2024.2
46
+ idna==3.10
47
+ pycountry==24.6.1
48
+ nvidia-nccl-cu12==2.20.5
49
+ matplotlib==3.9.2
50
+ pytz==2024.2
51
+ uvicorn==0.31.1
52
+ dill==0.3.8
53
+ pyparsing==3.1.4
54
+ pytest==7.2.0
55
+ jiter==0.6.1
56
+ safetensors==0.4.5
57
+ typing_extensions==4.12.2
58
+ decorator==4.4.2
59
+ typeguard==4.4.1
60
+ prometheus_client==0.21.0
61
+ nvidia-cuda-cupti-cu12==12.1.105
62
+ sentencepiece==0.2.0
63
+ requests==2.32.3
64
+ kiwisolver==1.4.7
65
+ gdown==5.2.0
66
+ multiprocess==0.70.16
67
+ xxhash==3.5.0
68
+ PyYAML==6.0.2
69
+ gguf==0.10.0
70
+ nvidia-nvtx-cu12==12.1.105
71
+ hpsv2==1.2.0
72
+ tensorboard==2.18.0
73
+ nodeenv==1.9.1
74
+ filelock==3.16.1
75
+ distro==1.9.0
76
+ scikit-learn==1.5.2
77
+ huggingface-hub==0.25.2
78
+ pyairports==2.1.1
79
+ importlib_metadata==8.5.0
80
+ pyarrow==17.0.0
81
+ llvmlite==0.43.0
82
+ ray==2.37.0
83
+ tokenizers==0.20.3
84
+ nvidia-nvjitlink-cu12==12.6.77
85
+ av==14.0.1
86
+ deepspeed==0.15.2
87
+ clip==0.2.0
88
+ shtab==1.7.1
89
+ certifi==2024.8.30
90
+ braceexpand==0.1.7
91
+ nvidia-ml-py==12.560.30
92
+ webdataset==0.2.100
93
+ docker-pycreds==0.4.0
94
+ einops==0.8.0
95
+ iniconfig==2.0.0
96
+ tyro==0.9.2
97
+ torchvision==0.19.0
98
+ accelerate==0.34.2
99
+ beautifulsoup4==4.12.3
100
+ pyzmq==26.2.0
101
+ pycparser==2.22
102
+ nvidia-curand-cu12==10.3.2.106
103
+ msgpack==1.1.0
104
+ soxr==0.5.0.post1
105
+ platformdirs==4.3.6
106
+ h11==0.14.0
107
+ psutil==6.0.0
108
+ pydantic==2.9.2
109
+ shellingham==1.5.4
110
+ imageio-ffmpeg==0.5.1
111
+ wandb==0.18.3
112
+ audioread==3.0.1
113
+ annotated-types==0.7.0
114
+ docstring_parser==0.16
115
+ cloudpickle==3.1.0
116
+ regex==2024.9.11
117
+ packaging==24.1
118
+ timm==0.6.13
119
+ aiosignal==1.3.1
120
+ numba==0.60.0
121
+ orjson==3.10.7
122
+ rpds-py==0.20.0
123
+ virtualenv==20.26.6
124
+ joblib==1.4.2
125
+ charset-normalizer==3.4.0
126
+ httpx==0.27.2
127
+ ffmpy==0.4.0
128
+ lm-format-enforcer==0.10.6
129
+ yt-dlp==2024.8.6
130
+ sympy==1.13.3
131
+ python-dateutil==2.9.0.post0
132
+ nvidia-cusolver-cu12==11.4.5.107
133
+ msgspec==0.18.6
134
+ mdurl==0.1.2
135
+ torch==2.4.0
136
+ fastapi==0.115.0
137
+ optree==0.13.0
138
+ PySocks==1.7.1
139
+ transformers==4.46.0.dev0
140
+ torchlibrosa==0.1.0
141
+ fsspec==2024.6.1
142
+ nvidia-cublas-cu12==12.1.3.1
143
+ gradio_client==1.3.0
144
+ args==0.1.0
145
+ cffi==1.17.1
146
+ fonttools==4.54.1
147
+ clint==0.5.1
148
+ lark==1.2.2
149
+ tqdm==4.66.5
150
+ semantic-version==2.10.0
151
+ pooch==1.8.2
152
+ markdown-it-py==3.0.0
153
+ pydantic_core==2.23.4
154
+ sniffio==1.3.1
155
+ httptools==0.6.1
156
+ nvidia-cuda-runtime-cu12==12.1.105
157
+ anyio==4.6.0
158
+ ftfy==6.3.0
159
+ Markdown==3.7
160
+ datasets==2.21.0
161
+ diffusers==0.30.3
162
+ nvidia-cuda-nvrtc-cu12==12.1.105
163
+ vllm==0.6.2
164
+ starlette==0.38.6
165
+ flash-attn==2.7.0.post2
166
+ urllib3==2.2.3
167
+ Werkzeug==3.0.4
168
+ py-cpuinfo==9.0.0
169
+ moviepy==1.0.3
170
+ librosa==0.10.2.post1
171
+ peft==0.12.0
172
+ soupsieve==2.6
173
+ lazy_loader==0.4
174
+ pluggy==1.5.0
175
+ setuptools==75.1.0
176
+ sentry-sdk==2.16.0
177
+ tabulate==0.9.0
178
+ transformers==4.45.2
179
+ pre_commit==4.0.1
180
+ termcolor==2.5.0
181
+ frechet-audio-distance==0.1.2
182
+ pytorch-fid==0.3.0
183
+ setproctitle==1.3.3
184
+ jsonschema==4.23.0
185
+ aiofiles==23.2.1
186
+ contourpy==1.3.0
187
+ distlib==0.3.9
188
+ interegular==0.3.3
189
+ fire==0.7.0
190
+ diskcache==5.6.3
191
+ proglog==0.1.10
192
+ soundfile==0.12.1
193
+ protobuf==3.20.3
194
+ smmap==5.0.1
195
+ pycryptodomex==3.21.0
196
+ Brotli==1.1.0
197
+ pillow==10.4.0
198
+ frozenlist==1.4.1
199
+ numpy==1.26.4
200
+ mutagen==1.47.0
201
+ outlines==0.0.46
202
+ attrs==24.2.0
203
+ torchaudio==2.4.0
204
+ aiohttp==3.10.10
205
+ ruff==0.6.9
206
+ watchfiles==0.24.0
207
+ threadpoolctl==3.5.0
208
+ nest-asyncio==1.6.0
209
+ partial-json-parser==0.2.1.1.post4
210
+ sse-starlette==2.1.3
211
+ shortuuid==1.0.13
212
+ typer==0.12.5
213
+ prometheus-fastapi-instrumentator==7.0.0
214
+ imageio==2.35.1
215
+ wheel==0.44.0
216
+ image-reward==1.5
217
+ networkx==3.4.1
218
+ propcache==0.2.0
219
+ aiohappyeyeballs==2.4.3
220
+ nvidia-cusparse-cu12==12.1.0.106
221
+ xformers==0.0.27.post2
222
+ cfgv==3.4.0
223
+ python-multipart==0.0.12
224
+ httpcore==1.0.6
225
+ opencv-python==4.6.0.66
226
+ resampy==0.4.3
227
+ yarl==1.15.0
228
+ referencing==0.35.1
229
+ openai==1.51.2
230
+ hjson==3.1.0
231
+ llamafactory==0.9.1.dev0
232
+ jaraco.collections==5.1.0
233
+ backports.tarfile==1.2.0
234
+ more-itertools==10.3.0
235
+ wheel==0.43.0
236
+ importlib_metadata==8.0.0
237
+ zipp==3.19.2
238
+ autocommand==2.2.2
239
+ jaraco.functools==4.0.1
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ jaraco.text==3.12.1
243
+ typing_extensions==4.12.2
244
+ jaraco.context==5.3.0
245
+ importlib_resources==6.4.0
246
+ packaging==24.1
247
+ inflect==7.3.1
248
+ typeguard==4.3.0
slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
3
+ "python": "3.11.10",
4
+ "startedAt": "2025-01-01T08:41:16.157770Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/data/align-anything/hantao/models/chameleon-7b",
9
+ "--train_datasets",
10
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized",
11
+ "--output_dir",
12
+ "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
13
+ "--per_device_train_batch_size",
14
+ "4",
15
+ "--per_device_eval_batch_size",
16
+ "4",
17
+ "--gradient_accumulation_steps",
18
+ "2",
19
+ "--train_template",
20
+ "Chameleon_preference",
21
+ "--train_split",
22
+ "train",
23
+ "--train_data_files",
24
+ "q0_40_preference.pt",
25
+ "--learning_rate",
26
+ "1e-6",
27
+ "--epochs",
28
+ "3",
29
+ "--lr_scheduler_type",
30
+ "cosine",
31
+ "--save_interval",
32
+ "400"
33
+ ],
34
+ "program": "-m align_anything.trainers.text_image_to_text_image.dpo",
35
+ "git": {
36
+ "remote": "https://github.com/PKU-Alignment/align-anything.git",
37
+ "commit": "6fde660afc9985323f147930eedf188a5699adc7"
38
+ },
39
+ "email": "[email protected]",
40
+ "root": "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
41
+ "host": "lyg0194",
42
+ "username": "align-anything",
43
+ "executable": "/data/align-anything/miniconda3/envs/hantao_stable/bin/python",
44
+ "cpu_count": 64,
45
+ "cpu_count_logical": 128,
46
+ "gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
47
+ "gpu_count": 8,
48
+ "disk": {
49
+ "/": {
50
+ "total": "939477946368",
51
+ "used": "596714827776"
52
+ }
53
+ },
54
+ "memory": {
55
+ "total": "1081823907840"
56
+ },
57
+ "cpu": {
58
+ "count": 64,
59
+ "countLogical": 128
60
+ },
61
+ "gpu_nvidia": [
62
+ {
63
+ "name": "NVIDIA A100-SXM4-80GB",
64
+ "memoryTotal": "85899345920",
65
+ "cudaCores": 6912,
66
+ "architecture": "Ampere"
67
+ },
68
+ {
69
+ "name": "NVIDIA A100-SXM4-80GB",
70
+ "memoryTotal": "85899345920",
71
+ "cudaCores": 6912,
72
+ "architecture": "Ampere"
73
+ },
74
+ {
75
+ "name": "NVIDIA A100-SXM4-80GB",
76
+ "memoryTotal": "85899345920",
77
+ "cudaCores": 6912,
78
+ "architecture": "Ampere"
79
+ },
80
+ {
81
+ "name": "NVIDIA A100-SXM4-80GB",
82
+ "memoryTotal": "85899345920",
83
+ "cudaCores": 6912,
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A100-SXM4-80GB",
88
+ "memoryTotal": "85899345920",
89
+ "cudaCores": 6912,
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A100-SXM4-80GB",
94
+ "memoryTotal": "85899345920",
95
+ "cudaCores": 6912,
96
+ "architecture": "Ampere"
97
+ },
98
+ {
99
+ "name": "NVIDIA A100-SXM4-80GB",
100
+ "memoryTotal": "85899345920",
101
+ "cudaCores": 6912,
102
+ "architecture": "Ampere"
103
+ },
104
+ {
105
+ "name": "NVIDIA A100-SXM4-80GB",
106
+ "memoryTotal": "85899345920",
107
+ "cudaCores": 6912,
108
+ "architecture": "Ampere"
109
+ }
110
+ ],
111
+ "cudaVersion": "12.4"
112
+ }
slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss":1.2820848226547241,"_step":1422,"train/better_sample_reward":92.50687408447266,"train/reward_margin":134.08236694335938,"train/lr":0,"train/worse_sample_reward":-41.57551193237305,"_wandb":{"runtime":12390},"_timestamp":1.7357332075421584e+09,"train/step":1422,"train/epoch":3,"_runtime":12390.697992413,"train/reward_accuracy":0.90625,"train/reward":50.931365966796875}
slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
3
+ {"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
5
+ {"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
6
+ {"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
7
+ {"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
8
+ {"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
9
+ {"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
11
+ {"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
12
+ {"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
13
+ {"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
14
+ {"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
15
+ {"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
16
+ {"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
17
+ {"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
18
+ {"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
19
+ {"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
20
+ {"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
21
+ {"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
22
+ {"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}
slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
3
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
8
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
9
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
11
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
12
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:init():617] calling init triggers
13
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
15
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():667] starting backend
16
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():671] sending inform_init request
17
+ 2025-01-01 08:41:16,156 INFO MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2025-01-01 08:41:16,157 INFO MainThread:675697 [wandb_init.py:init():684] backend started and connected
19
+ 2025-01-01 08:41:16,160 INFO MainThread:675697 [wandb_init.py:init():779] updated telemetry
20
+ 2025-01-01 08:41:16,221 INFO MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2025-01-01 08:41:16,848 INFO MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
22
+ 2025-01-01 08:41:17,411 INFO MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2025-01-01 08:41:17,413 INFO MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2025-01-01 08:41:17,424 INFO MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
27
+ 2025-01-01 12:07:46,853 INFO MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
28
+ 2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
29
+ 2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_restore():2410] restore
30
+ 2025-01-01 12:07:46,855 INFO MainThread:675697 [wandb_run.py:_restore():2416] restore done
31
+ 2025-01-01 12:07:50,513 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
32
+ 2025-01-01 12:07:50,515 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
33
+ 2025-01-01 12:07:50,525 INFO MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files
slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6184cd062ec8be4c9517ecc56b37ff397dd3f29795bce1cd495613256a3f6f76
3
+ size 12650956
slice_400/arguments.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_cfgs:
2
+ eval_data_files: null
3
+ eval_datasets: null
4
+ eval_optional_args: []
5
+ eval_size: null
6
+ eval_split: null
7
+ eval_subset: null
8
+ eval_template: null
9
+ train_data_files: q0_40_preference.pt
10
+ train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
11
+ train_optional_args: []
12
+ train_size: null
13
+ train_split: train
14
+ train_subset: null
15
+ train_template: Chameleon_preference
16
+ logger_cfgs:
17
+ cache_dir: null
18
+ log_project: align-anything
19
+ log_run_name: dpo
20
+ log_type: wandb
21
+ output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
22
+ save_interval: 400.0
23
+ model_cfgs:
24
+ model_max_length: 4096
25
+ model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
26
+ trust_remote_code: true
27
+ special_tokens: null
28
+ train_cfgs:
29
+ adam_betas:
30
+ - 0.9
31
+ - 0.95
32
+ bf16: true
33
+ ds_cfgs: ds_z3_config.json
34
+ epochs: 3.0
35
+ eval_interval: 10
36
+ eval_strategy: epoch
37
+ fp16: false
38
+ freeze_language_model: true
39
+ freeze_mm_proj: true
40
+ freeze_vision_tower: false
41
+ gradient_accumulation_steps: 2.0
42
+ gradient_checkpointing: true
43
+ learning_rate: 1.0e-06
44
+ lr_scheduler_type: cosine
45
+ lr_warmup_ratio: 0.03
46
+ per_device_eval_batch_size: 4.0
47
+ per_device_train_batch_size: 4.0
48
+ regularization: 0.001
49
+ scale_coeff: 0.1
50
+ seed: 42
51
+ weight_decay: 0.01
slice_400/config.json ADDED
The diff for this file is too large to render. See raw diff
 
slice_400/environ.txt ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
2
+ AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
3
+ AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
4
+ BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
5
+ BUILD=x86_64-conda-linux-gnu
6
+ CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
7
+ CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
8
+ CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
9
+ CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
10
+ COLORTERM=truecolor
11
+ CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
12
+ CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
13
+ CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
14
+ CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
15
+ CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
16
+ CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
17
+ CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
18
+ CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
19
+ CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
20
+ CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
21
+ CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
22
+ CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
23
+ CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
24
+ CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
25
+ CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
26
+ CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
27
+ CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
28
+ CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
29
+ CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
30
+ CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
31
+ CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
32
+ CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
33
+ CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
34
+ CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
35
+ CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
36
+ CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
37
+ CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
38
+ CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
39
+ CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
40
+ CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
41
+ CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
42
+ CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
43
+ CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
44
+ CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
45
+ CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
46
+ CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
47
+ CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
48
+ CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
49
+ CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
50
+ CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
51
+ CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
52
+ CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
53
+ CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
54
+ CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
55
+ CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
56
+ CONDA_DEFAULT_ENV=hantao_stable
57
+ CONDA_EXE=/data/align-anything/miniconda3/bin/conda
58
+ CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
59
+ CONDA_PREFIX_1=/home/align-anything/miniconda3
60
+ CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
61
+ CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
62
+ CONDA_PREFIX_3=/data/align-anything/miniconda3
63
+ CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
64
+ CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
65
+ CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
66
+ CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
67
+ CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
68
+ CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
69
+ CONDA_PROMPT_MODIFIER=(hantao_stable)
70
+ CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
71
+ CONDA_ROOT=/home/align-anything/miniconda3
72
+ CONDA_SHLVL=11
73
+ CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
74
+ CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
75
+ CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
76
+ CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
77
+ CROSS_RANK=0
78
+ CROSS_SIZE=1
79
+ CUDA_MODULE_LOADING=LAZY
80
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
81
+ CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
82
+ CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
83
+ CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
84
+ CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
85
+ DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
86
+ DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
87
+ DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
88
+ DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
89
+ DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
90
+ ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
91
+ GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
92
+ GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
93
+ GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
94
+ GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
95
+ GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
96
+ GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
97
+ GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
98
+ HOME=/home/align-anything
99
+ HOST=x86_64-conda-linux-gnu
100
+ LANG=en_US.UTF-8
101
+ LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
102
+ LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
103
+ LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
104
+ LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
105
+ LESSCLOSE=/usr/bin/lesspipe %s %s
106
+ LESSOPEN=| /usr/bin/lesspipe %s
107
+ LOCAL_RANK=0
108
+ LOCAL_SIZE=8
109
+ LOGLEVEL=WARNING
110
+ LOGNAME=align-anything
111
+ LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
112
+ MASTER_ADDR=127.0.0.1
113
+ MASTER_PORT=52201
114
+ MOTD_SHOWN=pam
115
+ NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
116
+ NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
117
+ OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
118
+ OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
119
+ OLDPWD=/data/align-anything/hantao/LLaMA-Factory
120
+ PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
121
+ PWD=/data/align-anything/hantao/align-anything/scripts
122
+ PYGAME_HIDE_SUPPORT_PROMPT=1
123
+ PYTHONHASHSEED=42
124
+ PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
125
+ QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
126
+ QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
127
+ RANK=0
128
+ RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
129
+ READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
130
+ SHELL=/bin/bash
131
+ SHLVL=3
132
+ SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
133
+ SSH_CLIENT=117.136.0.149 36325 30400
134
+ SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
135
+ SSL_CERT_DIR=/usr/lib/ssl/certs
136
+ SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
137
+ STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
138
+ STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
139
+ TERM=screen
140
+ TERM_PROGRAM=vscode
141
+ TERM_PROGRAM_VERSION=0.41.3
142
+ TMUX=/tmp/tmux-2000/default,34082,51
143
+ TMUX_PANE=%59
144
+ TRITON_CACHE_DIR=/home/align-anything/cache/triton
145
+ USER=align-anything
146
+ VSCODE_GIT_ASKPASS_EXTRA_ARGS=
147
+ VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
148
+ VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
149
+ VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
150
+ VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
151
+ WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
152
+ WANDB_MODE=online
153
+ WANDB_SERVICE=2-675697-tcp-localhost-45541
154
+ WORLD_SIZE=8
155
+ XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
156
+ XDG_RUNTIME_DIR=/run/user/2000
157
+ XDG_SESSION_CLASS=user
158
+ XDG_SESSION_ID=11
159
+ XDG_SESSION_TYPE=tty
160
+ _=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
161
+ _CE_CONDA=
162
+ _CE_M=
163
+ build_alias=x86_64-conda-linux-gnu
164
+ host_alias=x86_64-conda-linux-gnu
slice_400/preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 512,
4
+ "width": 512
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 1.0,
13
+ 1.0,
14
+ 1.0
15
+ ],
16
+ "image_processor_type": "ChameleonImageProcessor",
17
+ "image_std": [
18
+ 1.0,
19
+ 1.0,
20
+ 1.0
21
+ ],
22
+ "processor_class": "ChameleonProcessor",
23
+ "resample": 1,
24
+ "rescale_factor": 0.0078,
25
+ "size": {
26
+ "shortest_edge": 512
27
+ }
28
+ }
slice_400/processor_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 1024,
3
+ "image_token": "<image>",
4
+ "processor_class": "ChameleonProcessor"
5
+ }
slice_400/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d45286d89bc63b921ceef6df439a1bda7c4537d46f14ecab8a5b77fe81bdcde0
3
+ size 14086366378
slice_400/script.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Copyright 2024 PKU-Alignment Team. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ # ==============================================================================
17
+
18
+ export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
19
+ export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
20
+
21
+ export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
22
+
23
+ export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
24
+ export WANDB_MODE=online
25
+
26
+ MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
27
+
28
+ DATASET_PATH=(
29
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
30
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
31
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
32
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
33
+ )
34
+
35
+ DATASET_NAME=(
36
+ "q0_10_preference"
37
+ "q0_20_preference"
38
+ "q0_30_preference"
39
+ "q0_40_preference"
40
+ "q0_50_preference"
41
+ "q0_60_preference"
42
+ "q0_70_preference"
43
+ "q0_80_preference"
44
+ "q0_90_preference"
45
+ )
46
+
47
+ OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
48
+ mkdir -p $OUTPUT_PATH
49
+
50
+ # Initialize variables
51
+
52
+ for dataset_path in ${DATASET_PATH[@]}; do
53
+ for dataset_name in ${DATASET_NAME[@]}; do
54
+ TRAIN_DATASETS=$dataset_path
55
+
56
+ # dataset middle name
57
+ middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
58
+ OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
59
+ mkdir -p $OUTPUT_DIR
60
+ echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
61
+ # Source the setup script
62
+ source ./setup.sh
63
+
64
+ # Execute deepspeed command
65
+ deepspeed \
66
+ --master_port ${MASTER_PORT} \
67
+ --module align_anything.trainers.text_image_to_text_image.dpo \
68
+ --model_name_or_path ${MODEL_NAME_OR_PATH} \
69
+ --train_datasets ${TRAIN_DATASETS} \
70
+ --output_dir ${OUTPUT_DIR} \
71
+ --per_device_train_batch_size 4 \
72
+ --per_device_eval_batch_size 4 \
73
+ --gradient_accumulation_steps 2 \
74
+ --train_template Chameleon_preference \
75
+ --train_split train \
76
+ --train_data_files ${dataset_name}.pt \
77
+ --learning_rate 1e-6 \
78
+ --epochs 3 \
79
+ --lr_scheduler_type cosine \
80
+ --save_interval 400
81
+
82
+ bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
83
+ done
84
+ done
slice_400/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "<reserved08706>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
slice_400/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
slice_400/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
slice_400/wandb/debug-internal.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
3
+ {"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
5
+ {"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
6
+ {"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
7
+ {"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
8
+ {"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
9
+ {"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
11
+ {"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
12
+ {"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
13
+ {"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
14
+ {"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
15
+ {"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
16
+ {"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
17
+ {"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
18
+ {"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
19
+ {"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
20
+ {"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
21
+ {"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
22
+ {"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}
slice_400/wandb/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
3
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
8
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
9
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
11
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
12
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:init():617] calling init triggers
13
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
15
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():667] starting backend
16
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():671] sending inform_init request
17
+ 2025-01-01 08:41:16,156 INFO MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2025-01-01 08:41:16,157 INFO MainThread:675697 [wandb_init.py:init():684] backend started and connected
19
+ 2025-01-01 08:41:16,160 INFO MainThread:675697 [wandb_init.py:init():779] updated telemetry
20
+ 2025-01-01 08:41:16,221 INFO MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2025-01-01 08:41:16,848 INFO MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
22
+ 2025-01-01 08:41:17,411 INFO MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2025-01-01 08:41:17,413 INFO MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2025-01-01 08:41:17,424 INFO MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
27
+ 2025-01-01 12:07:46,853 INFO MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
28
+ 2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
29
+ 2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_restore():2410] restore
30
+ 2025-01-01 12:07:46,855 INFO MainThread:675697 [wandb_run.py:_restore():2416] restore done
31
+ 2025-01-01 12:07:50,513 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
32
+ 2025-01-01 12:07:50,515 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
33
+ 2025-01-01 12:07:50,525 INFO MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files
slice_400/wandb/run-20250101_084116-coewtb43/files/config.yaml ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.18.3
4
+ m: []
5
+ python_version: 3.11.10
6
+ t:
7
+ "1":
8
+ - 1
9
+ - 11
10
+ - 41
11
+ - 49
12
+ - 51
13
+ - 55
14
+ - 71
15
+ - 83
16
+ - 98
17
+ - 105
18
+ "2":
19
+ - 1
20
+ - 11
21
+ - 41
22
+ - 49
23
+ - 51
24
+ - 55
25
+ - 71
26
+ - 83
27
+ - 98
28
+ - 105
29
+ "3":
30
+ - 2
31
+ - 13
32
+ - 16
33
+ - 23
34
+ - 55
35
+ - 61
36
+ "4": 3.11.10
37
+ "5": 0.18.3
38
+ "6": 4.45.2
39
+ "8":
40
+ - 5
41
+ "12": 0.18.3
42
+ "13": linux-x86_64
43
+ data_cfgs:
44
+ value:
45
+ eval_data_files: null
46
+ eval_datasets: null
47
+ eval_optional_args: []
48
+ eval_size: null
49
+ eval_split: null
50
+ eval_subset: null
51
+ eval_template: null
52
+ train_data_files: q0_40_preference.pt
53
+ train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
54
+ train_optional_args: []
55
+ train_size: null
56
+ train_split: train
57
+ train_subset: null
58
+ train_template: Chameleon_preference
59
+ logger_cfgs:
60
+ value:
61
+ cache_dir: null
62
+ log_project: align-anything
63
+ log_run_name: dpo
64
+ log_type: wandb
65
+ output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
66
+ save_interval: 400
67
+ model_cfgs:
68
+ value:
69
+ model_max_length: 4096
70
+ model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
71
+ trust_remote_code: true
72
+ special_tokens:
73
+ value: null
74
+ train_cfgs:
75
+ value:
76
+ adam_betas:
77
+ - 0.9
78
+ - 0.95
79
+ bf16: true
80
+ ds_cfgs: ds_z3_config.json
81
+ epochs: 3
82
+ eval_interval: 10
83
+ eval_strategy: epoch
84
+ fp16: false
85
+ freeze_language_model: true
86
+ freeze_mm_proj: true
87
+ freeze_vision_tower: false
88
+ gradient_accumulation_steps: 2
89
+ gradient_checkpointing: true
90
+ learning_rate: 1e-06
91
+ lr_scheduler_type: cosine
92
+ lr_warmup_ratio: 0.03
93
+ per_device_eval_batch_size: 4
94
+ per_device_train_batch_size: 4
95
+ regularization: 0.001
96
+ scale_coeff: 0.1
97
+ seed: 42
98
+ weight_decay: 0.01
slice_400/wandb/run-20250101_084116-coewtb43/files/output.log ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ***** Running training *****
2
+ Training 1/3.0 epoch: 0%| | 0/1422.0 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
3
+ Training 1/3.0 epoch (loss 11.8749): 21%|██████████████████████████████████████████████████▋ | 299/1422.0 [43:24<2:59:40, 9.60s/it]
4
+ [2025-01-01 08:42:55,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
5
+ [2025-01-01 08:43:25,944] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
6
+ [2025-01-01 08:44:17,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[4.761904761904761e-07, 4.761904761904761e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
7
+ [2025-01-01 08:45:03,075] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
8
+ [2025-01-01 08:47:04,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.523809523809522e-07, 9.523809523809522e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
9
+ [2025-01-01 08:48:02,529] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
10
+ [2025-01-01 08:48:43,904] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
11
+ [2025-01-01 08:50:07,690] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[9.995802740501932e-07, 9.995802740501932e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
12
+ [2025-01-01 08:51:09,012] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
13
+ [2025-01-01 08:53:09,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[9.98130274211278e-07, 9.98130274211278e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
14
+ [2025-01-01 08:55:49,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
15
+ [2025-01-01 08:55:49,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[9.956478233113064e-07, 9.956478233113064e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
16
+ [2025-01-01 08:58:16,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[9.921380666088558e-07, 9.921380666088558e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
17
+ [2025-01-01 09:01:07,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[9.876082786106545e-07, 9.876082786106545e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
18
+ [2025-01-01 09:01:27,047] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
19
+ [2025-01-01 09:04:03,792] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
20
+ [2025-01-01 09:04:24,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
21
+ [2025-01-01 09:04:24,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[9.820678479940571e-07, 9.820678479940571e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
22
+ [2025-01-01 09:05:30,688] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
23
+ [2025-01-01 09:07:12,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.755282581475767e-07, 9.755282581475767e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
24
+ [2025-01-01 09:09:19,110] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
25
+ [2025-01-01 09:10:10,785] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
26
+ [2025-01-01 09:10:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[9.68003063369808e-07, 9.68003063369808e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
27
+ [2025-01-01 09:10:51,320] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
28
+ [2025-01-01 09:13:33,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[9.595078607760749e-07, 9.595078607760749e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
29
+ [2025-01-01 09:14:56,362] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
30
+ [2025-01-01 09:16:12,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[9.500602579710255e-07, 9.500602579710255e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
31
+ [2025-01-01 09:17:52,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
32
+ [2025-01-01 09:19:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[9.39679836554184e-07, 9.39679836554184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
33
+ [2025-01-01 09:21:51,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[9.283881115340955e-07, 9.283881115340955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
34
+ [2025-01-01 09:22:22,638] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
35
+ [2025-01-01 09:24:48,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[9.16208486735184e-07, 9.16208486735184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
36
+ [2025-01-01 09:25:39,046] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
37
+ [2025-01-01 09:27:38,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[9.03166206289754e-07, 9.03166206289754e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
38
+ [2025-01-01 09:28:42,774] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
39
+ [2025-01-01 09:29:54,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[8.8928830231567e-07, 8.8928830231567e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
40
+ [2025-01-01 09:30:34,992] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
41
+ [2025-01-01 09:31:34,787] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
42
+ [2025-01-01 09:32:15,776] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
43
+ [2025-01-01 09:32:50,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[8.746035388881654e-07, 8.746035388881654e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
44
+ [2025-01-01 09:35:09,447] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
45
+ [2025-01-01 09:35:48,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[8.591423524219029e-07, 8.591423524219029e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
46
+ [2025-01-01 09:36:31,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
47
+ [2025-01-01 09:37:59,484] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
48
+ [2025-01-01 09:38:30,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[8.429367885868581e-07, 8.429367885868581e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
49
+ Saving checkpoint at step 400 ...
50
+ Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
51
+ Saving 16-bit model...
52
+ [2025-01-01 09:38:39,848] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
53
+ [2025-01-01 09:38:39,849] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
54
+ [2025-01-01 09:38:39,850] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
55
+ [2025-01-01 09:38:58,770] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
56
+ [2025-01-01 09:38:58,773] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
57
+ Model saved!
58
+ Saving 16-bit model...
59
+ [2025-01-01 09:39:05,477] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
60
+ [2025-01-01 09:39:05,478] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
61
+ [2025-01-01 09:39:05,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
62
+ [2025-01-01 09:39:27,199] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
63
+ [2025-01-01 09:39:27,202] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
64
+ Model saved!
65
+ Checkpoint saved.
66
+ [2025-01-01 09:42:08,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[8.260204358887753e-07, 8.260204358887753e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
67
+ [2025-01-01 09:43:59,051] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
68
+ [2025-01-01 09:44:19,070] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
69
+ [2025-01-01 09:45:02,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[8.084283560518583e-07, 8.084283560518583e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
70
+ [2025-01-01 09:47:51,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[7.901970113479955e-07, 7.901970113479955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
71
+ [2025-01-01 09:48:18,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
72
+ [2025-01-01 09:50:44,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[7.713641890231308e-07, 7.713641890231308e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
73
+ [2025-01-01 09:51:29,985] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
74
+ [2025-01-01 09:52:00,601] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
75
+ [2025-01-01 09:53:37,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
76
+ [2025-01-01 09:53:37,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[7.51968922977428e-07, 7.51968922977428e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
77
+ [2025-01-01 09:56:36,667] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
78
+ [2025-01-01 09:56:36,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[7.320514128615511e-07, 7.320514128615511e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
79
+ [2025-01-01 09:57:18,253] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
80
+ [2025-01-01 09:59:22,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[7.116529407567488e-07, 7.116529407567488e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
81
+ [2025-01-01 09:59:42,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
82
+ [2025-01-01 10:02:27,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[6.908157856114392e-07, 6.908157856114392e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
83
+ [2025-01-01 10:04:22,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
84
+ [2025-01-01 10:05:15,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[6.695831356116303e-07, 6.695831356116303e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
85
+ [2025-01-01 10:07:32,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[6.479989986668117e-07, 6.479989986668117e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
86
+ [2025-01-01 10:09:58,442] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
87
+ [2025-01-01 10:10:45,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[6.261081111968403e-07, 6.261081111968403e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
88
+ [2025-01-01 10:12:34,690] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
89
+ [2025-01-01 10:12:55,185] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
90
+ [2025-01-01 10:13:41,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[6.039558454088795e-07, 6.039558454088795e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
91
+ [2025-01-01 10:14:01,438] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
92
+ [2025-01-01 10:16:42,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[5.815881152565711e-07, 5.815881152565711e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
93
+ [2025-01-01 10:17:49,827] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
94
+ [2025-01-01 10:18:41,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
95
+ [2025-01-01 10:19:22,050] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
96
+ [2025-01-01 10:20:00,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[5.590512812763541e-07, 5.590512812763541e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
97
+ [2025-01-01 10:22:48,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[5.363920544981748e-07, 5.363920544981748e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
98
+ [2025-01-01 10:23:27,066] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
99
+ [2025-01-01 10:25:42,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[5.136573996297429e-07, 5.136573996297429e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
100
+ [2025-01-01 10:26:22,366] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
101
+ [2025-01-01 10:28:45,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[4.908944377150043e-07, 4.908944377150043e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
102
+ [2025-01-01 10:30:51,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
103
+ [2025-01-01 10:31:03,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[4.681503484685803e-07, 4.681503484685803e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
104
+ [2025-01-01 10:34:07,594] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
105
+ [2025-01-01 10:34:07,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[4.454722724886051e-07, 4.454722724886051e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
106
+ [2025-01-01 10:36:51,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[4.229072135506384e-07, 4.229072135506384e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
107
+ Saving checkpoint at step 800 ...
108
+ Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
109
+ Saving 16-bit model...
110
+ [2025-01-01 10:37:00,920] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
111
+ [2025-01-01 10:37:00,921] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
112
+ [2025-01-01 10:37:00,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
113
+ [2025-01-01 10:37:17,303] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
114
+ [2025-01-01 10:37:17,305] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
115
+ Model saved!
116
+ Saving 16-bit model...
117
+ [2025-01-01 10:37:24,304] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
118
+ [2025-01-01 10:37:24,306] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
119
+ [2025-01-01 10:37:24,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
120
+ [2025-01-01 10:37:47,861] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
121
+ [2025-01-01 10:37:47,862] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
122
+ Model saved!
123
+ Checkpoint saved.
124
+ [2025-01-01 10:38:06,545] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
125
+ [2025-01-01 10:39:58,968] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
126
+ [2025-01-01 10:40:18,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[4.005019411851609e-07, 4.005019411851609e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
127
+ [2025-01-01 10:40:58,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
128
+ [2025-01-01 10:41:39,701] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
129
+ [2025-01-01 10:43:08,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[3.783028937405821e-07, 3.783028937405821e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
130
+ [2025-01-01 10:44:32,585] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
131
+ [2025-01-01 10:45:54,172] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
132
+ [2025-01-01 10:45:54,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[3.563560821326706e-07, 3.563560821326706e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
133
+ [2025-01-01 10:47:22,278] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
134
+ [2025-01-01 10:48:37,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[3.3470699447990527e-07, 3.3470699447990527e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
135
+ [2025-01-01 10:51:26,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[3.1340050182240436e-07, 3.1340050182240436e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
136
+ [2025-01-01 10:52:26,337] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
137
+ [2025-01-01 10:52:46,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
138
+ [2025-01-01 10:54:14,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[2.92480765119841e-07, 2.92480765119841e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
139
+ [2025-01-01 10:56:46,300] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
140
+ [2025-01-01 10:57:28,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[2.719911437211122e-07, 2.719911437211122e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
141
+ [2025-01-01 10:59:56,353] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
142
+ [2025-01-01 10:59:56,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[2.5197410549546595e-07, 2.5197410549546595e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
143
+ [2025-01-01 11:00:26,971] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
144
+ [2025-01-01 11:02:03,646] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
145
+ [2025-01-01 11:02:54,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[2.3247113881135781e-07, 2.3247113881135781e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
146
+ [2025-01-01 11:05:02,388] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
147
+ [2025-01-01 11:05:43,792] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
148
+ [2025-01-01 11:05:55,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[2.1352266654547125e-07, 2.1352266654547125e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
149
+ [2025-01-01 11:08:07,702] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
150
+ [2025-01-01 11:08:38,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[1.9516796230013272e-07, 1.9516796230013272e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
151
+ [2025-01-01 11:11:35,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[1.774450690027746e-07, 1.774450690027746e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
152
+ [2025-01-01 11:12:47,870] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
153
+ [2025-01-01 11:14:16,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[1.6039072005615716e-07, 1.6039072005615716e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
154
+ [2025-01-01 11:17:00,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[1.4404026320278317e-07, 1.4404026320278317e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
155
+ [2025-01-01 11:18:23,592] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
156
+ [2025-01-01 11:19:58,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[1.284275872613028e-07, 1.284275872613028e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
157
+ [2025-01-01 11:20:59,657] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
158
+ [2025-01-01 11:21:20,032] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
159
+ [2025-01-01 11:22:26,091] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
160
+ [2025-01-01 11:23:04,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[1.1358505188676288e-07, 1.1358505188676288e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
161
+ [2025-01-01 11:26:14,637] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
162
+ [2025-01-01 11:26:14,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[9.95434205002792e-08, 9.95434205002792e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
163
+ [2025-01-01 11:27:06,507] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
164
+ [2025-01-01 11:27:47,054] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
165
+ [2025-01-01 11:29:24,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[8.633179652714916e-08, 8.633179652714916e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
166
+ [2025-01-01 11:31:52,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
167
+ [2025-01-01 11:32:11,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[7.397756307555885e-08, 7.397756307555885e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
168
+ [2025-01-01 11:34:47,839] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
169
+ [2025-01-01 11:34:59,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[6.250632618090867e-08, 6.250632618090867e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
170
+ Saving checkpoint at step 1200 ...
171
+ Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
172
+ Saving 16-bit model...
173
+ [2025-01-01 11:35:08,703] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
174
+ [2025-01-01 11:35:08,704] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
175
+ [2025-01-01 11:35:08,704] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
176
+ [2025-01-01 11:35:25,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
177
+ [2025-01-01 11:35:25,317] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
178
+ Model saved!
179
+ Saving 16-bit model...
180
+ [2025-01-01 11:35:32,446] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
181
+ [2025-01-01 11:35:32,447] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
182
+ [2025-01-01 11:35:32,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
183
+ [2025-01-01 11:35:53,847] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
184
+ [2025-01-01 11:35:53,849] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
185
+ Model saved!
186
+ Checkpoint saved.
187
+ [2025-01-01 11:38:41,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[5.194186173339599e-08, 5.194186173339599e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
188
+ [2025-01-01 11:40:10,376] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
189
+ [2025-01-01 11:41:25,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[4.230606619885108e-08, 4.230606619885108e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
190
+ [2025-01-01 11:43:26,712] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
191
+ [2025-01-01 11:44:25,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[3.3618911234968236e-08, 3.3618911234968236e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
192
+ [2025-01-01 11:46:29,769] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
193
+ [2025-01-01 11:46:53,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[2.589840229699558e-08, 2.589840229699558e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
194
+ [2025-01-01 11:48:22,179] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
195
+ [2025-01-01 11:49:22,087] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
196
+ [2025-01-01 11:49:35,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[1.9160541318679224e-08, 1.9160541318679224e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
197
+ [2025-01-01 11:50:02,959] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
198
+ [2025-01-01 11:52:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[1.3419293545812338e-08, 1.3419293545812338e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
199
+ [2025-01-01 11:52:56,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
200
+ [2025-01-01 11:54:18,301] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
201
+ [2025-01-01 11:55:03,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[8.686558591130156e-09, 8.686558591130156e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
202
+ [2025-01-01 11:55:46,245] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
203
+ [2025-01-01 11:57:47,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[4.972145770545999e-09, 4.972145770545999e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
204
+ [2025-01-01 12:00:50,391] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
205
+ [2025-01-01 12:00:50,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[2.283753771845587e-09, 2.283753771845587e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
206
+ [2025-01-01 12:01:10,430] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
207
+ [2025-01-01 12:03:22,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[6.269546979813523e-10, 6.269546979813523e-10], mom=[[0.9, 0.95], [0.9, 0.95]]
208
+ [2025-01-01 12:05:10,192] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
209
+ [2025-01-01 12:06:28,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[5.182518037827321e-12, 5.182518037827321e-12], mom=[[0.9, 0.95], [0.9, 0.95]]
210
+ Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
211
+ Saving 16-bit model...
212
+ [2025-01-01 12:06:57,036] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
213
+ [2025-01-01 12:06:57,037] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
214
+ [2025-01-01 12:06:57,037] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
215
+ [2025-01-01 12:07:17,768] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
216
+ [2025-01-01 12:07:17,770] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
217
+ Model saved!
218
+ Saving 16-bit model...
219
+ [2025-01-01 12:07:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
220
+ [2025-01-01 12:07:24,459] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
221
+ [2025-01-01 12:07:24,459] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
222
+ [2025-01-01 12:07:46,734] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
223
+ [2025-01-01 12:07:46,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
224
+ Model saved!
slice_400/wandb/run-20250101_084116-coewtb43/files/requirements.txt ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ align-anything==0.0.1.dev0
2
+ gitdb==4.0.11
3
+ wcwidth==0.2.13
4
+ identify==2.6.1
5
+ tomlkit==0.12.0
6
+ bitsandbytes==0.44.1
7
+ trl==0.9.6
8
+ pytest-split==0.8.0
9
+ gradio==4.44.1
10
+ pip==24.2
11
+ multidict==6.1.0
12
+ fairscale==0.4.13
13
+ mistral_common==1.4.4
14
+ python-dotenv==1.0.1
15
+ uvloop==0.20.0
16
+ absl-py==2.1.0
17
+ tiktoken==0.7.0
18
+ pydub==0.25.1
19
+ websockets==12.0
20
+ llamafactory==0.9.1.dev0
21
+ triton==3.0.0
22
+ tifffile==2024.9.20
23
+ safe-rlhf==0.0.1.dev0
24
+ pandas==2.2.3
25
+ grpcio==1.66.2
26
+ click==8.1.7
27
+ ninja==1.11.1.1
28
+ rich==13.9.2
29
+ Jinja2==3.1.4
30
+ Pygments==2.18.0
31
+ nvidia-cudnn-cu12==9.1.0.70
32
+ importlib_resources==6.4.5
33
+ GitPython==3.1.43
34
+ nvidia-cufft-cu12==11.0.2.54
35
+ tensorboard-data-server==0.7.2
36
+ align-anything==0.0.1.dev0
37
+ six==1.16.0
38
+ scipy==1.14.1
39
+ mpmath==1.3.0
40
+ jsonschema-specifications==2024.10.1
41
+ scikit-image==0.24.0
42
+ zipp==3.20.2
43
+ cycler==0.12.1
44
+ MarkupSafe==2.1.5
45
+ tzdata==2024.2
46
+ idna==3.10
47
+ pycountry==24.6.1
48
+ nvidia-nccl-cu12==2.20.5
49
+ matplotlib==3.9.2
50
+ pytz==2024.2
51
+ uvicorn==0.31.1
52
+ dill==0.3.8
53
+ pyparsing==3.1.4
54
+ pytest==7.2.0
55
+ jiter==0.6.1
56
+ safetensors==0.4.5
57
+ typing_extensions==4.12.2
58
+ decorator==4.4.2
59
+ typeguard==4.4.1
60
+ prometheus_client==0.21.0
61
+ nvidia-cuda-cupti-cu12==12.1.105
62
+ sentencepiece==0.2.0
63
+ requests==2.32.3
64
+ kiwisolver==1.4.7
65
+ gdown==5.2.0
66
+ multiprocess==0.70.16
67
+ xxhash==3.5.0
68
+ PyYAML==6.0.2
69
+ gguf==0.10.0
70
+ nvidia-nvtx-cu12==12.1.105
71
+ hpsv2==1.2.0
72
+ tensorboard==2.18.0
73
+ nodeenv==1.9.1
74
+ filelock==3.16.1
75
+ distro==1.9.0
76
+ scikit-learn==1.5.2
77
+ huggingface-hub==0.25.2
78
+ pyairports==2.1.1
79
+ importlib_metadata==8.5.0
80
+ pyarrow==17.0.0
81
+ llvmlite==0.43.0
82
+ ray==2.37.0
83
+ tokenizers==0.20.3
84
+ nvidia-nvjitlink-cu12==12.6.77
85
+ av==14.0.1
86
+ deepspeed==0.15.2
87
+ clip==0.2.0
88
+ shtab==1.7.1
89
+ certifi==2024.8.30
90
+ braceexpand==0.1.7
91
+ nvidia-ml-py==12.560.30
92
+ webdataset==0.2.100
93
+ docker-pycreds==0.4.0
94
+ einops==0.8.0
95
+ iniconfig==2.0.0
96
+ tyro==0.9.2
97
+ torchvision==0.19.0
98
+ accelerate==0.34.2
99
+ beautifulsoup4==4.12.3
100
+ pyzmq==26.2.0
101
+ pycparser==2.22
102
+ nvidia-curand-cu12==10.3.2.106
103
+ msgpack==1.1.0
104
+ soxr==0.5.0.post1
105
+ platformdirs==4.3.6
106
+ h11==0.14.0
107
+ psutil==6.0.0
108
+ pydantic==2.9.2
109
+ shellingham==1.5.4
110
+ imageio-ffmpeg==0.5.1
111
+ wandb==0.18.3
112
+ audioread==3.0.1
113
+ annotated-types==0.7.0
114
+ docstring_parser==0.16
115
+ cloudpickle==3.1.0
116
+ regex==2024.9.11
117
+ packaging==24.1
118
+ timm==0.6.13
119
+ aiosignal==1.3.1
120
+ numba==0.60.0
121
+ orjson==3.10.7
122
+ rpds-py==0.20.0
123
+ virtualenv==20.26.6
124
+ joblib==1.4.2
125
+ charset-normalizer==3.4.0
126
+ httpx==0.27.2
127
+ ffmpy==0.4.0
128
+ lm-format-enforcer==0.10.6
129
+ yt-dlp==2024.8.6
130
+ sympy==1.13.3
131
+ python-dateutil==2.9.0.post0
132
+ nvidia-cusolver-cu12==11.4.5.107
133
+ msgspec==0.18.6
134
+ mdurl==0.1.2
135
+ torch==2.4.0
136
+ fastapi==0.115.0
137
+ optree==0.13.0
138
+ PySocks==1.7.1
139
+ transformers==4.46.0.dev0
140
+ torchlibrosa==0.1.0
141
+ fsspec==2024.6.1
142
+ nvidia-cublas-cu12==12.1.3.1
143
+ gradio_client==1.3.0
144
+ args==0.1.0
145
+ cffi==1.17.1
146
+ fonttools==4.54.1
147
+ clint==0.5.1
148
+ lark==1.2.2
149
+ tqdm==4.66.5
150
+ semantic-version==2.10.0
151
+ pooch==1.8.2
152
+ markdown-it-py==3.0.0
153
+ pydantic_core==2.23.4
154
+ sniffio==1.3.1
155
+ httptools==0.6.1
156
+ nvidia-cuda-runtime-cu12==12.1.105
157
+ anyio==4.6.0
158
+ ftfy==6.3.0
159
+ Markdown==3.7
160
+ datasets==2.21.0
161
+ diffusers==0.30.3
162
+ nvidia-cuda-nvrtc-cu12==12.1.105
163
+ vllm==0.6.2
164
+ starlette==0.38.6
165
+ flash-attn==2.7.0.post2
166
+ urllib3==2.2.3
167
+ Werkzeug==3.0.4
168
+ py-cpuinfo==9.0.0
169
+ moviepy==1.0.3
170
+ librosa==0.10.2.post1
171
+ peft==0.12.0
172
+ soupsieve==2.6
173
+ lazy_loader==0.4
174
+ pluggy==1.5.0
175
+ setuptools==75.1.0
176
+ sentry-sdk==2.16.0
177
+ tabulate==0.9.0
178
+ transformers==4.45.2
179
+ pre_commit==4.0.1
180
+ termcolor==2.5.0
181
+ frechet-audio-distance==0.1.2
182
+ pytorch-fid==0.3.0
183
+ setproctitle==1.3.3
184
+ jsonschema==4.23.0
185
+ aiofiles==23.2.1
186
+ contourpy==1.3.0
187
+ distlib==0.3.9
188
+ interegular==0.3.3
189
+ fire==0.7.0
190
+ diskcache==5.6.3
191
+ proglog==0.1.10
192
+ soundfile==0.12.1
193
+ protobuf==3.20.3
194
+ smmap==5.0.1
195
+ pycryptodomex==3.21.0
196
+ Brotli==1.1.0
197
+ pillow==10.4.0
198
+ frozenlist==1.4.1
199
+ numpy==1.26.4
200
+ mutagen==1.47.0
201
+ outlines==0.0.46
202
+ attrs==24.2.0
203
+ torchaudio==2.4.0
204
+ aiohttp==3.10.10
205
+ ruff==0.6.9
206
+ watchfiles==0.24.0
207
+ threadpoolctl==3.5.0
208
+ nest-asyncio==1.6.0
209
+ partial-json-parser==0.2.1.1.post4
210
+ sse-starlette==2.1.3
211
+ shortuuid==1.0.13
212
+ typer==0.12.5
213
+ prometheus-fastapi-instrumentator==7.0.0
214
+ imageio==2.35.1
215
+ wheel==0.44.0
216
+ image-reward==1.5
217
+ networkx==3.4.1
218
+ propcache==0.2.0
219
+ aiohappyeyeballs==2.4.3
220
+ nvidia-cusparse-cu12==12.1.0.106
221
+ xformers==0.0.27.post2
222
+ cfgv==3.4.0
223
+ python-multipart==0.0.12
224
+ httpcore==1.0.6
225
+ opencv-python==4.6.0.66
226
+ resampy==0.4.3
227
+ yarl==1.15.0
228
+ referencing==0.35.1
229
+ openai==1.51.2
230
+ hjson==3.1.0
231
+ llamafactory==0.9.1.dev0
232
+ jaraco.collections==5.1.0
233
+ backports.tarfile==1.2.0
234
+ more-itertools==10.3.0
235
+ wheel==0.43.0
236
+ importlib_metadata==8.0.0
237
+ zipp==3.19.2
238
+ autocommand==2.2.2
239
+ jaraco.functools==4.0.1
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ jaraco.text==3.12.1
243
+ typing_extensions==4.12.2
244
+ jaraco.context==5.3.0
245
+ importlib_resources==6.4.0
246
+ packaging==24.1
247
+ inflect==7.3.1
248
+ typeguard==4.3.0
slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
3
+ "python": "3.11.10",
4
+ "startedAt": "2025-01-01T08:41:16.157770Z",
5
+ "args": [
6
+ "--local_rank=0",
7
+ "--model_name_or_path",
8
+ "/data/align-anything/hantao/models/chameleon-7b",
9
+ "--train_datasets",
10
+ "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized",
11
+ "--output_dir",
12
+ "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
13
+ "--per_device_train_batch_size",
14
+ "4",
15
+ "--per_device_eval_batch_size",
16
+ "4",
17
+ "--gradient_accumulation_steps",
18
+ "2",
19
+ "--train_template",
20
+ "Chameleon_preference",
21
+ "--train_split",
22
+ "train",
23
+ "--train_data_files",
24
+ "q0_40_preference.pt",
25
+ "--learning_rate",
26
+ "1e-6",
27
+ "--epochs",
28
+ "3",
29
+ "--lr_scheduler_type",
30
+ "cosine",
31
+ "--save_interval",
32
+ "400"
33
+ ],
34
+ "program": "-m align_anything.trainers.text_image_to_text_image.dpo",
35
+ "git": {
36
+ "remote": "https://github.com/PKU-Alignment/align-anything.git",
37
+ "commit": "6fde660afc9985323f147930eedf188a5699adc7"
38
+ },
39
+ "email": "[email protected]",
40
+ "root": "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
41
+ "host": "lyg0194",
42
+ "username": "align-anything",
43
+ "executable": "/data/align-anything/miniconda3/envs/hantao_stable/bin/python",
44
+ "cpu_count": 64,
45
+ "cpu_count_logical": 128,
46
+ "gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
47
+ "gpu_count": 8,
48
+ "disk": {
49
+ "/": {
50
+ "total": "939477946368",
51
+ "used": "596714827776"
52
+ }
53
+ },
54
+ "memory": {
55
+ "total": "1081823907840"
56
+ },
57
+ "cpu": {
58
+ "count": 64,
59
+ "countLogical": 128
60
+ },
61
+ "gpu_nvidia": [
62
+ {
63
+ "name": "NVIDIA A100-SXM4-80GB",
64
+ "memoryTotal": "85899345920",
65
+ "cudaCores": 6912,
66
+ "architecture": "Ampere"
67
+ },
68
+ {
69
+ "name": "NVIDIA A100-SXM4-80GB",
70
+ "memoryTotal": "85899345920",
71
+ "cudaCores": 6912,
72
+ "architecture": "Ampere"
73
+ },
74
+ {
75
+ "name": "NVIDIA A100-SXM4-80GB",
76
+ "memoryTotal": "85899345920",
77
+ "cudaCores": 6912,
78
+ "architecture": "Ampere"
79
+ },
80
+ {
81
+ "name": "NVIDIA A100-SXM4-80GB",
82
+ "memoryTotal": "85899345920",
83
+ "cudaCores": 6912,
84
+ "architecture": "Ampere"
85
+ },
86
+ {
87
+ "name": "NVIDIA A100-SXM4-80GB",
88
+ "memoryTotal": "85899345920",
89
+ "cudaCores": 6912,
90
+ "architecture": "Ampere"
91
+ },
92
+ {
93
+ "name": "NVIDIA A100-SXM4-80GB",
94
+ "memoryTotal": "85899345920",
95
+ "cudaCores": 6912,
96
+ "architecture": "Ampere"
97
+ },
98
+ {
99
+ "name": "NVIDIA A100-SXM4-80GB",
100
+ "memoryTotal": "85899345920",
101
+ "cudaCores": 6912,
102
+ "architecture": "Ampere"
103
+ },
104
+ {
105
+ "name": "NVIDIA A100-SXM4-80GB",
106
+ "memoryTotal": "85899345920",
107
+ "cudaCores": 6912,
108
+ "architecture": "Ampere"
109
+ }
110
+ ],
111
+ "cudaVersion": "12.4"
112
+ }
slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/loss":1.2820848226547241,"_step":1422,"train/better_sample_reward":92.50687408447266,"train/reward_margin":134.08236694335938,"train/lr":0,"train/worse_sample_reward":-41.57551193237305,"_wandb":{"runtime":12390},"_timestamp":1.7357332075421584e+09,"train/step":1422,"train/epoch":3,"_runtime":12390.697992413,"train/reward_accuracy":0.90625,"train/reward":50.931365966796875}
slice_400/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
2
+ {"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
3
+ {"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
4
+ {"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
5
+ {"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
6
+ {"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
7
+ {"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
8
+ {"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
9
+ {"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
10
+ {"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
11
+ {"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
12
+ {"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
13
+ {"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
14
+ {"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
15
+ {"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
16
+ {"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
17
+ {"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
18
+ {"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
19
+ {"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
20
+ {"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
21
+ {"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
22
+ {"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}
slice_400/wandb/run-20250101_084116-coewtb43/logs/debug.log ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
2
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
3
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
4
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
5
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
6
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
7
+ 2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
8
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
9
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
10
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
11
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
12
+ 2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:init():617] calling init triggers
13
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
14
+ config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
15
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():667] starting backend
16
+ 2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():671] sending inform_init request
17
+ 2025-01-01 08:41:16,156 INFO MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
18
+ 2025-01-01 08:41:16,157 INFO MainThread:675697 [wandb_init.py:init():684] backend started and connected
19
+ 2025-01-01 08:41:16,160 INFO MainThread:675697 [wandb_init.py:init():779] updated telemetry
20
+ 2025-01-01 08:41:16,221 INFO MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
21
+ 2025-01-01 08:41:16,848 INFO MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
22
+ 2025-01-01 08:41:17,411 INFO MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
23
+ 2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
24
+ 2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
25
+ 2025-01-01 08:41:17,413 INFO MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
26
+ 2025-01-01 08:41:17,424 INFO MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
27
+ 2025-01-01 12:07:46,853 INFO MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
28
+ 2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
29
+ 2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_restore():2410] restore
30
+ 2025-01-01 12:07:46,855 INFO MainThread:675697 [wandb_run.py:_restore():2416] restore done
31
+ 2025-01-01 12:07:50,513 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
32
+ 2025-01-01 12:07:50,515 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
33
+ 2025-01-01 12:07:50,525 INFO MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files
slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6184cd062ec8be4c9517ecc56b37ff397dd3f29795bce1cd495613256a3f6f76
3
+ size 12650956
slice_800/arguments.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_cfgs:
2
+ eval_data_files: null
3
+ eval_datasets: null
4
+ eval_optional_args: []
5
+ eval_size: null
6
+ eval_split: null
7
+ eval_subset: null
8
+ eval_template: null
9
+ train_data_files: q0_40_preference.pt
10
+ train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
11
+ train_optional_args: []
12
+ train_size: null
13
+ train_split: train
14
+ train_subset: null
15
+ train_template: Chameleon_preference
16
+ logger_cfgs:
17
+ cache_dir: null
18
+ log_project: align-anything
19
+ log_run_name: dpo
20
+ log_type: wandb
21
+ output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
22
+ save_interval: 400.0
23
+ model_cfgs:
24
+ model_max_length: 4096
25
+ model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
26
+ trust_remote_code: true
27
+ special_tokens: null
28
+ train_cfgs:
29
+ adam_betas:
30
+ - 0.9
31
+ - 0.95
32
+ bf16: true
33
+ ds_cfgs: ds_z3_config.json
34
+ epochs: 3.0
35
+ eval_interval: 10
36
+ eval_strategy: epoch
37
+ fp16: false
38
+ freeze_language_model: true
39
+ freeze_mm_proj: true
40
+ freeze_vision_tower: false
41
+ gradient_accumulation_steps: 2.0
42
+ gradient_checkpointing: true
43
+ learning_rate: 1.0e-06
44
+ lr_scheduler_type: cosine
45
+ lr_warmup_ratio: 0.03
46
+ per_device_eval_batch_size: 4.0
47
+ per_device_train_batch_size: 4.0
48
+ regularization: 0.001
49
+ scale_coeff: 0.1
50
+ seed: 42
51
+ weight_decay: 0.01
slice_800/config.json ADDED
The diff for this file is too large to render. See raw diff