htlou commited on Jan 5

Commit

8d3b74e

verified ·

1 Parent(s): 60cce00

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
arguments.yaml +51 -0
config.json +0 -0
environ.txt +164 -0
preprocessor_config.json +28 -0
processor_config.json +5 -0
pytorch_model.bin +3 -0
script.sh +84 -0
slice_1200/arguments.yaml +51 -0
slice_1200/config.json +0 -0
slice_1200/environ.txt +164 -0
slice_1200/preprocessor_config.json +28 -0
slice_1200/processor_config.json +5 -0
slice_1200/pytorch_model.bin +3 -0
slice_1200/script.sh +84 -0
slice_1200/special_tokens_map.json +37 -0
slice_1200/tokenizer.json +0 -0
slice_1200/tokenizer_config.json +0 -0
slice_1200/wandb/debug-internal.log +22 -0
slice_1200/wandb/debug.log +33 -0
slice_1200/wandb/run-20250101_084116-coewtb43/files/config.yaml +98 -0
slice_1200/wandb/run-20250101_084116-coewtb43/files/output.log +224 -0
slice_1200/wandb/run-20250101_084116-coewtb43/files/requirements.txt +248 -0
slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json +112 -0
slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json +1 -0
slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log +22 -0
slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug.log +33 -0
slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb +3 -0
slice_400/arguments.yaml +51 -0
slice_400/config.json +0 -0
slice_400/environ.txt +164 -0
slice_400/preprocessor_config.json +28 -0
slice_400/processor_config.json +5 -0
slice_400/pytorch_model.bin +3 -0
slice_400/script.sh +84 -0
slice_400/special_tokens_map.json +37 -0
slice_400/tokenizer.json +0 -0
slice_400/tokenizer_config.json +0 -0
slice_400/wandb/debug-internal.log +22 -0
slice_400/wandb/debug.log +33 -0
slice_400/wandb/run-20250101_084116-coewtb43/files/config.yaml +98 -0
slice_400/wandb/run-20250101_084116-coewtb43/files/output.log +224 -0
slice_400/wandb/run-20250101_084116-coewtb43/files/requirements.txt +248 -0
slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json +112 -0
slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json +1 -0
slice_400/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log +22 -0
slice_400/wandb/run-20250101_084116-coewtb43/logs/debug.log +33 -0
slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb +3 -0
slice_800/arguments.yaml +51 -0
slice_800/config.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
+slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
+slice_800/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
+wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text

arguments.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+data_cfgs:
+  eval_data_files: null
+  eval_datasets: null
+  eval_optional_args: []
+  eval_size: null
+  eval_split: null
+  eval_subset: null
+  eval_template: null
+  train_data_files: q0_40_preference.pt
+  train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
+  train_optional_args: []
+  train_size: null
+  train_split: train
+  train_subset: null
+  train_template: Chameleon_preference
+logger_cfgs:
+  cache_dir: null
+  log_project: align-anything
+  log_run_name: dpo
+  log_type: wandb
+  output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
+  save_interval: 400.0
+model_cfgs:
+  model_max_length: 4096
+  model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
+  trust_remote_code: true
+special_tokens: null
+train_cfgs:
+  adam_betas:
+  - 0.9
+  - 0.95
+  bf16: true
+  ds_cfgs: ds_z3_config.json
+  epochs: 3.0
+  eval_interval: 10
+  eval_strategy: epoch
+  fp16: false
+  freeze_language_model: true
+  freeze_mm_proj: true
+  freeze_vision_tower: false
+  gradient_accumulation_steps: 2.0
+  gradient_checkpointing: true
+  learning_rate: 1.0e-06
+  lr_scheduler_type: cosine
+  lr_warmup_ratio: 0.03
+  per_device_eval_batch_size: 4.0
+  per_device_train_batch_size: 4.0
+  regularization: 0.001
+  scale_coeff: 0.1
+  seed: 42
+  weight_decay: 0.01

config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

environ.txt ADDED Viewed

	@@ -0,0 +1,164 @@

+ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
+AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
+AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
+BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
+BUILD=x86_64-conda-linux-gnu
+CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
+CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
+CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
+COLORTERM=truecolor
+CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
+CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
+CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
+CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
+CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
+CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
+CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
+CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
+CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
+CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
+CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
+CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
+CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
+CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
+CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
+CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
+CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
+CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
+CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
+CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
+CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
+CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
+CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
+CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
+CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
+CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
+CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
+CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
+CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
+CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
+CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
+CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
+CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
+CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
+CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
+CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
+CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
+CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
+CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
+CONDA_DEFAULT_ENV=hantao_stable
+CONDA_EXE=/data/align-anything/miniconda3/bin/conda
+CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_1=/home/align-anything/miniconda3
+CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
+CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
+CONDA_PREFIX_3=/data/align-anything/miniconda3
+CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
+CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
+CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
+CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PROMPT_MODIFIER=(hantao_stable)
+CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
+CONDA_ROOT=/home/align-anything/miniconda3
+CONDA_SHLVL=11
+CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
+CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
+CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
+CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CROSS_RANK=0
+CROSS_SIZE=1
+CUDA_MODULE_LOADING=LAZY
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
+CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
+CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
+DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
+DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
+ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
+GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
+GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
+GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
+GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
+GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
+GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
+GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
+HOME=/home/align-anything
+HOST=x86_64-conda-linux-gnu
+LANG=en_US.UTF-8
+LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
+LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
+LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
+LESSCLOSE=/usr/bin/lesspipe %s %s
+LESSOPEN=| /usr/bin/lesspipe %s
+LOCAL_RANK=0
+LOCAL_SIZE=8
+LOGLEVEL=WARNING
+LOGNAME=align-anything
+LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=52201
+MOTD_SHOWN=pam
+NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
+NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
+OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
+OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
+OLDPWD=/data/align-anything/hantao/LLaMA-Factory
+PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+PWD=/data/align-anything/hantao/align-anything/scripts
+PYGAME_HIDE_SUPPORT_PROMPT=1
+PYTHONHASHSEED=42
+PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
+QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
+QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
+RANK=0
+RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
+READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
+SHELL=/bin/bash
+SHLVL=3
+SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
+SSH_CLIENT=117.136.0.149 36325 30400
+SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
+SSL_CERT_DIR=/usr/lib/ssl/certs
+SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
+STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
+STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
+TERM=screen
+TERM_PROGRAM=vscode
+TERM_PROGRAM_VERSION=0.41.3
+TMUX=/tmp/tmux-2000/default,34082,51
+TMUX_PANE=%59
+TRITON_CACHE_DIR=/home/align-anything/cache/triton
+USER=align-anything
+VSCODE_GIT_ASKPASS_EXTRA_ARGS=
+VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
+VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
+VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
+VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
+WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
+WANDB_MODE=online
+WANDB_SERVICE=2-675697-tcp-localhost-45541
+WORLD_SIZE=8
+XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
+XDG_RUNTIME_DIR=/run/user/2000
+XDG_SESSION_CLASS=user
+XDG_SESSION_ID=11
+XDG_SESSION_TYPE=tty
+_=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
+_CE_CONDA=
+_CE_M=
+build_alias=x86_64-conda-linux-gnu
+host_alias=x86_64-conda-linux-gnu

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 512,
+    "width": 512
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "image_processor_type": "ChameleonImageProcessor",
+  "image_std": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "processor_class": "ChameleonProcessor",
+  "resample": 1,
+  "rescale_factor": 0.0078,
+  "size": {
+    "shortest_edge": 512
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "image_seq_length": 1024,
+  "image_token": "<image>",
+  "processor_class": "ChameleonProcessor"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43395241aee86be4cd8c53758c653e006b4e5ddd39103fd6e68ea3e6882d2269
+size 14086364170

script.sh ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/usr/bin/env bash
+#
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
+export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
+export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
+export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
+export WANDB_MODE=online
+MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
+DATASET_PATH=(
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
+)
+DATASET_NAME=(
+    "q0_10_preference"
+    "q0_20_preference"
+    "q0_30_preference"
+    "q0_40_preference"
+    "q0_50_preference"
+    "q0_60_preference"
+    "q0_70_preference"
+    "q0_80_preference"
+    "q0_90_preference"
+)
+OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
+mkdir -p $OUTPUT_PATH
+# Initialize variables
+for dataset_path in ${DATASET_PATH[@]}; do
+    for dataset_name in ${DATASET_NAME[@]}; do
+        TRAIN_DATASETS=$dataset_path
+        # dataset middle name
+        middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
+        OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
+        mkdir -p $OUTPUT_DIR
+        echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
+        # Source the setup script
+        source ./setup.sh
+        # Execute deepspeed command
+        deepspeed \
+            --master_port ${MASTER_PORT} \
+            --module align_anything.trainers.text_image_to_text_image.dpo \
+            --model_name_or_path ${MODEL_NAME_OR_PATH} \
+            --train_datasets ${TRAIN_DATASETS} \
+            --output_dir ${OUTPUT_DIR} \
+            --per_device_train_batch_size 4 \
+            --per_device_eval_batch_size 4 \
+            --gradient_accumulation_steps 2 \
+            --train_template Chameleon_preference \
+            --train_split train \
+            --train_data_files ${dataset_name}.pt \
+            --learning_rate 1e-6 \
+            --epochs 3 \
+            --lr_scheduler_type cosine \
+            --save_interval 400
+        bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
+    done
+done

slice_1200/arguments.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+data_cfgs:
+  eval_data_files: null
+  eval_datasets: null
+  eval_optional_args: []
+  eval_size: null
+  eval_split: null
+  eval_subset: null
+  eval_template: null
+  train_data_files: q0_40_preference.pt
+  train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
+  train_optional_args: []
+  train_size: null
+  train_split: train
+  train_subset: null
+  train_template: Chameleon_preference
+logger_cfgs:
+  cache_dir: null
+  log_project: align-anything
+  log_run_name: dpo
+  log_type: wandb
+  output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
+  save_interval: 400.0
+model_cfgs:
+  model_max_length: 4096
+  model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
+  trust_remote_code: true
+special_tokens: null
+train_cfgs:
+  adam_betas:
+  - 0.9
+  - 0.95
+  bf16: true
+  ds_cfgs: ds_z3_config.json
+  epochs: 3.0
+  eval_interval: 10
+  eval_strategy: epoch
+  fp16: false
+  freeze_language_model: true
+  freeze_mm_proj: true
+  freeze_vision_tower: false
+  gradient_accumulation_steps: 2.0
+  gradient_checkpointing: true
+  learning_rate: 1.0e-06
+  lr_scheduler_type: cosine
+  lr_warmup_ratio: 0.03
+  per_device_eval_batch_size: 4.0
+  per_device_train_batch_size: 4.0
+  regularization: 0.001
+  scale_coeff: 0.1
+  seed: 42
+  weight_decay: 0.01

slice_1200/config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

slice_1200/environ.txt ADDED Viewed

	@@ -0,0 +1,164 @@

+ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
+AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
+AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
+BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
+BUILD=x86_64-conda-linux-gnu
+CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
+CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
+CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
+COLORTERM=truecolor
+CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
+CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
+CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
+CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
+CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
+CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
+CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
+CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
+CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
+CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
+CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
+CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
+CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
+CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
+CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
+CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
+CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
+CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
+CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
+CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
+CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
+CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
+CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
+CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
+CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
+CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
+CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
+CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
+CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
+CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
+CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
+CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
+CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
+CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
+CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
+CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
+CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
+CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
+CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
+CONDA_DEFAULT_ENV=hantao_stable
+CONDA_EXE=/data/align-anything/miniconda3/bin/conda
+CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_1=/home/align-anything/miniconda3
+CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
+CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
+CONDA_PREFIX_3=/data/align-anything/miniconda3
+CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
+CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
+CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
+CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PROMPT_MODIFIER=(hantao_stable)
+CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
+CONDA_ROOT=/home/align-anything/miniconda3
+CONDA_SHLVL=11
+CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
+CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
+CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
+CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CROSS_RANK=0
+CROSS_SIZE=1
+CUDA_MODULE_LOADING=LAZY
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
+CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
+CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
+DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
+DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
+ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
+GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
+GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
+GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
+GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
+GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
+GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
+GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
+HOME=/home/align-anything
+HOST=x86_64-conda-linux-gnu
+LANG=en_US.UTF-8
+LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
+LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
+LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
+LESSCLOSE=/usr/bin/lesspipe %s %s
+LESSOPEN=| /usr/bin/lesspipe %s
+LOCAL_RANK=0
+LOCAL_SIZE=8
+LOGLEVEL=WARNING
+LOGNAME=align-anything
+LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=52201
+MOTD_SHOWN=pam
+NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
+NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
+OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
+OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
+OLDPWD=/data/align-anything/hantao/LLaMA-Factory
+PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+PWD=/data/align-anything/hantao/align-anything/scripts
+PYGAME_HIDE_SUPPORT_PROMPT=1
+PYTHONHASHSEED=42
+PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
+QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
+QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
+RANK=0
+RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
+READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
+SHELL=/bin/bash
+SHLVL=3
+SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
+SSH_CLIENT=117.136.0.149 36325 30400
+SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
+SSL_CERT_DIR=/usr/lib/ssl/certs
+SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
+STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
+STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
+TERM=screen
+TERM_PROGRAM=vscode
+TERM_PROGRAM_VERSION=0.41.3
+TMUX=/tmp/tmux-2000/default,34082,51
+TMUX_PANE=%59
+TRITON_CACHE_DIR=/home/align-anything/cache/triton
+USER=align-anything
+VSCODE_GIT_ASKPASS_EXTRA_ARGS=
+VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
+VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
+VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
+VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
+WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
+WANDB_MODE=online
+WANDB_SERVICE=2-675697-tcp-localhost-45541
+WORLD_SIZE=8
+XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
+XDG_RUNTIME_DIR=/run/user/2000
+XDG_SESSION_CLASS=user
+XDG_SESSION_ID=11
+XDG_SESSION_TYPE=tty
+_=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
+_CE_CONDA=
+_CE_M=
+build_alias=x86_64-conda-linux-gnu
+host_alias=x86_64-conda-linux-gnu

slice_1200/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 512,
+    "width": 512
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "image_processor_type": "ChameleonImageProcessor",
+  "image_std": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "processor_class": "ChameleonProcessor",
+  "resample": 1,
+  "rescale_factor": 0.0078,
+  "size": {
+    "shortest_edge": 512
+  }
+}

slice_1200/processor_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "image_seq_length": 1024,
+  "image_token": "<image>",
+  "processor_class": "ChameleonProcessor"
+}

slice_1200/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9571b162d6b31a5769b7b07e625c1594325aaeb450e02cadcb988815ad68a79d
+size 14086366930

slice_1200/script.sh ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/usr/bin/env bash
+#
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
+export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
+export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
+export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
+export WANDB_MODE=online
+MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
+DATASET_PATH=(
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
+)
+DATASET_NAME=(
+    "q0_10_preference"
+    "q0_20_preference"
+    "q0_30_preference"
+    "q0_40_preference"
+    "q0_50_preference"
+    "q0_60_preference"
+    "q0_70_preference"
+    "q0_80_preference"
+    "q0_90_preference"
+)
+OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
+mkdir -p $OUTPUT_PATH
+# Initialize variables
+for dataset_path in ${DATASET_PATH[@]}; do
+    for dataset_name in ${DATASET_NAME[@]}; do
+        TRAIN_DATASETS=$dataset_path
+        # dataset middle name
+        middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
+        OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
+        mkdir -p $OUTPUT_DIR
+        echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
+        # Source the setup script
+        source ./setup.sh
+        # Execute deepspeed command
+        deepspeed \
+            --master_port ${MASTER_PORT} \
+            --module align_anything.trainers.text_image_to_text_image.dpo \
+            --model_name_or_path ${MODEL_NAME_OR_PATH} \
+            --train_datasets ${TRAIN_DATASETS} \
+            --output_dir ${OUTPUT_DIR} \
+            --per_device_train_batch_size 4 \
+            --per_device_eval_batch_size 4 \
+            --gradient_accumulation_steps 2 \
+            --train_template Chameleon_preference \
+            --train_split train \
+            --train_data_files ${dataset_name}.pt \
+            --learning_rate 1e-6 \
+            --epochs 3 \
+            --lr_scheduler_type cosine \
+            --save_interval 400
+        bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
+    done
+done

slice_1200/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<reserved08706>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

slice_1200/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

slice_1200/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

slice_1200/wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,22 @@

+{"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
+{"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
+{"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
+{"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
+{"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
+{"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
+{"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
+{"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}

slice_1200/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,33 @@

+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:init():617] calling init triggers
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():667] starting backend
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():671] sending inform_init request
+2025-01-01 08:41:16,156 INFO    MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-01-01 08:41:16,157 INFO    MainThread:675697 [wandb_init.py:init():684] backend started and connected
+2025-01-01 08:41:16,160 INFO    MainThread:675697 [wandb_init.py:init():779] updated telemetry
+2025-01-01 08:41:16,221 INFO    MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2025-01-01 08:41:16,848 INFO    MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
+2025-01-01 08:41:17,411 INFO    MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
+2025-01-01 08:41:17,412 INFO    MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2025-01-01 08:41:17,412 INFO    MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2025-01-01 08:41:17,413 INFO    MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
+2025-01-01 08:41:17,424 INFO    MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
+2025-01-01 12:07:46,853 INFO    MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
+2025-01-01 12:07:46,854 INFO    MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
+2025-01-01 12:07:46,854 INFO    MainThread:675697 [wandb_run.py:_restore():2410] restore
+2025-01-01 12:07:46,855 INFO    MainThread:675697 [wandb_run.py:_restore():2416] restore done
+2025-01-01 12:07:50,513 INFO    MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
+2025-01-01 12:07:50,515 INFO    MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
+2025-01-01 12:07:50,525 INFO    MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files

slice_1200/wandb/run-20250101_084116-coewtb43/files/config.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+_wandb:
+    value:
+        cli_version: 0.18.3
+        m: []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 55
+                - 71
+                - 83
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 55
+                - 71
+                - 83
+                - 98
+                - 105
+            "3":
+                - 2
+                - 13
+                - 16
+                - 23
+                - 55
+                - 61
+            "4": 3.11.10
+            "5": 0.18.3
+            "6": 4.45.2
+            "8":
+                - 5
+            "12": 0.18.3
+            "13": linux-x86_64
+data_cfgs:
+    value:
+        eval_data_files: null
+        eval_datasets: null
+        eval_optional_args: []
+        eval_size: null
+        eval_split: null
+        eval_subset: null
+        eval_template: null
+        train_data_files: q0_40_preference.pt
+        train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
+        train_optional_args: []
+        train_size: null
+        train_split: train
+        train_subset: null
+        train_template: Chameleon_preference
+logger_cfgs:
+    value:
+        cache_dir: null
+        log_project: align-anything
+        log_run_name: dpo
+        log_type: wandb
+        output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
+        save_interval: 400
+model_cfgs:
+    value:
+        model_max_length: 4096
+        model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
+        trust_remote_code: true
+special_tokens:
+    value: null
+train_cfgs:
+    value:
+        adam_betas:
+            - 0.9
+            - 0.95
+        bf16: true
+        ds_cfgs: ds_z3_config.json
+        epochs: 3
+        eval_interval: 10
+        eval_strategy: epoch
+        fp16: false
+        freeze_language_model: true
+        freeze_mm_proj: true
+        freeze_vision_tower: false
+        gradient_accumulation_steps: 2
+        gradient_checkpointing: true
+        learning_rate: 1e-06
+        lr_scheduler_type: cosine
+        lr_warmup_ratio: 0.03
+        per_device_eval_batch_size: 4
+        per_device_train_batch_size: 4
+        regularization: 0.001
+        scale_coeff: 0.1
+        seed: 42
+        weight_decay: 0.01

slice_1200/wandb/run-20250101_084116-coewtb43/files/output.log ADDED Viewed

	@@ -0,0 +1,224 @@

+***** Running training *****
+Training 1/3.0 epoch:   0%|                                                                                                                                                                                                                                                                            | 0/1422.0 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
+Training 1/3.0 epoch (loss 11.8749):  21%|██████████████████████████████████████████████████▋                                                                                                                                                                                              | 299/1422.0 [43:24<2:59:40,  9.60s/it]
+[2025-01-01 08:42:55,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:43:25,944] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:44:17,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[4.761904761904761e-07, 4.761904761904761e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:45:03,075] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:47:04,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.523809523809522e-07, 9.523809523809522e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:48:02,529] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:48:43,904] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:50:07,690] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[9.995802740501932e-07, 9.995802740501932e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:51:09,012] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:53:09,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[9.98130274211278e-07, 9.98130274211278e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:55:49,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:55:49,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[9.956478233113064e-07, 9.956478233113064e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:58:16,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[9.921380666088558e-07, 9.921380666088558e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:01:07,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[9.876082786106545e-07, 9.876082786106545e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:01:27,047] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:04:03,792] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:04:24,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:04:24,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[9.820678479940571e-07, 9.820678479940571e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:05:30,688] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:07:12,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.755282581475767e-07, 9.755282581475767e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:09:19,110] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:10:10,785] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:10:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[9.68003063369808e-07, 9.68003063369808e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:10:51,320] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:13:33,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[9.595078607760749e-07, 9.595078607760749e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:14:56,362] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:16:12,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[9.500602579710255e-07, 9.500602579710255e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:17:52,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:19:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[9.39679836554184e-07, 9.39679836554184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:21:51,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[9.283881115340955e-07, 9.283881115340955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:22:22,638] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:24:48,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[9.16208486735184e-07, 9.16208486735184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:25:39,046] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:27:38,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[9.03166206289754e-07, 9.03166206289754e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:28:42,774] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:29:54,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[8.8928830231567e-07, 8.8928830231567e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:30:34,992] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:31:34,787] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:32:15,776] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:32:50,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[8.746035388881654e-07, 8.746035388881654e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:35:09,447] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:35:48,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[8.591423524219029e-07, 8.591423524219029e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:36:31,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:37:59,484] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:38:30,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[8.429367885868581e-07, 8.429367885868581e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+Saving checkpoint at step 400 ...
+Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
+Saving 16-bit model...
+[2025-01-01 09:38:39,848] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
+[2025-01-01 09:38:39,849] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
+[2025-01-01 09:38:39,850] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
+[2025-01-01 09:38:58,770] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
+[2025-01-01 09:38:58,773] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
+Model saved!
+Saving 16-bit model...
+[2025-01-01 09:39:05,477] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
+[2025-01-01 09:39:05,478] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
+[2025-01-01 09:39:05,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
+[2025-01-01 09:39:27,199] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
+[2025-01-01 09:39:27,202] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
+Model saved!
+Checkpoint saved.
+[2025-01-01 09:42:08,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[8.260204358887753e-07, 8.260204358887753e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:43:59,051] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:44:19,070] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:45:02,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[8.084283560518583e-07, 8.084283560518583e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:47:51,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[7.901970113479955e-07, 7.901970113479955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:48:18,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:50:44,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[7.713641890231308e-07, 7.713641890231308e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:51:29,985] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:52:00,601] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:53:37,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:53:37,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[7.51968922977428e-07, 7.51968922977428e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:56:36,667] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:56:36,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[7.320514128615511e-07, 7.320514128615511e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:57:18,253] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:59:22,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[7.116529407567488e-07, 7.116529407567488e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:59:42,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:02:27,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[6.908157856114392e-07, 6.908157856114392e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:04:22,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:05:15,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[6.695831356116303e-07, 6.695831356116303e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:07:32,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[6.479989986668117e-07, 6.479989986668117e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:09:58,442] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:10:45,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[6.261081111968403e-07, 6.261081111968403e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:12:34,690] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:12:55,185] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:13:41,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[6.039558454088795e-07, 6.039558454088795e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:14:01,438] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:16:42,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[5.815881152565711e-07, 5.815881152565711e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:17:49,827] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:18:41,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:19:22,050] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:20:00,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[5.590512812763541e-07, 5.590512812763541e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:22:48,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[5.363920544981748e-07, 5.363920544981748e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:23:27,066] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:25:42,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[5.136573996297429e-07, 5.136573996297429e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:26:22,366] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:28:45,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[4.908944377150043e-07, 4.908944377150043e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:30:51,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:31:03,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[4.681503484685803e-07, 4.681503484685803e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:34:07,594] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:34:07,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[4.454722724886051e-07, 4.454722724886051e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:36:51,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[4.229072135506384e-07, 4.229072135506384e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+Saving checkpoint at step 800 ...
+Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
+Saving 16-bit model...
+[2025-01-01 10:37:00,920] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
+[2025-01-01 10:37:00,921] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
+[2025-01-01 10:37:00,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
+[2025-01-01 10:37:17,303] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
+[2025-01-01 10:37:17,305] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
+Model saved!
+Saving 16-bit model...
+[2025-01-01 10:37:24,304] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
+[2025-01-01 10:37:24,306] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
+[2025-01-01 10:37:24,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
+[2025-01-01 10:37:47,861] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
+[2025-01-01 10:37:47,862] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
+Model saved!
+Checkpoint saved.
+[2025-01-01 10:38:06,545] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:39:58,968] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:40:18,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[4.005019411851609e-07, 4.005019411851609e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:40:58,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:41:39,701] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:43:08,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[3.783028937405821e-07, 3.783028937405821e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:44:32,585] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:45:54,172] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:45:54,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[3.563560821326706e-07, 3.563560821326706e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:47:22,278] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:48:37,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[3.3470699447990527e-07, 3.3470699447990527e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:51:26,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[3.1340050182240436e-07, 3.1340050182240436e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:52:26,337] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:52:46,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:54:14,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[2.92480765119841e-07, 2.92480765119841e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:56:46,300] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:57:28,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[2.719911437211122e-07, 2.719911437211122e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:59:56,353] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:59:56,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[2.5197410549546595e-07, 2.5197410549546595e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:00:26,971] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:02:03,646] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:02:54,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[2.3247113881135781e-07, 2.3247113881135781e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:05:02,388] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:05:43,792] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:05:55,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[2.1352266654547125e-07, 2.1352266654547125e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:08:07,702] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:08:38,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[1.9516796230013272e-07, 1.9516796230013272e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:11:35,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[1.774450690027746e-07, 1.774450690027746e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:12:47,870] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:14:16,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[1.6039072005615716e-07, 1.6039072005615716e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:17:00,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[1.4404026320278317e-07, 1.4404026320278317e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:18:23,592] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:19:58,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[1.284275872613028e-07, 1.284275872613028e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:20:59,657] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:21:20,032] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:22:26,091] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:23:04,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[1.1358505188676288e-07, 1.1358505188676288e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:26:14,637] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:26:14,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[9.95434205002792e-08, 9.95434205002792e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:27:06,507] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:27:47,054] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:29:24,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[8.633179652714916e-08, 8.633179652714916e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:31:52,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:32:11,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[7.397756307555885e-08, 7.397756307555885e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:34:47,839] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:34:59,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[6.250632618090867e-08, 6.250632618090867e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+Saving checkpoint at step 1200 ...
+Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
+Saving 16-bit model...
+[2025-01-01 11:35:08,703] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
+[2025-01-01 11:35:08,704] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
+[2025-01-01 11:35:08,704] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
+[2025-01-01 11:35:25,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
+[2025-01-01 11:35:25,317] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
+Model saved!
+Saving 16-bit model...
+[2025-01-01 11:35:32,446] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
+[2025-01-01 11:35:32,447] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
+[2025-01-01 11:35:32,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
+[2025-01-01 11:35:53,847] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
+[2025-01-01 11:35:53,849] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
+Model saved!
+Checkpoint saved.
+[2025-01-01 11:38:41,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[5.194186173339599e-08, 5.194186173339599e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:40:10,376] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:41:25,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[4.230606619885108e-08, 4.230606619885108e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:43:26,712] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:44:25,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[3.3618911234968236e-08, 3.3618911234968236e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:46:29,769] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:46:53,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[2.589840229699558e-08, 2.589840229699558e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:48:22,179] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:49:22,087] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:49:35,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[1.9160541318679224e-08, 1.9160541318679224e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:50:02,959] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:52:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[1.3419293545812338e-08, 1.3419293545812338e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:52:56,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:54:18,301] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:55:03,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[8.686558591130156e-09, 8.686558591130156e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:55:46,245] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:57:47,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[4.972145770545999e-09, 4.972145770545999e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 12:00:50,391] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 12:00:50,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[2.283753771845587e-09, 2.283753771845587e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 12:01:10,430] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 12:03:22,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[6.269546979813523e-10, 6.269546979813523e-10], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 12:05:10,192] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 12:06:28,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[5.182518037827321e-12, 5.182518037827321e-12], mom=[[0.9, 0.95], [0.9, 0.95]]
+Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
+Saving 16-bit model...
+[2025-01-01 12:06:57,036] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
+[2025-01-01 12:06:57,037] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
+[2025-01-01 12:06:57,037] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
+[2025-01-01 12:07:17,768] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
+[2025-01-01 12:07:17,770] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
+Model saved!
+Saving 16-bit model...
+[2025-01-01 12:07:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
+[2025-01-01 12:07:24,459] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
+[2025-01-01 12:07:24,459] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
+[2025-01-01 12:07:46,734] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
+[2025-01-01 12:07:46,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
+Model saved!

slice_1200/wandb/run-20250101_084116-coewtb43/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,248 @@

+align-anything==0.0.1.dev0
+gitdb==4.0.11
+wcwidth==0.2.13
+identify==2.6.1
+tomlkit==0.12.0
+bitsandbytes==0.44.1
+trl==0.9.6
+pytest-split==0.8.0
+gradio==4.44.1
+pip==24.2
+multidict==6.1.0
+fairscale==0.4.13
+mistral_common==1.4.4
+python-dotenv==1.0.1
+uvloop==0.20.0
+absl-py==2.1.0
+tiktoken==0.7.0
+pydub==0.25.1
+websockets==12.0
+llamafactory==0.9.1.dev0
+triton==3.0.0
+tifffile==2024.9.20
+safe-rlhf==0.0.1.dev0
+pandas==2.2.3
+grpcio==1.66.2
+click==8.1.7
+ninja==1.11.1.1
+rich==13.9.2
+Jinja2==3.1.4
+Pygments==2.18.0
+nvidia-cudnn-cu12==9.1.0.70
+importlib_resources==6.4.5
+GitPython==3.1.43
+nvidia-cufft-cu12==11.0.2.54
+tensorboard-data-server==0.7.2
+align-anything==0.0.1.dev0
+six==1.16.0
+scipy==1.14.1
+mpmath==1.3.0
+jsonschema-specifications==2024.10.1
+scikit-image==0.24.0
+zipp==3.20.2
+cycler==0.12.1
+MarkupSafe==2.1.5
+tzdata==2024.2
+idna==3.10
+pycountry==24.6.1
+nvidia-nccl-cu12==2.20.5
+matplotlib==3.9.2
+pytz==2024.2
+uvicorn==0.31.1
+dill==0.3.8
+pyparsing==3.1.4
+pytest==7.2.0
+jiter==0.6.1
+safetensors==0.4.5
+typing_extensions==4.12.2
+decorator==4.4.2
+typeguard==4.4.1
+prometheus_client==0.21.0
+nvidia-cuda-cupti-cu12==12.1.105
+sentencepiece==0.2.0
+requests==2.32.3
+kiwisolver==1.4.7
+gdown==5.2.0
+multiprocess==0.70.16
+xxhash==3.5.0
+PyYAML==6.0.2
+gguf==0.10.0
+nvidia-nvtx-cu12==12.1.105
+hpsv2==1.2.0
+tensorboard==2.18.0
+nodeenv==1.9.1
+filelock==3.16.1
+distro==1.9.0
+scikit-learn==1.5.2
+huggingface-hub==0.25.2
+pyairports==2.1.1
+importlib_metadata==8.5.0
+pyarrow==17.0.0
+llvmlite==0.43.0
+ray==2.37.0
+tokenizers==0.20.3
+nvidia-nvjitlink-cu12==12.6.77
+av==14.0.1
+deepspeed==0.15.2
+clip==0.2.0
+shtab==1.7.1
+certifi==2024.8.30
+braceexpand==0.1.7
+nvidia-ml-py==12.560.30
+webdataset==0.2.100
+docker-pycreds==0.4.0
+einops==0.8.0
+iniconfig==2.0.0
+tyro==0.9.2
+torchvision==0.19.0
+accelerate==0.34.2
+beautifulsoup4==4.12.3
+pyzmq==26.2.0
+pycparser==2.22
+nvidia-curand-cu12==10.3.2.106
+msgpack==1.1.0
+soxr==0.5.0.post1
+platformdirs==4.3.6
+h11==0.14.0
+psutil==6.0.0
+pydantic==2.9.2
+shellingham==1.5.4
+imageio-ffmpeg==0.5.1
+wandb==0.18.3
+audioread==3.0.1
+annotated-types==0.7.0
+docstring_parser==0.16
+cloudpickle==3.1.0
+regex==2024.9.11
+packaging==24.1
+timm==0.6.13
+aiosignal==1.3.1
+numba==0.60.0
+orjson==3.10.7
+rpds-py==0.20.0
+virtualenv==20.26.6
+joblib==1.4.2
+charset-normalizer==3.4.0
+httpx==0.27.2
+ffmpy==0.4.0
+lm-format-enforcer==0.10.6
+yt-dlp==2024.8.6
+sympy==1.13.3
+python-dateutil==2.9.0.post0
+nvidia-cusolver-cu12==11.4.5.107
+msgspec==0.18.6
+mdurl==0.1.2
+torch==2.4.0
+fastapi==0.115.0
+optree==0.13.0
+PySocks==1.7.1
+transformers==4.46.0.dev0
+torchlibrosa==0.1.0
+fsspec==2024.6.1
+nvidia-cublas-cu12==12.1.3.1
+gradio_client==1.3.0
+args==0.1.0
+cffi==1.17.1
+fonttools==4.54.1
+clint==0.5.1
+lark==1.2.2
+tqdm==4.66.5
+semantic-version==2.10.0
+pooch==1.8.2
+markdown-it-py==3.0.0
+pydantic_core==2.23.4
+sniffio==1.3.1
+httptools==0.6.1
+nvidia-cuda-runtime-cu12==12.1.105
+anyio==4.6.0
+ftfy==6.3.0
+Markdown==3.7
+datasets==2.21.0
+diffusers==0.30.3
+nvidia-cuda-nvrtc-cu12==12.1.105
+vllm==0.6.2
+starlette==0.38.6
+flash-attn==2.7.0.post2
+urllib3==2.2.3
+Werkzeug==3.0.4
+py-cpuinfo==9.0.0
+moviepy==1.0.3
+librosa==0.10.2.post1
+peft==0.12.0
+soupsieve==2.6
+lazy_loader==0.4
+pluggy==1.5.0
+setuptools==75.1.0
+sentry-sdk==2.16.0
+tabulate==0.9.0
+transformers==4.45.2
+pre_commit==4.0.1
+termcolor==2.5.0
+frechet-audio-distance==0.1.2
+pytorch-fid==0.3.0
+setproctitle==1.3.3
+jsonschema==4.23.0
+aiofiles==23.2.1
+contourpy==1.3.0
+distlib==0.3.9
+interegular==0.3.3
+fire==0.7.0
+diskcache==5.6.3
+proglog==0.1.10
+soundfile==0.12.1
+protobuf==3.20.3
+smmap==5.0.1
+pycryptodomex==3.21.0
+Brotli==1.1.0
+pillow==10.4.0
+frozenlist==1.4.1
+numpy==1.26.4
+mutagen==1.47.0
+outlines==0.0.46
+attrs==24.2.0
+torchaudio==2.4.0
+aiohttp==3.10.10
+ruff==0.6.9
+watchfiles==0.24.0
+threadpoolctl==3.5.0
+nest-asyncio==1.6.0
+partial-json-parser==0.2.1.1.post4
+sse-starlette==2.1.3
+shortuuid==1.0.13
+typer==0.12.5
+prometheus-fastapi-instrumentator==7.0.0
+imageio==2.35.1
+wheel==0.44.0
+image-reward==1.5
+networkx==3.4.1
+propcache==0.2.0
+aiohappyeyeballs==2.4.3
+nvidia-cusparse-cu12==12.1.0.106
+xformers==0.0.27.post2
+cfgv==3.4.0
+python-multipart==0.0.12
+httpcore==1.0.6
+opencv-python==4.6.0.66
+resampy==0.4.3
+yarl==1.15.0
+referencing==0.35.1
+openai==1.51.2
+hjson==3.1.0
+llamafactory==0.9.1.dev0
+jaraco.collections==5.1.0
+backports.tarfile==1.2.0
+more-itertools==10.3.0
+wheel==0.43.0
+importlib_metadata==8.0.0
+zipp==3.19.2
+autocommand==2.2.2
+jaraco.functools==4.0.1
+platformdirs==4.2.2
+tomli==2.0.1
+jaraco.text==3.12.1
+typing_extensions==4.12.2
+jaraco.context==5.3.0
+importlib_resources==6.4.0
+packaging==24.1
+inflect==7.3.1
+typeguard==4.3.0

slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "os":  "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
+  "python":  "3.11.10",
+  "startedAt":  "2025-01-01T08:41:16.157770Z",
+  "args":  [
+    "--local_rank=0",
+    "--model_name_or_path",
+    "/data/align-anything/hantao/models/chameleon-7b",
+    "--train_datasets",
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized",
+    "--output_dir",
+    "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
+    "--per_device_train_batch_size",
+    "4",
+    "--per_device_eval_batch_size",
+    "4",
+    "--gradient_accumulation_steps",
+    "2",
+    "--train_template",
+    "Chameleon_preference",
+    "--train_split",
+    "train",
+    "--train_data_files",
+    "q0_40_preference.pt",
+    "--learning_rate",
+    "1e-6",
+    "--epochs",
+    "3",
+    "--lr_scheduler_type",
+    "cosine",
+    "--save_interval",
+    "400"
+  ],
+  "program":  "-m align_anything.trainers.text_image_to_text_image.dpo",
+  "git":  {
+    "remote":  "https://github.com/PKU-Alignment/align-anything.git",
+    "commit":  "6fde660afc9985323f147930eedf188a5699adc7"
+  },
+  "email":  "[email protected]",
+  "root":  "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
+  "host":  "lyg0194",
+  "username":  "align-anything",
+  "executable":  "/data/align-anything/miniconda3/envs/hantao_stable/bin/python",
+  "cpu_count":  64,
+  "cpu_count_logical":  128,
+  "gpu":  "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "939477946368",
+      "used":  "596714827776"
+    }
+  },
+  "memory":  {
+    "total":  "1081823907840"
+  },
+  "cpu":  {
+    "count":  64,
+    "countLogical":  128
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/loss":1.2820848226547241,"_step":1422,"train/better_sample_reward":92.50687408447266,"train/reward_margin":134.08236694335938,"train/lr":0,"train/worse_sample_reward":-41.57551193237305,"_wandb":{"runtime":12390},"_timestamp":1.7357332075421584e+09,"train/step":1422,"train/epoch":3,"_runtime":12390.697992413,"train/reward_accuracy":0.90625,"train/reward":50.931365966796875}

slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,22 @@

+{"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
+{"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
+{"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
+{"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
+{"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
+{"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
+{"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
+{"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}

slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug.log ADDED Viewed

	@@ -0,0 +1,33 @@

+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:init():617] calling init triggers
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():667] starting backend
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():671] sending inform_init request
+2025-01-01 08:41:16,156 INFO    MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-01-01 08:41:16,157 INFO    MainThread:675697 [wandb_init.py:init():684] backend started and connected
+2025-01-01 08:41:16,160 INFO    MainThread:675697 [wandb_init.py:init():779] updated telemetry
+2025-01-01 08:41:16,221 INFO    MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2025-01-01 08:41:16,848 INFO    MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
+2025-01-01 08:41:17,411 INFO    MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
+2025-01-01 08:41:17,412 INFO    MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2025-01-01 08:41:17,412 INFO    MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2025-01-01 08:41:17,413 INFO    MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
+2025-01-01 08:41:17,424 INFO    MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
+2025-01-01 12:07:46,853 INFO    MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
+2025-01-01 12:07:46,854 INFO    MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
+2025-01-01 12:07:46,854 INFO    MainThread:675697 [wandb_run.py:_restore():2410] restore
+2025-01-01 12:07:46,855 INFO    MainThread:675697 [wandb_run.py:_restore():2416] restore done
+2025-01-01 12:07:50,513 INFO    MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
+2025-01-01 12:07:50,515 INFO    MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
+2025-01-01 12:07:50,525 INFO    MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files

slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6184cd062ec8be4c9517ecc56b37ff397dd3f29795bce1cd495613256a3f6f76
+size 12650956

slice_400/arguments.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+data_cfgs:
+  eval_data_files: null
+  eval_datasets: null
+  eval_optional_args: []
+  eval_size: null
+  eval_split: null
+  eval_subset: null
+  eval_template: null
+  train_data_files: q0_40_preference.pt
+  train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
+  train_optional_args: []
+  train_size: null
+  train_split: train
+  train_subset: null
+  train_template: Chameleon_preference
+logger_cfgs:
+  cache_dir: null
+  log_project: align-anything
+  log_run_name: dpo
+  log_type: wandb
+  output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
+  save_interval: 400.0
+model_cfgs:
+  model_max_length: 4096
+  model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
+  trust_remote_code: true
+special_tokens: null
+train_cfgs:
+  adam_betas:
+  - 0.9
+  - 0.95
+  bf16: true
+  ds_cfgs: ds_z3_config.json
+  epochs: 3.0
+  eval_interval: 10
+  eval_strategy: epoch
+  fp16: false
+  freeze_language_model: true
+  freeze_mm_proj: true
+  freeze_vision_tower: false
+  gradient_accumulation_steps: 2.0
+  gradient_checkpointing: true
+  learning_rate: 1.0e-06
+  lr_scheduler_type: cosine
+  lr_warmup_ratio: 0.03
+  per_device_eval_batch_size: 4.0
+  per_device_train_batch_size: 4.0
+  regularization: 0.001
+  scale_coeff: 0.1
+  seed: 42
+  weight_decay: 0.01

slice_400/config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

slice_400/environ.txt ADDED Viewed

	@@ -0,0 +1,164 @@

+ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
+AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
+AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
+BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
+BUILD=x86_64-conda-linux-gnu
+CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
+CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
+CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
+COLORTERM=truecolor
+CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
+CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
+CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
+CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
+CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
+CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
+CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
+CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
+CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
+CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
+CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
+CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
+CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
+CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
+CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
+CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
+CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
+CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
+CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
+CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
+CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
+CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
+CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
+CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
+CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
+CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs  -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
+CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
+CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
+CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
+CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
+CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
+CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
+CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
+CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
+CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
+CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
+CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
+CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
+CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
+CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
+CONDA_DEFAULT_ENV=hantao_stable
+CONDA_EXE=/data/align-anything/miniconda3/bin/conda
+CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_1=/home/align-anything/miniconda3
+CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
+CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
+CONDA_PREFIX_3=/data/align-anything/miniconda3
+CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
+CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
+CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
+CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
+CONDA_PROMPT_MODIFIER=(hantao_stable)
+CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
+CONDA_ROOT=/home/align-anything/miniconda3
+CONDA_SHLVL=11
+CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
+CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
+CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
+CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CROSS_RANK=0
+CROSS_SIZE=1
+CUDA_MODULE_LOADING=LAZY
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
+CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
+CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include  -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
+DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
+DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
+DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
+ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
+GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
+GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
+GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
+GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
+GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
+GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
+GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
+HOME=/home/align-anything
+HOST=x86_64-conda-linux-gnu
+LANG=en_US.UTF-8
+LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
+LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib  -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
+LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
+LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
+LESSCLOSE=/usr/bin/lesspipe %s %s
+LESSOPEN=| /usr/bin/lesspipe %s
+LOCAL_RANK=0
+LOCAL_SIZE=8
+LOGLEVEL=WARNING
+LOGNAME=align-anything
+LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=52201
+MOTD_SHOWN=pam
+NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
+NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
+OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
+OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
+OLDPWD=/data/align-anything/hantao/LLaMA-Factory
+PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
+PWD=/data/align-anything/hantao/align-anything/scripts
+PYGAME_HIDE_SUPPORT_PROMPT=1
+PYTHONHASHSEED=42
+PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
+QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
+QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
+RANK=0
+RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
+READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
+SHELL=/bin/bash
+SHLVL=3
+SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
+SSH_CLIENT=117.136.0.149 36325 30400
+SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
+SSL_CERT_DIR=/usr/lib/ssl/certs
+SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
+STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
+STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
+TERM=screen
+TERM_PROGRAM=vscode
+TERM_PROGRAM_VERSION=0.41.3
+TMUX=/tmp/tmux-2000/default,34082,51
+TMUX_PANE=%59
+TRITON_CACHE_DIR=/home/align-anything/cache/triton
+USER=align-anything
+VSCODE_GIT_ASKPASS_EXTRA_ARGS=
+VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
+VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
+VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
+VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
+WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
+WANDB_MODE=online
+WANDB_SERVICE=2-675697-tcp-localhost-45541
+WORLD_SIZE=8
+XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
+XDG_RUNTIME_DIR=/run/user/2000
+XDG_SESSION_CLASS=user
+XDG_SESSION_ID=11
+XDG_SESSION_TYPE=tty
+_=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
+_CE_CONDA=
+_CE_M=
+build_alias=x86_64-conda-linux-gnu
+host_alias=x86_64-conda-linux-gnu

slice_400/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 512,
+    "width": 512
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "image_processor_type": "ChameleonImageProcessor",
+  "image_std": [
+    1.0,
+    1.0,
+    1.0
+  ],
+  "processor_class": "ChameleonProcessor",
+  "resample": 1,
+  "rescale_factor": 0.0078,
+  "size": {
+    "shortest_edge": 512
+  }
+}

slice_400/processor_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "image_seq_length": 1024,
+  "image_token": "<image>",
+  "processor_class": "ChameleonProcessor"
+}

slice_400/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d45286d89bc63b921ceef6df439a1bda7c4537d46f14ecab8a5b77fe81bdcde0
+size 14086366378

slice_400/script.sh ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/usr/bin/env bash
+#
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
+export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
+export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
+export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
+export WANDB_MODE=online
+MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
+DATASET_PATH=(
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
+)
+DATASET_NAME=(
+    "q0_10_preference"
+    "q0_20_preference"
+    "q0_30_preference"
+    "q0_40_preference"
+    "q0_50_preference"
+    "q0_60_preference"
+    "q0_70_preference"
+    "q0_80_preference"
+    "q0_90_preference"
+)
+OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
+mkdir -p $OUTPUT_PATH
+# Initialize variables
+for dataset_path in ${DATASET_PATH[@]}; do
+    for dataset_name in ${DATASET_NAME[@]}; do
+        TRAIN_DATASETS=$dataset_path
+        # dataset middle name
+        middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
+        OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
+        mkdir -p $OUTPUT_DIR
+        echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
+        # Source the setup script
+        source ./setup.sh
+        # Execute deepspeed command
+        deepspeed \
+            --master_port ${MASTER_PORT} \
+            --module align_anything.trainers.text_image_to_text_image.dpo \
+            --model_name_or_path ${MODEL_NAME_OR_PATH} \
+            --train_datasets ${TRAIN_DATASETS} \
+            --output_dir ${OUTPUT_DIR} \
+            --per_device_train_batch_size 4 \
+            --per_device_eval_batch_size 4 \
+            --gradient_accumulation_steps 2 \
+            --train_template Chameleon_preference \
+            --train_split train \
+            --train_data_files ${dataset_name}.pt \
+            --learning_rate 1e-6 \
+            --epochs 3 \
+            --lr_scheduler_type cosine \
+            --save_interval 400
+        bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
+    done
+done

slice_400/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "<reserved08706>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

slice_400/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

slice_400/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

slice_400/wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,22 @@

+{"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
+{"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
+{"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
+{"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
+{"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
+{"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
+{"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
+{"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}

slice_400/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,33 @@

+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:init():617] calling init triggers
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():667] starting backend
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():671] sending inform_init request
+2025-01-01 08:41:16,156 INFO    MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-01-01 08:41:16,157 INFO    MainThread:675697 [wandb_init.py:init():684] backend started and connected
+2025-01-01 08:41:16,160 INFO    MainThread:675697 [wandb_init.py:init():779] updated telemetry
+2025-01-01 08:41:16,221 INFO    MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2025-01-01 08:41:16,848 INFO    MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
+2025-01-01 08:41:17,411 INFO    MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
+2025-01-01 08:41:17,412 INFO    MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2025-01-01 08:41:17,412 INFO    MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2025-01-01 08:41:17,413 INFO    MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
+2025-01-01 08:41:17,424 INFO    MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
+2025-01-01 12:07:46,853 INFO    MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
+2025-01-01 12:07:46,854 INFO    MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
+2025-01-01 12:07:46,854 INFO    MainThread:675697 [wandb_run.py:_restore():2410] restore
+2025-01-01 12:07:46,855 INFO    MainThread:675697 [wandb_run.py:_restore():2416] restore done
+2025-01-01 12:07:50,513 INFO    MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
+2025-01-01 12:07:50,515 INFO    MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
+2025-01-01 12:07:50,525 INFO    MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files

slice_400/wandb/run-20250101_084116-coewtb43/files/config.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+_wandb:
+    value:
+        cli_version: 0.18.3
+        m: []
+        python_version: 3.11.10
+        t:
+            "1":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 55
+                - 71
+                - 83
+                - 98
+                - 105
+            "2":
+                - 1
+                - 11
+                - 41
+                - 49
+                - 51
+                - 55
+                - 71
+                - 83
+                - 98
+                - 105
+            "3":
+                - 2
+                - 13
+                - 16
+                - 23
+                - 55
+                - 61
+            "4": 3.11.10
+            "5": 0.18.3
+            "6": 4.45.2
+            "8":
+                - 5
+            "12": 0.18.3
+            "13": linux-x86_64
+data_cfgs:
+    value:
+        eval_data_files: null
+        eval_datasets: null
+        eval_optional_args: []
+        eval_size: null
+        eval_split: null
+        eval_subset: null
+        eval_template: null
+        train_data_files: q0_40_preference.pt
+        train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
+        train_optional_args: []
+        train_size: null
+        train_split: train
+        train_subset: null
+        train_template: Chameleon_preference
+logger_cfgs:
+    value:
+        cache_dir: null
+        log_project: align-anything
+        log_run_name: dpo
+        log_type: wandb
+        output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
+        save_interval: 400
+model_cfgs:
+    value:
+        model_max_length: 4096
+        model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
+        trust_remote_code: true
+special_tokens:
+    value: null
+train_cfgs:
+    value:
+        adam_betas:
+            - 0.9
+            - 0.95
+        bf16: true
+        ds_cfgs: ds_z3_config.json
+        epochs: 3
+        eval_interval: 10
+        eval_strategy: epoch
+        fp16: false
+        freeze_language_model: true
+        freeze_mm_proj: true
+        freeze_vision_tower: false
+        gradient_accumulation_steps: 2
+        gradient_checkpointing: true
+        learning_rate: 1e-06
+        lr_scheduler_type: cosine
+        lr_warmup_ratio: 0.03
+        per_device_eval_batch_size: 4
+        per_device_train_batch_size: 4
+        regularization: 0.001
+        scale_coeff: 0.1
+        seed: 42
+        weight_decay: 0.01

slice_400/wandb/run-20250101_084116-coewtb43/files/output.log ADDED Viewed

	@@ -0,0 +1,224 @@

+***** Running training *****
+Training 1/3.0 epoch:   0%|                                                                                                                                                                                                                                                                            | 0/1422.0 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
+Training 1/3.0 epoch (loss 11.8749):  21%|██████████████████████████████████████████████████▋                                                                                                                                                                                              | 299/1422.0 [43:24<2:59:40,  9.60s/it]
+[2025-01-01 08:42:55,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:43:25,944] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:44:17,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[4.761904761904761e-07, 4.761904761904761e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:45:03,075] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:47:04,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.523809523809522e-07, 9.523809523809522e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:48:02,529] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:48:43,904] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:50:07,690] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[9.995802740501932e-07, 9.995802740501932e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:51:09,012] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:53:09,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[9.98130274211278e-07, 9.98130274211278e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:55:49,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 08:55:49,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[9.956478233113064e-07, 9.956478233113064e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 08:58:16,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[9.921380666088558e-07, 9.921380666088558e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:01:07,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[9.876082786106545e-07, 9.876082786106545e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:01:27,047] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:04:03,792] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:04:24,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:04:24,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[9.820678479940571e-07, 9.820678479940571e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:05:30,688] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:07:12,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.755282581475767e-07, 9.755282581475767e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:09:19,110] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:10:10,785] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:10:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[9.68003063369808e-07, 9.68003063369808e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:10:51,320] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:13:33,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[9.595078607760749e-07, 9.595078607760749e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:14:56,362] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:16:12,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[9.500602579710255e-07, 9.500602579710255e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:17:52,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:19:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[9.39679836554184e-07, 9.39679836554184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:21:51,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[9.283881115340955e-07, 9.283881115340955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:22:22,638] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:24:48,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[9.16208486735184e-07, 9.16208486735184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:25:39,046] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:27:38,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[9.03166206289754e-07, 9.03166206289754e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:28:42,774] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:29:54,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[8.8928830231567e-07, 8.8928830231567e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:30:34,992] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:31:34,787] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:32:15,776] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:32:50,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[8.746035388881654e-07, 8.746035388881654e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:35:09,447] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:35:48,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[8.591423524219029e-07, 8.591423524219029e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:36:31,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:37:59,484] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:38:30,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[8.429367885868581e-07, 8.429367885868581e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+Saving checkpoint at step 400 ...
+Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
+Saving 16-bit model...
+[2025-01-01 09:38:39,848] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
+[2025-01-01 09:38:39,849] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
+[2025-01-01 09:38:39,850] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
+[2025-01-01 09:38:58,770] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
+[2025-01-01 09:38:58,773] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
+Model saved!
+Saving 16-bit model...
+[2025-01-01 09:39:05,477] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
+[2025-01-01 09:39:05,478] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
+[2025-01-01 09:39:05,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
+[2025-01-01 09:39:27,199] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
+[2025-01-01 09:39:27,202] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
+Model saved!
+Checkpoint saved.
+[2025-01-01 09:42:08,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[8.260204358887753e-07, 8.260204358887753e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:43:59,051] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:44:19,070] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:45:02,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[8.084283560518583e-07, 8.084283560518583e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:47:51,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[7.901970113479955e-07, 7.901970113479955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:48:18,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:50:44,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[7.713641890231308e-07, 7.713641890231308e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:51:29,985] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:52:00,601] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:53:37,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:53:37,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[7.51968922977428e-07, 7.51968922977428e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:56:36,667] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:56:36,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[7.320514128615511e-07, 7.320514128615511e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:57:18,253] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 09:59:22,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[7.116529407567488e-07, 7.116529407567488e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 09:59:42,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:02:27,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[6.908157856114392e-07, 6.908157856114392e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:04:22,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:05:15,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[6.695831356116303e-07, 6.695831356116303e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:07:32,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[6.479989986668117e-07, 6.479989986668117e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:09:58,442] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:10:45,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[6.261081111968403e-07, 6.261081111968403e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:12:34,690] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:12:55,185] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:13:41,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[6.039558454088795e-07, 6.039558454088795e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:14:01,438] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:16:42,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[5.815881152565711e-07, 5.815881152565711e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:17:49,827] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:18:41,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:19:22,050] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:20:00,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[5.590512812763541e-07, 5.590512812763541e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:22:48,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[5.363920544981748e-07, 5.363920544981748e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:23:27,066] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:25:42,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[5.136573996297429e-07, 5.136573996297429e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:26:22,366] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:28:45,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[4.908944377150043e-07, 4.908944377150043e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:30:51,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:31:03,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[4.681503484685803e-07, 4.681503484685803e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:34:07,594] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:34:07,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[4.454722724886051e-07, 4.454722724886051e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:36:51,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[4.229072135506384e-07, 4.229072135506384e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+Saving checkpoint at step 800 ...
+Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
+Saving 16-bit model...
+[2025-01-01 10:37:00,920] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
+[2025-01-01 10:37:00,921] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
+[2025-01-01 10:37:00,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
+[2025-01-01 10:37:17,303] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
+[2025-01-01 10:37:17,305] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
+Model saved!
+Saving 16-bit model...
+[2025-01-01 10:37:24,304] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
+[2025-01-01 10:37:24,306] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
+[2025-01-01 10:37:24,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
+[2025-01-01 10:37:47,861] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
+[2025-01-01 10:37:47,862] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
+Model saved!
+Checkpoint saved.
+[2025-01-01 10:38:06,545] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:39:58,968] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:40:18,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[4.005019411851609e-07, 4.005019411851609e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:40:58,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:41:39,701] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:43:08,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[3.783028937405821e-07, 3.783028937405821e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:44:32,585] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:45:54,172] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:45:54,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[3.563560821326706e-07, 3.563560821326706e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:47:22,278] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:48:37,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[3.3470699447990527e-07, 3.3470699447990527e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:51:26,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[3.1340050182240436e-07, 3.1340050182240436e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:52:26,337] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:52:46,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:54:14,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[2.92480765119841e-07, 2.92480765119841e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:56:46,300] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:57:28,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[2.719911437211122e-07, 2.719911437211122e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 10:59:56,353] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 10:59:56,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[2.5197410549546595e-07, 2.5197410549546595e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:00:26,971] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:02:03,646] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:02:54,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[2.3247113881135781e-07, 2.3247113881135781e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:05:02,388] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:05:43,792] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:05:55,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[2.1352266654547125e-07, 2.1352266654547125e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:08:07,702] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:08:38,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[1.9516796230013272e-07, 1.9516796230013272e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:11:35,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[1.774450690027746e-07, 1.774450690027746e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:12:47,870] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:14:16,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[1.6039072005615716e-07, 1.6039072005615716e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:17:00,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[1.4404026320278317e-07, 1.4404026320278317e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:18:23,592] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:19:58,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[1.284275872613028e-07, 1.284275872613028e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:20:59,657] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:21:20,032] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:22:26,091] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:23:04,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[1.1358505188676288e-07, 1.1358505188676288e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:26:14,637] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:26:14,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[9.95434205002792e-08, 9.95434205002792e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:27:06,507] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:27:47,054] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:29:24,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[8.633179652714916e-08, 8.633179652714916e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:31:52,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:32:11,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[7.397756307555885e-08, 7.397756307555885e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:34:47,839] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:34:59,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[6.250632618090867e-08, 6.250632618090867e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+Saving checkpoint at step 1200 ...
+Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
+Saving 16-bit model...
+[2025-01-01 11:35:08,703] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
+[2025-01-01 11:35:08,704] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
+[2025-01-01 11:35:08,704] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
+[2025-01-01 11:35:25,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
+[2025-01-01 11:35:25,317] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
+Model saved!
+Saving 16-bit model...
+[2025-01-01 11:35:32,446] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
+[2025-01-01 11:35:32,447] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
+[2025-01-01 11:35:32,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
+[2025-01-01 11:35:53,847] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
+[2025-01-01 11:35:53,849] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
+Model saved!
+Checkpoint saved.
+[2025-01-01 11:38:41,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[5.194186173339599e-08, 5.194186173339599e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:40:10,376] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:41:25,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[4.230606619885108e-08, 4.230606619885108e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:43:26,712] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:44:25,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[3.3618911234968236e-08, 3.3618911234968236e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:46:29,769] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:46:53,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[2.589840229699558e-08, 2.589840229699558e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:48:22,179] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:49:22,087] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:49:35,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[1.9160541318679224e-08, 1.9160541318679224e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:50:02,959] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:52:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[1.3419293545812338e-08, 1.3419293545812338e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:52:56,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:54:18,301] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:55:03,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[8.686558591130156e-09, 8.686558591130156e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 11:55:46,245] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 11:57:47,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[4.972145770545999e-09, 4.972145770545999e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 12:00:50,391] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 12:00:50,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[2.283753771845587e-09, 2.283753771845587e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 12:01:10,430] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 12:03:22,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[6.269546979813523e-10, 6.269546979813523e-10], mom=[[0.9, 0.95], [0.9, 0.95]]
+[2025-01-01 12:05:10,192] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
+[2025-01-01 12:06:28,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[5.182518037827321e-12, 5.182518037827321e-12], mom=[[0.9, 0.95], [0.9, 0.95]]
+Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
+Saving 16-bit model...
+[2025-01-01 12:06:57,036] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
+[2025-01-01 12:06:57,037] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
+[2025-01-01 12:06:57,037] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
+[2025-01-01 12:07:17,768] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
+[2025-01-01 12:07:17,770] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
+Model saved!
+Saving 16-bit model...
+[2025-01-01 12:07:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
+[2025-01-01 12:07:24,459] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
+[2025-01-01 12:07:24,459] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
+[2025-01-01 12:07:46,734] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
+[2025-01-01 12:07:46,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
+Model saved!

slice_400/wandb/run-20250101_084116-coewtb43/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,248 @@

+align-anything==0.0.1.dev0
+gitdb==4.0.11
+wcwidth==0.2.13
+identify==2.6.1
+tomlkit==0.12.0
+bitsandbytes==0.44.1
+trl==0.9.6
+pytest-split==0.8.0
+gradio==4.44.1
+pip==24.2
+multidict==6.1.0
+fairscale==0.4.13
+mistral_common==1.4.4
+python-dotenv==1.0.1
+uvloop==0.20.0
+absl-py==2.1.0
+tiktoken==0.7.0
+pydub==0.25.1
+websockets==12.0
+llamafactory==0.9.1.dev0
+triton==3.0.0
+tifffile==2024.9.20
+safe-rlhf==0.0.1.dev0
+pandas==2.2.3
+grpcio==1.66.2
+click==8.1.7
+ninja==1.11.1.1
+rich==13.9.2
+Jinja2==3.1.4
+Pygments==2.18.0
+nvidia-cudnn-cu12==9.1.0.70
+importlib_resources==6.4.5
+GitPython==3.1.43
+nvidia-cufft-cu12==11.0.2.54
+tensorboard-data-server==0.7.2
+align-anything==0.0.1.dev0
+six==1.16.0
+scipy==1.14.1
+mpmath==1.3.0
+jsonschema-specifications==2024.10.1
+scikit-image==0.24.0
+zipp==3.20.2
+cycler==0.12.1
+MarkupSafe==2.1.5
+tzdata==2024.2
+idna==3.10
+pycountry==24.6.1
+nvidia-nccl-cu12==2.20.5
+matplotlib==3.9.2
+pytz==2024.2
+uvicorn==0.31.1
+dill==0.3.8
+pyparsing==3.1.4
+pytest==7.2.0
+jiter==0.6.1
+safetensors==0.4.5
+typing_extensions==4.12.2
+decorator==4.4.2
+typeguard==4.4.1
+prometheus_client==0.21.0
+nvidia-cuda-cupti-cu12==12.1.105
+sentencepiece==0.2.0
+requests==2.32.3
+kiwisolver==1.4.7
+gdown==5.2.0
+multiprocess==0.70.16
+xxhash==3.5.0
+PyYAML==6.0.2
+gguf==0.10.0
+nvidia-nvtx-cu12==12.1.105
+hpsv2==1.2.0
+tensorboard==2.18.0
+nodeenv==1.9.1
+filelock==3.16.1
+distro==1.9.0
+scikit-learn==1.5.2
+huggingface-hub==0.25.2
+pyairports==2.1.1
+importlib_metadata==8.5.0
+pyarrow==17.0.0
+llvmlite==0.43.0
+ray==2.37.0
+tokenizers==0.20.3
+nvidia-nvjitlink-cu12==12.6.77
+av==14.0.1
+deepspeed==0.15.2
+clip==0.2.0
+shtab==1.7.1
+certifi==2024.8.30
+braceexpand==0.1.7
+nvidia-ml-py==12.560.30
+webdataset==0.2.100
+docker-pycreds==0.4.0
+einops==0.8.0
+iniconfig==2.0.0
+tyro==0.9.2
+torchvision==0.19.0
+accelerate==0.34.2
+beautifulsoup4==4.12.3
+pyzmq==26.2.0
+pycparser==2.22
+nvidia-curand-cu12==10.3.2.106
+msgpack==1.1.0
+soxr==0.5.0.post1
+platformdirs==4.3.6
+h11==0.14.0
+psutil==6.0.0
+pydantic==2.9.2
+shellingham==1.5.4
+imageio-ffmpeg==0.5.1
+wandb==0.18.3
+audioread==3.0.1
+annotated-types==0.7.0
+docstring_parser==0.16
+cloudpickle==3.1.0
+regex==2024.9.11
+packaging==24.1
+timm==0.6.13
+aiosignal==1.3.1
+numba==0.60.0
+orjson==3.10.7
+rpds-py==0.20.0
+virtualenv==20.26.6
+joblib==1.4.2
+charset-normalizer==3.4.0
+httpx==0.27.2
+ffmpy==0.4.0
+lm-format-enforcer==0.10.6
+yt-dlp==2024.8.6
+sympy==1.13.3
+python-dateutil==2.9.0.post0
+nvidia-cusolver-cu12==11.4.5.107
+msgspec==0.18.6
+mdurl==0.1.2
+torch==2.4.0
+fastapi==0.115.0
+optree==0.13.0
+PySocks==1.7.1
+transformers==4.46.0.dev0
+torchlibrosa==0.1.0
+fsspec==2024.6.1
+nvidia-cublas-cu12==12.1.3.1
+gradio_client==1.3.0
+args==0.1.0
+cffi==1.17.1
+fonttools==4.54.1
+clint==0.5.1
+lark==1.2.2
+tqdm==4.66.5
+semantic-version==2.10.0
+pooch==1.8.2
+markdown-it-py==3.0.0
+pydantic_core==2.23.4
+sniffio==1.3.1
+httptools==0.6.1
+nvidia-cuda-runtime-cu12==12.1.105
+anyio==4.6.0
+ftfy==6.3.0
+Markdown==3.7
+datasets==2.21.0
+diffusers==0.30.3
+nvidia-cuda-nvrtc-cu12==12.1.105
+vllm==0.6.2
+starlette==0.38.6
+flash-attn==2.7.0.post2
+urllib3==2.2.3
+Werkzeug==3.0.4
+py-cpuinfo==9.0.0
+moviepy==1.0.3
+librosa==0.10.2.post1
+peft==0.12.0
+soupsieve==2.6
+lazy_loader==0.4
+pluggy==1.5.0
+setuptools==75.1.0
+sentry-sdk==2.16.0
+tabulate==0.9.0
+transformers==4.45.2
+pre_commit==4.0.1
+termcolor==2.5.0
+frechet-audio-distance==0.1.2
+pytorch-fid==0.3.0
+setproctitle==1.3.3
+jsonschema==4.23.0
+aiofiles==23.2.1
+contourpy==1.3.0
+distlib==0.3.9
+interegular==0.3.3
+fire==0.7.0
+diskcache==5.6.3
+proglog==0.1.10
+soundfile==0.12.1
+protobuf==3.20.3
+smmap==5.0.1
+pycryptodomex==3.21.0
+Brotli==1.1.0
+pillow==10.4.0
+frozenlist==1.4.1
+numpy==1.26.4
+mutagen==1.47.0
+outlines==0.0.46
+attrs==24.2.0
+torchaudio==2.4.0
+aiohttp==3.10.10
+ruff==0.6.9
+watchfiles==0.24.0
+threadpoolctl==3.5.0
+nest-asyncio==1.6.0
+partial-json-parser==0.2.1.1.post4
+sse-starlette==2.1.3
+shortuuid==1.0.13
+typer==0.12.5
+prometheus-fastapi-instrumentator==7.0.0
+imageio==2.35.1
+wheel==0.44.0
+image-reward==1.5
+networkx==3.4.1
+propcache==0.2.0
+aiohappyeyeballs==2.4.3
+nvidia-cusparse-cu12==12.1.0.106
+xformers==0.0.27.post2
+cfgv==3.4.0
+python-multipart==0.0.12
+httpcore==1.0.6
+opencv-python==4.6.0.66
+resampy==0.4.3
+yarl==1.15.0
+referencing==0.35.1
+openai==1.51.2
+hjson==3.1.0
+llamafactory==0.9.1.dev0
+jaraco.collections==5.1.0
+backports.tarfile==1.2.0
+more-itertools==10.3.0
+wheel==0.43.0
+importlib_metadata==8.0.0
+zipp==3.19.2
+autocommand==2.2.2
+jaraco.functools==4.0.1
+platformdirs==4.2.2
+tomli==2.0.1
+jaraco.text==3.12.1
+typing_extensions==4.12.2
+jaraco.context==5.3.0
+importlib_resources==6.4.0
+packaging==24.1
+inflect==7.3.1
+typeguard==4.3.0

slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "os":  "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
+  "python":  "3.11.10",
+  "startedAt":  "2025-01-01T08:41:16.157770Z",
+  "args":  [
+    "--local_rank=0",
+    "--model_name_or_path",
+    "/data/align-anything/hantao/models/chameleon-7b",
+    "--train_datasets",
+    "/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized",
+    "--output_dir",
+    "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
+    "--per_device_train_batch_size",
+    "4",
+    "--per_device_eval_batch_size",
+    "4",
+    "--gradient_accumulation_steps",
+    "2",
+    "--train_template",
+    "Chameleon_preference",
+    "--train_split",
+    "train",
+    "--train_data_files",
+    "q0_40_preference.pt",
+    "--learning_rate",
+    "1e-6",
+    "--epochs",
+    "3",
+    "--lr_scheduler_type",
+    "cosine",
+    "--save_interval",
+    "400"
+  ],
+  "program":  "-m align_anything.trainers.text_image_to_text_image.dpo",
+  "git":  {
+    "remote":  "https://github.com/PKU-Alignment/align-anything.git",
+    "commit":  "6fde660afc9985323f147930eedf188a5699adc7"
+  },
+  "email":  "[email protected]",
+  "root":  "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
+  "host":  "lyg0194",
+  "username":  "align-anything",
+  "executable":  "/data/align-anything/miniconda3/envs/hantao_stable/bin/python",
+  "cpu_count":  64,
+  "cpu_count_logical":  128,
+  "gpu":  "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "939477946368",
+      "used":  "596714827776"
+    }
+  },
+  "memory":  {
+    "total":  "1081823907840"
+  },
+  "cpu":  {
+    "count":  64,
+    "countLogical":  128
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    },
+    {
+      "name":  "NVIDIA A100-SXM4-80GB",
+      "memoryTotal":  "85899345920",
+      "cudaCores":  6912,
+      "architecture":  "Ampere"
+    }
+  ],
+  "cudaVersion":  "12.4"
+}

slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"train/loss":1.2820848226547241,"_step":1422,"train/better_sample_reward":92.50687408447266,"train/reward_margin":134.08236694335938,"train/lr":0,"train/worse_sample_reward":-41.57551193237305,"_wandb":{"runtime":12390},"_timestamp":1.7357332075421584e+09,"train/step":1422,"train/epoch":3,"_runtime":12390.697992413,"train/reward_accuracy":0.90625,"train/reward":50.931365966796875}

slice_400/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,22 @@

+{"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
+{"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
+{"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
+{"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
+{"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
+{"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
+{"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
+{"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
+{"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
+{"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
+{"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
+{"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
+{"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}

slice_400/wandb/run-20250101_084116-coewtb43/logs/debug.log ADDED Viewed

	@@ -0,0 +1,33 @@

+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
+2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
+2025-01-01 08:41:16,150 INFO    MainThread:675697 [wandb_init.py:init():617] calling init triggers
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
+config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():667] starting backend
+2025-01-01 08:41:16,151 INFO    MainThread:675697 [wandb_init.py:init():671] sending inform_init request
+2025-01-01 08:41:16,156 INFO    MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-01-01 08:41:16,157 INFO    MainThread:675697 [wandb_init.py:init():684] backend started and connected
+2025-01-01 08:41:16,160 INFO    MainThread:675697 [wandb_init.py:init():779] updated telemetry
+2025-01-01 08:41:16,221 INFO    MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
+2025-01-01 08:41:16,848 INFO    MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
+2025-01-01 08:41:17,411 INFO    MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
+2025-01-01 08:41:17,412 INFO    MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
+2025-01-01 08:41:17,412 INFO    MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
+2025-01-01 08:41:17,413 INFO    MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
+2025-01-01 08:41:17,424 INFO    MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
+2025-01-01 12:07:46,853 INFO    MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
+2025-01-01 12:07:46,854 INFO    MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
+2025-01-01 12:07:46,854 INFO    MainThread:675697 [wandb_run.py:_restore():2410] restore
+2025-01-01 12:07:46,855 INFO    MainThread:675697 [wandb_run.py:_restore():2416] restore done
+2025-01-01 12:07:50,513 INFO    MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
+2025-01-01 12:07:50,515 INFO    MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
+2025-01-01 12:07:50,525 INFO    MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files

slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6184cd062ec8be4c9517ecc56b37ff397dd3f29795bce1cd495613256a3f6f76
+size 12650956

slice_800/arguments.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+data_cfgs:
+  eval_data_files: null
+  eval_datasets: null
+  eval_optional_args: []
+  eval_size: null
+  eval_split: null
+  eval_subset: null
+  eval_template: null
+  train_data_files: q0_40_preference.pt
+  train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
+  train_optional_args: []
+  train_size: null
+  train_split: train
+  train_subset: null
+  train_template: Chameleon_preference
+logger_cfgs:
+  cache_dir: null
+  log_project: align-anything
+  log_run_name: dpo
+  log_type: wandb
+  output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
+  save_interval: 400.0
+model_cfgs:
+  model_max_length: 4096
+  model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
+  trust_remote_code: true
+special_tokens: null
+train_cfgs:
+  adam_betas:
+  - 0.9
+  - 0.95
+  bf16: true
+  ds_cfgs: ds_z3_config.json
+  epochs: 3.0
+  eval_interval: 10
+  eval_strategy: epoch
+  fp16: false
+  freeze_language_model: true
+  freeze_mm_proj: true
+  freeze_vision_tower: false
+  gradient_accumulation_steps: 2.0
+  gradient_checkpointing: true
+  learning_rate: 1.0e-06
+  lr_scheduler_type: cosine
+  lr_warmup_ratio: 0.03
+  per_device_eval_batch_size: 4.0
+  per_device_train_batch_size: 4.0
+  regularization: 0.001
+  scale_coeff: 0.1
+  seed: 42
+  weight_decay: 0.01

slice_800/config.json ADDED Viewed

The diff for this file is too large to render. See raw diff