Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +4 -0
- arguments.yaml +51 -0
- config.json +0 -0
- environ.txt +164 -0
- preprocessor_config.json +28 -0
- processor_config.json +5 -0
- pytorch_model.bin +3 -0
- script.sh +84 -0
- slice_1200/arguments.yaml +51 -0
- slice_1200/config.json +0 -0
- slice_1200/environ.txt +164 -0
- slice_1200/preprocessor_config.json +28 -0
- slice_1200/processor_config.json +5 -0
- slice_1200/pytorch_model.bin +3 -0
- slice_1200/script.sh +84 -0
- slice_1200/special_tokens_map.json +37 -0
- slice_1200/tokenizer.json +0 -0
- slice_1200/tokenizer_config.json +0 -0
- slice_1200/wandb/debug-internal.log +22 -0
- slice_1200/wandb/debug.log +33 -0
- slice_1200/wandb/run-20250101_084116-coewtb43/files/config.yaml +98 -0
- slice_1200/wandb/run-20250101_084116-coewtb43/files/output.log +224 -0
- slice_1200/wandb/run-20250101_084116-coewtb43/files/requirements.txt +248 -0
- slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json +112 -0
- slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json +1 -0
- slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log +22 -0
- slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug.log +33 -0
- slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb +3 -0
- slice_400/arguments.yaml +51 -0
- slice_400/config.json +0 -0
- slice_400/environ.txt +164 -0
- slice_400/preprocessor_config.json +28 -0
- slice_400/processor_config.json +5 -0
- slice_400/pytorch_model.bin +3 -0
- slice_400/script.sh +84 -0
- slice_400/special_tokens_map.json +37 -0
- slice_400/tokenizer.json +0 -0
- slice_400/tokenizer_config.json +0 -0
- slice_400/wandb/debug-internal.log +22 -0
- slice_400/wandb/debug.log +33 -0
- slice_400/wandb/run-20250101_084116-coewtb43/files/config.yaml +98 -0
- slice_400/wandb/run-20250101_084116-coewtb43/files/output.log +224 -0
- slice_400/wandb/run-20250101_084116-coewtb43/files/requirements.txt +248 -0
- slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json +112 -0
- slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json +1 -0
- slice_400/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log +22 -0
- slice_400/wandb/run-20250101_084116-coewtb43/logs/debug.log +33 -0
- slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb +3 -0
- slice_800/arguments.yaml +51 -0
- slice_800/config.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
slice_800/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb filter=lfs diff=lfs merge=lfs -text
|
arguments.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_cfgs:
|
| 2 |
+
eval_data_files: null
|
| 3 |
+
eval_datasets: null
|
| 4 |
+
eval_optional_args: []
|
| 5 |
+
eval_size: null
|
| 6 |
+
eval_split: null
|
| 7 |
+
eval_subset: null
|
| 8 |
+
eval_template: null
|
| 9 |
+
train_data_files: q0_40_preference.pt
|
| 10 |
+
train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
|
| 11 |
+
train_optional_args: []
|
| 12 |
+
train_size: null
|
| 13 |
+
train_split: train
|
| 14 |
+
train_subset: null
|
| 15 |
+
train_template: Chameleon_preference
|
| 16 |
+
logger_cfgs:
|
| 17 |
+
cache_dir: null
|
| 18 |
+
log_project: align-anything
|
| 19 |
+
log_run_name: dpo
|
| 20 |
+
log_type: wandb
|
| 21 |
+
output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
|
| 22 |
+
save_interval: 400.0
|
| 23 |
+
model_cfgs:
|
| 24 |
+
model_max_length: 4096
|
| 25 |
+
model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
|
| 26 |
+
trust_remote_code: true
|
| 27 |
+
special_tokens: null
|
| 28 |
+
train_cfgs:
|
| 29 |
+
adam_betas:
|
| 30 |
+
- 0.9
|
| 31 |
+
- 0.95
|
| 32 |
+
bf16: true
|
| 33 |
+
ds_cfgs: ds_z3_config.json
|
| 34 |
+
epochs: 3.0
|
| 35 |
+
eval_interval: 10
|
| 36 |
+
eval_strategy: epoch
|
| 37 |
+
fp16: false
|
| 38 |
+
freeze_language_model: true
|
| 39 |
+
freeze_mm_proj: true
|
| 40 |
+
freeze_vision_tower: false
|
| 41 |
+
gradient_accumulation_steps: 2.0
|
| 42 |
+
gradient_checkpointing: true
|
| 43 |
+
learning_rate: 1.0e-06
|
| 44 |
+
lr_scheduler_type: cosine
|
| 45 |
+
lr_warmup_ratio: 0.03
|
| 46 |
+
per_device_eval_batch_size: 4.0
|
| 47 |
+
per_device_train_batch_size: 4.0
|
| 48 |
+
regularization: 0.001
|
| 49 |
+
scale_coeff: 0.1
|
| 50 |
+
seed: 42
|
| 51 |
+
weight_decay: 0.01
|
config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
environ.txt
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
|
| 2 |
+
AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
|
| 3 |
+
AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
|
| 4 |
+
BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
|
| 5 |
+
BUILD=x86_64-conda-linux-gnu
|
| 6 |
+
CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
|
| 7 |
+
CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
|
| 8 |
+
CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 9 |
+
CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
|
| 10 |
+
COLORTERM=truecolor
|
| 11 |
+
CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
|
| 12 |
+
CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
|
| 13 |
+
CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
|
| 14 |
+
CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
|
| 15 |
+
CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
|
| 16 |
+
CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
|
| 17 |
+
CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 18 |
+
CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
|
| 19 |
+
CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
|
| 20 |
+
CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
|
| 21 |
+
CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
|
| 22 |
+
CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
|
| 23 |
+
CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 24 |
+
CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
|
| 25 |
+
CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
|
| 26 |
+
CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 27 |
+
CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
|
| 28 |
+
CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 29 |
+
CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 30 |
+
CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 31 |
+
CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
|
| 32 |
+
CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
|
| 33 |
+
CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
|
| 34 |
+
CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
|
| 35 |
+
CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
|
| 36 |
+
CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
|
| 37 |
+
CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
|
| 38 |
+
CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
|
| 39 |
+
CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
|
| 40 |
+
CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
|
| 41 |
+
CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
|
| 42 |
+
CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
|
| 43 |
+
CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
|
| 44 |
+
CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
|
| 45 |
+
CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
|
| 46 |
+
CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
|
| 47 |
+
CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
|
| 48 |
+
CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
|
| 49 |
+
CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
|
| 50 |
+
CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
|
| 51 |
+
CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
|
| 52 |
+
CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
|
| 53 |
+
CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
|
| 54 |
+
CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
|
| 55 |
+
CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
|
| 56 |
+
CONDA_DEFAULT_ENV=hantao_stable
|
| 57 |
+
CONDA_EXE=/data/align-anything/miniconda3/bin/conda
|
| 58 |
+
CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
|
| 59 |
+
CONDA_PREFIX_1=/home/align-anything/miniconda3
|
| 60 |
+
CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
|
| 61 |
+
CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
|
| 62 |
+
CONDA_PREFIX_3=/data/align-anything/miniconda3
|
| 63 |
+
CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
|
| 64 |
+
CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
|
| 65 |
+
CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
|
| 66 |
+
CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
|
| 67 |
+
CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
|
| 68 |
+
CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
|
| 69 |
+
CONDA_PROMPT_MODIFIER=(hantao_stable)
|
| 70 |
+
CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
|
| 71 |
+
CONDA_ROOT=/home/align-anything/miniconda3
|
| 72 |
+
CONDA_SHLVL=11
|
| 73 |
+
CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
|
| 74 |
+
CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
|
| 75 |
+
CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
|
| 76 |
+
CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 77 |
+
CROSS_RANK=0
|
| 78 |
+
CROSS_SIZE=1
|
| 79 |
+
CUDA_MODULE_LOADING=LAZY
|
| 80 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 81 |
+
CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
|
| 82 |
+
CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
|
| 83 |
+
CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 84 |
+
CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
|
| 85 |
+
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
|
| 86 |
+
DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 87 |
+
DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 88 |
+
DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 89 |
+
DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
|
| 90 |
+
ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
|
| 91 |
+
GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
|
| 92 |
+
GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
|
| 93 |
+
GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
|
| 94 |
+
GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
|
| 95 |
+
GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
|
| 96 |
+
GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
|
| 97 |
+
GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
|
| 98 |
+
HOME=/home/align-anything
|
| 99 |
+
HOST=x86_64-conda-linux-gnu
|
| 100 |
+
LANG=en_US.UTF-8
|
| 101 |
+
LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
|
| 102 |
+
LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 103 |
+
LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
|
| 104 |
+
LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
|
| 105 |
+
LESSCLOSE=/usr/bin/lesspipe %s %s
|
| 106 |
+
LESSOPEN=| /usr/bin/lesspipe %s
|
| 107 |
+
LOCAL_RANK=0
|
| 108 |
+
LOCAL_SIZE=8
|
| 109 |
+
LOGLEVEL=WARNING
|
| 110 |
+
LOGNAME=align-anything
|
| 111 |
+
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
|
| 112 |
+
MASTER_ADDR=127.0.0.1
|
| 113 |
+
MASTER_PORT=52201
|
| 114 |
+
MOTD_SHOWN=pam
|
| 115 |
+
NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
|
| 116 |
+
NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
|
| 117 |
+
OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
|
| 118 |
+
OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
|
| 119 |
+
OLDPWD=/data/align-anything/hantao/LLaMA-Factory
|
| 120 |
+
PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
|
| 121 |
+
PWD=/data/align-anything/hantao/align-anything/scripts
|
| 122 |
+
PYGAME_HIDE_SUPPORT_PROMPT=1
|
| 123 |
+
PYTHONHASHSEED=42
|
| 124 |
+
PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
|
| 125 |
+
QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
|
| 126 |
+
QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
|
| 127 |
+
RANK=0
|
| 128 |
+
RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
|
| 129 |
+
READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
|
| 130 |
+
SHELL=/bin/bash
|
| 131 |
+
SHLVL=3
|
| 132 |
+
SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
|
| 133 |
+
SSH_CLIENT=117.136.0.149 36325 30400
|
| 134 |
+
SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
|
| 135 |
+
SSL_CERT_DIR=/usr/lib/ssl/certs
|
| 136 |
+
SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
|
| 137 |
+
STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
|
| 138 |
+
STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
|
| 139 |
+
TERM=screen
|
| 140 |
+
TERM_PROGRAM=vscode
|
| 141 |
+
TERM_PROGRAM_VERSION=0.41.3
|
| 142 |
+
TMUX=/tmp/tmux-2000/default,34082,51
|
| 143 |
+
TMUX_PANE=%59
|
| 144 |
+
TRITON_CACHE_DIR=/home/align-anything/cache/triton
|
| 145 |
+
USER=align-anything
|
| 146 |
+
VSCODE_GIT_ASKPASS_EXTRA_ARGS=
|
| 147 |
+
VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
|
| 148 |
+
VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
|
| 149 |
+
VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
|
| 150 |
+
VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
|
| 151 |
+
WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
|
| 152 |
+
WANDB_MODE=online
|
| 153 |
+
WANDB_SERVICE=2-675697-tcp-localhost-45541
|
| 154 |
+
WORLD_SIZE=8
|
| 155 |
+
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
| 156 |
+
XDG_RUNTIME_DIR=/run/user/2000
|
| 157 |
+
XDG_SESSION_CLASS=user
|
| 158 |
+
XDG_SESSION_ID=11
|
| 159 |
+
XDG_SESSION_TYPE=tty
|
| 160 |
+
_=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
|
| 161 |
+
_CE_CONDA=
|
| 162 |
+
_CE_M=
|
| 163 |
+
build_alias=x86_64-conda-linux-gnu
|
| 164 |
+
host_alias=x86_64-conda-linux-gnu
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": {
|
| 3 |
+
"height": 512,
|
| 4 |
+
"width": 512
|
| 5 |
+
},
|
| 6 |
+
"do_center_crop": true,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"image_mean": [
|
| 12 |
+
1.0,
|
| 13 |
+
1.0,
|
| 14 |
+
1.0
|
| 15 |
+
],
|
| 16 |
+
"image_processor_type": "ChameleonImageProcessor",
|
| 17 |
+
"image_std": [
|
| 18 |
+
1.0,
|
| 19 |
+
1.0,
|
| 20 |
+
1.0
|
| 21 |
+
],
|
| 22 |
+
"processor_class": "ChameleonProcessor",
|
| 23 |
+
"resample": 1,
|
| 24 |
+
"rescale_factor": 0.0078,
|
| 25 |
+
"size": {
|
| 26 |
+
"shortest_edge": 512
|
| 27 |
+
}
|
| 28 |
+
}
|
processor_config.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"image_seq_length": 1024,
|
| 3 |
+
"image_token": "<image>",
|
| 4 |
+
"processor_class": "ChameleonProcessor"
|
| 5 |
+
}
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43395241aee86be4cd8c53758c653e006b4e5ddd39103fd6e68ea3e6882d2269
|
| 3 |
+
size 14086364170
|
script.sh
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
|
| 4 |
+
#
|
| 5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
+
# you may not use this file except in compliance with the License.
|
| 7 |
+
# You may obtain a copy of the License at
|
| 8 |
+
#
|
| 9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
+
#
|
| 11 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
+
# See the License for the specific language governing permissions and
|
| 15 |
+
# limitations under the License.
|
| 16 |
+
# ==============================================================================
|
| 17 |
+
|
| 18 |
+
export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
|
| 19 |
+
export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
|
| 20 |
+
|
| 21 |
+
export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
|
| 22 |
+
|
| 23 |
+
export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
|
| 24 |
+
export WANDB_MODE=online
|
| 25 |
+
|
| 26 |
+
MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
|
| 27 |
+
|
| 28 |
+
DATASET_PATH=(
|
| 29 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
|
| 30 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
|
| 31 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
|
| 32 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
DATASET_NAME=(
|
| 36 |
+
"q0_10_preference"
|
| 37 |
+
"q0_20_preference"
|
| 38 |
+
"q0_30_preference"
|
| 39 |
+
"q0_40_preference"
|
| 40 |
+
"q0_50_preference"
|
| 41 |
+
"q0_60_preference"
|
| 42 |
+
"q0_70_preference"
|
| 43 |
+
"q0_80_preference"
|
| 44 |
+
"q0_90_preference"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
|
| 48 |
+
mkdir -p $OUTPUT_PATH
|
| 49 |
+
|
| 50 |
+
# Initialize variables
|
| 51 |
+
|
| 52 |
+
for dataset_path in ${DATASET_PATH[@]}; do
|
| 53 |
+
for dataset_name in ${DATASET_NAME[@]}; do
|
| 54 |
+
TRAIN_DATASETS=$dataset_path
|
| 55 |
+
|
| 56 |
+
# dataset middle name
|
| 57 |
+
middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
|
| 58 |
+
OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
|
| 59 |
+
mkdir -p $OUTPUT_DIR
|
| 60 |
+
echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
|
| 61 |
+
# Source the setup script
|
| 62 |
+
source ./setup.sh
|
| 63 |
+
|
| 64 |
+
# Execute deepspeed command
|
| 65 |
+
deepspeed \
|
| 66 |
+
--master_port ${MASTER_PORT} \
|
| 67 |
+
--module align_anything.trainers.text_image_to_text_image.dpo \
|
| 68 |
+
--model_name_or_path ${MODEL_NAME_OR_PATH} \
|
| 69 |
+
--train_datasets ${TRAIN_DATASETS} \
|
| 70 |
+
--output_dir ${OUTPUT_DIR} \
|
| 71 |
+
--per_device_train_batch_size 4 \
|
| 72 |
+
--per_device_eval_batch_size 4 \
|
| 73 |
+
--gradient_accumulation_steps 2 \
|
| 74 |
+
--train_template Chameleon_preference \
|
| 75 |
+
--train_split train \
|
| 76 |
+
--train_data_files ${dataset_name}.pt \
|
| 77 |
+
--learning_rate 1e-6 \
|
| 78 |
+
--epochs 3 \
|
| 79 |
+
--lr_scheduler_type cosine \
|
| 80 |
+
--save_interval 400
|
| 81 |
+
|
| 82 |
+
bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
|
| 83 |
+
done
|
| 84 |
+
done
|
slice_1200/arguments.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_cfgs:
|
| 2 |
+
eval_data_files: null
|
| 3 |
+
eval_datasets: null
|
| 4 |
+
eval_optional_args: []
|
| 5 |
+
eval_size: null
|
| 6 |
+
eval_split: null
|
| 7 |
+
eval_subset: null
|
| 8 |
+
eval_template: null
|
| 9 |
+
train_data_files: q0_40_preference.pt
|
| 10 |
+
train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
|
| 11 |
+
train_optional_args: []
|
| 12 |
+
train_size: null
|
| 13 |
+
train_split: train
|
| 14 |
+
train_subset: null
|
| 15 |
+
train_template: Chameleon_preference
|
| 16 |
+
logger_cfgs:
|
| 17 |
+
cache_dir: null
|
| 18 |
+
log_project: align-anything
|
| 19 |
+
log_run_name: dpo
|
| 20 |
+
log_type: wandb
|
| 21 |
+
output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
|
| 22 |
+
save_interval: 400.0
|
| 23 |
+
model_cfgs:
|
| 24 |
+
model_max_length: 4096
|
| 25 |
+
model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
|
| 26 |
+
trust_remote_code: true
|
| 27 |
+
special_tokens: null
|
| 28 |
+
train_cfgs:
|
| 29 |
+
adam_betas:
|
| 30 |
+
- 0.9
|
| 31 |
+
- 0.95
|
| 32 |
+
bf16: true
|
| 33 |
+
ds_cfgs: ds_z3_config.json
|
| 34 |
+
epochs: 3.0
|
| 35 |
+
eval_interval: 10
|
| 36 |
+
eval_strategy: epoch
|
| 37 |
+
fp16: false
|
| 38 |
+
freeze_language_model: true
|
| 39 |
+
freeze_mm_proj: true
|
| 40 |
+
freeze_vision_tower: false
|
| 41 |
+
gradient_accumulation_steps: 2.0
|
| 42 |
+
gradient_checkpointing: true
|
| 43 |
+
learning_rate: 1.0e-06
|
| 44 |
+
lr_scheduler_type: cosine
|
| 45 |
+
lr_warmup_ratio: 0.03
|
| 46 |
+
per_device_eval_batch_size: 4.0
|
| 47 |
+
per_device_train_batch_size: 4.0
|
| 48 |
+
regularization: 0.001
|
| 49 |
+
scale_coeff: 0.1
|
| 50 |
+
seed: 42
|
| 51 |
+
weight_decay: 0.01
|
slice_1200/config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
slice_1200/environ.txt
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
|
| 2 |
+
AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
|
| 3 |
+
AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
|
| 4 |
+
BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
|
| 5 |
+
BUILD=x86_64-conda-linux-gnu
|
| 6 |
+
CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
|
| 7 |
+
CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
|
| 8 |
+
CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 9 |
+
CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
|
| 10 |
+
COLORTERM=truecolor
|
| 11 |
+
CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
|
| 12 |
+
CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
|
| 13 |
+
CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
|
| 14 |
+
CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
|
| 15 |
+
CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
|
| 16 |
+
CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
|
| 17 |
+
CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 18 |
+
CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
|
| 19 |
+
CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
|
| 20 |
+
CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
|
| 21 |
+
CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
|
| 22 |
+
CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
|
| 23 |
+
CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 24 |
+
CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
|
| 25 |
+
CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
|
| 26 |
+
CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 27 |
+
CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
|
| 28 |
+
CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 29 |
+
CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 30 |
+
CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 31 |
+
CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
|
| 32 |
+
CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
|
| 33 |
+
CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
|
| 34 |
+
CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
|
| 35 |
+
CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
|
| 36 |
+
CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
|
| 37 |
+
CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
|
| 38 |
+
CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
|
| 39 |
+
CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
|
| 40 |
+
CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
|
| 41 |
+
CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
|
| 42 |
+
CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
|
| 43 |
+
CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
|
| 44 |
+
CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
|
| 45 |
+
CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
|
| 46 |
+
CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
|
| 47 |
+
CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
|
| 48 |
+
CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
|
| 49 |
+
CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
|
| 50 |
+
CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
|
| 51 |
+
CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
|
| 52 |
+
CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
|
| 53 |
+
CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
|
| 54 |
+
CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
|
| 55 |
+
CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
|
| 56 |
+
CONDA_DEFAULT_ENV=hantao_stable
|
| 57 |
+
CONDA_EXE=/data/align-anything/miniconda3/bin/conda
|
| 58 |
+
CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
|
| 59 |
+
CONDA_PREFIX_1=/home/align-anything/miniconda3
|
| 60 |
+
CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
|
| 61 |
+
CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
|
| 62 |
+
CONDA_PREFIX_3=/data/align-anything/miniconda3
|
| 63 |
+
CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
|
| 64 |
+
CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
|
| 65 |
+
CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
|
| 66 |
+
CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
|
| 67 |
+
CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
|
| 68 |
+
CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
|
| 69 |
+
CONDA_PROMPT_MODIFIER=(hantao_stable)
|
| 70 |
+
CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
|
| 71 |
+
CONDA_ROOT=/home/align-anything/miniconda3
|
| 72 |
+
CONDA_SHLVL=11
|
| 73 |
+
CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
|
| 74 |
+
CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
|
| 75 |
+
CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
|
| 76 |
+
CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 77 |
+
CROSS_RANK=0
|
| 78 |
+
CROSS_SIZE=1
|
| 79 |
+
CUDA_MODULE_LOADING=LAZY
|
| 80 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 81 |
+
CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
|
| 82 |
+
CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
|
| 83 |
+
CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 84 |
+
CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
|
| 85 |
+
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
|
| 86 |
+
DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 87 |
+
DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 88 |
+
DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 89 |
+
DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
|
| 90 |
+
ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
|
| 91 |
+
GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
|
| 92 |
+
GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
|
| 93 |
+
GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
|
| 94 |
+
GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
|
| 95 |
+
GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
|
| 96 |
+
GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
|
| 97 |
+
GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
|
| 98 |
+
HOME=/home/align-anything
|
| 99 |
+
HOST=x86_64-conda-linux-gnu
|
| 100 |
+
LANG=en_US.UTF-8
|
| 101 |
+
LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
|
| 102 |
+
LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 103 |
+
LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
|
| 104 |
+
LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
|
| 105 |
+
LESSCLOSE=/usr/bin/lesspipe %s %s
|
| 106 |
+
LESSOPEN=| /usr/bin/lesspipe %s
|
| 107 |
+
LOCAL_RANK=0
|
| 108 |
+
LOCAL_SIZE=8
|
| 109 |
+
LOGLEVEL=WARNING
|
| 110 |
+
LOGNAME=align-anything
|
| 111 |
+
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
|
| 112 |
+
MASTER_ADDR=127.0.0.1
|
| 113 |
+
MASTER_PORT=52201
|
| 114 |
+
MOTD_SHOWN=pam
|
| 115 |
+
NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
|
| 116 |
+
NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
|
| 117 |
+
OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
|
| 118 |
+
OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
|
| 119 |
+
OLDPWD=/data/align-anything/hantao/LLaMA-Factory
|
| 120 |
+
PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
|
| 121 |
+
PWD=/data/align-anything/hantao/align-anything/scripts
|
| 122 |
+
PYGAME_HIDE_SUPPORT_PROMPT=1
|
| 123 |
+
PYTHONHASHSEED=42
|
| 124 |
+
PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
|
| 125 |
+
QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
|
| 126 |
+
QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
|
| 127 |
+
RANK=0
|
| 128 |
+
RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
|
| 129 |
+
READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
|
| 130 |
+
SHELL=/bin/bash
|
| 131 |
+
SHLVL=3
|
| 132 |
+
SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
|
| 133 |
+
SSH_CLIENT=117.136.0.149 36325 30400
|
| 134 |
+
SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
|
| 135 |
+
SSL_CERT_DIR=/usr/lib/ssl/certs
|
| 136 |
+
SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
|
| 137 |
+
STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
|
| 138 |
+
STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
|
| 139 |
+
TERM=screen
|
| 140 |
+
TERM_PROGRAM=vscode
|
| 141 |
+
TERM_PROGRAM_VERSION=0.41.3
|
| 142 |
+
TMUX=/tmp/tmux-2000/default,34082,51
|
| 143 |
+
TMUX_PANE=%59
|
| 144 |
+
TRITON_CACHE_DIR=/home/align-anything/cache/triton
|
| 145 |
+
USER=align-anything
|
| 146 |
+
VSCODE_GIT_ASKPASS_EXTRA_ARGS=
|
| 147 |
+
VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
|
| 148 |
+
VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
|
| 149 |
+
VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
|
| 150 |
+
VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
|
| 151 |
+
WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
|
| 152 |
+
WANDB_MODE=online
|
| 153 |
+
WANDB_SERVICE=2-675697-tcp-localhost-45541
|
| 154 |
+
WORLD_SIZE=8
|
| 155 |
+
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
| 156 |
+
XDG_RUNTIME_DIR=/run/user/2000
|
| 157 |
+
XDG_SESSION_CLASS=user
|
| 158 |
+
XDG_SESSION_ID=11
|
| 159 |
+
XDG_SESSION_TYPE=tty
|
| 160 |
+
_=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
|
| 161 |
+
_CE_CONDA=
|
| 162 |
+
_CE_M=
|
| 163 |
+
build_alias=x86_64-conda-linux-gnu
|
| 164 |
+
host_alias=x86_64-conda-linux-gnu
|
slice_1200/preprocessor_config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": {
|
| 3 |
+
"height": 512,
|
| 4 |
+
"width": 512
|
| 5 |
+
},
|
| 6 |
+
"do_center_crop": true,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"image_mean": [
|
| 12 |
+
1.0,
|
| 13 |
+
1.0,
|
| 14 |
+
1.0
|
| 15 |
+
],
|
| 16 |
+
"image_processor_type": "ChameleonImageProcessor",
|
| 17 |
+
"image_std": [
|
| 18 |
+
1.0,
|
| 19 |
+
1.0,
|
| 20 |
+
1.0
|
| 21 |
+
],
|
| 22 |
+
"processor_class": "ChameleonProcessor",
|
| 23 |
+
"resample": 1,
|
| 24 |
+
"rescale_factor": 0.0078,
|
| 25 |
+
"size": {
|
| 26 |
+
"shortest_edge": 512
|
| 27 |
+
}
|
| 28 |
+
}
|
slice_1200/processor_config.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"image_seq_length": 1024,
|
| 3 |
+
"image_token": "<image>",
|
| 4 |
+
"processor_class": "ChameleonProcessor"
|
| 5 |
+
}
|
slice_1200/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9571b162d6b31a5769b7b07e625c1594325aaeb450e02cadcb988815ad68a79d
|
| 3 |
+
size 14086366930
|
slice_1200/script.sh
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
|
| 4 |
+
#
|
| 5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
+
# you may not use this file except in compliance with the License.
|
| 7 |
+
# You may obtain a copy of the License at
|
| 8 |
+
#
|
| 9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
+
#
|
| 11 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
+
# See the License for the specific language governing permissions and
|
| 15 |
+
# limitations under the License.
|
| 16 |
+
# ==============================================================================
|
| 17 |
+
|
| 18 |
+
export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
|
| 19 |
+
export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
|
| 20 |
+
|
| 21 |
+
export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
|
| 22 |
+
|
| 23 |
+
export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
|
| 24 |
+
export WANDB_MODE=online
|
| 25 |
+
|
| 26 |
+
MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
|
| 27 |
+
|
| 28 |
+
DATASET_PATH=(
|
| 29 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
|
| 30 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
|
| 31 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
|
| 32 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
DATASET_NAME=(
|
| 36 |
+
"q0_10_preference"
|
| 37 |
+
"q0_20_preference"
|
| 38 |
+
"q0_30_preference"
|
| 39 |
+
"q0_40_preference"
|
| 40 |
+
"q0_50_preference"
|
| 41 |
+
"q0_60_preference"
|
| 42 |
+
"q0_70_preference"
|
| 43 |
+
"q0_80_preference"
|
| 44 |
+
"q0_90_preference"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
|
| 48 |
+
mkdir -p $OUTPUT_PATH
|
| 49 |
+
|
| 50 |
+
# Initialize variables
|
| 51 |
+
|
| 52 |
+
for dataset_path in ${DATASET_PATH[@]}; do
|
| 53 |
+
for dataset_name in ${DATASET_NAME[@]}; do
|
| 54 |
+
TRAIN_DATASETS=$dataset_path
|
| 55 |
+
|
| 56 |
+
# dataset middle name
|
| 57 |
+
middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
|
| 58 |
+
OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
|
| 59 |
+
mkdir -p $OUTPUT_DIR
|
| 60 |
+
echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
|
| 61 |
+
# Source the setup script
|
| 62 |
+
source ./setup.sh
|
| 63 |
+
|
| 64 |
+
# Execute deepspeed command
|
| 65 |
+
deepspeed \
|
| 66 |
+
--master_port ${MASTER_PORT} \
|
| 67 |
+
--module align_anything.trainers.text_image_to_text_image.dpo \
|
| 68 |
+
--model_name_or_path ${MODEL_NAME_OR_PATH} \
|
| 69 |
+
--train_datasets ${TRAIN_DATASETS} \
|
| 70 |
+
--output_dir ${OUTPUT_DIR} \
|
| 71 |
+
--per_device_train_batch_size 4 \
|
| 72 |
+
--per_device_eval_batch_size 4 \
|
| 73 |
+
--gradient_accumulation_steps 2 \
|
| 74 |
+
--train_template Chameleon_preference \
|
| 75 |
+
--train_split train \
|
| 76 |
+
--train_data_files ${dataset_name}.pt \
|
| 77 |
+
--learning_rate 1e-6 \
|
| 78 |
+
--epochs 3 \
|
| 79 |
+
--lr_scheduler_type cosine \
|
| 80 |
+
--save_interval 400
|
| 81 |
+
|
| 82 |
+
bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
|
| 83 |
+
done
|
| 84 |
+
done
|
slice_1200/special_tokens_map.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<pad>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"sep_token": {
|
| 24 |
+
"content": "<reserved08706>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"unk_token": {
|
| 31 |
+
"content": "<unk>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
slice_1200/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
slice_1200/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
slice_1200/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
| 2 |
+
{"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
|
| 3 |
+
{"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
| 4 |
+
{"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
|
| 5 |
+
{"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
|
| 6 |
+
{"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
|
| 7 |
+
{"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
|
| 8 |
+
{"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
|
| 9 |
+
{"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
| 10 |
+
{"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
|
| 11 |
+
{"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
|
| 12 |
+
{"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
|
| 13 |
+
{"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
|
| 14 |
+
{"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
|
| 15 |
+
{"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
|
| 16 |
+
{"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
|
| 17 |
+
{"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 18 |
+
{"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
|
| 19 |
+
{"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
|
| 20 |
+
{"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
|
| 21 |
+
{"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
|
| 22 |
+
{"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}
|
slice_1200/wandb/debug.log
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
| 2 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
|
| 3 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
| 4 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
| 6 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
| 7 |
+
2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
|
| 8 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
|
| 9 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
|
| 10 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
|
| 11 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
|
| 12 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:init():617] calling init triggers
|
| 13 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
| 14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
|
| 15 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():667] starting backend
|
| 16 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():671] sending inform_init request
|
| 17 |
+
2025-01-01 08:41:16,156 INFO MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 18 |
+
2025-01-01 08:41:16,157 INFO MainThread:675697 [wandb_init.py:init():684] backend started and connected
|
| 19 |
+
2025-01-01 08:41:16,160 INFO MainThread:675697 [wandb_init.py:init():779] updated telemetry
|
| 20 |
+
2025-01-01 08:41:16,221 INFO MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
| 21 |
+
2025-01-01 08:41:16,848 INFO MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
|
| 22 |
+
2025-01-01 08:41:17,411 INFO MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
|
| 23 |
+
2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
| 24 |
+
2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
| 25 |
+
2025-01-01 08:41:17,413 INFO MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
|
| 26 |
+
2025-01-01 08:41:17,424 INFO MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
|
| 27 |
+
2025-01-01 12:07:46,853 INFO MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
|
| 28 |
+
2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
|
| 29 |
+
2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_restore():2410] restore
|
| 30 |
+
2025-01-01 12:07:46,855 INFO MainThread:675697 [wandb_run.py:_restore():2416] restore done
|
| 31 |
+
2025-01-01 12:07:50,513 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
|
| 32 |
+
2025-01-01 12:07:50,515 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
|
| 33 |
+
2025-01-01 12:07:50,525 INFO MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files
|
slice_1200/wandb/run-20250101_084116-coewtb43/files/config.yaml
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.18.3
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.11.10
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 11
|
| 10 |
+
- 41
|
| 11 |
+
- 49
|
| 12 |
+
- 51
|
| 13 |
+
- 55
|
| 14 |
+
- 71
|
| 15 |
+
- 83
|
| 16 |
+
- 98
|
| 17 |
+
- 105
|
| 18 |
+
"2":
|
| 19 |
+
- 1
|
| 20 |
+
- 11
|
| 21 |
+
- 41
|
| 22 |
+
- 49
|
| 23 |
+
- 51
|
| 24 |
+
- 55
|
| 25 |
+
- 71
|
| 26 |
+
- 83
|
| 27 |
+
- 98
|
| 28 |
+
- 105
|
| 29 |
+
"3":
|
| 30 |
+
- 2
|
| 31 |
+
- 13
|
| 32 |
+
- 16
|
| 33 |
+
- 23
|
| 34 |
+
- 55
|
| 35 |
+
- 61
|
| 36 |
+
"4": 3.11.10
|
| 37 |
+
"5": 0.18.3
|
| 38 |
+
"6": 4.45.2
|
| 39 |
+
"8":
|
| 40 |
+
- 5
|
| 41 |
+
"12": 0.18.3
|
| 42 |
+
"13": linux-x86_64
|
| 43 |
+
data_cfgs:
|
| 44 |
+
value:
|
| 45 |
+
eval_data_files: null
|
| 46 |
+
eval_datasets: null
|
| 47 |
+
eval_optional_args: []
|
| 48 |
+
eval_size: null
|
| 49 |
+
eval_split: null
|
| 50 |
+
eval_subset: null
|
| 51 |
+
eval_template: null
|
| 52 |
+
train_data_files: q0_40_preference.pt
|
| 53 |
+
train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
|
| 54 |
+
train_optional_args: []
|
| 55 |
+
train_size: null
|
| 56 |
+
train_split: train
|
| 57 |
+
train_subset: null
|
| 58 |
+
train_template: Chameleon_preference
|
| 59 |
+
logger_cfgs:
|
| 60 |
+
value:
|
| 61 |
+
cache_dir: null
|
| 62 |
+
log_project: align-anything
|
| 63 |
+
log_run_name: dpo
|
| 64 |
+
log_type: wandb
|
| 65 |
+
output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
|
| 66 |
+
save_interval: 400
|
| 67 |
+
model_cfgs:
|
| 68 |
+
value:
|
| 69 |
+
model_max_length: 4096
|
| 70 |
+
model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
|
| 71 |
+
trust_remote_code: true
|
| 72 |
+
special_tokens:
|
| 73 |
+
value: null
|
| 74 |
+
train_cfgs:
|
| 75 |
+
value:
|
| 76 |
+
adam_betas:
|
| 77 |
+
- 0.9
|
| 78 |
+
- 0.95
|
| 79 |
+
bf16: true
|
| 80 |
+
ds_cfgs: ds_z3_config.json
|
| 81 |
+
epochs: 3
|
| 82 |
+
eval_interval: 10
|
| 83 |
+
eval_strategy: epoch
|
| 84 |
+
fp16: false
|
| 85 |
+
freeze_language_model: true
|
| 86 |
+
freeze_mm_proj: true
|
| 87 |
+
freeze_vision_tower: false
|
| 88 |
+
gradient_accumulation_steps: 2
|
| 89 |
+
gradient_checkpointing: true
|
| 90 |
+
learning_rate: 1e-06
|
| 91 |
+
lr_scheduler_type: cosine
|
| 92 |
+
lr_warmup_ratio: 0.03
|
| 93 |
+
per_device_eval_batch_size: 4
|
| 94 |
+
per_device_train_batch_size: 4
|
| 95 |
+
regularization: 0.001
|
| 96 |
+
scale_coeff: 0.1
|
| 97 |
+
seed: 42
|
| 98 |
+
weight_decay: 0.01
|
slice_1200/wandb/run-20250101_084116-coewtb43/files/output.log
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
***** Running training *****
|
| 2 |
+
Training 1/3.0 epoch: 0%| | 0/1422.0 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
| 3 |
+
Training 1/3.0 epoch (loss 11.8749): 21%|██████████████████████████████████████████████████▋ | 299/1422.0 [43:24<2:59:40, 9.60s/it]
|
| 4 |
+
[2025-01-01 08:42:55,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 5 |
+
[2025-01-01 08:43:25,944] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 6 |
+
[2025-01-01 08:44:17,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[4.761904761904761e-07, 4.761904761904761e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 7 |
+
[2025-01-01 08:45:03,075] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 8 |
+
[2025-01-01 08:47:04,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.523809523809522e-07, 9.523809523809522e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 9 |
+
[2025-01-01 08:48:02,529] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 10 |
+
[2025-01-01 08:48:43,904] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 11 |
+
[2025-01-01 08:50:07,690] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[9.995802740501932e-07, 9.995802740501932e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 12 |
+
[2025-01-01 08:51:09,012] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 13 |
+
[2025-01-01 08:53:09,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[9.98130274211278e-07, 9.98130274211278e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 14 |
+
[2025-01-01 08:55:49,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 15 |
+
[2025-01-01 08:55:49,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[9.956478233113064e-07, 9.956478233113064e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 16 |
+
[2025-01-01 08:58:16,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[9.921380666088558e-07, 9.921380666088558e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 17 |
+
[2025-01-01 09:01:07,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[9.876082786106545e-07, 9.876082786106545e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 18 |
+
[2025-01-01 09:01:27,047] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 19 |
+
[2025-01-01 09:04:03,792] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 20 |
+
[2025-01-01 09:04:24,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 21 |
+
[2025-01-01 09:04:24,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[9.820678479940571e-07, 9.820678479940571e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 22 |
+
[2025-01-01 09:05:30,688] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 23 |
+
[2025-01-01 09:07:12,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.755282581475767e-07, 9.755282581475767e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 24 |
+
[2025-01-01 09:09:19,110] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 25 |
+
[2025-01-01 09:10:10,785] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 26 |
+
[2025-01-01 09:10:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[9.68003063369808e-07, 9.68003063369808e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 27 |
+
[2025-01-01 09:10:51,320] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 28 |
+
[2025-01-01 09:13:33,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[9.595078607760749e-07, 9.595078607760749e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 29 |
+
[2025-01-01 09:14:56,362] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 30 |
+
[2025-01-01 09:16:12,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[9.500602579710255e-07, 9.500602579710255e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 31 |
+
[2025-01-01 09:17:52,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 32 |
+
[2025-01-01 09:19:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[9.39679836554184e-07, 9.39679836554184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 33 |
+
[2025-01-01 09:21:51,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[9.283881115340955e-07, 9.283881115340955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 34 |
+
[2025-01-01 09:22:22,638] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 35 |
+
[2025-01-01 09:24:48,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[9.16208486735184e-07, 9.16208486735184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 36 |
+
[2025-01-01 09:25:39,046] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 37 |
+
[2025-01-01 09:27:38,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[9.03166206289754e-07, 9.03166206289754e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 38 |
+
[2025-01-01 09:28:42,774] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 39 |
+
[2025-01-01 09:29:54,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[8.8928830231567e-07, 8.8928830231567e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 40 |
+
[2025-01-01 09:30:34,992] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 41 |
+
[2025-01-01 09:31:34,787] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 42 |
+
[2025-01-01 09:32:15,776] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 43 |
+
[2025-01-01 09:32:50,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[8.746035388881654e-07, 8.746035388881654e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 44 |
+
[2025-01-01 09:35:09,447] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 45 |
+
[2025-01-01 09:35:48,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[8.591423524219029e-07, 8.591423524219029e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 46 |
+
[2025-01-01 09:36:31,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 47 |
+
[2025-01-01 09:37:59,484] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 48 |
+
[2025-01-01 09:38:30,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[8.429367885868581e-07, 8.429367885868581e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 49 |
+
Saving checkpoint at step 400 ...
|
| 50 |
+
Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
|
| 51 |
+
Saving 16-bit model...
|
| 52 |
+
[2025-01-01 09:38:39,848] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
|
| 53 |
+
[2025-01-01 09:38:39,849] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
|
| 54 |
+
[2025-01-01 09:38:39,850] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
|
| 55 |
+
[2025-01-01 09:38:58,770] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
|
| 56 |
+
[2025-01-01 09:38:58,773] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
|
| 57 |
+
Model saved!
|
| 58 |
+
Saving 16-bit model...
|
| 59 |
+
[2025-01-01 09:39:05,477] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
|
| 60 |
+
[2025-01-01 09:39:05,478] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
|
| 61 |
+
[2025-01-01 09:39:05,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
|
| 62 |
+
[2025-01-01 09:39:27,199] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
|
| 63 |
+
[2025-01-01 09:39:27,202] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
|
| 64 |
+
Model saved!
|
| 65 |
+
Checkpoint saved.
|
| 66 |
+
[2025-01-01 09:42:08,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[8.260204358887753e-07, 8.260204358887753e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 67 |
+
[2025-01-01 09:43:59,051] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 68 |
+
[2025-01-01 09:44:19,070] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 69 |
+
[2025-01-01 09:45:02,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[8.084283560518583e-07, 8.084283560518583e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 70 |
+
[2025-01-01 09:47:51,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[7.901970113479955e-07, 7.901970113479955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 71 |
+
[2025-01-01 09:48:18,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 72 |
+
[2025-01-01 09:50:44,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[7.713641890231308e-07, 7.713641890231308e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 73 |
+
[2025-01-01 09:51:29,985] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 74 |
+
[2025-01-01 09:52:00,601] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 75 |
+
[2025-01-01 09:53:37,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 76 |
+
[2025-01-01 09:53:37,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[7.51968922977428e-07, 7.51968922977428e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 77 |
+
[2025-01-01 09:56:36,667] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 78 |
+
[2025-01-01 09:56:36,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[7.320514128615511e-07, 7.320514128615511e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 79 |
+
[2025-01-01 09:57:18,253] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 80 |
+
[2025-01-01 09:59:22,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[7.116529407567488e-07, 7.116529407567488e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 81 |
+
[2025-01-01 09:59:42,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 82 |
+
[2025-01-01 10:02:27,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[6.908157856114392e-07, 6.908157856114392e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 83 |
+
[2025-01-01 10:04:22,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 84 |
+
[2025-01-01 10:05:15,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[6.695831356116303e-07, 6.695831356116303e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 85 |
+
[2025-01-01 10:07:32,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[6.479989986668117e-07, 6.479989986668117e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 86 |
+
[2025-01-01 10:09:58,442] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 87 |
+
[2025-01-01 10:10:45,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[6.261081111968403e-07, 6.261081111968403e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 88 |
+
[2025-01-01 10:12:34,690] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 89 |
+
[2025-01-01 10:12:55,185] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 90 |
+
[2025-01-01 10:13:41,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[6.039558454088795e-07, 6.039558454088795e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 91 |
+
[2025-01-01 10:14:01,438] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 92 |
+
[2025-01-01 10:16:42,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[5.815881152565711e-07, 5.815881152565711e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 93 |
+
[2025-01-01 10:17:49,827] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 94 |
+
[2025-01-01 10:18:41,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 95 |
+
[2025-01-01 10:19:22,050] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 96 |
+
[2025-01-01 10:20:00,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[5.590512812763541e-07, 5.590512812763541e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 97 |
+
[2025-01-01 10:22:48,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[5.363920544981748e-07, 5.363920544981748e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 98 |
+
[2025-01-01 10:23:27,066] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 99 |
+
[2025-01-01 10:25:42,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[5.136573996297429e-07, 5.136573996297429e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 100 |
+
[2025-01-01 10:26:22,366] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 101 |
+
[2025-01-01 10:28:45,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[4.908944377150043e-07, 4.908944377150043e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 102 |
+
[2025-01-01 10:30:51,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 103 |
+
[2025-01-01 10:31:03,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[4.681503484685803e-07, 4.681503484685803e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 104 |
+
[2025-01-01 10:34:07,594] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 105 |
+
[2025-01-01 10:34:07,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[4.454722724886051e-07, 4.454722724886051e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 106 |
+
[2025-01-01 10:36:51,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[4.229072135506384e-07, 4.229072135506384e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 107 |
+
Saving checkpoint at step 800 ...
|
| 108 |
+
Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
|
| 109 |
+
Saving 16-bit model...
|
| 110 |
+
[2025-01-01 10:37:00,920] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
|
| 111 |
+
[2025-01-01 10:37:00,921] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
|
| 112 |
+
[2025-01-01 10:37:00,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
|
| 113 |
+
[2025-01-01 10:37:17,303] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
|
| 114 |
+
[2025-01-01 10:37:17,305] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
|
| 115 |
+
Model saved!
|
| 116 |
+
Saving 16-bit model...
|
| 117 |
+
[2025-01-01 10:37:24,304] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
|
| 118 |
+
[2025-01-01 10:37:24,306] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
|
| 119 |
+
[2025-01-01 10:37:24,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
|
| 120 |
+
[2025-01-01 10:37:47,861] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
|
| 121 |
+
[2025-01-01 10:37:47,862] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
|
| 122 |
+
Model saved!
|
| 123 |
+
Checkpoint saved.
|
| 124 |
+
[2025-01-01 10:38:06,545] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 125 |
+
[2025-01-01 10:39:58,968] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 126 |
+
[2025-01-01 10:40:18,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[4.005019411851609e-07, 4.005019411851609e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 127 |
+
[2025-01-01 10:40:58,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 128 |
+
[2025-01-01 10:41:39,701] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 129 |
+
[2025-01-01 10:43:08,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[3.783028937405821e-07, 3.783028937405821e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 130 |
+
[2025-01-01 10:44:32,585] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 131 |
+
[2025-01-01 10:45:54,172] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 132 |
+
[2025-01-01 10:45:54,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[3.563560821326706e-07, 3.563560821326706e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 133 |
+
[2025-01-01 10:47:22,278] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 134 |
+
[2025-01-01 10:48:37,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[3.3470699447990527e-07, 3.3470699447990527e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 135 |
+
[2025-01-01 10:51:26,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[3.1340050182240436e-07, 3.1340050182240436e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 136 |
+
[2025-01-01 10:52:26,337] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 137 |
+
[2025-01-01 10:52:46,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 138 |
+
[2025-01-01 10:54:14,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[2.92480765119841e-07, 2.92480765119841e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 139 |
+
[2025-01-01 10:56:46,300] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 140 |
+
[2025-01-01 10:57:28,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[2.719911437211122e-07, 2.719911437211122e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 141 |
+
[2025-01-01 10:59:56,353] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 142 |
+
[2025-01-01 10:59:56,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[2.5197410549546595e-07, 2.5197410549546595e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 143 |
+
[2025-01-01 11:00:26,971] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 144 |
+
[2025-01-01 11:02:03,646] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 145 |
+
[2025-01-01 11:02:54,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[2.3247113881135781e-07, 2.3247113881135781e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 146 |
+
[2025-01-01 11:05:02,388] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 147 |
+
[2025-01-01 11:05:43,792] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 148 |
+
[2025-01-01 11:05:55,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[2.1352266654547125e-07, 2.1352266654547125e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 149 |
+
[2025-01-01 11:08:07,702] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 150 |
+
[2025-01-01 11:08:38,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[1.9516796230013272e-07, 1.9516796230013272e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 151 |
+
[2025-01-01 11:11:35,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[1.774450690027746e-07, 1.774450690027746e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 152 |
+
[2025-01-01 11:12:47,870] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 153 |
+
[2025-01-01 11:14:16,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[1.6039072005615716e-07, 1.6039072005615716e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 154 |
+
[2025-01-01 11:17:00,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[1.4404026320278317e-07, 1.4404026320278317e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 155 |
+
[2025-01-01 11:18:23,592] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 156 |
+
[2025-01-01 11:19:58,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[1.284275872613028e-07, 1.284275872613028e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 157 |
+
[2025-01-01 11:20:59,657] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 158 |
+
[2025-01-01 11:21:20,032] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 159 |
+
[2025-01-01 11:22:26,091] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 160 |
+
[2025-01-01 11:23:04,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[1.1358505188676288e-07, 1.1358505188676288e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 161 |
+
[2025-01-01 11:26:14,637] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 162 |
+
[2025-01-01 11:26:14,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[9.95434205002792e-08, 9.95434205002792e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 163 |
+
[2025-01-01 11:27:06,507] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 164 |
+
[2025-01-01 11:27:47,054] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 165 |
+
[2025-01-01 11:29:24,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[8.633179652714916e-08, 8.633179652714916e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 166 |
+
[2025-01-01 11:31:52,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 167 |
+
[2025-01-01 11:32:11,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[7.397756307555885e-08, 7.397756307555885e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 168 |
+
[2025-01-01 11:34:47,839] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 169 |
+
[2025-01-01 11:34:59,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[6.250632618090867e-08, 6.250632618090867e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 170 |
+
Saving checkpoint at step 1200 ...
|
| 171 |
+
Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
|
| 172 |
+
Saving 16-bit model...
|
| 173 |
+
[2025-01-01 11:35:08,703] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
|
| 174 |
+
[2025-01-01 11:35:08,704] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
|
| 175 |
+
[2025-01-01 11:35:08,704] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
|
| 176 |
+
[2025-01-01 11:35:25,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
|
| 177 |
+
[2025-01-01 11:35:25,317] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
|
| 178 |
+
Model saved!
|
| 179 |
+
Saving 16-bit model...
|
| 180 |
+
[2025-01-01 11:35:32,446] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
|
| 181 |
+
[2025-01-01 11:35:32,447] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
|
| 182 |
+
[2025-01-01 11:35:32,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
|
| 183 |
+
[2025-01-01 11:35:53,847] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
|
| 184 |
+
[2025-01-01 11:35:53,849] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
|
| 185 |
+
Model saved!
|
| 186 |
+
Checkpoint saved.
|
| 187 |
+
[2025-01-01 11:38:41,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[5.194186173339599e-08, 5.194186173339599e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 188 |
+
[2025-01-01 11:40:10,376] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 189 |
+
[2025-01-01 11:41:25,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[4.230606619885108e-08, 4.230606619885108e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 190 |
+
[2025-01-01 11:43:26,712] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 191 |
+
[2025-01-01 11:44:25,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[3.3618911234968236e-08, 3.3618911234968236e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 192 |
+
[2025-01-01 11:46:29,769] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 193 |
+
[2025-01-01 11:46:53,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[2.589840229699558e-08, 2.589840229699558e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 194 |
+
[2025-01-01 11:48:22,179] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 195 |
+
[2025-01-01 11:49:22,087] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 196 |
+
[2025-01-01 11:49:35,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[1.9160541318679224e-08, 1.9160541318679224e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 197 |
+
[2025-01-01 11:50:02,959] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 198 |
+
[2025-01-01 11:52:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[1.3419293545812338e-08, 1.3419293545812338e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 199 |
+
[2025-01-01 11:52:56,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 200 |
+
[2025-01-01 11:54:18,301] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 201 |
+
[2025-01-01 11:55:03,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[8.686558591130156e-09, 8.686558591130156e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 202 |
+
[2025-01-01 11:55:46,245] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 203 |
+
[2025-01-01 11:57:47,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[4.972145770545999e-09, 4.972145770545999e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 204 |
+
[2025-01-01 12:00:50,391] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 205 |
+
[2025-01-01 12:00:50,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[2.283753771845587e-09, 2.283753771845587e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 206 |
+
[2025-01-01 12:01:10,430] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 207 |
+
[2025-01-01 12:03:22,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[6.269546979813523e-10, 6.269546979813523e-10], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 208 |
+
[2025-01-01 12:05:10,192] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 209 |
+
[2025-01-01 12:06:28,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[5.182518037827321e-12, 5.182518037827321e-12], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 210 |
+
Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
|
| 211 |
+
Saving 16-bit model...
|
| 212 |
+
[2025-01-01 12:06:57,036] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
|
| 213 |
+
[2025-01-01 12:06:57,037] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
|
| 214 |
+
[2025-01-01 12:06:57,037] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
|
| 215 |
+
[2025-01-01 12:07:17,768] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
|
| 216 |
+
[2025-01-01 12:07:17,770] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
|
| 217 |
+
Model saved!
|
| 218 |
+
Saving 16-bit model...
|
| 219 |
+
[2025-01-01 12:07:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
|
| 220 |
+
[2025-01-01 12:07:24,459] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
|
| 221 |
+
[2025-01-01 12:07:24,459] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
|
| 222 |
+
[2025-01-01 12:07:46,734] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
|
| 223 |
+
[2025-01-01 12:07:46,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
|
| 224 |
+
Model saved!
|
slice_1200/wandb/run-20250101_084116-coewtb43/files/requirements.txt
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
align-anything==0.0.1.dev0
|
| 2 |
+
gitdb==4.0.11
|
| 3 |
+
wcwidth==0.2.13
|
| 4 |
+
identify==2.6.1
|
| 5 |
+
tomlkit==0.12.0
|
| 6 |
+
bitsandbytes==0.44.1
|
| 7 |
+
trl==0.9.6
|
| 8 |
+
pytest-split==0.8.0
|
| 9 |
+
gradio==4.44.1
|
| 10 |
+
pip==24.2
|
| 11 |
+
multidict==6.1.0
|
| 12 |
+
fairscale==0.4.13
|
| 13 |
+
mistral_common==1.4.4
|
| 14 |
+
python-dotenv==1.0.1
|
| 15 |
+
uvloop==0.20.0
|
| 16 |
+
absl-py==2.1.0
|
| 17 |
+
tiktoken==0.7.0
|
| 18 |
+
pydub==0.25.1
|
| 19 |
+
websockets==12.0
|
| 20 |
+
llamafactory==0.9.1.dev0
|
| 21 |
+
triton==3.0.0
|
| 22 |
+
tifffile==2024.9.20
|
| 23 |
+
safe-rlhf==0.0.1.dev0
|
| 24 |
+
pandas==2.2.3
|
| 25 |
+
grpcio==1.66.2
|
| 26 |
+
click==8.1.7
|
| 27 |
+
ninja==1.11.1.1
|
| 28 |
+
rich==13.9.2
|
| 29 |
+
Jinja2==3.1.4
|
| 30 |
+
Pygments==2.18.0
|
| 31 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 32 |
+
importlib_resources==6.4.5
|
| 33 |
+
GitPython==3.1.43
|
| 34 |
+
nvidia-cufft-cu12==11.0.2.54
|
| 35 |
+
tensorboard-data-server==0.7.2
|
| 36 |
+
align-anything==0.0.1.dev0
|
| 37 |
+
six==1.16.0
|
| 38 |
+
scipy==1.14.1
|
| 39 |
+
mpmath==1.3.0
|
| 40 |
+
jsonschema-specifications==2024.10.1
|
| 41 |
+
scikit-image==0.24.0
|
| 42 |
+
zipp==3.20.2
|
| 43 |
+
cycler==0.12.1
|
| 44 |
+
MarkupSafe==2.1.5
|
| 45 |
+
tzdata==2024.2
|
| 46 |
+
idna==3.10
|
| 47 |
+
pycountry==24.6.1
|
| 48 |
+
nvidia-nccl-cu12==2.20.5
|
| 49 |
+
matplotlib==3.9.2
|
| 50 |
+
pytz==2024.2
|
| 51 |
+
uvicorn==0.31.1
|
| 52 |
+
dill==0.3.8
|
| 53 |
+
pyparsing==3.1.4
|
| 54 |
+
pytest==7.2.0
|
| 55 |
+
jiter==0.6.1
|
| 56 |
+
safetensors==0.4.5
|
| 57 |
+
typing_extensions==4.12.2
|
| 58 |
+
decorator==4.4.2
|
| 59 |
+
typeguard==4.4.1
|
| 60 |
+
prometheus_client==0.21.0
|
| 61 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
| 62 |
+
sentencepiece==0.2.0
|
| 63 |
+
requests==2.32.3
|
| 64 |
+
kiwisolver==1.4.7
|
| 65 |
+
gdown==5.2.0
|
| 66 |
+
multiprocess==0.70.16
|
| 67 |
+
xxhash==3.5.0
|
| 68 |
+
PyYAML==6.0.2
|
| 69 |
+
gguf==0.10.0
|
| 70 |
+
nvidia-nvtx-cu12==12.1.105
|
| 71 |
+
hpsv2==1.2.0
|
| 72 |
+
tensorboard==2.18.0
|
| 73 |
+
nodeenv==1.9.1
|
| 74 |
+
filelock==3.16.1
|
| 75 |
+
distro==1.9.0
|
| 76 |
+
scikit-learn==1.5.2
|
| 77 |
+
huggingface-hub==0.25.2
|
| 78 |
+
pyairports==2.1.1
|
| 79 |
+
importlib_metadata==8.5.0
|
| 80 |
+
pyarrow==17.0.0
|
| 81 |
+
llvmlite==0.43.0
|
| 82 |
+
ray==2.37.0
|
| 83 |
+
tokenizers==0.20.3
|
| 84 |
+
nvidia-nvjitlink-cu12==12.6.77
|
| 85 |
+
av==14.0.1
|
| 86 |
+
deepspeed==0.15.2
|
| 87 |
+
clip==0.2.0
|
| 88 |
+
shtab==1.7.1
|
| 89 |
+
certifi==2024.8.30
|
| 90 |
+
braceexpand==0.1.7
|
| 91 |
+
nvidia-ml-py==12.560.30
|
| 92 |
+
webdataset==0.2.100
|
| 93 |
+
docker-pycreds==0.4.0
|
| 94 |
+
einops==0.8.0
|
| 95 |
+
iniconfig==2.0.0
|
| 96 |
+
tyro==0.9.2
|
| 97 |
+
torchvision==0.19.0
|
| 98 |
+
accelerate==0.34.2
|
| 99 |
+
beautifulsoup4==4.12.3
|
| 100 |
+
pyzmq==26.2.0
|
| 101 |
+
pycparser==2.22
|
| 102 |
+
nvidia-curand-cu12==10.3.2.106
|
| 103 |
+
msgpack==1.1.0
|
| 104 |
+
soxr==0.5.0.post1
|
| 105 |
+
platformdirs==4.3.6
|
| 106 |
+
h11==0.14.0
|
| 107 |
+
psutil==6.0.0
|
| 108 |
+
pydantic==2.9.2
|
| 109 |
+
shellingham==1.5.4
|
| 110 |
+
imageio-ffmpeg==0.5.1
|
| 111 |
+
wandb==0.18.3
|
| 112 |
+
audioread==3.0.1
|
| 113 |
+
annotated-types==0.7.0
|
| 114 |
+
docstring_parser==0.16
|
| 115 |
+
cloudpickle==3.1.0
|
| 116 |
+
regex==2024.9.11
|
| 117 |
+
packaging==24.1
|
| 118 |
+
timm==0.6.13
|
| 119 |
+
aiosignal==1.3.1
|
| 120 |
+
numba==0.60.0
|
| 121 |
+
orjson==3.10.7
|
| 122 |
+
rpds-py==0.20.0
|
| 123 |
+
virtualenv==20.26.6
|
| 124 |
+
joblib==1.4.2
|
| 125 |
+
charset-normalizer==3.4.0
|
| 126 |
+
httpx==0.27.2
|
| 127 |
+
ffmpy==0.4.0
|
| 128 |
+
lm-format-enforcer==0.10.6
|
| 129 |
+
yt-dlp==2024.8.6
|
| 130 |
+
sympy==1.13.3
|
| 131 |
+
python-dateutil==2.9.0.post0
|
| 132 |
+
nvidia-cusolver-cu12==11.4.5.107
|
| 133 |
+
msgspec==0.18.6
|
| 134 |
+
mdurl==0.1.2
|
| 135 |
+
torch==2.4.0
|
| 136 |
+
fastapi==0.115.0
|
| 137 |
+
optree==0.13.0
|
| 138 |
+
PySocks==1.7.1
|
| 139 |
+
transformers==4.46.0.dev0
|
| 140 |
+
torchlibrosa==0.1.0
|
| 141 |
+
fsspec==2024.6.1
|
| 142 |
+
nvidia-cublas-cu12==12.1.3.1
|
| 143 |
+
gradio_client==1.3.0
|
| 144 |
+
args==0.1.0
|
| 145 |
+
cffi==1.17.1
|
| 146 |
+
fonttools==4.54.1
|
| 147 |
+
clint==0.5.1
|
| 148 |
+
lark==1.2.2
|
| 149 |
+
tqdm==4.66.5
|
| 150 |
+
semantic-version==2.10.0
|
| 151 |
+
pooch==1.8.2
|
| 152 |
+
markdown-it-py==3.0.0
|
| 153 |
+
pydantic_core==2.23.4
|
| 154 |
+
sniffio==1.3.1
|
| 155 |
+
httptools==0.6.1
|
| 156 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
| 157 |
+
anyio==4.6.0
|
| 158 |
+
ftfy==6.3.0
|
| 159 |
+
Markdown==3.7
|
| 160 |
+
datasets==2.21.0
|
| 161 |
+
diffusers==0.30.3
|
| 162 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 163 |
+
vllm==0.6.2
|
| 164 |
+
starlette==0.38.6
|
| 165 |
+
flash-attn==2.7.0.post2
|
| 166 |
+
urllib3==2.2.3
|
| 167 |
+
Werkzeug==3.0.4
|
| 168 |
+
py-cpuinfo==9.0.0
|
| 169 |
+
moviepy==1.0.3
|
| 170 |
+
librosa==0.10.2.post1
|
| 171 |
+
peft==0.12.0
|
| 172 |
+
soupsieve==2.6
|
| 173 |
+
lazy_loader==0.4
|
| 174 |
+
pluggy==1.5.0
|
| 175 |
+
setuptools==75.1.0
|
| 176 |
+
sentry-sdk==2.16.0
|
| 177 |
+
tabulate==0.9.0
|
| 178 |
+
transformers==4.45.2
|
| 179 |
+
pre_commit==4.0.1
|
| 180 |
+
termcolor==2.5.0
|
| 181 |
+
frechet-audio-distance==0.1.2
|
| 182 |
+
pytorch-fid==0.3.0
|
| 183 |
+
setproctitle==1.3.3
|
| 184 |
+
jsonschema==4.23.0
|
| 185 |
+
aiofiles==23.2.1
|
| 186 |
+
contourpy==1.3.0
|
| 187 |
+
distlib==0.3.9
|
| 188 |
+
interegular==0.3.3
|
| 189 |
+
fire==0.7.0
|
| 190 |
+
diskcache==5.6.3
|
| 191 |
+
proglog==0.1.10
|
| 192 |
+
soundfile==0.12.1
|
| 193 |
+
protobuf==3.20.3
|
| 194 |
+
smmap==5.0.1
|
| 195 |
+
pycryptodomex==3.21.0
|
| 196 |
+
Brotli==1.1.0
|
| 197 |
+
pillow==10.4.0
|
| 198 |
+
frozenlist==1.4.1
|
| 199 |
+
numpy==1.26.4
|
| 200 |
+
mutagen==1.47.0
|
| 201 |
+
outlines==0.0.46
|
| 202 |
+
attrs==24.2.0
|
| 203 |
+
torchaudio==2.4.0
|
| 204 |
+
aiohttp==3.10.10
|
| 205 |
+
ruff==0.6.9
|
| 206 |
+
watchfiles==0.24.0
|
| 207 |
+
threadpoolctl==3.5.0
|
| 208 |
+
nest-asyncio==1.6.0
|
| 209 |
+
partial-json-parser==0.2.1.1.post4
|
| 210 |
+
sse-starlette==2.1.3
|
| 211 |
+
shortuuid==1.0.13
|
| 212 |
+
typer==0.12.5
|
| 213 |
+
prometheus-fastapi-instrumentator==7.0.0
|
| 214 |
+
imageio==2.35.1
|
| 215 |
+
wheel==0.44.0
|
| 216 |
+
image-reward==1.5
|
| 217 |
+
networkx==3.4.1
|
| 218 |
+
propcache==0.2.0
|
| 219 |
+
aiohappyeyeballs==2.4.3
|
| 220 |
+
nvidia-cusparse-cu12==12.1.0.106
|
| 221 |
+
xformers==0.0.27.post2
|
| 222 |
+
cfgv==3.4.0
|
| 223 |
+
python-multipart==0.0.12
|
| 224 |
+
httpcore==1.0.6
|
| 225 |
+
opencv-python==4.6.0.66
|
| 226 |
+
resampy==0.4.3
|
| 227 |
+
yarl==1.15.0
|
| 228 |
+
referencing==0.35.1
|
| 229 |
+
openai==1.51.2
|
| 230 |
+
hjson==3.1.0
|
| 231 |
+
llamafactory==0.9.1.dev0
|
| 232 |
+
jaraco.collections==5.1.0
|
| 233 |
+
backports.tarfile==1.2.0
|
| 234 |
+
more-itertools==10.3.0
|
| 235 |
+
wheel==0.43.0
|
| 236 |
+
importlib_metadata==8.0.0
|
| 237 |
+
zipp==3.19.2
|
| 238 |
+
autocommand==2.2.2
|
| 239 |
+
jaraco.functools==4.0.1
|
| 240 |
+
platformdirs==4.2.2
|
| 241 |
+
tomli==2.0.1
|
| 242 |
+
jaraco.text==3.12.1
|
| 243 |
+
typing_extensions==4.12.2
|
| 244 |
+
jaraco.context==5.3.0
|
| 245 |
+
importlib_resources==6.4.0
|
| 246 |
+
packaging==24.1
|
| 247 |
+
inflect==7.3.1
|
| 248 |
+
typeguard==4.3.0
|
slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
|
| 3 |
+
"python": "3.11.10",
|
| 4 |
+
"startedAt": "2025-01-01T08:41:16.157770Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--model_name_or_path",
|
| 8 |
+
"/data/align-anything/hantao/models/chameleon-7b",
|
| 9 |
+
"--train_datasets",
|
| 10 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized",
|
| 11 |
+
"--output_dir",
|
| 12 |
+
"/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
|
| 13 |
+
"--per_device_train_batch_size",
|
| 14 |
+
"4",
|
| 15 |
+
"--per_device_eval_batch_size",
|
| 16 |
+
"4",
|
| 17 |
+
"--gradient_accumulation_steps",
|
| 18 |
+
"2",
|
| 19 |
+
"--train_template",
|
| 20 |
+
"Chameleon_preference",
|
| 21 |
+
"--train_split",
|
| 22 |
+
"train",
|
| 23 |
+
"--train_data_files",
|
| 24 |
+
"q0_40_preference.pt",
|
| 25 |
+
"--learning_rate",
|
| 26 |
+
"1e-6",
|
| 27 |
+
"--epochs",
|
| 28 |
+
"3",
|
| 29 |
+
"--lr_scheduler_type",
|
| 30 |
+
"cosine",
|
| 31 |
+
"--save_interval",
|
| 32 |
+
"400"
|
| 33 |
+
],
|
| 34 |
+
"program": "-m align_anything.trainers.text_image_to_text_image.dpo",
|
| 35 |
+
"git": {
|
| 36 |
+
"remote": "https://github.com/PKU-Alignment/align-anything.git",
|
| 37 |
+
"commit": "6fde660afc9985323f147930eedf188a5699adc7"
|
| 38 |
+
},
|
| 39 |
+
"email": "[email protected]",
|
| 40 |
+
"root": "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
|
| 41 |
+
"host": "lyg0194",
|
| 42 |
+
"username": "align-anything",
|
| 43 |
+
"executable": "/data/align-anything/miniconda3/envs/hantao_stable/bin/python",
|
| 44 |
+
"cpu_count": 64,
|
| 45 |
+
"cpu_count_logical": 128,
|
| 46 |
+
"gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
|
| 47 |
+
"gpu_count": 8,
|
| 48 |
+
"disk": {
|
| 49 |
+
"/": {
|
| 50 |
+
"total": "939477946368",
|
| 51 |
+
"used": "596714827776"
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"memory": {
|
| 55 |
+
"total": "1081823907840"
|
| 56 |
+
},
|
| 57 |
+
"cpu": {
|
| 58 |
+
"count": 64,
|
| 59 |
+
"countLogical": 128
|
| 60 |
+
},
|
| 61 |
+
"gpu_nvidia": [
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 64 |
+
"memoryTotal": "85899345920",
|
| 65 |
+
"cudaCores": 6912,
|
| 66 |
+
"architecture": "Ampere"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 70 |
+
"memoryTotal": "85899345920",
|
| 71 |
+
"cudaCores": 6912,
|
| 72 |
+
"architecture": "Ampere"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 76 |
+
"memoryTotal": "85899345920",
|
| 77 |
+
"cudaCores": 6912,
|
| 78 |
+
"architecture": "Ampere"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 82 |
+
"memoryTotal": "85899345920",
|
| 83 |
+
"cudaCores": 6912,
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85899345920",
|
| 89 |
+
"cudaCores": 6912,
|
| 90 |
+
"architecture": "Ampere"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 94 |
+
"memoryTotal": "85899345920",
|
| 95 |
+
"cudaCores": 6912,
|
| 96 |
+
"architecture": "Ampere"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 100 |
+
"memoryTotal": "85899345920",
|
| 101 |
+
"cudaCores": 6912,
|
| 102 |
+
"architecture": "Ampere"
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 106 |
+
"memoryTotal": "85899345920",
|
| 107 |
+
"cudaCores": 6912,
|
| 108 |
+
"architecture": "Ampere"
|
| 109 |
+
}
|
| 110 |
+
],
|
| 111 |
+
"cudaVersion": "12.4"
|
| 112 |
+
}
|
slice_1200/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train/loss":1.2820848226547241,"_step":1422,"train/better_sample_reward":92.50687408447266,"train/reward_margin":134.08236694335938,"train/lr":0,"train/worse_sample_reward":-41.57551193237305,"_wandb":{"runtime":12390},"_timestamp":1.7357332075421584e+09,"train/step":1422,"train/epoch":3,"_runtime":12390.697992413,"train/reward_accuracy":0.90625,"train/reward":50.931365966796875}
|
slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
| 2 |
+
{"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
|
| 3 |
+
{"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
| 4 |
+
{"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
|
| 5 |
+
{"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
|
| 6 |
+
{"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
|
| 7 |
+
{"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
|
| 8 |
+
{"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
|
| 9 |
+
{"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
| 10 |
+
{"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
|
| 11 |
+
{"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
|
| 12 |
+
{"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
|
| 13 |
+
{"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
|
| 14 |
+
{"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
|
| 15 |
+
{"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
|
| 16 |
+
{"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
|
| 17 |
+
{"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 18 |
+
{"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
|
| 19 |
+
{"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
|
| 20 |
+
{"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
|
| 21 |
+
{"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
|
| 22 |
+
{"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}
|
slice_1200/wandb/run-20250101_084116-coewtb43/logs/debug.log
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
| 2 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
|
| 3 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
| 4 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
| 6 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
| 7 |
+
2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
|
| 8 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
|
| 9 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
|
| 10 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
|
| 11 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
|
| 12 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:init():617] calling init triggers
|
| 13 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
| 14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
|
| 15 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():667] starting backend
|
| 16 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():671] sending inform_init request
|
| 17 |
+
2025-01-01 08:41:16,156 INFO MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 18 |
+
2025-01-01 08:41:16,157 INFO MainThread:675697 [wandb_init.py:init():684] backend started and connected
|
| 19 |
+
2025-01-01 08:41:16,160 INFO MainThread:675697 [wandb_init.py:init():779] updated telemetry
|
| 20 |
+
2025-01-01 08:41:16,221 INFO MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
| 21 |
+
2025-01-01 08:41:16,848 INFO MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
|
| 22 |
+
2025-01-01 08:41:17,411 INFO MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
|
| 23 |
+
2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
| 24 |
+
2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
| 25 |
+
2025-01-01 08:41:17,413 INFO MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
|
| 26 |
+
2025-01-01 08:41:17,424 INFO MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
|
| 27 |
+
2025-01-01 12:07:46,853 INFO MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
|
| 28 |
+
2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
|
| 29 |
+
2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_restore():2410] restore
|
| 30 |
+
2025-01-01 12:07:46,855 INFO MainThread:675697 [wandb_run.py:_restore():2416] restore done
|
| 31 |
+
2025-01-01 12:07:50,513 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
|
| 32 |
+
2025-01-01 12:07:50,515 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
|
| 33 |
+
2025-01-01 12:07:50,525 INFO MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files
|
slice_1200/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6184cd062ec8be4c9517ecc56b37ff397dd3f29795bce1cd495613256a3f6f76
|
| 3 |
+
size 12650956
|
slice_400/arguments.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_cfgs:
|
| 2 |
+
eval_data_files: null
|
| 3 |
+
eval_datasets: null
|
| 4 |
+
eval_optional_args: []
|
| 5 |
+
eval_size: null
|
| 6 |
+
eval_split: null
|
| 7 |
+
eval_subset: null
|
| 8 |
+
eval_template: null
|
| 9 |
+
train_data_files: q0_40_preference.pt
|
| 10 |
+
train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
|
| 11 |
+
train_optional_args: []
|
| 12 |
+
train_size: null
|
| 13 |
+
train_split: train
|
| 14 |
+
train_subset: null
|
| 15 |
+
train_template: Chameleon_preference
|
| 16 |
+
logger_cfgs:
|
| 17 |
+
cache_dir: null
|
| 18 |
+
log_project: align-anything
|
| 19 |
+
log_run_name: dpo
|
| 20 |
+
log_type: wandb
|
| 21 |
+
output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
|
| 22 |
+
save_interval: 400.0
|
| 23 |
+
model_cfgs:
|
| 24 |
+
model_max_length: 4096
|
| 25 |
+
model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
|
| 26 |
+
trust_remote_code: true
|
| 27 |
+
special_tokens: null
|
| 28 |
+
train_cfgs:
|
| 29 |
+
adam_betas:
|
| 30 |
+
- 0.9
|
| 31 |
+
- 0.95
|
| 32 |
+
bf16: true
|
| 33 |
+
ds_cfgs: ds_z3_config.json
|
| 34 |
+
epochs: 3.0
|
| 35 |
+
eval_interval: 10
|
| 36 |
+
eval_strategy: epoch
|
| 37 |
+
fp16: false
|
| 38 |
+
freeze_language_model: true
|
| 39 |
+
freeze_mm_proj: true
|
| 40 |
+
freeze_vision_tower: false
|
| 41 |
+
gradient_accumulation_steps: 2.0
|
| 42 |
+
gradient_checkpointing: true
|
| 43 |
+
learning_rate: 1.0e-06
|
| 44 |
+
lr_scheduler_type: cosine
|
| 45 |
+
lr_warmup_ratio: 0.03
|
| 46 |
+
per_device_eval_batch_size: 4.0
|
| 47 |
+
per_device_train_batch_size: 4.0
|
| 48 |
+
regularization: 0.001
|
| 49 |
+
scale_coeff: 0.1
|
| 50 |
+
seed: 42
|
| 51 |
+
weight_decay: 0.01
|
slice_400/config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
slice_400/environ.txt
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ADDR2LINE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-addr2line
|
| 2 |
+
AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ar
|
| 3 |
+
AS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-as
|
| 4 |
+
BROWSER=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/bin/helpers/browser.sh
|
| 5 |
+
BUILD=x86_64-conda-linux-gnu
|
| 6 |
+
CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
|
| 7 |
+
CC_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cc
|
| 8 |
+
CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 9 |
+
CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/jy-a:/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot/usr
|
| 10 |
+
COLORTERM=truecolor
|
| 11 |
+
CONDA_BACKUP_ADDR2LINE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-addr2line
|
| 12 |
+
CONDA_BACKUP_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ar
|
| 13 |
+
CONDA_BACKUP_AS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-as
|
| 14 |
+
CONDA_BACKUP_BUILD=x86_64-conda-linux-gnu
|
| 15 |
+
CONDA_BACKUP_CC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
|
| 16 |
+
CONDA_BACKUP_CC_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cc
|
| 17 |
+
CONDA_BACKUP_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 18 |
+
CONDA_BACKUP_CMAKE_PREFIX_PATH=/data/align-anything/miniconda3/envs/hantao_proxy:/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot/usr
|
| 19 |
+
CONDA_BACKUP_CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/hantao_proxy/x86_64-conda-linux-gnu/sysroot
|
| 20 |
+
CONDA_BACKUP_CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
|
| 21 |
+
CONDA_BACKUP_CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
|
| 22 |
+
CONDA_BACKUP_CPP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-cpp
|
| 23 |
+
CONDA_BACKUP_CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 24 |
+
CONDA_BACKUP_CXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
|
| 25 |
+
CONDA_BACKUP_CXXFILT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++filt
|
| 26 |
+
CONDA_BACKUP_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -I/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/include
|
| 27 |
+
CONDA_BACKUP_CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-c++
|
| 28 |
+
CONDA_BACKUP_DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 29 |
+
CONDA_BACKUP_DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 30 |
+
CONDA_BACKUP_DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/hantao_proxy/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 31 |
+
CONDA_BACKUP_DWP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-dwp
|
| 32 |
+
CONDA_BACKUP_ELFEDIT=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-elfedit
|
| 33 |
+
CONDA_BACKUP_GCC=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc
|
| 34 |
+
CONDA_BACKUP_GCC_AR=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ar
|
| 35 |
+
CONDA_BACKUP_GCC_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-nm
|
| 36 |
+
CONDA_BACKUP_GCC_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gcc-ranlib
|
| 37 |
+
CONDA_BACKUP_GPROF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-gprof
|
| 38 |
+
CONDA_BACKUP_GXX=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-g++
|
| 39 |
+
CONDA_BACKUP_HOST=x86_64-conda-linux-gnu
|
| 40 |
+
CONDA_BACKUP_LD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld
|
| 41 |
+
CONDA_BACKUP_LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/hantao_proxy/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/hantao_proxy/targets/x86_64-linux/lib/stubs
|
| 42 |
+
CONDA_BACKUP_LD_GOLD=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ld.gold
|
| 43 |
+
CONDA_BACKUP_MESON_ARGS=-Dbuildtype=release
|
| 44 |
+
CONDA_BACKUP_NM=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-nm
|
| 45 |
+
CONDA_BACKUP_OBJCOPY=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objcopy
|
| 46 |
+
CONDA_BACKUP_OBJDUMP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-objdump
|
| 47 |
+
CONDA_BACKUP_RANLIB=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-ranlib
|
| 48 |
+
CONDA_BACKUP_READELF=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-readelf
|
| 49 |
+
CONDA_BACKUP_SIZE=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-size
|
| 50 |
+
CONDA_BACKUP_STRINGS=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strings
|
| 51 |
+
CONDA_BACKUP_STRIP=/data/align-anything/miniconda3/envs/hantao_proxy/bin/x86_64-conda-linux-gnu-strip
|
| 52 |
+
CONDA_BACKUP__CONDA_PYTHON_SYSCONFIGDATA_NAME=_sysconfigdata_x86_64_conda_cos6_linux_gnu
|
| 53 |
+
CONDA_BACKUP_build_alias=x86_64-conda-linux-gnu
|
| 54 |
+
CONDA_BACKUP_host_alias=x86_64-conda-linux-gnu
|
| 55 |
+
CONDA_BUILD_SYSROOT=/data/align-anything/miniconda3/envs/jy-a/x86_64-conda-linux-gnu/sysroot
|
| 56 |
+
CONDA_DEFAULT_ENV=hantao_stable
|
| 57 |
+
CONDA_EXE=/data/align-anything/miniconda3/bin/conda
|
| 58 |
+
CONDA_PREFIX=/data/align-anything/miniconda3/envs/hantao_stable
|
| 59 |
+
CONDA_PREFIX_1=/home/align-anything/miniconda3
|
| 60 |
+
CONDA_PREFIX_10=/data/align-anything/miniconda3/envs/hantao_proxy
|
| 61 |
+
CONDA_PREFIX_2=/data/align-anything/miniconda3/envs/jy-a
|
| 62 |
+
CONDA_PREFIX_3=/data/align-anything/miniconda3
|
| 63 |
+
CONDA_PREFIX_4=/data/align-anything/miniconda3/envs/hantao_stable
|
| 64 |
+
CONDA_PREFIX_5=/data/align-anything/miniconda3/envs/hantao_cham
|
| 65 |
+
CONDA_PREFIX_6=/data/align-anything/miniconda3/envs/hantao_stable
|
| 66 |
+
CONDA_PREFIX_7=/data/align-anything/miniconda3/envs/hantao_stream
|
| 67 |
+
CONDA_PREFIX_8=/data/align-anything/miniconda3/envs/hantao_proxy
|
| 68 |
+
CONDA_PREFIX_9=/data/align-anything/miniconda3/envs/hantao_stable
|
| 69 |
+
CONDA_PROMPT_MODIFIER=(hantao_stable)
|
| 70 |
+
CONDA_PYTHON_EXE=/data/align-anything/miniconda3/bin/python
|
| 71 |
+
CONDA_ROOT=/home/align-anything/miniconda3
|
| 72 |
+
CONDA_SHLVL=11
|
| 73 |
+
CONDA_TOOLCHAIN_BUILD=x86_64-conda-linux-gnu
|
| 74 |
+
CONDA_TOOLCHAIN_HOST=x86_64-conda-linux-gnu
|
| 75 |
+
CPP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-cpp
|
| 76 |
+
CPPFLAGS=-DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 77 |
+
CROSS_RANK=0
|
| 78 |
+
CROSS_SIZE=1
|
| 79 |
+
CUDA_MODULE_LOADING=LAZY
|
| 80 |
+
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 81 |
+
CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
|
| 82 |
+
CXXFILT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++filt
|
| 83 |
+
CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -I/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/include -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 84 |
+
CXX_FOR_BUILD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
|
| 85 |
+
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/2000/bus
|
| 86 |
+
DEBUG_CFLAGS=-march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 87 |
+
DEBUG_CPPFLAGS=-D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include -D_DEBUG -D_FORTIFY_SOURCE=2 -Og -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 88 |
+
DEBUG_CXXFLAGS=-fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-all -fno-plt -Og -g -Wall -Wextra -fvar-tracking-assignments -ffunction-sections -pipe -isystem /data/align-anything/miniconda3/envs/jy-a/include
|
| 89 |
+
DWP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-dwp
|
| 90 |
+
ELFEDIT=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-elfedit
|
| 91 |
+
GCC=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc
|
| 92 |
+
GCC_AR=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ar
|
| 93 |
+
GCC_NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-nm
|
| 94 |
+
GCC_RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gcc-ranlib
|
| 95 |
+
GIT_ASKPASS=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass.sh
|
| 96 |
+
GPROF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-gprof
|
| 97 |
+
GXX=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-g++
|
| 98 |
+
HOME=/home/align-anything
|
| 99 |
+
HOST=x86_64-conda-linux-gnu
|
| 100 |
+
LANG=en_US.UTF-8
|
| 101 |
+
LD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld
|
| 102 |
+
LDFLAGS=-Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-O2 -Wl,--sort-common -Wl,--as-needed -Wl,-z,relro -Wl,-z,now -Wl,--disable-new-dtags -Wl,--gc-sections -Wl,--allow-shlib-undefined -Wl,-rpath,/data/align-anything/miniconda3/envs/jy-a/lib -Wl,-rpath-link,/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib -L/data/align-anything/miniconda3/envs/jy-a/targets/x86_64-linux/lib/stubs
|
| 103 |
+
LD_GOLD=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ld.gold
|
| 104 |
+
LD_LIBRARY_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/../../lib64:
|
| 105 |
+
LESSCLOSE=/usr/bin/lesspipe %s %s
|
| 106 |
+
LESSOPEN=| /usr/bin/lesspipe %s
|
| 107 |
+
LOCAL_RANK=0
|
| 108 |
+
LOCAL_SIZE=8
|
| 109 |
+
LOGLEVEL=WARNING
|
| 110 |
+
LOGNAME=align-anything
|
| 111 |
+
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
|
| 112 |
+
MASTER_ADDR=127.0.0.1
|
| 113 |
+
MASTER_PORT=52201
|
| 114 |
+
MOTD_SHOWN=pam
|
| 115 |
+
NM=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-nm
|
| 116 |
+
NVCC_PREPEND_FLAGS= -ccbin=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-c++
|
| 117 |
+
OBJCOPY=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objcopy
|
| 118 |
+
OBJDUMP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-objdump
|
| 119 |
+
OLDPWD=/data/align-anything/hantao/LLaMA-Factory
|
| 120 |
+
PATH=/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/envs/hantao_stable/bin:/data/align-anything/miniconda3/bin:/data/align-anything/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
|
| 121 |
+
PWD=/data/align-anything/hantao/align-anything/scripts
|
| 122 |
+
PYGAME_HIDE_SUPPORT_PROMPT=1
|
| 123 |
+
PYTHONHASHSEED=42
|
| 124 |
+
PYTHONPATH=/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything:/data/align-anything/hantao/align-anything
|
| 125 |
+
QT_QPA_FONTDIR=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/fonts
|
| 126 |
+
QT_QPA_PLATFORM_PLUGIN_PATH=/data/align-anything/miniconda3/envs/hantao_stable/lib/python3.11/site-packages/cv2/qt/plugins
|
| 127 |
+
RANK=0
|
| 128 |
+
RANLIB=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-ranlib
|
| 129 |
+
READELF=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-readelf
|
| 130 |
+
SHELL=/bin/bash
|
| 131 |
+
SHLVL=3
|
| 132 |
+
SIZE=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-size
|
| 133 |
+
SSH_CLIENT=117.136.0.149 36325 30400
|
| 134 |
+
SSH_CONNECTION=111.205.232.251 37945 10.10.212.194 30400
|
| 135 |
+
SSL_CERT_DIR=/usr/lib/ssl/certs
|
| 136 |
+
SSL_CERT_FILE=/usr/lib/ssl/certs/ca-certificates.crt
|
| 137 |
+
STRINGS=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strings
|
| 138 |
+
STRIP=/data/align-anything/miniconda3/envs/jy-a/bin/x86_64-conda-linux-gnu-strip
|
| 139 |
+
TERM=screen
|
| 140 |
+
TERM_PROGRAM=vscode
|
| 141 |
+
TERM_PROGRAM_VERSION=0.41.3
|
| 142 |
+
TMUX=/tmp/tmux-2000/default,34082,51
|
| 143 |
+
TMUX_PANE=%59
|
| 144 |
+
TRITON_CACHE_DIR=/home/align-anything/cache/triton
|
| 145 |
+
USER=align-anything
|
| 146 |
+
VSCODE_GIT_ASKPASS_EXTRA_ARGS=
|
| 147 |
+
VSCODE_GIT_ASKPASS_MAIN=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/extensions/git/dist/askpass-main.js
|
| 148 |
+
VSCODE_GIT_ASKPASS_NODE=/home/align-anything/.cursor-server/cli/servers/Stable-51c8aff7cb5a89f4a0e462fbacab938bdbfaf140/server/node
|
| 149 |
+
VSCODE_GIT_IPC_HANDLE=/run/user/2000/vscode-git-ef8058c264.sock
|
| 150 |
+
VSCODE_IPC_HOOK_CLI=/run/user/2000/vscode-ipc-db013265-9a8a-4fb7-ba94-00b66d808feb.sock
|
| 151 |
+
WANDB_API_KEY=7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33
|
| 152 |
+
WANDB_MODE=online
|
| 153 |
+
WANDB_SERVICE=2-675697-tcp-localhost-45541
|
| 154 |
+
WORLD_SIZE=8
|
| 155 |
+
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
|
| 156 |
+
XDG_RUNTIME_DIR=/run/user/2000
|
| 157 |
+
XDG_SESSION_CLASS=user
|
| 158 |
+
XDG_SESSION_ID=11
|
| 159 |
+
XDG_SESSION_TYPE=tty
|
| 160 |
+
_=/data/align-anything/miniconda3/envs/hantao_stable/bin/deepspeed
|
| 161 |
+
_CE_CONDA=
|
| 162 |
+
_CE_M=
|
| 163 |
+
build_alias=x86_64-conda-linux-gnu
|
| 164 |
+
host_alias=x86_64-conda-linux-gnu
|
slice_400/preprocessor_config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": {
|
| 3 |
+
"height": 512,
|
| 4 |
+
"width": 512
|
| 5 |
+
},
|
| 6 |
+
"do_center_crop": true,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"image_mean": [
|
| 12 |
+
1.0,
|
| 13 |
+
1.0,
|
| 14 |
+
1.0
|
| 15 |
+
],
|
| 16 |
+
"image_processor_type": "ChameleonImageProcessor",
|
| 17 |
+
"image_std": [
|
| 18 |
+
1.0,
|
| 19 |
+
1.0,
|
| 20 |
+
1.0
|
| 21 |
+
],
|
| 22 |
+
"processor_class": "ChameleonProcessor",
|
| 23 |
+
"resample": 1,
|
| 24 |
+
"rescale_factor": 0.0078,
|
| 25 |
+
"size": {
|
| 26 |
+
"shortest_edge": 512
|
| 27 |
+
}
|
| 28 |
+
}
|
slice_400/processor_config.json
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"image_seq_length": 1024,
|
| 3 |
+
"image_token": "<image>",
|
| 4 |
+
"processor_class": "ChameleonProcessor"
|
| 5 |
+
}
|
slice_400/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d45286d89bc63b921ceef6df439a1bda7c4537d46f14ecab8a5b77fe81bdcde0
|
| 3 |
+
size 14086366378
|
slice_400/script.sh
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
|
| 4 |
+
#
|
| 5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
+
# you may not use this file except in compliance with the License.
|
| 7 |
+
# You may obtain a copy of the License at
|
| 8 |
+
#
|
| 9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
+
#
|
| 11 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
+
# See the License for the specific language governing permissions and
|
| 15 |
+
# limitations under the License.
|
| 16 |
+
# ==============================================================================
|
| 17 |
+
|
| 18 |
+
export CC=/data/align-anything/miniconda3/envs/hantao_stable/bin/gcc
|
| 19 |
+
export CXX=/data/align-anything/miniconda3/envs/hantao_stable/bin/g++
|
| 20 |
+
|
| 21 |
+
export TRITON_CACHE_DIR="/home/align-anything/cache/triton"
|
| 22 |
+
|
| 23 |
+
export WANDB_API_KEY="7e2dcc0c310ebcb7cdcafd5e9320d6be55cf1a33"
|
| 24 |
+
export WANDB_MODE=online
|
| 25 |
+
|
| 26 |
+
MODEL_NAME_OR_PATH="/data/align-anything/hantao/models/chameleon-7b"
|
| 27 |
+
|
| 28 |
+
DATASET_PATH=(
|
| 29 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized"
|
| 30 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_cosi_new_step10/tokenized"
|
| 31 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_l0_new_step10/tokenized"
|
| 32 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_random/tokenized"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
DATASET_NAME=(
|
| 36 |
+
"q0_10_preference"
|
| 37 |
+
"q0_20_preference"
|
| 38 |
+
"q0_30_preference"
|
| 39 |
+
"q0_40_preference"
|
| 40 |
+
"q0_50_preference"
|
| 41 |
+
"q0_60_preference"
|
| 42 |
+
"q0_70_preference"
|
| 43 |
+
"q0_80_preference"
|
| 44 |
+
"q0_90_preference"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
OUTPUT_PATH="/data/align-anything/hantao/align-anything/outputs/mm_interp"
|
| 48 |
+
mkdir -p $OUTPUT_PATH
|
| 49 |
+
|
| 50 |
+
# Initialize variables
|
| 51 |
+
|
| 52 |
+
for dataset_path in ${DATASET_PATH[@]}; do
|
| 53 |
+
for dataset_name in ${DATASET_NAME[@]}; do
|
| 54 |
+
TRAIN_DATASETS=$dataset_path
|
| 55 |
+
|
| 56 |
+
# dataset middle name
|
| 57 |
+
middle_name= echo "$dataset_path" | awk -F'/' '{print $(NF-1)}'
|
| 58 |
+
OUTPUT_DIR=$OUTPUT_PATH/$middle_name/$dataset_name
|
| 59 |
+
mkdir -p $OUTPUT_DIR
|
| 60 |
+
echo "Training on $TRAIN_DATASETS, output to $OUTPUT_DIR"
|
| 61 |
+
# Source the setup script
|
| 62 |
+
source ./setup.sh
|
| 63 |
+
|
| 64 |
+
# Execute deepspeed command
|
| 65 |
+
deepspeed \
|
| 66 |
+
--master_port ${MASTER_PORT} \
|
| 67 |
+
--module align_anything.trainers.text_image_to_text_image.dpo \
|
| 68 |
+
--model_name_or_path ${MODEL_NAME_OR_PATH} \
|
| 69 |
+
--train_datasets ${TRAIN_DATASETS} \
|
| 70 |
+
--output_dir ${OUTPUT_DIR} \
|
| 71 |
+
--per_device_train_batch_size 4 \
|
| 72 |
+
--per_device_eval_batch_size 4 \
|
| 73 |
+
--gradient_accumulation_steps 2 \
|
| 74 |
+
--train_template Chameleon_preference \
|
| 75 |
+
--train_split train \
|
| 76 |
+
--train_data_files ${dataset_name}.pt \
|
| 77 |
+
--learning_rate 1e-6 \
|
| 78 |
+
--epochs 3 \
|
| 79 |
+
--lr_scheduler_type cosine \
|
| 80 |
+
--save_interval 400
|
| 81 |
+
|
| 82 |
+
bash /data/align-anything/hantao/align-anything/outputs/cut.sh $OUTPUT_DIR
|
| 83 |
+
done
|
| 84 |
+
done
|
slice_400/special_tokens_map.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<pad>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"sep_token": {
|
| 24 |
+
"content": "<reserved08706>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"unk_token": {
|
| 31 |
+
"content": "<unk>",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
}
|
| 37 |
+
}
|
slice_400/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
slice_400/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
slice_400/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
| 2 |
+
{"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
|
| 3 |
+
{"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
| 4 |
+
{"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
|
| 5 |
+
{"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
|
| 6 |
+
{"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
|
| 7 |
+
{"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
|
| 8 |
+
{"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
|
| 9 |
+
{"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
| 10 |
+
{"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
|
| 11 |
+
{"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
|
| 12 |
+
{"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
|
| 13 |
+
{"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
|
| 14 |
+
{"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
|
| 15 |
+
{"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
|
| 16 |
+
{"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
|
| 17 |
+
{"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 18 |
+
{"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
|
| 19 |
+
{"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
|
| 20 |
+
{"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
|
| 21 |
+
{"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
|
| 22 |
+
{"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}
|
slice_400/wandb/debug.log
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
| 2 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
|
| 3 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
| 4 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
| 6 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
| 7 |
+
2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
|
| 8 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
|
| 9 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
|
| 10 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
|
| 11 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
|
| 12 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:init():617] calling init triggers
|
| 13 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
| 14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
|
| 15 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():667] starting backend
|
| 16 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():671] sending inform_init request
|
| 17 |
+
2025-01-01 08:41:16,156 INFO MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 18 |
+
2025-01-01 08:41:16,157 INFO MainThread:675697 [wandb_init.py:init():684] backend started and connected
|
| 19 |
+
2025-01-01 08:41:16,160 INFO MainThread:675697 [wandb_init.py:init():779] updated telemetry
|
| 20 |
+
2025-01-01 08:41:16,221 INFO MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
| 21 |
+
2025-01-01 08:41:16,848 INFO MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
|
| 22 |
+
2025-01-01 08:41:17,411 INFO MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
|
| 23 |
+
2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
| 24 |
+
2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
| 25 |
+
2025-01-01 08:41:17,413 INFO MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
|
| 26 |
+
2025-01-01 08:41:17,424 INFO MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
|
| 27 |
+
2025-01-01 12:07:46,853 INFO MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
|
| 28 |
+
2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
|
| 29 |
+
2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_restore():2410] restore
|
| 30 |
+
2025-01-01 12:07:46,855 INFO MainThread:675697 [wandb_run.py:_restore():2416] restore done
|
| 31 |
+
2025-01-01 12:07:50,513 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
|
| 32 |
+
2025-01-01 12:07:50,515 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
|
| 33 |
+
2025-01-01 12:07:50,525 INFO MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files
|
slice_400/wandb/run-20250101_084116-coewtb43/files/config.yaml
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.18.3
|
| 4 |
+
m: []
|
| 5 |
+
python_version: 3.11.10
|
| 6 |
+
t:
|
| 7 |
+
"1":
|
| 8 |
+
- 1
|
| 9 |
+
- 11
|
| 10 |
+
- 41
|
| 11 |
+
- 49
|
| 12 |
+
- 51
|
| 13 |
+
- 55
|
| 14 |
+
- 71
|
| 15 |
+
- 83
|
| 16 |
+
- 98
|
| 17 |
+
- 105
|
| 18 |
+
"2":
|
| 19 |
+
- 1
|
| 20 |
+
- 11
|
| 21 |
+
- 41
|
| 22 |
+
- 49
|
| 23 |
+
- 51
|
| 24 |
+
- 55
|
| 25 |
+
- 71
|
| 26 |
+
- 83
|
| 27 |
+
- 98
|
| 28 |
+
- 105
|
| 29 |
+
"3":
|
| 30 |
+
- 2
|
| 31 |
+
- 13
|
| 32 |
+
- 16
|
| 33 |
+
- 23
|
| 34 |
+
- 55
|
| 35 |
+
- 61
|
| 36 |
+
"4": 3.11.10
|
| 37 |
+
"5": 0.18.3
|
| 38 |
+
"6": 4.45.2
|
| 39 |
+
"8":
|
| 40 |
+
- 5
|
| 41 |
+
"12": 0.18.3
|
| 42 |
+
"13": linux-x86_64
|
| 43 |
+
data_cfgs:
|
| 44 |
+
value:
|
| 45 |
+
eval_data_files: null
|
| 46 |
+
eval_datasets: null
|
| 47 |
+
eval_optional_args: []
|
| 48 |
+
eval_size: null
|
| 49 |
+
eval_split: null
|
| 50 |
+
eval_subset: null
|
| 51 |
+
eval_template: null
|
| 52 |
+
train_data_files: q0_40_preference.pt
|
| 53 |
+
train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
|
| 54 |
+
train_optional_args: []
|
| 55 |
+
train_size: null
|
| 56 |
+
train_split: train
|
| 57 |
+
train_subset: null
|
| 58 |
+
train_template: Chameleon_preference
|
| 59 |
+
logger_cfgs:
|
| 60 |
+
value:
|
| 61 |
+
cache_dir: null
|
| 62 |
+
log_project: align-anything
|
| 63 |
+
log_run_name: dpo
|
| 64 |
+
log_type: wandb
|
| 65 |
+
output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
|
| 66 |
+
save_interval: 400
|
| 67 |
+
model_cfgs:
|
| 68 |
+
value:
|
| 69 |
+
model_max_length: 4096
|
| 70 |
+
model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
|
| 71 |
+
trust_remote_code: true
|
| 72 |
+
special_tokens:
|
| 73 |
+
value: null
|
| 74 |
+
train_cfgs:
|
| 75 |
+
value:
|
| 76 |
+
adam_betas:
|
| 77 |
+
- 0.9
|
| 78 |
+
- 0.95
|
| 79 |
+
bf16: true
|
| 80 |
+
ds_cfgs: ds_z3_config.json
|
| 81 |
+
epochs: 3
|
| 82 |
+
eval_interval: 10
|
| 83 |
+
eval_strategy: epoch
|
| 84 |
+
fp16: false
|
| 85 |
+
freeze_language_model: true
|
| 86 |
+
freeze_mm_proj: true
|
| 87 |
+
freeze_vision_tower: false
|
| 88 |
+
gradient_accumulation_steps: 2
|
| 89 |
+
gradient_checkpointing: true
|
| 90 |
+
learning_rate: 1e-06
|
| 91 |
+
lr_scheduler_type: cosine
|
| 92 |
+
lr_warmup_ratio: 0.03
|
| 93 |
+
per_device_eval_batch_size: 4
|
| 94 |
+
per_device_train_batch_size: 4
|
| 95 |
+
regularization: 0.001
|
| 96 |
+
scale_coeff: 0.1
|
| 97 |
+
seed: 42
|
| 98 |
+
weight_decay: 0.01
|
slice_400/wandb/run-20250101_084116-coewtb43/files/output.log
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
***** Running training *****
|
| 2 |
+
Training 1/3.0 epoch: 0%| | 0/1422.0 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
|
| 3 |
+
Training 1/3.0 epoch (loss 11.8749): 21%|██████████████████████████████████████████████████▋ | 299/1422.0 [43:24<2:59:40, 9.60s/it]
|
| 4 |
+
[2025-01-01 08:42:55,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 5 |
+
[2025-01-01 08:43:25,944] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 6 |
+
[2025-01-01 08:44:17,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[4.761904761904761e-07, 4.761904761904761e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 7 |
+
[2025-01-01 08:45:03,075] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 8 |
+
[2025-01-01 08:47:04,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.523809523809522e-07, 9.523809523809522e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 9 |
+
[2025-01-01 08:48:02,529] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 10 |
+
[2025-01-01 08:48:43,904] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 11 |
+
[2025-01-01 08:50:07,690] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[9.995802740501932e-07, 9.995802740501932e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 12 |
+
[2025-01-01 08:51:09,012] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 13 |
+
[2025-01-01 08:53:09,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[9.98130274211278e-07, 9.98130274211278e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 14 |
+
[2025-01-01 08:55:49,714] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 15 |
+
[2025-01-01 08:55:49,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[9.956478233113064e-07, 9.956478233113064e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 16 |
+
[2025-01-01 08:58:16,431] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[9.921380666088558e-07, 9.921380666088558e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 17 |
+
[2025-01-01 09:01:07,798] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[9.876082786106545e-07, 9.876082786106545e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 18 |
+
[2025-01-01 09:01:27,047] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 19 |
+
[2025-01-01 09:04:03,792] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 20 |
+
[2025-01-01 09:04:24,181] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 21 |
+
[2025-01-01 09:04:24,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[9.820678479940571e-07, 9.820678479940571e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 22 |
+
[2025-01-01 09:05:30,688] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 23 |
+
[2025-01-01 09:07:12,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.755282581475767e-07, 9.755282581475767e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 24 |
+
[2025-01-01 09:09:19,110] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 25 |
+
[2025-01-01 09:10:10,785] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 26 |
+
[2025-01-01 09:10:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[9.68003063369808e-07, 9.68003063369808e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 27 |
+
[2025-01-01 09:10:51,320] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 28 |
+
[2025-01-01 09:13:33,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[9.595078607760749e-07, 9.595078607760749e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 29 |
+
[2025-01-01 09:14:56,362] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 30 |
+
[2025-01-01 09:16:12,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[9.500602579710255e-07, 9.500602579710255e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 31 |
+
[2025-01-01 09:17:52,852] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 32 |
+
[2025-01-01 09:19:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[9.39679836554184e-07, 9.39679836554184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 33 |
+
[2025-01-01 09:21:51,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[9.283881115340955e-07, 9.283881115340955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 34 |
+
[2025-01-01 09:22:22,638] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 35 |
+
[2025-01-01 09:24:48,036] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[9.16208486735184e-07, 9.16208486735184e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 36 |
+
[2025-01-01 09:25:39,046] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 37 |
+
[2025-01-01 09:27:38,935] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[9.03166206289754e-07, 9.03166206289754e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 38 |
+
[2025-01-01 09:28:42,774] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 39 |
+
[2025-01-01 09:29:54,622] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[8.8928830231567e-07, 8.8928830231567e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 40 |
+
[2025-01-01 09:30:34,992] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 41 |
+
[2025-01-01 09:31:34,787] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 42 |
+
[2025-01-01 09:32:15,776] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 43 |
+
[2025-01-01 09:32:50,765] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[8.746035388881654e-07, 8.746035388881654e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 44 |
+
[2025-01-01 09:35:09,447] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 45 |
+
[2025-01-01 09:35:48,046] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[8.591423524219029e-07, 8.591423524219029e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 46 |
+
[2025-01-01 09:36:31,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 47 |
+
[2025-01-01 09:37:59,484] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 48 |
+
[2025-01-01 09:38:30,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[8.429367885868581e-07, 8.429367885868581e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 49 |
+
Saving checkpoint at step 400 ...
|
| 50 |
+
Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
|
| 51 |
+
Saving 16-bit model...
|
| 52 |
+
[2025-01-01 09:38:39,848] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
|
| 53 |
+
[2025-01-01 09:38:39,849] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
|
| 54 |
+
[2025-01-01 09:38:39,850] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
|
| 55 |
+
[2025-01-01 09:38:58,770] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
|
| 56 |
+
[2025-01-01 09:38:58,773] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
|
| 57 |
+
Model saved!
|
| 58 |
+
Saving 16-bit model...
|
| 59 |
+
[2025-01-01 09:39:05,477] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved!
|
| 60 |
+
[2025-01-01 09:39:05,478] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin, tag: global_step200
|
| 61 |
+
[2025-01-01 09:39:05,479] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin...
|
| 62 |
+
[2025-01-01 09:39:27,199] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_400.bin.
|
| 63 |
+
[2025-01-01 09:39:27,202] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now!
|
| 64 |
+
Model saved!
|
| 65 |
+
Checkpoint saved.
|
| 66 |
+
[2025-01-01 09:42:08,747] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[8.260204358887753e-07, 8.260204358887753e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 67 |
+
[2025-01-01 09:43:59,051] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 68 |
+
[2025-01-01 09:44:19,070] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 69 |
+
[2025-01-01 09:45:02,019] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[8.084283560518583e-07, 8.084283560518583e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 70 |
+
[2025-01-01 09:47:51,338] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[7.901970113479955e-07, 7.901970113479955e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 71 |
+
[2025-01-01 09:48:18,628] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 72 |
+
[2025-01-01 09:50:44,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[7.713641890231308e-07, 7.713641890231308e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 73 |
+
[2025-01-01 09:51:29,985] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 74 |
+
[2025-01-01 09:52:00,601] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 75 |
+
[2025-01-01 09:53:37,616] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 76 |
+
[2025-01-01 09:53:37,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[7.51968922977428e-07, 7.51968922977428e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 77 |
+
[2025-01-01 09:56:36,667] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 78 |
+
[2025-01-01 09:56:36,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[7.320514128615511e-07, 7.320514128615511e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 79 |
+
[2025-01-01 09:57:18,253] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 80 |
+
[2025-01-01 09:59:22,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[7.116529407567488e-07, 7.116529407567488e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 81 |
+
[2025-01-01 09:59:42,581] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 82 |
+
[2025-01-01 10:02:27,253] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[6.908157856114392e-07, 6.908157856114392e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 83 |
+
[2025-01-01 10:04:22,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 84 |
+
[2025-01-01 10:05:15,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[6.695831356116303e-07, 6.695831356116303e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 85 |
+
[2025-01-01 10:07:32,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[6.479989986668117e-07, 6.479989986668117e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 86 |
+
[2025-01-01 10:09:58,442] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 87 |
+
[2025-01-01 10:10:45,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[6.261081111968403e-07, 6.261081111968403e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 88 |
+
[2025-01-01 10:12:34,690] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 89 |
+
[2025-01-01 10:12:55,185] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 90 |
+
[2025-01-01 10:13:41,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[6.039558454088795e-07, 6.039558454088795e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 91 |
+
[2025-01-01 10:14:01,438] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 92 |
+
[2025-01-01 10:16:42,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[5.815881152565711e-07, 5.815881152565711e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 93 |
+
[2025-01-01 10:17:49,827] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 94 |
+
[2025-01-01 10:18:41,651] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 95 |
+
[2025-01-01 10:19:22,050] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 96 |
+
[2025-01-01 10:20:00,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[5.590512812763541e-07, 5.590512812763541e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 97 |
+
[2025-01-01 10:22:48,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[5.363920544981748e-07, 5.363920544981748e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 98 |
+
[2025-01-01 10:23:27,066] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 99 |
+
[2025-01-01 10:25:42,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[5.136573996297429e-07, 5.136573996297429e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 100 |
+
[2025-01-01 10:26:22,366] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 101 |
+
[2025-01-01 10:28:45,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[4.908944377150043e-07, 4.908944377150043e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 102 |
+
[2025-01-01 10:30:51,802] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 103 |
+
[2025-01-01 10:31:03,983] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[4.681503484685803e-07, 4.681503484685803e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 104 |
+
[2025-01-01 10:34:07,594] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 105 |
+
[2025-01-01 10:34:07,595] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[4.454722724886051e-07, 4.454722724886051e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 106 |
+
[2025-01-01 10:36:51,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[4.229072135506384e-07, 4.229072135506384e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 107 |
+
Saving checkpoint at step 800 ...
|
| 108 |
+
Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
|
| 109 |
+
Saving 16-bit model...
|
| 110 |
+
[2025-01-01 10:37:00,920] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
|
| 111 |
+
[2025-01-01 10:37:00,921] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
|
| 112 |
+
[2025-01-01 10:37:00,921] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
|
| 113 |
+
[2025-01-01 10:37:17,303] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
|
| 114 |
+
[2025-01-01 10:37:17,305] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
|
| 115 |
+
Model saved!
|
| 116 |
+
Saving 16-bit model...
|
| 117 |
+
[2025-01-01 10:37:24,304] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved!
|
| 118 |
+
[2025-01-01 10:37:24,306] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin, tag: global_step400
|
| 119 |
+
[2025-01-01 10:37:24,306] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin...
|
| 120 |
+
[2025-01-01 10:37:47,861] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_800.bin.
|
| 121 |
+
[2025-01-01 10:37:47,862] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now!
|
| 122 |
+
Model saved!
|
| 123 |
+
Checkpoint saved.
|
| 124 |
+
[2025-01-01 10:38:06,545] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 125 |
+
[2025-01-01 10:39:58,968] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 126 |
+
[2025-01-01 10:40:18,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[4.005019411851609e-07, 4.005019411851609e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 127 |
+
[2025-01-01 10:40:58,615] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 128 |
+
[2025-01-01 10:41:39,701] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 129 |
+
[2025-01-01 10:43:08,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[3.783028937405821e-07, 3.783028937405821e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 130 |
+
[2025-01-01 10:44:32,585] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 131 |
+
[2025-01-01 10:45:54,172] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 132 |
+
[2025-01-01 10:45:54,173] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[3.563560821326706e-07, 3.563560821326706e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 133 |
+
[2025-01-01 10:47:22,278] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 134 |
+
[2025-01-01 10:48:37,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[3.3470699447990527e-07, 3.3470699447990527e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 135 |
+
[2025-01-01 10:51:26,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[3.1340050182240436e-07, 3.1340050182240436e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 136 |
+
[2025-01-01 10:52:26,337] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 137 |
+
[2025-01-01 10:52:46,441] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 138 |
+
[2025-01-01 10:54:14,010] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[2.92480765119841e-07, 2.92480765119841e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 139 |
+
[2025-01-01 10:56:46,300] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 140 |
+
[2025-01-01 10:57:28,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[2.719911437211122e-07, 2.719911437211122e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 141 |
+
[2025-01-01 10:59:56,353] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 142 |
+
[2025-01-01 10:59:56,354] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[2.5197410549546595e-07, 2.5197410549546595e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 143 |
+
[2025-01-01 11:00:26,971] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 144 |
+
[2025-01-01 11:02:03,646] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 145 |
+
[2025-01-01 11:02:54,314] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[2.3247113881135781e-07, 2.3247113881135781e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 146 |
+
[2025-01-01 11:05:02,388] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 147 |
+
[2025-01-01 11:05:43,792] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 148 |
+
[2025-01-01 11:05:55,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[2.1352266654547125e-07, 2.1352266654547125e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 149 |
+
[2025-01-01 11:08:07,702] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 150 |
+
[2025-01-01 11:08:38,466] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[1.9516796230013272e-07, 1.9516796230013272e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 151 |
+
[2025-01-01 11:11:35,871] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[1.774450690027746e-07, 1.774450690027746e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 152 |
+
[2025-01-01 11:12:47,870] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 153 |
+
[2025-01-01 11:14:16,051] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[1.6039072005615716e-07, 1.6039072005615716e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 154 |
+
[2025-01-01 11:17:00,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[1.4404026320278317e-07, 1.4404026320278317e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 155 |
+
[2025-01-01 11:18:23,592] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 156 |
+
[2025-01-01 11:19:58,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[1.284275872613028e-07, 1.284275872613028e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 157 |
+
[2025-01-01 11:20:59,657] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 158 |
+
[2025-01-01 11:21:20,032] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 159 |
+
[2025-01-01 11:22:26,091] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 160 |
+
[2025-01-01 11:23:04,553] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[1.1358505188676288e-07, 1.1358505188676288e-07], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 161 |
+
[2025-01-01 11:26:14,637] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 162 |
+
[2025-01-01 11:26:14,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[9.95434205002792e-08, 9.95434205002792e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 163 |
+
[2025-01-01 11:27:06,507] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 164 |
+
[2025-01-01 11:27:47,054] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 165 |
+
[2025-01-01 11:29:24,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[8.633179652714916e-08, 8.633179652714916e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 166 |
+
[2025-01-01 11:31:52,055] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 167 |
+
[2025-01-01 11:32:11,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[7.397756307555885e-08, 7.397756307555885e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 168 |
+
[2025-01-01 11:34:47,839] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 169 |
+
[2025-01-01 11:34:59,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[6.250632618090867e-08, 6.250632618090867e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 170 |
+
Saving checkpoint at step 1200 ...
|
| 171 |
+
Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
|
| 172 |
+
Saving 16-bit model...
|
| 173 |
+
[2025-01-01 11:35:08,703] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
|
| 174 |
+
[2025-01-01 11:35:08,704] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
|
| 175 |
+
[2025-01-01 11:35:08,704] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
|
| 176 |
+
[2025-01-01 11:35:25,316] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
|
| 177 |
+
[2025-01-01 11:35:25,317] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
|
| 178 |
+
Model saved!
|
| 179 |
+
Saving 16-bit model...
|
| 180 |
+
[2025-01-01 11:35:32,446] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved!
|
| 181 |
+
[2025-01-01 11:35:32,447] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin, tag: global_step600
|
| 182 |
+
[2025-01-01 11:35:32,447] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin...
|
| 183 |
+
[2025-01-01 11:35:53,847] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model_1200.bin.
|
| 184 |
+
[2025-01-01 11:35:53,849] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now!
|
| 185 |
+
Model saved!
|
| 186 |
+
Checkpoint saved.
|
| 187 |
+
[2025-01-01 11:38:41,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[5.194186173339599e-08, 5.194186173339599e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 188 |
+
[2025-01-01 11:40:10,376] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 189 |
+
[2025-01-01 11:41:25,192] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[4.230606619885108e-08, 4.230606619885108e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 190 |
+
[2025-01-01 11:43:26,712] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 191 |
+
[2025-01-01 11:44:25,848] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[3.3618911234968236e-08, 3.3618911234968236e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 192 |
+
[2025-01-01 11:46:29,769] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 193 |
+
[2025-01-01 11:46:53,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[2.589840229699558e-08, 2.589840229699558e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 194 |
+
[2025-01-01 11:48:22,179] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 195 |
+
[2025-01-01 11:49:22,087] [WARNING] [stage3.py:2104:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 196 |
+
[2025-01-01 11:49:35,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[1.9160541318679224e-08, 1.9160541318679224e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 197 |
+
[2025-01-01 11:50:02,959] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 198 |
+
[2025-01-01 11:52:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[1.3419293545812338e-08, 1.3419293545812338e-08], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 199 |
+
[2025-01-01 11:52:56,411] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 200 |
+
[2025-01-01 11:54:18,301] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 201 |
+
[2025-01-01 11:55:03,737] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[8.686558591130156e-09, 8.686558591130156e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 202 |
+
[2025-01-01 11:55:46,245] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 203 |
+
[2025-01-01 11:57:47,016] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[4.972145770545999e-09, 4.972145770545999e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 204 |
+
[2025-01-01 12:00:50,391] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 205 |
+
[2025-01-01 12:00:50,392] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[2.283753771845587e-09, 2.283753771845587e-09], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 206 |
+
[2025-01-01 12:01:10,430] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 207 |
+
[2025-01-01 12:03:22,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[6.269546979813523e-10, 6.269546979813523e-10], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 208 |
+
[2025-01-01 12:05:10,192] [WARNING] [stage3.py:2104:step] 1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time
|
| 209 |
+
[2025-01-01 12:06:28,152] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[5.182518037827321e-12, 5.182518037827321e-12], mom=[[0.9, 0.95], [0.9, 0.95]]
|
| 210 |
+
Saving model to "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference" ...
|
| 211 |
+
Saving 16-bit model...
|
| 212 |
+
[2025-01-01 12:06:57,036] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
|
| 213 |
+
[2025-01-01 12:06:57,037] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
|
| 214 |
+
[2025-01-01 12:06:57,037] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
|
| 215 |
+
[2025-01-01 12:07:17,768] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
|
| 216 |
+
[2025-01-01 12:07:17,770] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
|
| 217 |
+
Model saved!
|
| 218 |
+
Saving 16-bit model...
|
| 219 |
+
[2025-01-01 12:07:24,458] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step711 is about to be saved!
|
| 220 |
+
[2025-01-01 12:07:24,459] [INFO] [engine.py:3649:save_16bit_model] Saving model weights to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin, tag: global_step711
|
| 221 |
+
[2025-01-01 12:07:24,459] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin...
|
| 222 |
+
[2025-01-01 12:07:46,734] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/pytorch_model.bin.
|
| 223 |
+
[2025-01-01 12:07:46,737] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step711 is ready now!
|
| 224 |
+
Model saved!
|
slice_400/wandb/run-20250101_084116-coewtb43/files/requirements.txt
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
align-anything==0.0.1.dev0
|
| 2 |
+
gitdb==4.0.11
|
| 3 |
+
wcwidth==0.2.13
|
| 4 |
+
identify==2.6.1
|
| 5 |
+
tomlkit==0.12.0
|
| 6 |
+
bitsandbytes==0.44.1
|
| 7 |
+
trl==0.9.6
|
| 8 |
+
pytest-split==0.8.0
|
| 9 |
+
gradio==4.44.1
|
| 10 |
+
pip==24.2
|
| 11 |
+
multidict==6.1.0
|
| 12 |
+
fairscale==0.4.13
|
| 13 |
+
mistral_common==1.4.4
|
| 14 |
+
python-dotenv==1.0.1
|
| 15 |
+
uvloop==0.20.0
|
| 16 |
+
absl-py==2.1.0
|
| 17 |
+
tiktoken==0.7.0
|
| 18 |
+
pydub==0.25.1
|
| 19 |
+
websockets==12.0
|
| 20 |
+
llamafactory==0.9.1.dev0
|
| 21 |
+
triton==3.0.0
|
| 22 |
+
tifffile==2024.9.20
|
| 23 |
+
safe-rlhf==0.0.1.dev0
|
| 24 |
+
pandas==2.2.3
|
| 25 |
+
grpcio==1.66.2
|
| 26 |
+
click==8.1.7
|
| 27 |
+
ninja==1.11.1.1
|
| 28 |
+
rich==13.9.2
|
| 29 |
+
Jinja2==3.1.4
|
| 30 |
+
Pygments==2.18.0
|
| 31 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 32 |
+
importlib_resources==6.4.5
|
| 33 |
+
GitPython==3.1.43
|
| 34 |
+
nvidia-cufft-cu12==11.0.2.54
|
| 35 |
+
tensorboard-data-server==0.7.2
|
| 36 |
+
align-anything==0.0.1.dev0
|
| 37 |
+
six==1.16.0
|
| 38 |
+
scipy==1.14.1
|
| 39 |
+
mpmath==1.3.0
|
| 40 |
+
jsonschema-specifications==2024.10.1
|
| 41 |
+
scikit-image==0.24.0
|
| 42 |
+
zipp==3.20.2
|
| 43 |
+
cycler==0.12.1
|
| 44 |
+
MarkupSafe==2.1.5
|
| 45 |
+
tzdata==2024.2
|
| 46 |
+
idna==3.10
|
| 47 |
+
pycountry==24.6.1
|
| 48 |
+
nvidia-nccl-cu12==2.20.5
|
| 49 |
+
matplotlib==3.9.2
|
| 50 |
+
pytz==2024.2
|
| 51 |
+
uvicorn==0.31.1
|
| 52 |
+
dill==0.3.8
|
| 53 |
+
pyparsing==3.1.4
|
| 54 |
+
pytest==7.2.0
|
| 55 |
+
jiter==0.6.1
|
| 56 |
+
safetensors==0.4.5
|
| 57 |
+
typing_extensions==4.12.2
|
| 58 |
+
decorator==4.4.2
|
| 59 |
+
typeguard==4.4.1
|
| 60 |
+
prometheus_client==0.21.0
|
| 61 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
| 62 |
+
sentencepiece==0.2.0
|
| 63 |
+
requests==2.32.3
|
| 64 |
+
kiwisolver==1.4.7
|
| 65 |
+
gdown==5.2.0
|
| 66 |
+
multiprocess==0.70.16
|
| 67 |
+
xxhash==3.5.0
|
| 68 |
+
PyYAML==6.0.2
|
| 69 |
+
gguf==0.10.0
|
| 70 |
+
nvidia-nvtx-cu12==12.1.105
|
| 71 |
+
hpsv2==1.2.0
|
| 72 |
+
tensorboard==2.18.0
|
| 73 |
+
nodeenv==1.9.1
|
| 74 |
+
filelock==3.16.1
|
| 75 |
+
distro==1.9.0
|
| 76 |
+
scikit-learn==1.5.2
|
| 77 |
+
huggingface-hub==0.25.2
|
| 78 |
+
pyairports==2.1.1
|
| 79 |
+
importlib_metadata==8.5.0
|
| 80 |
+
pyarrow==17.0.0
|
| 81 |
+
llvmlite==0.43.0
|
| 82 |
+
ray==2.37.0
|
| 83 |
+
tokenizers==0.20.3
|
| 84 |
+
nvidia-nvjitlink-cu12==12.6.77
|
| 85 |
+
av==14.0.1
|
| 86 |
+
deepspeed==0.15.2
|
| 87 |
+
clip==0.2.0
|
| 88 |
+
shtab==1.7.1
|
| 89 |
+
certifi==2024.8.30
|
| 90 |
+
braceexpand==0.1.7
|
| 91 |
+
nvidia-ml-py==12.560.30
|
| 92 |
+
webdataset==0.2.100
|
| 93 |
+
docker-pycreds==0.4.0
|
| 94 |
+
einops==0.8.0
|
| 95 |
+
iniconfig==2.0.0
|
| 96 |
+
tyro==0.9.2
|
| 97 |
+
torchvision==0.19.0
|
| 98 |
+
accelerate==0.34.2
|
| 99 |
+
beautifulsoup4==4.12.3
|
| 100 |
+
pyzmq==26.2.0
|
| 101 |
+
pycparser==2.22
|
| 102 |
+
nvidia-curand-cu12==10.3.2.106
|
| 103 |
+
msgpack==1.1.0
|
| 104 |
+
soxr==0.5.0.post1
|
| 105 |
+
platformdirs==4.3.6
|
| 106 |
+
h11==0.14.0
|
| 107 |
+
psutil==6.0.0
|
| 108 |
+
pydantic==2.9.2
|
| 109 |
+
shellingham==1.5.4
|
| 110 |
+
imageio-ffmpeg==0.5.1
|
| 111 |
+
wandb==0.18.3
|
| 112 |
+
audioread==3.0.1
|
| 113 |
+
annotated-types==0.7.0
|
| 114 |
+
docstring_parser==0.16
|
| 115 |
+
cloudpickle==3.1.0
|
| 116 |
+
regex==2024.9.11
|
| 117 |
+
packaging==24.1
|
| 118 |
+
timm==0.6.13
|
| 119 |
+
aiosignal==1.3.1
|
| 120 |
+
numba==0.60.0
|
| 121 |
+
orjson==3.10.7
|
| 122 |
+
rpds-py==0.20.0
|
| 123 |
+
virtualenv==20.26.6
|
| 124 |
+
joblib==1.4.2
|
| 125 |
+
charset-normalizer==3.4.0
|
| 126 |
+
httpx==0.27.2
|
| 127 |
+
ffmpy==0.4.0
|
| 128 |
+
lm-format-enforcer==0.10.6
|
| 129 |
+
yt-dlp==2024.8.6
|
| 130 |
+
sympy==1.13.3
|
| 131 |
+
python-dateutil==2.9.0.post0
|
| 132 |
+
nvidia-cusolver-cu12==11.4.5.107
|
| 133 |
+
msgspec==0.18.6
|
| 134 |
+
mdurl==0.1.2
|
| 135 |
+
torch==2.4.0
|
| 136 |
+
fastapi==0.115.0
|
| 137 |
+
optree==0.13.0
|
| 138 |
+
PySocks==1.7.1
|
| 139 |
+
transformers==4.46.0.dev0
|
| 140 |
+
torchlibrosa==0.1.0
|
| 141 |
+
fsspec==2024.6.1
|
| 142 |
+
nvidia-cublas-cu12==12.1.3.1
|
| 143 |
+
gradio_client==1.3.0
|
| 144 |
+
args==0.1.0
|
| 145 |
+
cffi==1.17.1
|
| 146 |
+
fonttools==4.54.1
|
| 147 |
+
clint==0.5.1
|
| 148 |
+
lark==1.2.2
|
| 149 |
+
tqdm==4.66.5
|
| 150 |
+
semantic-version==2.10.0
|
| 151 |
+
pooch==1.8.2
|
| 152 |
+
markdown-it-py==3.0.0
|
| 153 |
+
pydantic_core==2.23.4
|
| 154 |
+
sniffio==1.3.1
|
| 155 |
+
httptools==0.6.1
|
| 156 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
| 157 |
+
anyio==4.6.0
|
| 158 |
+
ftfy==6.3.0
|
| 159 |
+
Markdown==3.7
|
| 160 |
+
datasets==2.21.0
|
| 161 |
+
diffusers==0.30.3
|
| 162 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
| 163 |
+
vllm==0.6.2
|
| 164 |
+
starlette==0.38.6
|
| 165 |
+
flash-attn==2.7.0.post2
|
| 166 |
+
urllib3==2.2.3
|
| 167 |
+
Werkzeug==3.0.4
|
| 168 |
+
py-cpuinfo==9.0.0
|
| 169 |
+
moviepy==1.0.3
|
| 170 |
+
librosa==0.10.2.post1
|
| 171 |
+
peft==0.12.0
|
| 172 |
+
soupsieve==2.6
|
| 173 |
+
lazy_loader==0.4
|
| 174 |
+
pluggy==1.5.0
|
| 175 |
+
setuptools==75.1.0
|
| 176 |
+
sentry-sdk==2.16.0
|
| 177 |
+
tabulate==0.9.0
|
| 178 |
+
transformers==4.45.2
|
| 179 |
+
pre_commit==4.0.1
|
| 180 |
+
termcolor==2.5.0
|
| 181 |
+
frechet-audio-distance==0.1.2
|
| 182 |
+
pytorch-fid==0.3.0
|
| 183 |
+
setproctitle==1.3.3
|
| 184 |
+
jsonschema==4.23.0
|
| 185 |
+
aiofiles==23.2.1
|
| 186 |
+
contourpy==1.3.0
|
| 187 |
+
distlib==0.3.9
|
| 188 |
+
interegular==0.3.3
|
| 189 |
+
fire==0.7.0
|
| 190 |
+
diskcache==5.6.3
|
| 191 |
+
proglog==0.1.10
|
| 192 |
+
soundfile==0.12.1
|
| 193 |
+
protobuf==3.20.3
|
| 194 |
+
smmap==5.0.1
|
| 195 |
+
pycryptodomex==3.21.0
|
| 196 |
+
Brotli==1.1.0
|
| 197 |
+
pillow==10.4.0
|
| 198 |
+
frozenlist==1.4.1
|
| 199 |
+
numpy==1.26.4
|
| 200 |
+
mutagen==1.47.0
|
| 201 |
+
outlines==0.0.46
|
| 202 |
+
attrs==24.2.0
|
| 203 |
+
torchaudio==2.4.0
|
| 204 |
+
aiohttp==3.10.10
|
| 205 |
+
ruff==0.6.9
|
| 206 |
+
watchfiles==0.24.0
|
| 207 |
+
threadpoolctl==3.5.0
|
| 208 |
+
nest-asyncio==1.6.0
|
| 209 |
+
partial-json-parser==0.2.1.1.post4
|
| 210 |
+
sse-starlette==2.1.3
|
| 211 |
+
shortuuid==1.0.13
|
| 212 |
+
typer==0.12.5
|
| 213 |
+
prometheus-fastapi-instrumentator==7.0.0
|
| 214 |
+
imageio==2.35.1
|
| 215 |
+
wheel==0.44.0
|
| 216 |
+
image-reward==1.5
|
| 217 |
+
networkx==3.4.1
|
| 218 |
+
propcache==0.2.0
|
| 219 |
+
aiohappyeyeballs==2.4.3
|
| 220 |
+
nvidia-cusparse-cu12==12.1.0.106
|
| 221 |
+
xformers==0.0.27.post2
|
| 222 |
+
cfgv==3.4.0
|
| 223 |
+
python-multipart==0.0.12
|
| 224 |
+
httpcore==1.0.6
|
| 225 |
+
opencv-python==4.6.0.66
|
| 226 |
+
resampy==0.4.3
|
| 227 |
+
yarl==1.15.0
|
| 228 |
+
referencing==0.35.1
|
| 229 |
+
openai==1.51.2
|
| 230 |
+
hjson==3.1.0
|
| 231 |
+
llamafactory==0.9.1.dev0
|
| 232 |
+
jaraco.collections==5.1.0
|
| 233 |
+
backports.tarfile==1.2.0
|
| 234 |
+
more-itertools==10.3.0
|
| 235 |
+
wheel==0.43.0
|
| 236 |
+
importlib_metadata==8.0.0
|
| 237 |
+
zipp==3.19.2
|
| 238 |
+
autocommand==2.2.2
|
| 239 |
+
jaraco.functools==4.0.1
|
| 240 |
+
platformdirs==4.2.2
|
| 241 |
+
tomli==2.0.1
|
| 242 |
+
jaraco.text==3.12.1
|
| 243 |
+
typing_extensions==4.12.2
|
| 244 |
+
jaraco.context==5.3.0
|
| 245 |
+
importlib_resources==6.4.0
|
| 246 |
+
packaging==24.1
|
| 247 |
+
inflect==7.3.1
|
| 248 |
+
typeguard==4.3.0
|
slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.4.0-196-generic-x86_64-with-glibc2.31",
|
| 3 |
+
"python": "3.11.10",
|
| 4 |
+
"startedAt": "2025-01-01T08:41:16.157770Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--local_rank=0",
|
| 7 |
+
"--model_name_or_path",
|
| 8 |
+
"/data/align-anything/hantao/models/chameleon-7b",
|
| 9 |
+
"--train_datasets",
|
| 10 |
+
"/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized",
|
| 11 |
+
"--output_dir",
|
| 12 |
+
"/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
|
| 13 |
+
"--per_device_train_batch_size",
|
| 14 |
+
"4",
|
| 15 |
+
"--per_device_eval_batch_size",
|
| 16 |
+
"4",
|
| 17 |
+
"--gradient_accumulation_steps",
|
| 18 |
+
"2",
|
| 19 |
+
"--train_template",
|
| 20 |
+
"Chameleon_preference",
|
| 21 |
+
"--train_split",
|
| 22 |
+
"train",
|
| 23 |
+
"--train_data_files",
|
| 24 |
+
"q0_40_preference.pt",
|
| 25 |
+
"--learning_rate",
|
| 26 |
+
"1e-6",
|
| 27 |
+
"--epochs",
|
| 28 |
+
"3",
|
| 29 |
+
"--lr_scheduler_type",
|
| 30 |
+
"cosine",
|
| 31 |
+
"--save_interval",
|
| 32 |
+
"400"
|
| 33 |
+
],
|
| 34 |
+
"program": "-m align_anything.trainers.text_image_to_text_image.dpo",
|
| 35 |
+
"git": {
|
| 36 |
+
"remote": "https://github.com/PKU-Alignment/align-anything.git",
|
| 37 |
+
"commit": "6fde660afc9985323f147930eedf188a5699adc7"
|
| 38 |
+
},
|
| 39 |
+
"email": "[email protected]",
|
| 40 |
+
"root": "/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference",
|
| 41 |
+
"host": "lyg0194",
|
| 42 |
+
"username": "align-anything",
|
| 43 |
+
"executable": "/data/align-anything/miniconda3/envs/hantao_stable/bin/python",
|
| 44 |
+
"cpu_count": 64,
|
| 45 |
+
"cpu_count_logical": 128,
|
| 46 |
+
"gpu": "[NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB, NVIDIA A100-SXM4-80GB]",
|
| 47 |
+
"gpu_count": 8,
|
| 48 |
+
"disk": {
|
| 49 |
+
"/": {
|
| 50 |
+
"total": "939477946368",
|
| 51 |
+
"used": "596714827776"
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"memory": {
|
| 55 |
+
"total": "1081823907840"
|
| 56 |
+
},
|
| 57 |
+
"cpu": {
|
| 58 |
+
"count": 64,
|
| 59 |
+
"countLogical": 128
|
| 60 |
+
},
|
| 61 |
+
"gpu_nvidia": [
|
| 62 |
+
{
|
| 63 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 64 |
+
"memoryTotal": "85899345920",
|
| 65 |
+
"cudaCores": 6912,
|
| 66 |
+
"architecture": "Ampere"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 70 |
+
"memoryTotal": "85899345920",
|
| 71 |
+
"cudaCores": 6912,
|
| 72 |
+
"architecture": "Ampere"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 76 |
+
"memoryTotal": "85899345920",
|
| 77 |
+
"cudaCores": 6912,
|
| 78 |
+
"architecture": "Ampere"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 82 |
+
"memoryTotal": "85899345920",
|
| 83 |
+
"cudaCores": 6912,
|
| 84 |
+
"architecture": "Ampere"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 88 |
+
"memoryTotal": "85899345920",
|
| 89 |
+
"cudaCores": 6912,
|
| 90 |
+
"architecture": "Ampere"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 94 |
+
"memoryTotal": "85899345920",
|
| 95 |
+
"cudaCores": 6912,
|
| 96 |
+
"architecture": "Ampere"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 100 |
+
"memoryTotal": "85899345920",
|
| 101 |
+
"cudaCores": 6912,
|
| 102 |
+
"architecture": "Ampere"
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"name": "NVIDIA A100-SXM4-80GB",
|
| 106 |
+
"memoryTotal": "85899345920",
|
| 107 |
+
"cudaCores": 6912,
|
| 108 |
+
"architecture": "Ampere"
|
| 109 |
+
}
|
| 110 |
+
],
|
| 111 |
+
"cudaVersion": "12.4"
|
| 112 |
+
}
|
slice_400/wandb/run-20250101_084116-coewtb43/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"train/loss":1.2820848226547241,"_step":1422,"train/better_sample_reward":92.50687408447266,"train/reward_margin":134.08236694335938,"train/lr":0,"train/worse_sample_reward":-41.57551193237305,"_wandb":{"runtime":12390},"_timestamp":1.7357332075421584e+09,"train/step":1422,"train/epoch":3,"_runtime":12390.697992413,"train/reward_accuracy":0.90625,"train/reward":50.931365966796875}
|
slice_400/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-01-01T08:41:16.162688269Z","level":"INFO","msg":"using version","core version":"0.18.3"}
|
| 2 |
+
{"time":"2025-01-01T08:41:16.162723485Z","level":"INFO","msg":"created symlink","path":"/data/align-anything/hantao/align-anything/outputs/mm_interp/q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-core.log"}
|
| 3 |
+
{"time":"2025-01-01T08:41:16.164755723Z","level":"ERROR","msg":"dialing: google: could not find default credentials. See https://cloud.google.com/docs/authentication/external/set-up-adc for more information"}
|
| 4 |
+
{"time":"2025-01-01T08:41:16.193696068Z","level":"INFO","msg":"created new stream","id":"coewtb43"}
|
| 5 |
+
{"time":"2025-01-01T08:41:16.193725018Z","level":"INFO","msg":"stream: started","id":"coewtb43"}
|
| 6 |
+
{"time":"2025-01-01T08:41:16.193745568Z","level":"INFO","msg":"sender: started","stream_id":{"value":"coewtb43"}}
|
| 7 |
+
{"time":"2025-01-01T08:41:16.19376544Z","level":"INFO","msg":"handler: started","stream_id":{"value":"coewtb43"}}
|
| 8 |
+
{"time":"2025-01-01T08:41:16.193764055Z","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"coewtb43"}}
|
| 9 |
+
{"time":"2025-01-01T08:41:16.85353523Z","level":"INFO","msg":"wandb-core","!BADKEY":null}
|
| 10 |
+
{"time":"2025-01-01T08:41:16.857782107Z","level":"INFO","msg":"Starting system monitor"}
|
| 11 |
+
{"time":"2025-01-01T09:18:13.461106519Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
|
| 12 |
+
{"time":"2025-01-01T09:59:56.117005921Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/files/htlou/align-anything/coewtb43/file_stream\": dial tcp 35.186.228.49:443: connect: connection timed out"}
|
| 13 |
+
{"time":"2025-01-01T12:07:46.855783288Z","level":"INFO","msg":"Stopping system monitor"}
|
| 14 |
+
{"time":"2025-01-01T12:07:46.874087131Z","level":"INFO","msg":"Stopped system monitor"}
|
| 15 |
+
{"time":"2025-01-01T12:07:47.46933058Z","level":"WARN","msg":"No program path found, not creating job artifact. See https://docs.wandb.ai/guides/launch/create-job"}
|
| 16 |
+
{"time":"2025-01-01T12:07:47.469354945Z","level":"INFO","msg":"sender: sendDefer: no job artifact to save"}
|
| 17 |
+
{"time":"2025-01-01T12:07:48.770864759Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 18 |
+
{"time":"2025-01-01T12:07:50.527748121Z","level":"INFO","msg":"stream: closing","id":"coewtb43"}
|
| 19 |
+
{"time":"2025-01-01T12:07:50.527778689Z","level":"INFO","msg":"handler: closed","stream_id":{"value":"coewtb43"}}
|
| 20 |
+
{"time":"2025-01-01T12:07:50.527803216Z","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"coewtb43"}}
|
| 21 |
+
{"time":"2025-01-01T12:07:50.52781292Z","level":"INFO","msg":"sender: closed","stream_id":{"value":"coewtb43"}}
|
| 22 |
+
{"time":"2025-01-01T12:07:50.530364592Z","level":"INFO","msg":"stream: closed","id":"coewtb43"}
|
slice_400/wandb/run-20250101_084116-coewtb43/logs/debug.log
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Current SDK version is 0.18.3
|
| 2 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Configure stats pid to 675697
|
| 3 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /home/align-anything/.config/wandb/settings
|
| 4 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from /data/align-anything/hantao/align-anything/scripts/wandb/settings
|
| 5 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Loading settings from environment variables: {'api_key': '***REDACTED***', 'mode': 'online'}
|
| 6 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None}
|
| 7 |
+
2025-01-01 08:41:16,150 WARNING MainThread:675697 [wandb_setup.py:_flush():79] Could not find program at -m align_anything.trainers.text_image_to_text_image.dpo
|
| 8 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': None, 'program': '-m align_anything.trainers.text_image_to_text_image.dpo'}
|
| 9 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_setup.py:_flush():79] Applying login settings: {}
|
| 10 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():532] Logging user logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug.log
|
| 11 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:_log_setup():533] Logging internal logs to /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference/wandb/run-20250101_084116-coewtb43/logs/debug-internal.log
|
| 12 |
+
2025-01-01 08:41:16,150 INFO MainThread:675697 [wandb_init.py:init():617] calling init triggers
|
| 13 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():624] wandb.init called with sweep_config: {}
|
| 14 |
+
config: {'train_cfgs': {'ds_cfgs': 'ds_z3_config.json', 'epochs': 3.0, 'seed': 42, 'per_device_train_batch_size': 4.0, 'per_device_eval_batch_size': 4.0, 'gradient_accumulation_steps': 2.0, 'gradient_checkpointing': True, 'learning_rate': 1e-06, 'lr_scheduler_type': 'cosine', 'lr_warmup_ratio': 0.03, 'weight_decay': 0.01, 'adam_betas': [0.9, 0.95], 'bf16': True, 'fp16': False, 'eval_strategy': 'epoch', 'eval_interval': 10, 'regularization': 0.001, 'scale_coeff': 0.1, 'freeze_mm_proj': True, 'freeze_vision_tower': False, 'freeze_language_model': True}, 'data_cfgs': {'train_datasets': '/data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized', 'train_template': 'Chameleon_preference', 'train_size': None, 'train_split': 'train', 'train_subset': None, 'train_data_files': 'q0_40_preference.pt', 'train_optional_args': [], 'eval_datasets': None, 'eval_template': None, 'eval_size': None, 'eval_split': None, 'eval_subset': None, 'eval_data_files': None, 'eval_optional_args': []}, 'logger_cfgs': {'log_type': 'wandb', 'log_project': 'align-anything', 'log_run_name': 'dpo', 'output_dir': '/data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference', 'cache_dir': None, 'save_interval': 400.0}, 'model_cfgs': {'model_name_or_path': '/data/align-anything/hantao/models/chameleon-7b', 'trust_remote_code': True, 'model_max_length': 4096}, 'special_tokens': None}
|
| 15 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():667] starting backend
|
| 16 |
+
2025-01-01 08:41:16,151 INFO MainThread:675697 [wandb_init.py:init():671] sending inform_init request
|
| 17 |
+
2025-01-01 08:41:16,156 INFO MainThread:675697 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
|
| 18 |
+
2025-01-01 08:41:16,157 INFO MainThread:675697 [wandb_init.py:init():684] backend started and connected
|
| 19 |
+
2025-01-01 08:41:16,160 INFO MainThread:675697 [wandb_init.py:init():779] updated telemetry
|
| 20 |
+
2025-01-01 08:41:16,221 INFO MainThread:675697 [wandb_init.py:init():812] communicating run to backend with 90.0 second timeout
|
| 21 |
+
2025-01-01 08:41:16,848 INFO MainThread:675697 [wandb_init.py:init():863] starting run threads in backend
|
| 22 |
+
2025-01-01 08:41:17,411 INFO MainThread:675697 [wandb_run.py:_console_start():2465] atexit reg
|
| 23 |
+
2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2313] redirect: wrap_raw
|
| 24 |
+
2025-01-01 08:41:17,412 INFO MainThread:675697 [wandb_run.py:_redirect():2378] Wrapping output streams.
|
| 25 |
+
2025-01-01 08:41:17,413 INFO MainThread:675697 [wandb_run.py:_redirect():2403] Redirects installed.
|
| 26 |
+
2025-01-01 08:41:17,424 INFO MainThread:675697 [wandb_init.py:init():907] run started, returning control to user process
|
| 27 |
+
2025-01-01 12:07:46,853 INFO MainThread:675697 [wandb_run.py:_finish():2164] finishing run htlou/align-anything/coewtb43
|
| 28 |
+
2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_atexit_cleanup():2428] got exitcode: 0
|
| 29 |
+
2025-01-01 12:07:46,854 INFO MainThread:675697 [wandb_run.py:_restore():2410] restore
|
| 30 |
+
2025-01-01 12:07:46,855 INFO MainThread:675697 [wandb_run.py:_restore():2416] restore done
|
| 31 |
+
2025-01-01 12:07:50,513 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4049] rendering history
|
| 32 |
+
2025-01-01 12:07:50,515 INFO MainThread:675697 [wandb_run.py:_footer_history_summary_info():4081] rendering summary
|
| 33 |
+
2025-01-01 12:07:50,525 INFO MainThread:675697 [wandb_run.py:_footer_sync_info():4008] logging synced files
|
slice_400/wandb/run-20250101_084116-coewtb43/run-coewtb43.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6184cd062ec8be4c9517ecc56b37ff397dd3f29795bce1cd495613256a3f6f76
|
| 3 |
+
size 12650956
|
slice_800/arguments.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data_cfgs:
|
| 2 |
+
eval_data_files: null
|
| 3 |
+
eval_datasets: null
|
| 4 |
+
eval_optional_args: []
|
| 5 |
+
eval_size: null
|
| 6 |
+
eval_split: null
|
| 7 |
+
eval_subset: null
|
| 8 |
+
eval_template: null
|
| 9 |
+
train_data_files: q0_40_preference.pt
|
| 10 |
+
train_datasets: /data/align-anything/hantao/data/mm_interp/AA_preference_cocour_new_step10/tokenized
|
| 11 |
+
train_optional_args: []
|
| 12 |
+
train_size: null
|
| 13 |
+
train_split: train
|
| 14 |
+
train_subset: null
|
| 15 |
+
train_template: Chameleon_preference
|
| 16 |
+
logger_cfgs:
|
| 17 |
+
cache_dir: null
|
| 18 |
+
log_project: align-anything
|
| 19 |
+
log_run_name: dpo
|
| 20 |
+
log_type: wandb
|
| 21 |
+
output_dir: /data/align-anything/hantao/align-anything/outputs/mm_interp//q0_40_preference
|
| 22 |
+
save_interval: 400.0
|
| 23 |
+
model_cfgs:
|
| 24 |
+
model_max_length: 4096
|
| 25 |
+
model_name_or_path: /data/align-anything/hantao/models/chameleon-7b
|
| 26 |
+
trust_remote_code: true
|
| 27 |
+
special_tokens: null
|
| 28 |
+
train_cfgs:
|
| 29 |
+
adam_betas:
|
| 30 |
+
- 0.9
|
| 31 |
+
- 0.95
|
| 32 |
+
bf16: true
|
| 33 |
+
ds_cfgs: ds_z3_config.json
|
| 34 |
+
epochs: 3.0
|
| 35 |
+
eval_interval: 10
|
| 36 |
+
eval_strategy: epoch
|
| 37 |
+
fp16: false
|
| 38 |
+
freeze_language_model: true
|
| 39 |
+
freeze_mm_proj: true
|
| 40 |
+
freeze_vision_tower: false
|
| 41 |
+
gradient_accumulation_steps: 2.0
|
| 42 |
+
gradient_checkpointing: true
|
| 43 |
+
learning_rate: 1.0e-06
|
| 44 |
+
lr_scheduler_type: cosine
|
| 45 |
+
lr_warmup_ratio: 0.03
|
| 46 |
+
per_device_eval_batch_size: 4.0
|
| 47 |
+
per_device_train_batch_size: 4.0
|
| 48 |
+
regularization: 0.001
|
| 49 |
+
scale_coeff: 0.1
|
| 50 |
+
seed: 42
|
| 51 |
+
weight_decay: 0.01
|
slice_800/config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|