ChatterjeeLab
/

TR2-D2

Model card Files Files and versions

TR2-D2 / tr2d2-dna /train.sh

zyc4975matholic

Include DNA training code

303c2e0 about 1 month ago

1.42 kB

	#!/bin/bash

	#SBATCH --job-name=dna_mdns
	#SBATCH --partition=coe-gpu
	#SBATCH --gres=gpu:H100:1
	#SBATCH --time=16:00:00
	# max 16 GPU hours, i.e., time <= 16h / num of GPUs
	#SBATCH --mem-per-gpu=60G
	# maximum GPU RAM, 141G for H200, 94G for H100
	# in the current setting, 40G is enough for num_replicates=2 and 80G is enough for num_replicates=4
	#SBATCH --cpus-per-task=2
	#SBATCH --wait-all-nodes=1
	#SBATCH --output=../outputs/%j.%x/.log


	HOME_LOC= "" # Fill in directory of the repo
	SAVE_PATH = "" # Fill in directory to save the checkpoints
	BASE_PATH = "" # Fill in directory of the pretrained checkpoints, e.g., "...../data_and_model/"
	SCRIPT_LOC=$HOME_LOC/tr2d2/dna
	LOG_LOC=$HOME_LOC/tr2d2/dna/logs
	DATE=$(date +%m_%d)



	mkdir -p "$LOG_LOC"

	# set 3 have skip connection
	# ===================================================================
	python $SCRIPT_LOC/finetune.py \
	--base_path $BASE_PATH \
	--device "cuda:0" \
	--noise_removal \
	--save_path_dir $SAVE_PATH \
	--wdce_num_replicates 16 \
	--buffer_size 160 \
	--batch_size 160 \
	--seq_length 200 \
	--num_children 32 \
	--total_num_steps 128 \
	--num_iter 5 \
	--resample_every_n_step 5 \
	--eval_every_n_epochs 10 \
	--num_epochs 60000 \
	--exploration 0.1 \
	--save_every_n_epoch 2000 \
	--alpha 0.1 \
	--centering \
	--grad_clip \
	--reward_clip \
	--reward_clip_value 15.0 \
	--reset_tree