esunAI commited on
Commit
ad86d54
Β·
verified Β·
1 Parent(s): eb85c1a

Add launch_full_data_training.sh

Browse files
training_logs/launch_full_data_training.sh ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Optimized Single GPU AMP Flow Matching Training Launch Script with FULL DATA
4
+ # This script launches optimized training on GPU 0 using ALL available data
5
+ # Features: Mixed precision (BF16), increased batch size, H100 optimizations
6
+
7
+ echo "=== Launching Optimized Single GPU AMP Flow Matching Training with FULL DATA ==="
8
+ echo "Using GPU 0 for training"
9
+ echo "Using ALL available peptide embeddings and new FASTA CFG data"
10
+ echo "OVERNIGHT TRAINING: 15000 iterations with CFG support and H100 optimizations"
11
+ echo ""
12
+
13
+ # Activate flow virtual environment
14
+ echo "Activating flow virtual environment..."
15
+ source /home/edwardsun/miniconda3/envs/flow/bin/activate
16
+ if [ $? -eq 0 ]; then
17
+ echo "βœ“ Flow environment activated"
18
+ echo "Python: $(which python)"
19
+ echo "Python version: $(python --version)"
20
+ else
21
+ echo "❌ Failed to activate flow environment"
22
+ echo "Please check if the environment exists: conda env list"
23
+ exit 1
24
+ fi
25
+ echo ""
26
+
27
+ # Check if required files exist
28
+ echo "Checking required files..."
29
+ if [ ! -f "final_compressor_model.pth" ]; then
30
+ echo "❌ Missing final_compressor_model.pth"
31
+ echo "Please run compressor_with_embeddings.py first"
32
+ exit 1
33
+ fi
34
+
35
+ if [ ! -f "final_decompressor_model.pth" ]; then
36
+ echo "❌ Missing final_decompressor_model.pth"
37
+ echo "Please run compressor_with_embeddings.py first"
38
+ exit 1
39
+ fi
40
+
41
+ if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then
42
+ echo "❌ Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory"
43
+ echo "Please run final_sequence_encoder.py first"
44
+ exit 1
45
+ fi
46
+
47
+ # Check for full data files
48
+ if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then
49
+ echo "⚠️ Warning: all_peptide_embeddings.pt not found"
50
+ echo "Will use individual embedding files instead"
51
+ else
52
+ echo "βœ“ Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)"
53
+ fi
54
+
55
+ # Check for new FASTA CFG data
56
+ if [ ! -f "/home/edwardsun/flow/combined_final.fasta" ]; then
57
+ echo "❌ Missing /home/edwardsun/flow/combined_final.fasta"
58
+ echo "This contains the new CFG training data with >AP (AMP) and >sp (Non-AMP) labels"
59
+ exit 1
60
+ else
61
+ echo "βœ“ Found combined_final.fasta - New CFG data with automatic labeling"
62
+ echo " >AP headers = AMP sequences"
63
+ echo " >sp headers = Non-AMP sequences"
64
+ fi
65
+
66
+ echo "βœ“ All required files found!"
67
+ echo ""
68
+
69
+ # Set CUDA device to GPU 0
70
+ export CUDA_VISIBLE_DEVICES=0
71
+
72
+ # Enable H100 optimizations
73
+ export TORCH_CUDNN_V8_API_ENABLED=1
74
+ export TORCH_CUDNN_V8_API_DISABLED=0
75
+
76
+ echo "=== Optimized Training Configuration ==="
77
+ echo " - GPU: 0 (CUDA_VISIBLE_DEVICES=0)"
78
+ echo " - Batch size: 96 (optimized based on profiling)"
79
+ echo " - Total iterations: 6,000"
80
+ echo " - Mixed precision: BF16 (H100 optimized)"
81
+ echo " - Learning rate: 4e-4 -> 2e-4 (cosine annealing)"
82
+ echo " - Warmup steps: 5,000"
83
+ echo " - Gradient clipping: 1.0"
84
+ echo " - Weight decay: 0.01"
85
+ echo " - Data workers: 16"
86
+ echo " - CFG dropout: 15%"
87
+ echo " - Validation: Every 10,000 steps"
88
+ echo " - Checkpoints: Every 1,000 epochs"
89
+ echo " - Estimated time: ~8-10 hours (overnight training)"
90
+ echo ""
91
+
92
+ # Check GPU memory and capabilities
93
+ echo "Checking GPU capabilities..."
94
+ nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits | while IFS=, read -r name total free; do
95
+ echo " GPU: $name"
96
+ echo " Total memory: ${total}MB"
97
+ echo " Free memory: ${free}MB"
98
+ echo " Available: $((free * 100 / total))%"
99
+ done
100
+
101
+ echo ""
102
+
103
+ # Launch optimized training
104
+ echo "Starting optimized single GPU training on GPU 0 with FULL DATA..."
105
+ echo "Using new FASTA CFG data: combined_final.fasta"
106
+ echo ""
107
+
108
+ # Launch training with new FASTA CFG data
109
+ python amp_flow_training_single_gpu_full_data.py --cfg_data /home/edwardsun/flow/combined_final.fasta
110
+
111
+ echo ""
112
+ echo "=== Optimized Overnight Training Complete with FULL DATA ==="
113
+ echo "Check for output files:"
114
+ echo " - amp_flow_model_best_optimized.pth (best validation model)"
115
+ echo " - amp_flow_model_final_optimized.pth (final model)"
116
+ echo " - amp_flow_checkpoint_optimized_step_*.pth (checkpoints every 1000 epochs)"
117
+ echo ""
118
+ echo "Training optimizations applied:"
119
+ echo " βœ“ Mixed precision (BF16) for ~30-50% speedup"
120
+ echo " βœ“ Increased batch size (128) for better H100 utilization"
121
+ echo " βœ“ Optimized learning rate schedule with proper warmup"
122
+ echo " βœ“ Gradient clipping for training stability"
123
+ echo " βœ“ CFG dropout for better guidance"
124
+ echo " βœ“ Validation monitoring and early stopping"
125
+ echo " βœ“ PyTorch 2.x compilation for speedup"
126
+ echo ""
127
+ echo "Next steps:"
128
+ echo "1. Test the optimized model: python generate_amps.py"
129
+ echo "2. Compare performance with previous model"
130
+ echo "3. Implement reflow for 1-step generation"
131
+ echo "4. Add conditioning for toxicity"
132
+ echo "5. Fine-tune on specific AMP properties"