esunAI
/

FlowFinal

+#!/bin/bash
+# Optimized Single GPU AMP Flow Matching Training Launch Script with FULL DATA
+# This script launches optimized training on GPU 0 using ALL available data
+# Features: Mixed precision (BF16), increased batch size, H100 optimizations
+echo "=== Launching Optimized Single GPU AMP Flow Matching Training with FULL DATA ==="
+echo "Using GPU 0 for training"
+echo "Using ALL available peptide embeddings and new FASTA CFG data"
+echo "OVERNIGHT TRAINING: 15000 iterations with CFG support and H100 optimizations"
+echo ""
+# Activate flow virtual environment
+echo "Activating flow virtual environment..."
+source /home/edwardsun/miniconda3/envs/flow/bin/activate
+if [ $? -eq 0 ]; then
+    echo "✓ Flow environment activated"
+    echo "Python: $(which python)"
+    echo "Python version: $(python --version)"
+else
+    echo "❌ Failed to activate flow environment"
+    echo "Please check if the environment exists: conda env list"
+    exit 1
+fi
+echo ""
+# Check if required files exist
+echo "Checking required files..."
+if [ ! -f "final_compressor_model.pth" ]; then
+    echo "❌ Missing final_compressor_model.pth"
+    echo "Please run compressor_with_embeddings.py first"
+    exit 1
+fi
+if [ ! -f "final_decompressor_model.pth" ]; then
+    echo "❌ Missing final_decompressor_model.pth"
+    echo "Please run compressor_with_embeddings.py first"
+    exit 1
+fi
+if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then
+    echo "❌ Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory"
+    echo "Please run final_sequence_encoder.py first"
+    exit 1
+fi
+# Check for full data files
+if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then
+    echo "⚠️  Warning: all_peptide_embeddings.pt not found"
+    echo "Will use individual embedding files instead"
+else
+    echo "✓ Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)"
+fi
+# Check for new FASTA CFG data
+if [ ! -f "/home/edwardsun/flow/combined_final.fasta" ]; then
+    echo "❌ Missing /home/edwardsun/flow/combined_final.fasta"
+    echo "This contains the new CFG training data with >AP (AMP) and >sp (Non-AMP) labels"
+    exit 1
+else
+    echo "✓ Found combined_final.fasta - New CFG data with automatic labeling"
+    echo "  >AP headers = AMP sequences"
+    echo "  >sp headers = Non-AMP sequences"
+fi
+echo "✓ All required files found!"
+echo ""
+# Set CUDA device to GPU 0
+export CUDA_VISIBLE_DEVICES=0
+# Enable H100 optimizations
+export TORCH_CUDNN_V8_API_ENABLED=1
+export TORCH_CUDNN_V8_API_DISABLED=0
+echo "=== Optimized Training Configuration ==="
+echo "  - GPU: 0 (CUDA_VISIBLE_DEVICES=0)"
+echo "  - Batch size: 96 (optimized based on profiling)"
+echo "  - Total iterations: 6,000"
+echo "  - Mixed precision: BF16 (H100 optimized)"
+echo "  - Learning rate: 4e-4 -> 2e-4 (cosine annealing)"
+echo "  - Warmup steps: 5,000"
+echo "  - Gradient clipping: 1.0"
+echo "  - Weight decay: 0.01"
+echo "  - Data workers: 16"
+echo "  - CFG dropout: 15%"
+echo "  - Validation: Every 10,000 steps"
+echo "  - Checkpoints: Every 1,000 epochs"
+echo "  - Estimated time: ~8-10 hours (overnight training)"
+echo ""
+# Check GPU memory and capabilities
+echo "Checking GPU capabilities..."
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits | while IFS=, read -r name total free; do
+    echo "  GPU: $name"
+    echo "  Total memory: ${total}MB"
+    echo "  Free memory: ${free}MB"
+    echo "  Available: $((free * 100 / total))%"
+done
+echo ""
+# Launch optimized training
+echo "Starting optimized single GPU training on GPU 0 with FULL DATA..."
+echo "Using new FASTA CFG data: combined_final.fasta"
+echo ""
+# Launch training with new FASTA CFG data
+python amp_flow_training_single_gpu_full_data.py --cfg_data /home/edwardsun/flow/combined_final.fasta
+echo ""
+echo "=== Optimized Overnight Training Complete with FULL DATA ==="
+echo "Check for output files:"
+echo "  - amp_flow_model_best_optimized.pth (best validation model)"
+echo "  - amp_flow_model_final_optimized.pth (final model)"
+echo "  - amp_flow_checkpoint_optimized_step_*.pth (checkpoints every 1000 epochs)"
+echo ""
+echo "Training optimizations applied:"
+echo "  ✓ Mixed precision (BF16) for ~30-50% speedup"
+echo "  ✓ Increased batch size (128) for better H100 utilization"
+echo "  ✓ Optimized learning rate schedule with proper warmup"
+echo "  ✓ Gradient clipping for training stability"
+echo "  ✓ CFG dropout for better guidance"
+echo "  ✓ Validation monitoring and early stopping"
+echo "  ✓ PyTorch 2.x compilation for speedup"
+echo ""
+echo "Next steps:"
+echo "1. Test the optimized model: python generate_amps.py"
+echo "2. Compare performance with previous model"
+echo "3. Implement reflow for 1-step generation"
+echo "4. Add conditioning for toxicity"
+echo "5. Fine-tune on specific AMP properties"