Spaces:

HassounLab
/

FLARE

Sleeping

App Files Files Community

yzhouchen001 commited on 27 days ago

Commit

aedfaa8

1 Parent(s): bb73124

changes

Browse files

Files changed (8) hide show

.gitignore +3 -0
README.md +2 -3
flare/params_binnedSpec.yaml +0 -122
flare/params_filipGlobal.yaml +0 -95
flare/params_formSpec.yaml +0 -121
flare/params_jestr.yaml +0 -122
flare/params_tmp.yaml +0 -125
pretrained_models/.gitkeep +0 -0

.gitignore CHANGED Viewed

@@ -207,3 +207,6 @@ marimo/_lsp/
 __marimo__/
 experiments/20250913_optimized_filip-model/*.ckpt
 experiments/20250913_optimized_filip-model/*.ckpt

 __marimo__/
 experiments/20250913_optimized_filip-model/*.ckpt
 experiments/20250913_optimized_filip-model/*.ckpt
+data/
+pretrained_models/*.ckpt

README.md CHANGED Viewed

@@ -7,7 +7,6 @@ sdk: streamlit
 pinned: false
 python_version: 3.11.7
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 # 🔥 FLARE
@@ -15,12 +14,12 @@ Fine-grained Learning for Aligment of spectra-molecule REpresentation
 ###  Yan Zhou Chen, Soha Hassoun
 #### Department of Computer Science, Tufts University
-This repository provides the implementation of FALRE. Given a mass spectrum, FLARE can be used to rank a set of molecular candidates. Further, FLARE enables visualization of peak-to-node attribution.
 ## 🛠 Set up
 ### Clone repository
 ```
-git clone https://github.com/hassounlab/flare
 cd flare
 ```
 ### Set up environment and install dependencies

 pinned: false
 python_version: 3.11.7
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 # 🔥 FLARE
 ###  Yan Zhou Chen, Soha Hassoun
 #### Department of Computer Science, Tufts University
+This repository provides the implementation of FLARE. Given a mass spectrum, FLARE can be used to rank a set of molecular candidates. Further, FLARE enables visualization of peak-to-node attribution.
 ## 🛠 Set up
 ### Clone repository
 ```
+git clone https://huggingface.co/spaces/HassounLab/FLARE
 cd flare
 ```
 ### Set up environment and install dependencies

flare/params_binnedSpec.yaml DELETED Viewed

@@ -1,122 +0,0 @@
-# Experiment setup
-job_key: ''
-run_name: 'binnedSpec_experiment'
-run_details: ""
-project_name: ''
-wandb_entity_name: 'mass-spec-ml'
-no_wandb: True
-seed: 0
-debug: False
-checkpoint_pth: ""
-# Training setup
-max_epochs: 1000
-accelerator: 'gpu'
-devices: [1]
-log_every_n_steps: 250
-val_check_interval: 1.0
-# Data paths
-candidates_pth: ../data/sample/candidates_mass.json
-dataset_pth: "../data/sample/data.tsv"
-subformula_dir_pth: ""
-split_pth:
-fp_dir_pth: '../data/sample/morganfp_r5_1024.pickle'
-cons_spec_dir_pth: "../data/sample/consensus_binnedSpec.pkl"
-NL_spec_dir_pth: ""
-partial_checkpoint: ""
-# General hyperparameters
-batch_size: 64
-lr: 5.0e-4
-weight_decay: 0
-contr_temp: 0.05
-early_stopping_patience: 300
-loss_strategy: 'static' # static, linear, manual
-num_workers: 50
-############################## Data transforms ##############################
-# - Spectra
-spectra_view:  SpecBinnerLog
-max_mz: 1000
-bin_width: 1
-mask_peak_ratio: 0.00
-# 2. SpecFormula
-element_list: ['H', 'C',  'O', 'N', 'P', 'S', 'Cl', 'F', 'Br', 'I', 'B', 'As', 'Si', 'Se']
-add_intensities: True
-mask_precursor: False
-# - Molecule
-molecule_view: "MolGraph"
-atom_feature: 'full'
-bond_feature: 'full'
-############################## Views ##############################
-# contrastive
-use_contr: True
-contr_wt: 1
-contr_wt_update: {}
-# consensus spectra
-use_cons_spec: False
-cons_spec_wt: 3
-cons_spec_wt_update: {}
-cons_loss_type: 'l2' # cosine, l2
-# fp prediction/usage
-pred_fp: False
-use_fp: False
-fp_loss_type: 'cosine' #cosine, bce
-fp_wt: 3
-fp_wt_update: {}
-fp_size: 1024
-fp_radius: 5
-fp_dropout: 0.4
-# candidates
-aug_cands: False
-aug_cands_wt: 0.1
-aug_cands_update: {}
-aug_cands_size: 3
-# neutral loss
-use_NL: False
-############################## Task and model ##############################
-task: 'retrieval'
-spec_enc: MLP_BIN
-mol_enc: "GNN"
-model:  "MultiviewContrastive"
-contr_views: [['spec_enc', 'mol_enc']]
-log_only_loss_at_stages: []
-df_test_path: ""
-# - Spectra encoder
-final_embedding_dim: 512
-fc_dropout: 0.4
-# - Spectra Token encoder
-hidden_dims: [64, 128]
-peak_dropout: 0.2
-# - Formula-based spec encoders
-formula_dropout: 0.2
-formula_dims: [64, 128, 256]
-cross_attn_heads: 2
-use_cls: True
-# -- GAT params
-attn_heads: [12,12,12]
-# - Molecule encoder (GNN)
-gnn_channels: [64,128,256]
-gnn_type: "gcn"
-num_gnn_layers: 3
-gnn_hidden_dim: 512
-gnn_dropout: 0.3

flare/params_filipGlobal.yaml DELETED Viewed

@@ -1,95 +0,0 @@
-# Experiment setup
-job_key: ''
-run_name: 'filip-global'
-run_details: ""
-project_name: ''
-wandb_entity_name: 'mass-spec-ml'
-no_wandb: True
-seed: 42
-debug: False
-checkpoint_pth:
-# Training setup
-max_epochs: 2000
-accelerator: 'gpu'
-devices: [1]
-log_every_n_steps: 250
-val_check_interval: 1.0
-# Data paths
-candidates_pth: /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
-dataset_pth: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # /data/yzhouc01/MVP/data/sample/data.tsv #/r/hassounlab/spectra_data/msgym/MassSpecGym.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
-subformula_dir_pth: /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default # /data/yzhouc01/FILIP-MS/data/magma # /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
-split_pth:
-fp_dir_pth:
-partial_checkpoint: ""
-# General hyperparameters
-batch_size: 64 #64
-lr: 2.881339661302105e-05 # 5.0e-05
-weight_decay: 1.8376229667330708e-05
-contr_temp: 0.022772534845886608 # 0.022772534845886608 # 0.05
-num_workers: 50
-# FILIP_GLOBAL model parameters
-loss_mode: "pcgrad"
-agg_fn: "mean"
-loss_weight: 1.1
-############################## Data transforms ##############################
-# - Spectra
-spectra_view: SpecFormula #SpecMzIntTokens #SpecFormula
-formula_source: 'default' # magma_1, magma_all, sirius, default
-#  1. Binner
-max_mz: 1000
-bin_width: 1
-mask_peak_ratio: 0.00
-# 2. SpecFormula
-element_list: ['H', 'C',  'O', 'N', 'P', 'S', 'Cl', 'F', 'Br', 'I', 'B', 'As', 'Si', 'Se']
-add_intensities: True
-# - Molecule
-molecule_view: "MolGraph"
-atom_feature: 'full'
-bond_feature: 'full'
-############################## Task and model ##############################
-task: 'retrieval'
-spec_enc: Transformer_Formula # Transformer_MzInt #Transformer_Formula
-mol_enc: "GNN"
-model:  filipGlobalContrastive #filipContrastive # "MultiviewContrastive"
-contr_views: [['spec_enc', 'mol_enc']]
-log_only_loss_at_stages: []
-df_test_path: ""
-# - Formula-based spec encoders
-formula_dropout:  0.2
-formula_dims: [512,256,512] #[512, 256, 512] #[64, 128, 256]
-cross_attn_heads: 2
-use_cls: False
-peak_dropout: 0.2
-formula_attn_heads: 4 # 2
-formula_transformer_layers: 2 #2
-# -- GAT params
-attn_heads: [12,12,12]
-# - Molecule encoder (GNN)
-gnn_channels: [128, 256, 512] #[64,128,512]
-gnn_type: "gcn"
-# num_gnn_layers: 3
-# gnn_hidden_dim: 512
-gnn_dropout: 0.23234950970370824 #0.3
-# - Spectra encoder (cross attention model)
-# final_embedding_dim: 512
-# fc_dropout: 0.4
-# - Spectra Token encoder (mz-int token model)
-# hidden_dims: [64, 256]

flare/params_formSpec.yaml DELETED Viewed

@@ -1,121 +0,0 @@
-# Experiment setup
-job_key: ''
-run_name: 'flare_sirius_labels_42'
-run_details: ""
-project_name: ''
-wandb_entity_name: 'mass-spec-ml'
-no_wandb: True
-seed: 42
-debug: False
-checkpoint_pth:
-# Training setup
-max_epochs: 2000
-accelerator: 'gpu'
-devices: [1]
-log_every_n_steps: 250
-val_check_interval: 1.0
-# Data paths
-candidates_pth: /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
-dataset_pth: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # /data/yzhouc01/MVP/data/sample/data.tsv #/r/hassounlab/spectra_data/msgym/MassSpecGym.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
-subformula_dir_pth: /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default # /data/yzhouc01/FILIP-MS/data/magma # /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
-split_pth:
-fp_dir_pth:
-partial_checkpoint: ""
-# General hyperparameters
-batch_size: 64 #64
-lr: 2.881339661302105e-05 # 5.0e-05
-weight_decay: 1.8376229667330708e-05
-contr_temp: 0.022772534845886608 # 0.022772534845886608 # 0.05
-num_workers: 50
-############################## Data transforms ##############################
-# - Spectra
-spectra_view: SpecFormula #SpecMzIntTokens #SpecFormula
-formula_source: 'sirius' # magma_1, magma_all, sirius, default
-#  1. Binner
-max_mz: 1000
-bin_width: 1
-mask_peak_ratio: 0.00
-# 2. SpecFormula
-element_list: ['H', 'C',  'O', 'N', 'P', 'S', 'Cl', 'F', 'Br', 'I', 'B', 'As', 'Si', 'Se']
-add_intensities: True
-# - Molecule
-molecule_view: "MolGraph"
-atom_feature: 'full'
-bond_feature: 'full'
-############################## Views ##############################
-# contrastive
-# use_contr: False
-# contr_wt: 1
-# contr_wt_update: {}
-# consensus spectra
-# use_cons_spec: False
-# cons_spec_wt: 3
-# cons_spec_wt_update: {}
-# cons_loss_type: 'l2' # cosine, l2
-# fp prediction/usage
-# pred_fp: False
-# use_fp: False
-# fp_loss_type: 'cosine' #cosine, bce
-# fp_wt: 3
-# fp_wt_update: {}
-# fp_size: 1024
-# fp_radius: 5
-# fp_dropout: 0.4
-# candidates
-# aug_cands: False
-# aug_cands_wt: 0.1
-# aug_cands_update: {}
-# aug_cands_size: 3
-# neutral loss
-# use_NL: False
-############################## Task and model ##############################
-task: 'retrieval'
-spec_enc: Transformer_Formula # Transformer_MzInt #Transformer_Formula
-mol_enc: "GNN"
-model:  filipContrastive # "MultiviewContrastive"
-contr_views: [['spec_enc', 'mol_enc']]
-log_only_loss_at_stages: []
-df_test_path: ""
-# - Formula-based spec encoders
-formula_dropout:  0.2
-formula_dims: [512,256,512] #[512, 256, 512] #[64, 128, 256]
-cross_attn_heads: 2
-use_cls: False
-peak_dropout: 0.2
-formula_attn_heads: 4 # 2
-formula_transformer_layers: 2 #2
-# -- GAT params
-attn_heads: [12,12,12]
-# - Molecule encoder (GNN)
-gnn_channels: [128, 256, 512] #[64,128,512]
-gnn_type: "gcn"
-# num_gnn_layers: 3
-# gnn_hidden_dim: 512
-gnn_dropout: 0.23234950970370824 #0.3
-# - Spectra encoder (cross attention model)
-# final_embedding_dim: 512
-# fc_dropout: 0.4
-# - Spectra Token encoder (mz-int token model)
-# hidden_dims: [64, 256]

flare/params_jestr.yaml DELETED Viewed

@@ -1,122 +0,0 @@
-# Experiment setup
-job_key: ''
-run_name: 'combined_d_1024dim_100bs'
-run_details: ""
-project_name: ''
-wandb_entity_name: 'mass-spec-ml'
-no_wandb: True
-seed: 3
-debug: False
-checkpoint_pth:
-# Training setup
-max_epochs: 2000
-accelerator: 'gpu'
-devices: [1]
-log_every_n_steps: 250
-val_check_interval: 1.0
-# Data paths
-candidates_pth: "/r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json"
-dataset_pth: '/r/hassounlab/spectra_data/msgym/MassSpecGym.tsv' # '/r/hassounlab/spectra_data/msgym/MassSpecGym.tsv' #"/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv"
-subformula_dir_pth: ""
-split_pth:
-fp_dir_pth: ''
-cons_spec_dir_pth:
-NL_spec_dir_pth: ""
-partial_checkpoint: ""
-# General hyperparameters
-batch_size: 100
-lr: 5.0e-4
-weight_decay: 0
-contr_temp: 0.05
-early_stopping_patience: 300
-loss_strategy: 'static' # static, linear, manual
-num_workers: 50
-############################## Data transforms ##############################
-# - Spectra
-spectra_view:  SpecBinnerLog
-max_mz: 1000
-bin_width: 1
-mask_peak_ratio: 0.00
-# 2. SpecFormula
-element_list: ['H', 'C',  'O', 'N', 'P', 'S', 'Cl', 'F', 'Br', 'I', 'B', 'As', 'Si', 'Se']
-add_intensities: True
-mask_precursor: False
-# - Molecule
-molecule_view: "MolGraph"
-atom_feature: 'full'
-bond_feature: 'full'
-############################## Views ##############################
-# contrastive
-use_contr: True
-contr_wt: 1
-contr_wt_update: {}
-# consensus spectra
-use_cons_spec: False
-cons_spec_wt: 3
-cons_spec_wt_update: {}
-cons_loss_type: 'l2' # cosine, l2
-# fp prediction/usage
-pred_fp: False
-use_fp: False
-fp_loss_type: 'cosine' #cosine, bce
-fp_wt: 3
-fp_wt_update: {}
-fp_size: 1024
-fp_radius: 5
-fp_dropout: 0.4
-# candidates
-aug_cands: False
-aug_cands_wt: 0.1
-aug_cands_update: {}
-aug_cands_size: 3
-# neutral loss
-use_NL: False
-############################## Task and model ##############################
-task: 'retrieval'
-spec_enc: MLP_BIN
-mol_enc: "GNN"
-model:  "MultiviewContrastive"
-contr_views: [['spec_enc', 'mol_enc']]
-log_only_loss_at_stages: []
-df_test_path: ""
-# - Spectra encoder
-final_embedding_dim: 1024
-fc_dropout: 0.4
-# - Spectra Token encoder
-hidden_dims: [64, 128]
-peak_dropout: 0.2
-# - Formula-based spec encoders
-formula_dropout: 0.2
-formula_dims: [64, 128, 256]
-cross_attn_heads: 2
-use_cls: True
-# -- GAT params
-attn_heads: [12,12,12]
-# - Molecule encoder (GNN)
-gnn_channels: [64,128,256]
-gnn_type: "gcn"
-num_gnn_layers: 3
-gnn_hidden_dim: 1024
-gnn_dropout: 0.3

flare/params_tmp.yaml DELETED Viewed

@@ -1,125 +0,0 @@
-# Experiment setup
-job_key: ''
-run_name: 'crossAttnModel'
-run_details: ""
-project_name: ''
-wandb_entity_name: 'mass-spec-ml'
-no_wandb: True
-seed: 0
-debug: False
-checkpoint_pth:
-# Training setup
-max_epochs: 1000
-accelerator: 'gpu'
-devices: [1]
-log_every_n_steps: 250
-val_check_interval: 1.0
-# Data paths
-candidates_pth: /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
-dataset_pth: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # /data/yzhouc01/MVP/data/sample/data.tsv #/r/hassounlab/spectra_data/msgym/MassSpecGym.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
-subformula_dir_pth: /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default # /data/yzhouc01/FILIP-MS/data/magma # /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
-split_pth:
-fp_dir_pth:
-cons_spec_dir_pth:
-NL_spec_dir_pth: ""
-partial_checkpoint: ""
-# General hyperparameters
-batch_size: 64
-lr: 2.881339661302105e-05 # 5.0e-05
-weight_decay: 1.1586679936312845e-05
-contr_temp: 0.022772534845886608 # 0.05
-early_stopping_patience: 300
-loss_strategy: 'static'
-num_workers: 50
-############################## Data transforms ##############################
-# - Spectra
-spectra_view: SpecFormula #SpecMzIntTokens #SpecFormula
-formula_source: 'default' # magma_1, magma_all, sirius, default
-#  1. Binner
-max_mz: 1000
-bin_width: 1
-mask_peak_ratio: 0.00
-# 2. SpecFormula
-element_list: ['H', 'C',  'O', 'N', 'P', 'S', 'Cl', 'F', 'Br', 'I', 'B', 'As', 'Si', 'Se']
-add_intensities: True
-mask_precursor: False
-# - Molecule
-molecule_view: "MolGraph"
-atom_feature: 'full'
-bond_feature: 'full'
-############################## Views ##############################
-# contrastive
-use_contr: False
-contr_wt: 1
-contr_wt_update: {}
-# consensus spectra
-use_cons_spec: False
-cons_spec_wt: 3
-cons_spec_wt_update: {}
-cons_loss_type: 'l2' # cosine, l2
-# fp prediction/usage
-pred_fp: False
-use_fp: False
-fp_loss_type: 'cosine' #cosine, bce
-fp_wt: 3
-fp_wt_update: {}
-fp_size: 1024
-fp_radius: 5
-fp_dropout: 0.4
-# candidates
-aug_cands: False
-aug_cands_wt: 0.1
-aug_cands_update: {}
-aug_cands_size: 3
-# neutral loss
-use_NL: False
-############################## Task and model ##############################
-task: 'retrieval'
-spec_enc: Transformer_Formula # Transformer_MzInt #Transformer_Formula
-mol_enc: "GNN"
-model:  crossAttenContrastive # "MultiviewContrastive"
-contr_views: [['spec_enc', 'mol_enc']] #[['spec_enc', 'mol_enc'], ['spec_enc', 'NL_spec_enc'], ['mol_enc', 'NL_spec_enc']] #[['spec_enc', 'mol_enc'], ['mol_enc', 'cons_spec_enc'], ['cons_spec_enc', 'spec_enc'], ['fp_enc', 'mol_enc'], ['fp_enc', 'spec_enc'], ['fp_enc', 'cons_spec_enc']]
-log_only_loss_at_stages: []
-df_test_path: ""
-# - Spectra encoder
-final_embedding_dim: 512
-fc_dropout: 0.4
-# - Spectra Token encoder
-hidden_dims: [64, 128]
-# - Formula-based spec encoders
-formula_dropout:  0.2
-formula_dims: [128, 256, 512] #[64, 128, 256]
-cross_attn_heads: 4 # 2
-use_cls: False
-peak_dropout: 0.414425691950033 # 0.2
-formula_attn_heads: 4 # 2
-formula_transformer_layers: 2
-# -- GAT params
-attn_heads: [12,12,12]
-# - Molecule encoder (GNN)
-gnn_channels: [64,128,512]
-gnn_type: "gcn"
-num_gnn_layers: 3
-gnn_hidden_dim: 512
-gnn_dropout: 0.23234950970370824 #0.3

pretrained_models/.gitkeep ADDED Viewed

File without changes