Spaces:

HassounLab
/

FLARE

Sleeping

App Files Files Community

yzhouchen001 commited on 28 days ago

Commit

19a4dfc

1 Parent(s): 60219be

update

Browse files

Files changed (23) hide show

app_utils/model_utils.py +4 -4
app_utils/viz_utils.py +3 -12
flare/data/datasets.py +19 -10
flare/models/contrastive.py +184 -403
flare/models/mol_encoder.py +2 -1
flare/models/spec_encoder.py +2 -3
flare/params_filipGlobal.yaml +95 -0
flare/run.sh +3 -3
flare/subformula_assign/run.sh +8 -3
flare/subformula_assign/utils/chem_utils.py +4 -0
flare/test.py +9 -1
flare/tune.py +1 -1
flare/utils/case_study_utils.py +193 -0
flare/utils/general.py +94 -58
flare/utils/loss.py +95 -4
flare/utils/models.py +3 -1
flare/utils/mol_search.py +367 -0
notebooks/UMAP_spectra_embeddings.ipynb +0 -0
notebooks/fine-grained_vs_global.ipynb +6 -2
notebooks/good_vs_bad_instances.ipynb +0 -0
notebooks/mol-spec_visualization.ipynb +0 -0
notebooks/results.ipynb +233 -0
notebooks/spectra_sim.ipynb +0 -0

app_utils/model_utils.py CHANGED Viewed

@@ -3,8 +3,8 @@ import sys
 # sys.path.insert(0, "/data/yzhouc01/FILIP-MS")
 from rdkit import RDLogger
-from mvp.utils.data import get_spec_featurizer, get_mol_featurizer, get_ms_dataset
-from mvp.utils.models import get_model
 import yaml
@@ -15,7 +15,7 @@ lg.setLevel(RDLogger.CRITICAL)
 # Load model and data
 def load_model_components():
-    param_pth = 'hparams.yaml'
     with open(param_pth) as f:
         params = yaml.load(f, Loader=yaml.FullLoader)
@@ -24,7 +24,7 @@ def load_model_components():
     # load model
-    checkpoint_pth = "epoch=1993-train_loss=0.10.ckpt"
     params['checkpoint_pth'] = checkpoint_pth
     model = get_model(params['model'], params)

 # sys.path.insert(0, "/data/yzhouc01/FILIP-MS")
 from rdkit import RDLogger
+from flare.utils.data import get_spec_featurizer, get_mol_featurizer, get_ms_dataset
+from flare.utils.models import get_model
 import yaml
 # Load model and data
 def load_model_components():
+    param_pth = '/data/yzhouc01/FILIP-MS/experiments/20250913_optimized_filip-model/lightning_logs/version_0/hparams.yaml'
     with open(param_pth) as f:
         params = yaml.load(f, Loader=yaml.FullLoader)
     # load model
+    checkpoint_pth = "/data/yzhouc01/FILIP-MS/experiments/20250913_optimized_filip-model/epoch=1993-train_loss=0.10.ckpt"
     params['checkpoint_pth'] = checkpoint_pth
     model = get_model(params['model'], params)

app_utils/viz_utils.py CHANGED Viewed

@@ -6,7 +6,9 @@ import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 from rdkit import Chem
 from rdkit.Chem import rdDepictor
-import pandas as pd
 def mol_to_graph_coords(mol):
     """Return atom coordinates and bond list for a molecule."""
@@ -16,12 +18,6 @@ def mol_to_graph_coords(mol):
     bonds = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()]
     return coords, bonds
-import torch
-import torch.nn.functional as F
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
 def interactive_attention_visualization(
     spectral_embeds,
     graph_embeds,
@@ -68,7 +64,6 @@ def interactive_attention_visualization(
         hoverinfo='text',
         customdata=list(range(num_peaks)),  # actual peak indices
     )
     # --- Graph nodes ---
     graph_nodes = go.Scatter(
         x=atom_x,
@@ -127,10 +122,6 @@ def interactive_attention_visualization(
 # ------------------------
 # Model set up
 # ------------------------
-from mvp.subformula_assign.utils.spectra_utils import assign_subforms
-import matchms
 def run(ms, smiles, formula, precursor_mz, adduct, spec_featurizer, mol_featurizer,model, mass_diff_thresh=20, precursor_intensity=1.1):
     # step 1 - label peaks with formula, setup matchms spectrum

 from plotly.subplots import make_subplots
 from rdkit import Chem
 from rdkit.Chem import rdDepictor
+from flare.subformula_assign.utils.spectra_utils import assign_subforms
+import matchms
 def mol_to_graph_coords(mol):
     """Return atom coordinates and bond list for a molecule."""
     bonds = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()]
     return coords, bonds
 def interactive_attention_visualization(
     spectral_embeds,
     graph_embeds,
         hoverinfo='text',
         customdata=list(range(num_peaks)),  # actual peak indices
     )
     # --- Graph nodes ---
     graph_nodes = go.Scatter(
         x=atom_x,
 # ------------------------
 # Model set up
 # ------------------------
 def run(ms, smiles, formula, precursor_mz, adduct, spec_featurizer, mol_featurizer,model, mass_diff_thresh=20, precursor_intensity=1.1):
     # step 1 - label peaks with formula, setup matchms spectrum

flare/data/datasets.py CHANGED Viewed

@@ -83,7 +83,7 @@ class JESTR1_MassSpecDataset(MassSpecDataset):
         spec = self.spectra[i]
         metadata = self.metadata.iloc[i]
-        mol = metadata["smiles"]
         # Apply all transformations to the spectrum
         item = {}
@@ -254,7 +254,7 @@ class ContrastiveDataset(Dataset):
         return item
     @staticmethod
-    def collate_fn(batch: T.Iterable[dict], spec_enc: str, spectra_view: str, stage=None) -> dict:
         mol_key = 'cand' if stage == Stage.TEST else 'mol'
         non_standard_collate = ['mol', 'cand', 'aug_cands', 'cons_spec', 'aug_cands_fp', 'NL_spec']
         require_pad = False
@@ -277,15 +277,16 @@ class ContrastiveDataset(Dataset):
                     raise
         # batch graphs
-        batch_mol = []
-        batch_mol_nodes= []
-        for item in batch:
-            batch_mol.append(item[mol_key])
-            batch_mol_nodes.append(item[mol_key].num_nodes())
-        collated_batch[mol_key] = dgl.batch(batch_mol)
-        collated_batch['mol_n_nodes'] = batch_mol_nodes
         # pad peaks/formulas
         if require_pad:
@@ -347,7 +348,15 @@ class ExpandedRetrievalDataset:
         self.candidates = {}
         for s, cand in candidates.items():
-            self.candidates[s] = [c for c in cand if '.' not in c]
         self.spec_cand = [] #(spec index, cand_smiles, true_label)

         spec = self.spectra[i]
         metadata = self.metadata.iloc[i]
+        mol = metadata["smiles"] if 'smiles' in metadata else metadata["identifier"]
         # Apply all transformations to the spectrum
         item = {}
         return item
     @staticmethod
+    def collate_fn(batch: T.Iterable[dict], spec_enc: str, spectra_view: str, stage=None, batch_mol: bool = True) -> dict:
         mol_key = 'cand' if stage == Stage.TEST else 'mol'
         non_standard_collate = ['mol', 'cand', 'aug_cands', 'cons_spec', 'aug_cands_fp', 'NL_spec']
         require_pad = False
                     raise
         # batch graphs
+        if batch_mol:
+            batch_mol = []
+            batch_mol_nodes= []
+            for item in batch:
+                batch_mol.append(item[mol_key])
+                batch_mol_nodes.append(item[mol_key].num_nodes())
+            collated_batch[mol_key] = dgl.batch(batch_mol)
+            collated_batch['mol_n_nodes'] = batch_mol_nodes
         # pad peaks/formulas
         if require_pad:
         self.candidates = {}
         for s, cand in candidates.items():
+            clean_cands = []
+            for c in cand:
+                try:
+                    if '.' not in c:
+                        clean_cands.append(c)
+                except:
+                    print(f"Error in processing candidate {c} for smiles {s}")
+                    pass
+            self.candidates[s] = clean_cands
         self.spec_cand = [] #(spec index, cand_smiles, true_label)

flare/models/contrastive.py CHANGED Viewed

@@ -10,7 +10,7 @@ from massspecgym.models.base import Stage
 from massspecgym import utils
 from torch.nn.utils.rnn import pad_sequence
-from flare.utils.loss import contrastive_loss, cand_spec_sim_loss, fp_loss, cons_spec_loss, filip_loss_with_mask
 import flare.utils.models as model_utils
 from flare.utils.general import pad_graph_nodes, filip_similarity_batch
@@ -18,14 +18,17 @@ from flare.models.encoders import CrossAttention
 import torch.nn.functional as F
 from torch_geometric.nn import global_mean_pool
 class ContrastiveModel(RetrievalMassSpecGymModel):
     def __init__(
         self,
         **kwargs
     ):
         super().__init__(**kwargs)
         self.save_hyperparameters()
         if 'use_fp' not in self.hparams:
             self.hparams.use_fp = False
@@ -42,13 +45,26 @@ class ContrastiveModel(RetrievalMassSpecGymModel):
         self.result_dct = defaultdict(lambda: defaultdict(list))
     def forward(self, batch, stage):
-        g = batch['cand'] if stage == Stage.TEST else batch['mol']
         spec = batch[self.spec_view]
         n_peaks = batch['n_peaks'] if 'n_peaks' in batch else None
         spec_enc = self.spec_enc_model(spec, n_peaks)
         fp = batch['fp'] if self.hparams.use_fp else None
         mol_enc = self.mol_enc_model(g, fp=fp)
         return spec_enc, mol_enc
@@ -61,20 +77,6 @@ class ContrastiveModel(RetrievalMassSpecGymModel):
         losses['contr_loss'] = contr_loss.detach().item()
         loss+=contr_loss
-        # if self.hparams.pred_fp:
-        #     fp_loss_val = self.loss_wts['fp_wt'] *self.fp_loss(output['fp'], batch['fp'])
-        #     loss+= fp_loss_val
-        #     losses['fp_loss'] = fp_loss_val.detach().item()
-        # if 'aug_cand_enc' in output:
-        #     aug_cand_loss = self.loss_wts['aug_cand_wt'] * cand_spec_sim_loss(spec_enc, output['aug_cand_enc'])
-        #     loss+= aug_cand_loss
-        #     losses['aug_cand_loss'] = aug_cand_loss.detach().item()
-        # if 'ind_spec' in output:
-        #     spec_loss = self.loss_wts['cons_spec_wt'] * self.cons_loss(spec_enc, output['ind_spec'])
-        #     loss+=spec_loss
-        #     losses['cons_spec_loss'] = spec_loss.detach().item()
         losses['loss'] = loss
@@ -108,7 +110,7 @@ class ContrastiveModel(RetrievalMassSpecGymModel):
         # total loss
         self.log(
             f'{stage.to_pref()}loss',
-            outputs['loss'],
             batch_size=len(batch['identifier']),
             sync_dist=True,
             prog_bar=True,
@@ -146,11 +148,6 @@ class ContrastiveModel(RetrievalMassSpecGymModel):
             self.result_dct[i]['candidates'].extend(cands)
             self.result_dct[i]['scores'].extend(scores.cpu().tolist())
             self.result_dct[i]['labels'].extend([x.cpu().item() for x in l])
-        # # external test case only
-        # for i, cands, scores in zip(outputs['identifiers'], outputs['cand_smiles'], outputs['scores']):
-        #     self.result_dct[i.cpu().item()]['candidates'].extend(cands)
-        #     self.result_dct[i.cpu().item()]['scores'].extend(scores.cpu().tolist())
     def _compute_rank(self, scores, labels):
         if not any(labels):
@@ -160,12 +157,21 @@ class ContrastiveModel(RetrievalMassSpecGymModel):
         rank = np.count_nonzero(scores >=target_score)
         return rank
     def on_test_epoch_end(self) -> None:
         self.df_test = pd.DataFrame.from_dict(self.result_dct, orient='index').reset_index().rename(columns={'index': 'identifier'})
         # Compute rank
-        self.df_test['rank'] = self.df_test.apply(lambda row: self._compute_rank(row['scores'], row['labels']), axis=1)
         if not self.df_test_path:
             self.df_test_path = os.path.join(self.hparams['experiment_dir'], 'result.pkl')
         self.df_test.to_pickle(self.df_test_path)
@@ -176,160 +182,6 @@ class ContrastiveModel(RetrievalMassSpecGymModel):
             {"monitor": f"{Stage.VAL.to_pref()}loss", "mode": "min", "early_stopping": False}, # monitor val loss
         ]
         return monitors
-# class MultiViewContrastive(ContrastiveModel):
-#     def __init__(self,
-#                  **kwargs):
-#         super().__init__(**kwargs)
-#         # build fingerprint encoder model
-#         if self.hparams.use_fp:
-#             self.fp_enc_model = model_utils.get_fp_enc_model(self.hparams)
-#         # build NL encoder model
-#         if self.hparams.use_NL_spec:
-#             self.NL_enc_model = model_utils.get_spec_encoder(self.hparams.spec_enc, self.hparams)
-#     def forward(self, batch, stage):
-#         g = batch['cand'] if stage == Stage.TEST else batch['mol']
-#         spec = batch[self.spec_view]
-#         n_peaks = batch['n_peaks'] if 'n_peaks' in batch else None
-#         spec_enc = self.spec_enc_model(spec, n_peaks)
-#         mol_enc = self.mol_enc_model(g)
-#         views = {'spec_enc': spec_enc, 'mol_enc': mol_enc}
-#         if self.hparams.use_fp:
-#             fp_enc = self.fp_enc_model(batch['fp'])
-#             views['fp_enc'] = fp_enc
-#         if self.hparams.use_cons_spec:
-#             spec = batch['cons_spec']
-#             n_peaks = batch['cons_n_peaks'] if 'cons_n_peaks' in batch else None
-#             spec_enc = self.cons_spec_enc_model(spec, n_peaks)
-#             views['cons_spec_enc'] = spec_enc
-#         if self.hparams.use_NL_spec:
-#             spec = batch['NL_spec']
-#             n_peaks = batch['NL_n_peaks'] if 'NL_n_peaks' in batch else None
-#             spec_enc = self.NL_enc_model(spec, n_peaks)
-#             views['NL_spec_enc'] = spec_enc
-#         return views
-#     def step(
-#         self, batch: dict, stage= Stage.NONE):
-#         # Compute spectra and mol encoding
-#         views = self.forward(batch, stage)
-#         if stage == Stage.TEST:
-#             return views
-#         # Calculate loss
-#         losses = self.compute_loss(batch, views)
-#         return losses
-#     def compute_loss(self, batch: dict, views: dict):
-#         loss = 0
-#         losses = {}
-#         for v1, v2 in self.hparams.contr_views:
-#             contr_loss, cong_loss, noncong_loss = contrastive_loss(views[v1], views[v2], self.hparams.contr_temp)
-#             loss+=contr_loss
-#             losses[f'{v1[:-4]}-{v2[:-4]}_contr_loss'] = contr_loss.detach().item()
-#             losses[f'{v1[:-4]}-{v2[:-4]}_cong_loss'] = cong_loss.detach().item()
-#             losses[f'{v1[:-4]}-{v2[:-4]}_noncong_loss'] = noncong_loss.detach().item()
-#         losses['loss'] = loss
-#         return losses
-#     def on_batch_end(self, outputs, batch: dict, batch_idx: int, stage: Stage) -> None:
-#         # total loss
-#         self.log(
-#             f'{stage.to_pref()}loss',
-#             outputs['loss'],
-#             batch_size=len(batch['identifier']),
-#             sync_dist=True,
-#             prog_bar=True,
-#             on_epoch=True,
-#             # on_step=True
-#         )
-#         for v1, v2 in self.hparams.contr_views:
-#             self.log(
-#             f'{stage.to_pref()}{v1[:-4]}-{v2[:-4]}_contr_loss',
-#             outputs[f'{v1[:-4]}-{v2[:-4]}_contr_loss'],
-#             batch_size=len(batch['identifier']),
-#             sync_dist=True,
-#             on_epoch=True,
-#         )
-#             self.log(
-#             f'{stage.to_pref()}{v1[:-4]}-{v2[:-4]}_cong_loss',
-#             outputs[f'{v1[:-4]}-{v2[:-4]}_cong_loss'],
-#             batch_size=len(batch['identifier']),
-#             sync_dist=True,
-#             on_epoch=True,
-#         )
-#             self.log(
-#             f'{stage.to_pref()}{v1[:-4]}-{v2[:-4]}_noncong_loss',
-#             outputs[f'{v1[:-4]}-{v2[:-4]}_noncong_loss'],
-#             batch_size=len(batch['identifier']),
-#             sync_dist=True,
-#             on_epoch=True,
-#         )
-#     def test_step(self, batch):
-#         # Unpack inputs
-#         identifiers = batch['identifier']
-#         cand_smiles = batch['cand_smiles']
-#         id_to_ct = defaultdict(int)
-#         for i in identifiers: id_to_ct[i]+=1
-#         batch_ptr = torch.tensor(list(id_to_ct.values()))
-#         outputs = self.step(batch, stage=Stage.TEST)
-#         scores = {}
-#         for v1, v2 in self.hparams.contr_views:
-#             # if 'cons_spec_enc' in (v1, v2):
-#             #     continue
-#             v1_enc = outputs[v1]
-#             v2_enc = outputs[v2]
-#             s = nn.functional.cosine_similarity(v1_enc, v2_enc)
-#             scores[f'{v1[:-4]}-{v2[:-4]}_scores'] = torch.split(s, list(id_to_ct.values()))
-#         indexes = utils.batch_ptr_to_batch_idx(batch_ptr)
-#         cand_smiles = utils.unbatch_list(batch['cand_smiles'], indexes)
-#         labels = utils.unbatch_list(batch['label'], indexes)
-#         return dict(identifiers=list(id_to_ct.keys()), scores=scores, cand_smiles=cand_smiles, labels=labels)
-#     def on_test_batch_end(self, outputs, batch: dict, batch_idx: int, stage: Stage = Stage.TEST) -> None:
-#         # save scores
-#         for i, cands, l in zip(outputs['identifiers'], outputs['cand_smiles'], outputs['labels']):
-#             self.result_dct[i]['candidates'].extend(cands)
-#             self.result_dct[i]['labels'].extend([x.cpu().item() for x in l])
-#         for v1, v2 in self.hparams.contr_views:
-#             for i, scores in zip(outputs['identifiers'], outputs['scores'][f'{v1[:-4]}-{v2[:-4]}_scores']):
-#                 self.result_dct[i][f'{v1[:-4]}-{v2[:-4]}_scores'].extend(scores.cpu().tolist())
-#     def on_test_epoch_end(self) -> None:
-#         self.df_test = pd.DataFrame.from_dict(self.result_dct, orient='index').reset_index().rename(columns={'index': 'identifier'})
-#         # Compute rank
-#         for v1, v2 in self.hparams.contr_views:
-#             self.df_test[f'{v1[:-4]}-{v2[:-4]}_rank'] = self.df_test.apply(lambda row: self._compute_rank(row[f'{v1[:-4]}-{v2[:-4]}_scores'], row['labels']), axis=1)
-#         self.df_test.to_pickle(self.df_test_path)
 class FilipContrastive(ContrastiveModel):
     def __init__(self,
@@ -381,7 +233,7 @@ class FilipContrastive(ContrastiveModel):
         # Calculate scores
         indexes = utils.batch_ptr_to_batch_idx(batch_ptr)
-        scores = filip_similarity_batch(spec_enc, mol_enc, spec_mask, mol_masks)
         scores = torch.split(scores, list(id_to_ct.values()))
         cand_smiles = utils.unbatch_list(batch['cand_smiles'], indexes)
@@ -389,248 +241,177 @@ class FilipContrastive(ContrastiveModel):
         return dict(identifiers=list(id_to_ct.keys()), scores=scores, cand_smiles=cand_smiles, labels=labels)
-# class MultiViewFineTuning(MultiViewContrastive):
-#     def __init__(self,
-#                  **kwargs):
-#         super().__init__(**kwargs)
-#         # load preptrained spec, mol, fp encoders
-#         checkpoint = torch.load(self.hparams.partial_checkpoint)
-#         state_dict = state_dict = {k[len("spec_enc_model."):]: v for k, v in checkpoint['state_dict'].items() if k.startswith("spec_enc_model")}
-#         self.spec_enc_model.load_state_dict(state_dict) # trained on consensus spectra
-#         state_dict = state_dict = {k[len("mol_enc_model."):]: v for k, v in checkpoint['state_dict'].items() if k.startswith("mol_enc_model")}
-#         self.mol_enc_model.load_state_dict(state_dict)
-#         state_dict = state_dict = {k[len("fp_enc_model."):]: v for k, v in checkpoint['state_dict'].items() if k.startswith("fp_enc_model")}
-#         self.fp_enc_model.load_state_dict(state_dict)
-#         self.encoding_views = ['spec_enc', 'mol_enc', 'fp_enc']
-#         self.loss_fn = nn.BCELoss()
-#         # freeze encoders
-#         for param in self.mol_enc_model.parameters():
-#             param.requires_grad = False
-#         for param in self.spec_enc_model.parameters():
-#             param.requires_grad = False
-#         for param in self.fp_enc_model.parameters():
-#             param.requires_grad = False
-#         for param in self.cons_spec_enc_model.parameters():
-#             param.requires_grad = False
-#         # n_views = 2
-#         # if self.hparams.use_fp:
-#         #     n_views+=1
-#         # in_dim = self.hparams.final_embedding_dim*n_views
-#         in_dim = self.hparams.final_embedding_dim *2 + 2
-#         self.classifier_model = nn.Sequential(
-#             nn.Linear(in_dim, 512),
-#             nn.ReLU(),
-#             nn.BatchNorm1d(512),
-#             nn.Dropout(0.3),
-#             nn.Linear(512, 256),
-#             nn.ReLU(),
-#             nn.BatchNorm1d(256),
-#             nn.Dropout(0.3),
-#             nn.Linear(256, 1),
-#             nn.Sigmoid()
-#         )
-#         self.noise_std = 0.01
-#     def _add_noise(self, x):
-#         noise = torch.randn_like(x) * self.noise_std
-#         return x + noise
-#     def forward(self, batch, stage):
-#         matching_views = super().forward(batch, stage)
-#         # matching_enc = torch.concat((matching_views['spec_enc'], matching_views['mol_enc'], matching_views['fp_enc']), dim=-1)
-#         # enc1 = matching_views['spec_enc'] - matching_views['mol_enc']
-#         # enc2 = matching_views['spec_enc'] - matching_views['fp_enc']
-#         # matching_enc = torch.concat((enc1, enc2), dim=-1)
-#         view1 = matching_views['spec_enc']
-#         view2 = matching_views['mol_enc']
-#         view3 = matching_views['fp_enc']
-#         if stage == Stage.TRAIN:
-#             view1, view2, view3 = map(self._add_noise, (view1, view2, view3))
-#         pairwise_diffs = torch.cat([
-#             torch.abs(view1 - view2),
-#             torch.abs(view1 - view3),
-#         ], dim=-1)
-#         pairwise_sims = torch.cat([
-#             (view1 * view2).sum(dim=-1, keepdim=True),
-#             (view1 * view3).sum(dim=-1, keepdim=True),
-#         ], dim=-1)
-#         matching_enc = torch.cat([pairwise_diffs, pairwise_sims], dim=-1)
-#         matching_scores = self.classifier_model(matching_enc)
-#         if stage == Stage.TEST:
-#             return dict(matching_scores = matching_scores)
-#         view1 = view1.repeat_interleave(self.hparams.aug_cands_size, dim=0)
-#         view2 = self.mol_enc_model(batch['aug_cands'])
-#         view3= self.fp_enc_model(batch['aug_cands_fp'])
-#         if stage == Stage.TRAIN:
-#             view1, view2, view3 = map(self._add_noise, (view1, view2, view3))
-#         pairwise_diffs = torch.cat([
-#             torch.abs(view1 - view2),
-#             torch.abs(view1 - view3),
-#         ], dim=-1)
-#         pairwise_sims = torch.cat([
-#             (view1 * view2).sum(dim=-1, keepdim=True),
-#             (view1 * view3).sum(dim=-1, keepdim=True),
-#         ], dim=-1)
-#         nonmatching_enc = torch.cat([pairwise_diffs, pairwise_sims], dim=-1)
-#         nonmatching_scores = self.classifier_model(nonmatching_enc)
-#         return dict(matching_scores=matching_scores, nonmatching_scores=nonmatching_scores)
-#     def compute_loss(self, matching_scores, nonmatching_scores):
-#         matching_loss = self.loss_fn(matching_scores, torch.ones_like(matching_scores).to(matching_scores.device))
-#         nonmatching_loss = self.loss_fn(nonmatching_scores, torch.zeros_like(nonmatching_scores).to(nonmatching_scores.device))
-#         loss = matching_loss + (1/self.hparams.aug_cands_size)*nonmatching_loss
-#         return dict(loss=loss)
-#     def step(
-#         self, batch: dict, stage= Stage.NONE):
-#         output = self.forward(batch, stage)
-#         if stage == Stage.TEST:
-#             return output
-#         # Calculate loss
-#         losses = self.compute_loss(output['matching_scores'], output['nonmatching_scores'])
-#         return losses
-#     def test_step(self, batch):
-#         # Unpack inputs
-#         identifiers = batch['identifier']
-#         cand_smiles = batch['cand_smiles']
-#         id_to_ct = defaultdict(int)
-#         for i in identifiers: id_to_ct[i]+=1
-#         batch_ptr = torch.tensor(list(id_to_ct.values()))
-#         outputs = self.step(batch, stage=Stage.TEST)
-#         scores = outputs['matching_scores']
-#         indexes = utils.batch_ptr_to_batch_idx(batch_ptr)
-#         cand_smiles = utils.unbatch_list(batch['cand_smiles'], indexes)
-#         labels = utils.unbatch_list(batch['label'], indexes)
-#         return dict(identifiers=list(id_to_ct.keys()), scores=scores, cand_smiles=cand_smiles, labels=labels)
-#     def on_batch_end(self, outputs, batch: dict, batch_idx: int, stage: Stage) -> None:
-#         # total loss
-#         self.log(
-#             f'{stage.to_pref()}loss',
-#             outputs['loss'],
-#             batch_size=len(batch['identifier']),
-#             sync_dist=True,
-#             prog_bar=True,
-#             on_epoch=True,
-#             # on_step=True
-#         )
-#     def on_test_batch_end(self, outputs, batch: dict, batch_idx: int, stage: Stage = Stage.TEST) -> None:
-#         ContrastiveModel.on_test_batch_end(self, outputs, batch, batch_idx, stage)
-#     def on_test_epoch_end(self):
-#         self.df_test = pd.DataFrame.from_dict(self.result_dct, orient='index').reset_index().rename(columns={'index': 'identifier'})
-#         # self.df_test.to_csv(self.hparams.resutl)
-#         print(self.df_test_path)
-#         self.df_test.to_pickle(self.df_test_path)
-#         # ContrastiveModel.on_test_epoch_end(self)
-#     def get_checkpoint_monitors(self) -> T.List[dict]:
-#         monitors = [
-#             {"monitor": f"{Stage.VAL.to_pref()}loss", "mode": "min", "early_stopping": True}
-#         ]
-#         return monitors
-#     def configure_optimizers(self):
-#         return torch.optim.Adam(
-#             self.classifier_model.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay
-#         )
-# class IndSpecEncoder(ContrastiveModel):
-#     """ Trains a spectra encoder that maps to a pretrained spec encoder"""
-#     def __init__(
-#             self,
-#             **kwargs
-#     ):
-#         super().__init__(**kwargs)
-#         # initialize ind_spec_encoder and loss
-#         self.ind_spec_enc_model = model_utils.get_spec_encoder(self.hparams.spec_enc, self.hparams)
-#         self.cons_loss = cons_spec_loss(self.hparams.cons_loss_type)
-#         # load preptrained spec and mol encoders
-#         checkpoint = torch.load(self.hparams.partial_checkpoint)
-#         state_dict = state_dict = {k[len("spec_enc_model."):]: v for k, v in checkpoint['state_dict'].items() if k.startswith("spec_enc_model")}
-#         self.spec_enc_model.load_state_dict(state_dict) # trained on consensus spectra
-#         state_dict = state_dict = {k[len("mol_enc_model."):]: v for k, v in checkpoint['state_dict'].items() if k.startswith("mol_enc_model")}
-#         self.mol_enc_model.load_state_dict(state_dict)
-#         # freeze cons spec and mol encoders
-#         for param in self.mol_enc_model.parameters():
-#             param.requires_grad = False
-#         for param in self.spec_enc_model.parameters():
-#             param.requires_grad = False
-#     def forward(self, batch, stage):
-#         spec = batch[self.spec_view]
-#         n_peaks = batch['n_peaks']
-#         spec_enc = self.ind_spec_enc_model(spec, n_peaks)
-#         return spec_enc
-#     def compute_loss(self, spec_enc, cons_spec_enc):
-#         loss = self.cons_loss(spec_enc, cons_spec_enc)
-#         return dict(loss=loss)
-#     def step(self, batch: dict, stage=Stage.NONE):
-#         self.spec_enc_model.eval()
-#         self.mol_enc_model.eval()
-#         spec_enc = self.forward(batch, stage)
-#         if stage == Stage.TEST:
-#             mol_enc = self.mol_enc_model(batch['cand'])
-#             return dict(spec_enc=spec_enc, mol_enc=mol_enc)
-#         cons_spec_enc = self.spec_enc_model(batch['cons_spec'], batch['cons_n_peaks'])
-#         losses = self.compute_loss(spec_enc, cons_spec_enc)
-#         return losses
-#     def configure_optimizers(self):
-#         return torch.optim.Adam(
-#             self.ind_spec_enc_model.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay
-#         )
-#     def get_checkpoint_monitors(self) -> T.List[dict]:
-#         monitors = [
-#             {"monitor": f"{Stage.VAL.to_pref()}loss", "mode": "min", "early_stopping": True}
-#         ]
-#         return monitors
 class CrossAttenContrastive(ContrastiveModel):
     def __init__(

 from massspecgym import utils
 from torch.nn.utils.rnn import pad_sequence
+from flare.utils.loss import contrastive_loss, filip_loss_with_mask, global_infonce_loss, pcgrad_combine
 import flare.utils.models as model_utils
 from flare.utils.general import pad_graph_nodes, filip_similarity_batch
 import torch.nn.functional as F
 from torch_geometric.nn import global_mean_pool
+import torch, dgllife
 class ContrastiveModel(RetrievalMassSpecGymModel):
     def __init__(
         self,
+        external_test: bool = False,
         **kwargs
     ):
         super().__init__(**kwargs)
         self.save_hyperparameters()
+        self.external_test = external_test
         if 'use_fp' not in self.hparams:
             self.hparams.use_fp = False
         self.result_dct = defaultdict(lambda: defaultdict(list))
     def forward(self, batch, stage):
+        if 'cand' in batch:
+            g = batch['cand']
+        elif 'mol' in batch:
+            g = batch['mol']
+        else:
+            g = None
         spec = batch[self.spec_view]
         n_peaks = batch['n_peaks'] if 'n_peaks' in batch else None
         spec_enc = self.spec_enc_model(spec, n_peaks)
+        if g is None:
+            mol_enc = None
+            return spec_enc, mol_enc
         fp = batch['fp'] if self.hparams.use_fp else None
+        f = self.mol_enc_model.GNN(g, g.ndata['h'])
         mol_enc = self.mol_enc_model(g, fp=fp)
         return spec_enc, mol_enc
         losses['contr_loss'] = contr_loss.detach().item()
         loss+=contr_loss
         losses['loss'] = loss
         # total loss
         self.log(
             f'{stage.to_pref()}loss',
+           outputs['loss'],
             batch_size=len(batch['identifier']),
             sync_dist=True,
             prog_bar=True,
             self.result_dct[i]['candidates'].extend(cands)
             self.result_dct[i]['scores'].extend(scores.cpu().tolist())
             self.result_dct[i]['labels'].extend([x.cpu().item() for x in l])
     def _compute_rank(self, scores, labels):
         if not any(labels):
         rank = np.count_nonzero(scores >=target_score)
         return rank
+    def _get_top_cand(self, scores, candidates):
+        return candidates[np.argmax(np.array(scores))]
     def on_test_epoch_end(self) -> None:
         self.df_test = pd.DataFrame.from_dict(self.result_dct, orient='index').reset_index().rename(columns={'index': 'identifier'})
         # Compute rank
+        if not self.external_test:
+            self.df_test['rank'] = self.df_test.apply(lambda row: self._compute_rank(row['scores'], row['labels']), axis=1)
+        if self.external_test:
+            self.df_test.drop('labels', axis=1, inplace=True)
+            self.df_test['top_cand'] = self.df_test.apply(lambda row: self._get_top_cand(row['scores'], row['candidates']), axis=1)
         if not self.df_test_path:
             self.df_test_path = os.path.join(self.hparams['experiment_dir'], 'result.pkl')
         self.df_test.to_pickle(self.df_test_path)
             {"monitor": f"{Stage.VAL.to_pref()}loss", "mode": "min", "early_stopping": False}, # monitor val loss
         ]
         return monitors
 class FilipContrastive(ContrastiveModel):
     def __init__(self,
         # Calculate scores
         indexes = utils.batch_ptr_to_batch_idx(batch_ptr)
+        scores = filip_similarity_batch(spec_enc, mol_enc, spec_mask, mol_mask)
         scores = torch.split(scores, list(id_to_ct.values()))
         cand_smiles = utils.unbatch_list(batch['cand_smiles'], indexes)
         return dict(identifiers=list(id_to_ct.keys()), scores=scores, cand_smiles=cand_smiles, labels=labels)
+# ============================================================
+#        Combined FILIP + Global InfoNCE
+# ============================================================
+class FilipGlobalContrastive(ContrastiveModel):
+    def __init__(self, loss_mode="sum", loss_weight=1.0, agg_fn="mean", **kwargs):
+        """
+        Args:
+            loss_mode: str, one of ["sum", "weighted", "pcgrad"]
+            loss_weight: weight for global loss if using weighted sum
+            agg_fn: aggregation function for global InfoNCE ("mean", "max", "cls")
+        """
+        super().__init__(**kwargs)
+        self.loss_mode = loss_mode
+        self.loss_weight = loss_weight
+        self.agg_fn = agg_fn
+    # -------------- loss computation --------------
+    def compute_loss(self, batch: dict, spec_enc, mol_enc, spec_mask, mol_mask, stage=Stage.NONE):
+        losses = {}
+        # fine-grained FILIP loss
+        loss_fine = filip_loss_with_mask(spec_enc, mol_enc, spec_mask, mol_mask, self.hparams.contr_temp)
+        # global InfoNCE loss
+        loss_global = global_infonce_loss(spec_enc, mol_enc, spec_mask, mol_mask,
+                                          temperature=self.hparams.contr_temp, agg_fn=self.agg_fn)
+        # choose combination mode
+        if self.loss_mode == "sum":
+            loss = loss_fine + loss_global
+        elif self.loss_mode == "weighted":
+            loss = loss_fine + self.loss_weight * loss_global
+        elif self.loss_mode == "pcgrad":
+            if stage == Stage.TRAIN:
+                # PCGrad over both losses (training only)
+                shared_params = list(self.spec_enc_model.parameters()) + list(self.mol_enc_model.parameters())
+                self.zero_grad(set_to_none=True)
+                loss = pcgrad_combine([loss_fine, loss_global], shared_params)
+            else:
+                loss = (loss_fine + loss_global).detach()
+        else:
+            raise ValueError(f"Unsupported loss_mode: {self.loss_mode}")
+        losses["loss"] = loss
+        losses["loss_fine"] = loss_fine.detach()
+        losses["loss_global"] = loss_global.detach()
+        return losses
+    def step(self, batch: dict, stage=Stage.NONE):
+        spec_enc, mol_enc = self.forward(batch, stage)
+        mol_enc, mol_mask = pad_graph_nodes(mol_enc, batch["mol_n_nodes"])
+        spec_mask = ~torch.all((spec_enc == -5), dim=-1)
+        if stage == Stage.TEST:
+            return dict(spec_enc=spec_enc, mol_enc=mol_enc, spec_mask=spec_mask, mol_mask=mol_mask)
+        losses = self.compute_loss(batch, spec_enc, mol_enc, spec_mask, mol_mask, stage=stage)
+        return losses
+    # -------------- TEST step with different score variants --------------
+    def test_step(self, batch, batch_idx):
+        identifiers = batch["identifier"]
+        cand_smiles = batch["cand_smiles"]
+        id_to_ct = defaultdict(int)
+        for i in identifiers:
+            id_to_ct[i] += 1
+        batch_ptr = torch.tensor(list(id_to_ct.values()), device=self.device)
+        outputs = self.step(batch, stage=Stage.TEST)
+        spec_enc = outputs["spec_enc"]
+        mol_enc = outputs["mol_enc"]
+        spec_mask = outputs["spec_mask"]
+        mol_mask = outputs["mol_mask"]
+        indexes = utils.batch_ptr_to_batch_idx(batch_ptr)
+        # --- fine-grained score ---
+        fine_scores = filip_similarity_batch(spec_enc, mol_enc, spec_mask, mol_mask)
+        # --- global cosine score ---
+        spec_global = (spec_enc * spec_mask.unsqueeze(-1)).sum(1) / spec_mask.sum(1, keepdim=True).clamp(min=1)
+        mol_global = (mol_enc * mol_mask.unsqueeze(-1)).sum(1) / mol_mask.sum(1, keepdim=True).clamp(min=1)
+        global_scores = F.cosine_similarity(spec_global, mol_global, dim=-1)
+        # --- combined scores (for evaluation) ---
+        combined_sum = fine_scores + global_scores
+        combined_weighted = fine_scores + self.loss_weight * global_scores
+        combined_pc = 0.5 * (fine_scores + global_scores)  # simple average baseline
+        scores_dict = {
+            "fine": fine_scores,
+            "global": global_scores,
+            "sum": combined_sum,
+            "weighted": combined_weighted,
+            "avg": combined_pc,
+        }
+        # split back per identifier
+        for key in scores_dict:
+            scores_dict[key] = torch.split(scores_dict[key], list(id_to_ct.values()))
+        cand_smiles = utils.unbatch_list(batch["cand_smiles"], indexes)
+        labels = utils.unbatch_list(batch["label"], indexes)
+        return dict(
+            identifiers=list(id_to_ct.keys()),
+            scores=scores_dict,
+            cand_smiles=cand_smiles,
+            labels=labels,
+        )
+    def on_test_batch_end(self, outputs, batch: dict, batch_idx: int, stage: Stage = Stage.TEST) -> None:
+        """
+        Collects test batch outputs and stores them in self.result_dct.
+        Supports both:
+        - Single score list format (legacy)
+        - Dict of multiple score variants (new)
+        """
+        identifiers = outputs["identifiers"]
+        cand_smiles = outputs["cand_smiles"]
+        labels = outputs["labels"]
+        scores_out = outputs["scores"]
+        for k, (i, cands, l) in enumerate(zip(outputs['identifiers'], outputs['cand_smiles'], outputs['labels'])):
+            self.result_dct[i]['candidates'].extend(cands)
+            self.result_dct[i]['labels'].extend([x.cpu().item() for x in l])
+            for variant_name, score_list in scores_out.items():
+                self.result_dct[i][f"scores_{variant_name}"].extend(score_list[k].cpu().tolist())
+    def on_test_epoch_end(self) -> None:
+        """
+        Combine results into one DataFrame with one row per identifier.
+        Adds rank/top_cand columns for each score variant.
+        """
+        records = []
+        for identifier, val in self.result_dct.items():
+            row = {"identifier": identifier, "candidates": val["candidates"]}
+            if not self.external_test:
+                row["labels"] = val["labels"]
+            # For every scores_* key, compute rank or top candidate
+            for key, scores in val.items():
+                if not key.startswith("scores_"):
+                    continue
+                variant = key.replace("scores_", "")
+                if not self.external_test:
+                    row[f"rank_{variant}"] = self._compute_rank(scores, val["labels"])
+                else:
+                    row[f"top_cand_{variant}"] = self._get_top_cand(scores, val["candidates"])
+                row[key] = scores
+            records.append(row)
+        self.df_test = pd.DataFrame(records)
+        if self.external_test and "labels" in self.df_test.columns:
+            self.df_test.drop(columns=["labels"], inplace=True)
+        # Save once
+        if not getattr(self, "df_test_path", None):
+            self.df_test_path = os.path.join(self.hparams["experiment_dir"], "result_combined.pkl")
+        self.df_test.to_pickle(self.df_test_path)
 class CrossAttenContrastive(ContrastiveModel):
     def __init__(

flare/models/mol_encoder.py CHANGED Viewed

@@ -12,7 +12,7 @@ class MolEnc(nn.Module):
         self.return_emb = False
-        if args.model in ('filipContrastive', 'crossAttenContrastive'):
             self.return_emb = True
         dropout = [args.gnn_dropout for _ in range(len(args.gnn_channels))]
@@ -46,4 +46,5 @@ class MolEnc(nn.Module):
         h1 = self.dropout(h1)
         return h1

         self.return_emb = False
+        if args.model in ('filipContrastive', 'crossAttenContrastive', 'filipGlobalContrastive'):
             self.return_emb = True
         dropout = [args.gnn_dropout for _ in range(len(args.gnn_channels))]
         h1 = self.dropout(h1)
         return h1

flare/models/spec_encoder.py CHANGED Viewed

@@ -111,7 +111,7 @@ class SpecFormulaTransformer(nn.Module):
             in_dim+=1
         self.returnEmb = False
-        if args.model in ('crossAttenContrastive', 'filipContrastive'):
             self.returnEmb = True
             assert(args.use_cls == False)
@@ -128,7 +128,7 @@ class SpecFormulaTransformer(nn.Module):
                 out_dim = args.final_embedding_dim
             self.fc = nn.Linear(args.formula_dims[-1], out_dim)
-    def forward(self, spec, n_peaks):
         h = self.formulaEnc(spec)
         pad = (spec == -5)
         pad = torch.all(pad, -1)
@@ -154,7 +154,6 @@ class SpecFormulaTransformer(nn.Module):
         h = self.fc(h)
         return h
 class SpecFormula_mz_Encoder(nn.Module):
     '''
     Encodes formula and mz_int

             in_dim+=1
         self.returnEmb = False
+        if args.model in ('crossAttenContrastive', 'filipContrastive', 'filipGlobalContrastive'):
             self.returnEmb = True
             assert(args.use_cls == False)
                 out_dim = args.final_embedding_dim
             self.fc = nn.Linear(args.formula_dims[-1], out_dim)
+    def forward(self, spec, n_peaks=None):
         h = self.formulaEnc(spec)
         pad = (spec == -5)
         pad = torch.all(pad, -1)
         h = self.fc(h)
         return h
 class SpecFormula_mz_Encoder(nn.Module):
     '''
     Encodes formula and mz_int

flare/params_filipGlobal.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+# Experiment setup
+job_key: ''
+run_name: 'filip-global'
+run_details: ""
+project_name: ''
+wandb_entity_name: 'mass-spec-ml'
+no_wandb: True
+seed: 42
+debug: False
+checkpoint_pth:
+# Training setup
+max_epochs: 2000
+accelerator: 'gpu'
+devices: [1]
+log_every_n_steps: 250
+val_check_interval: 1.0
+# Data paths
+candidates_pth: /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_mass.json # "../data/MassSpecGym/data/molecules/MassSpecGym_retrieval_candidates_formula.json"
+dataset_pth: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # /data/yzhouc01/MVP/data/sample/data.tsv #/r/hassounlab/spectra_data/msgym/MassSpecGym.tsv #/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv # /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv # "../data/MassSpecGym/data/sample_data.tsv"
+subformula_dir_pth: /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default # /data/yzhouc01/FILIP-MS/data/magma # /r/hassounlab/msgym_sirius # /data/yzhouc01/MVP/data/MassSpecGym/data/subformulae_default #/data/yzhouc01/spectra_data/subformulae #"../data/MassSpecGym/data/subformulae_default"
+split_pth:
+fp_dir_pth:
+partial_checkpoint: ""
+# General hyperparameters
+batch_size: 64 #64
+lr: 2.881339661302105e-05 # 5.0e-05
+weight_decay: 1.8376229667330708e-05
+contr_temp: 0.022772534845886608 # 0.022772534845886608 # 0.05
+num_workers: 50
+# FILIP_GLOBAL model parameters
+loss_mode: "pcgrad"
+agg_fn: "mean"
+loss_weight: 1.1
+############################## Data transforms ##############################
+# - Spectra
+spectra_view: SpecFormula #SpecMzIntTokens #SpecFormula
+formula_source: 'default' # magma_1, magma_all, sirius, default
+#  1. Binner
+max_mz: 1000
+bin_width: 1
+mask_peak_ratio: 0.00
+# 2. SpecFormula
+element_list: ['H', 'C',  'O', 'N', 'P', 'S', 'Cl', 'F', 'Br', 'I', 'B', 'As', 'Si', 'Se']
+add_intensities: True
+# - Molecule
+molecule_view: "MolGraph"
+atom_feature: 'full'
+bond_feature: 'full'
+############################## Task and model ##############################
+task: 'retrieval'
+spec_enc: Transformer_Formula # Transformer_MzInt #Transformer_Formula
+mol_enc: "GNN"
+model:  filipGlobalContrastive #filipContrastive # "MultiviewContrastive"
+contr_views: [['spec_enc', 'mol_enc']]
+log_only_loss_at_stages: []
+df_test_path: ""
+# - Formula-based spec encoders
+formula_dropout:  0.2
+formula_dims: [512,256,512] #[512, 256, 512] #[64, 128, 256]
+cross_attn_heads: 2
+use_cls: False
+peak_dropout: 0.2
+formula_attn_heads: 4 # 2
+formula_transformer_layers: 2 #2
+# -- GAT params
+attn_heads: [12,12,12]
+# - Molecule encoder (GNN)
+gnn_channels: [128, 256, 512] #[64,128,512]
+gnn_type: "gcn"
+# num_gnn_layers: 3
+# gnn_hidden_dim: 512
+gnn_dropout: 0.23234950970370824 #0.3
+# - Spectra encoder (cross attention model)
+# final_embedding_dim: 512
+# fc_dropout: 0.4
+# - Spectra Token encoder (mz-int token model)
+# hidden_dims: [64, 256]

flare/run.sh CHANGED Viewed

@@ -1,3 +1,3 @@
-# python train.py
-python test.py --param_pth ../hparams.yaml
-# python test.py --candidates_pth /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_formula.json

+# python train.py --param_pth params_filipGlobal.yaml
+# python test.py --param_pth params_filipGlobal.yaml
+python test.py --param_pth params_filipGlobal.yaml --candidates_pth /r/hassounlab/spectra_data/msgym/molecules/MassSpecGym_retrieval_candidates_formula.json

flare/subformula_assign/run.sh CHANGED Viewed

@@ -1,6 +1,11 @@
-SPEC_FILES="/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv"
-OUTPUT_DIR="/data/yzhouc01/spectra_data/subformulae"
 MAX_FORMULAE=60
-LABELS_FILE="/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv"
 python assign_subformulae.py --spec-files $SPEC_FILES --output-dir $OUTPUT_DIR --max-formulae $MAX_FORMULAE --labels-file $LABELS_FILE

+# SPEC_FILES="/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv"
+# OUTPUT_DIR="/data/yzhouc01/spectra_data/subformulae"
+# MAX_FORMULAE=60
+# LABELS_FILE="/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex.tsv"
+SPEC_FILES="/data/yzhouc01/cancer/breast_cancer_data.tsv"
+OUTPUT_DIR="/data/yzhouc01/cancer/subformulae"
 MAX_FORMULAE=60
+LABELS_FILE="/data/yzhouc01/cancer/breast_cancer_data.tsv"
 python assign_subformulae.py --spec-files $SPEC_FILES --output-dir $OUTPUT_DIR --max-formulae $MAX_FORMULAE --labels-file $LABELS_FILE

flare/subformula_assign/utils/chem_utils.py CHANGED Viewed

@@ -181,6 +181,8 @@ def formula_to_dense(chem_formula: str) -> np.ndarray:
     """
     total_onehot = []
     for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
         # Convert num to int
         num = 1 if num == "" else int(num)
         one_hot = element_to_position[chem_symbol].reshape(1, -1)
@@ -257,6 +259,8 @@ def formula_to_dense(chem_formula: str) -> np.ndarray:
     """
     total_onehot = []
     for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
         # Convert num to int
         num = 1 if num == "" else int(num)
         one_hot = element_to_position[chem_symbol].reshape(1, -1)

     """
     total_onehot = []
     for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
+        if chem_symbol not in VALID_ELEMENTS: # yzc
+            continue
         # Convert num to int
         num = 1 if num == "" else int(num)
         one_hot = element_to_position[chem_symbol].reshape(1, -1)
     """
     total_onehot = []
     for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
+        if chem_symbol not in VALID_ELEMENTS: # yzc
+            continue
         # Convert num to int
         num = 1 if num == "" else int(num)
         one_hot = element_to_position[chem_symbol].reshape(1, -1)

flare/test.py CHANGED Viewed

@@ -29,6 +29,8 @@ parser.add_argument('--checkpoint_choice', type=str, default='train', choices=['
 parser.add_argument('--df_test_pth', type=str, help='result file name')
 parser.add_argument('--exp_dir', type=str)
 parser.add_argument('--candidates_pth', type=str)
 def main(params):
     # Seed everything
     pl.seed_everything(params['seed'])
@@ -58,6 +60,7 @@ def main(params):
     model = get_model(params['model'], params)
     model.df_test_path = params['df_test_path']
     # Init trainer
     trainer = Trainer(
@@ -109,7 +112,12 @@ if __name__ == "__main__":
                 params['checkpoint_pth'] = checkpoint_path
                 break
     assert(params['checkpoint_pth'] != '')
     if args.candidates_pth:
         params['candidates_pth'] = args.candidates_pth
     if args.df_test_pth:

 parser.add_argument('--df_test_pth', type=str, help='result file name')
 parser.add_argument('--exp_dir', type=str)
 parser.add_argument('--candidates_pth', type=str)
+parser.add_argument('--external_test', action='store_true', help='whether the test set is external data without labels')
 def main(params):
     # Seed everything
     pl.seed_everything(params['seed'])
     model = get_model(params['model'], params)
     model.df_test_path = params['df_test_path']
+    model.external_test = params['external_test']
     # Init trainer
     trainer = Trainer(
                 params['checkpoint_pth'] = checkpoint_path
                 break
     assert(params['checkpoint_pth'] != '')
+    if args.external_test:
+        params['external_test'] = True
+    else:
+        params['external_test'] = False
     if args.candidates_pth:
         params['candidates_pth'] = args.candidates_pth
     if args.df_test_pth:

flare/tune.py CHANGED Viewed

@@ -231,7 +231,7 @@ def main(args):
     # now = datetime.datetime.now().strftime("%Y%m%d")
     # base_dir = str(TEST_RESULTS_DIR / f"{now}_{params['run_name']}_optuna")
-    base_dir = "/data/yzhouc01/FILIP-MS/experiments/20250916_simple_model_optuna"
     os.makedirs(base_dir, exist_ok=True)
     params["experiment_dir"] = base_dir

     # now = datetime.datetime.now().strftime("%Y%m%d")
     # base_dir = str(TEST_RESULTS_DIR / f"{now}_{params['run_name']}_optuna")
+    base_dir = "../experiments/20250916_simple_model_optuna"
     os.makedirs(base_dir, exist_ok=True)
     params["experiment_dir"] = base_dir

flare/utils/case_study_utils.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import pandas as pd
+from tqdm import tqdm
+from rdkit import Chem
+import multiprocessing as mp
+from tqdm import tqdm
+import numpy as np
+import sys
+import os
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+if parent_dir not in sys.path:
+    sys.path.insert(0, parent_dir)
+database_to_path = {'fdb':"/data/yzhouc01/molecule_data/foodb_2020_04_07_csv/Compound.csv",
+                    'hmdb':"/data/yzhouc01/molecule_data/metabolites-2025-09-18.csv",
+                    'spectra_db':"/data/yzhouc01/spectra_data/combined_msgym_nist23_multiplex_processed.tsv",
+                    'bio_db':"/data/yzhouc01/molecule_data/bio_2023_07_11_smiles.csv",
+                    'coconut':"/data/yzhouc01/molecule_data/coconut_csv-05-2025.csv"}
+db_to_mass_col = {'fdb':'exact_molecular_weight',
+                  'hmdb':'MONO_MASS',
+                  'spectra_db':'exact_molecular_weight',
+                  'bio_db':'exact_molecular_weight',
+                  'coconut':'exact_molecular_weight'}
+db_to_smiles_col = {'fdb':'CANONICAL_SMILES',
+                    'hmdb':'CANONICAL_SMILES',
+                    'spectra_db':'CANONICAL_SMILES',
+                    'bio_db':'canonical_smiles',
+                    'coconut':'rdkit_canonical_smiles'}
+_worker_instance = None
+def _init_worker(databases, threshold):
+    """Run once per worker process to initialize shared CandidateAssignment."""
+    global _worker_instance
+    _worker_instance = CandidateAssignment(databases, threshold)
+def _worker_retrieve_candidates(parent_mass):
+    """Use the global CandidateAssignment instance inside each worker."""
+    return _worker_instance.retrieve_candidates(parent_mass)
+_worker_instance = None
+def _init_worker(databases, threshold):
+    """Initialize global CandidateAssignment in each worker (silent)."""
+    global _worker_instance
+    _worker_instance = CandidateAssignment(databases, threshold, verbose=False)
+def _worker_retrieve_candidates(parent_mass):
+    """Retrieve candidates using the worker's global CandidateAssignment."""
+    return _worker_instance.retrieve_candidates(parent_mass)
+class CandidateAssignment:
+    def __init__(self, databases=None, threshold=0.01, verbose=True):
+        self.threshold = threshold
+        self.databases = []
+        self.verbose = verbose
+        for db in databases:
+            if db not in database_to_path:
+                raise ValueError(
+                    f"Database {db} not recognized. Available: {list(database_to_path.keys())}"
+                )
+            if not os.path.exists(database_to_path[db]):
+                raise ValueError(f"Database file for {db} not found at {database_to_path[db]}")
+            self.databases.append(db)
+        # Only print in main process
+        if self.verbose and mp.current_process().name == "MainProcess":
+            print(f"[{os.getpid()}] Loading databases: {self.databases}")
+        self.db_dfs = {}
+        self._load_databases()
+    def _load_databases(self):
+        for db in self.databases:
+            path = database_to_path[db]
+            if path.endswith("tsv"):
+                df = pd.read_csv(path, sep="\t", low_memory=False)
+            elif path.endswith("csv"):
+                df = pd.read_csv(path, low_memory=False)
+            else:
+                if self.verbose and mp.current_process().name == "MainProcess":
+                    print(f"Unable to load database: {db}")
+                continue
+            # make sure required columns exist
+            required_cols = [db_to_mass_col[db], db_to_smiles_col[db]]
+            for col in required_cols:
+                if col not in df.columns:
+                    raise ValueError(f"Column {col} not found in database {db}. {db} columns: {df.columns.tolist()}")
+            # convert to proper types
+            df[db_to_mass_col[db]] = pd.to_numeric(df[db_to_mass_col[db]], errors='coerce')
+            self.db_dfs[db] = df
+            # Only print in main process
+            if self.verbose and mp.current_process().name == "MainProcess":
+                print(f"[{os.getpid()}] Loaded {db} with {len(df)} entries.")
+    def retrieve_candidates(self, parent_mass):
+        """Retrieve SMILES candidates for a single parent mass."""
+        ub = parent_mass + self.threshold
+        lb = parent_mass - self.threshold
+        smiles_list = []
+        for db_name, df in self.db_dfs.items():
+            select_rows = df[
+                (df[db_to_mass_col[db_name]] >= lb)
+                & (df[db_to_mass_col[db_name]] <= ub)
+            ]
+            smiles_list.extend(select_rows[db_to_smiles_col[db_name]].tolist())
+        smiles_list = list(set(smiles_list))
+        return parent_mass, smiles_list
+    def retrieve_candidates_batch(self, parent_masses, n_workers=25, chunksize=10):
+        """Parallel batch retrieval with silent workers."""
+        with mp.Pool(
+            processes=n_workers,
+            initializer=_init_worker,
+            initargs=(self.databases, self.threshold),
+        ) as pool:
+            results = list(
+                tqdm(
+                    pool.imap(_worker_retrieve_candidates, parent_masses, chunksize=chunksize),
+                    total=len(parent_masses),
+                    desc="Retrieving candidates",
+                )
+            )
+        return {r[0]: r[1] for r in results}
+# P_TBL = Chem.GetPeriodicTable()
+# ELECTRON_MASS = 0.00054858
+# VALID_ELEMENTS = [
+#     "C",
+#     "H",
+#     "As",
+#     "B",
+#     "Br",
+#     "Cl",
+#     "Co",
+#     "F",
+#     "Fe",
+#     "I",
+#     "K",
+#     "N",
+#     "Na",
+#     "O",
+#     "P",
+#     "S",
+#     "Se",
+#     "Si",
+# ]
+# VALID_MONO_MASSES = np.array(
+#     [P_TBL.GetMostCommonIsotopeMass(i) for i in VALID_ELEMENTS]
+# )
+# CHEM_MASSES = VALID_MONO_MASSES[:, None]
+# ELEMENT_TO_MASS = dict(zip(VALID_ELEMENTS, CHEM_MASSES.squeeze()))
+# adduct_to_mass = {
+#     "[M+H]+": ELEMENT_TO_MASS["H"] - ELECTRON_MASS,
+#     "[M+Na]+": ELEMENT_TO_MASS["Na"] - ELECTRON_MASS,
+#     "[M+K]+": ELEMENT_TO_MASS["K"] - ELECTRON_MASS,
+#     "[M-H2O+H]+": -ELEMENT_TO_MASS["O"] - ELEMENT_TO_MASS["H"] - ELECTRON_MASS,
+#     "[M+H3N+H]+": ELEMENT_TO_MASS["N"] + ELEMENT_TO_MASS["H"] * 4 - ELECTRON_MASS,
+#     "[M]+": 0 - ELECTRON_MASS,
+#     "[M-H4O2+H]+": -ELEMENT_TO_MASS["O"] * 2 - ELEMENT_TO_MASS["H"] * 3 - ELECTRON_MASS,
+#     "[M-H]-": ELEMENT_TO_MASS["H"] + ELECTRON_MASS,
+#     "[M+H2O+H]+":ELEMENT_TO_MASS["O"] * 2 + ELEMENT_TO_MASS["H"] * 2 - ELECTRON_MASS,
+# }
+# def calculate_parent_mass(precursor_mz, adduct):
+#     if adduct not in adduct_to_mass:
+#         print(f'{adduct} not supported, returning original precursor_mz')
+#     return precursor_mz + adduct_to_mass[adduct]
+if __name__ == "__main__":
+    # get_mol_mass_for_combined()
+    ca = CandidateAssignment(databases=['hmdb'])
+    candidates = ca.retrieve_candidates(parent_mass=180.0634, threshold=0.01)
+    print(candidates)

flare/utils/general.py CHANGED Viewed

@@ -2,37 +2,69 @@ import torch
 from torch import nn
 import torch.nn.functional as F
 def pad_graph_nodes(mol_enc, g_n_nodes):
     """
     Args:
-        mol_enc: 2D tensor of shape (sum_nodes, D)
-                 Node embeddings for each molecule.
-        g_n_nodes: list[int]  Number of nodes per graph (len = B)
     Returns:
-        padded: (B, max_nodes, D) tensor
         mask:   (B, max_nodes) bool tensor, True for valid nodes
     """
-    # Already concatenated: shape (sum_nodes, D)
     B = len(g_n_nodes)
     D = mol_enc.shape[1]
     max_nodes = max(g_n_nodes)
-    padded = mol_enc.new_zeros((B, max_nodes, D))
-    mask = torch.zeros((B, max_nodes), dtype=torch.bool, device=mol_enc.device)
     idx = 0
     for i, n in enumerate(g_n_nodes):
         padded[i, :n] = mol_enc[idx:idx+n]
         mask[i, :n] = True
         idx += n
     return padded, mask
-import torch
-import torch.nn.functional as F
-import torch
-import torch.nn.functional as F
 def filip_similarity_batch(
     image_tokens,
@@ -127,60 +159,64 @@ def filip_similarity_batch(
     return similarity
-# def filip_similarity_batch(image_tokens, text_tokens, mask_image, mask_text):
-#     """
-#     Compute FILIP similarity for batches of image and text token embeddings.
-#     Args:
-#         image_tokens: (B, N_img, D) float tensor
-#         text_tokens:  (B, N_text, D) float tensor
-#         mask_image:   (B, N_img) bool tensor
-#         mask_text:    (B, N_text) bool tensor
-#     Returns:
-#         similarities: (B,) float tensor of similarity scores
-#     """
-#     B, N_img, D = image_tokens.shape
-#     N_text = text_tokens.shape[1]
-#     # Normalize tokens
-#     image_norm = F.normalize(image_tokens, p=2, dim=-1)  # (B, N_img, D)
-#     text_norm = F.normalize(text_tokens, p=2, dim=-1)    # (B, N_text, D)
-#     # Compute batched cosine similarity matrices
-#     # Result shape: (B, N_img, N_text)
-#     sim_matrix = torch.bmm(image_norm, text_norm.transpose(1, 2))
-#     # Expand masks for broadcasting
-#     mask_image_exp = mask_image.unsqueeze(2)  # (B, N_img, 1)
-#     mask_text_exp = mask_text.unsqueeze(1)    # (B, 1, N_text)
-#     valid_mask = mask_image_exp & mask_text_exp  # (B, N_img, N_text)
-#     # Mask invalid positions by setting them to -inf
-#     sim_matrix_masked = sim_matrix.masked_fill(~valid_mask, float('-inf'))
-#     # Max over text tokens per image token: (B, N_img)
-#     max_sim_img, _ = sim_matrix_masked.max(dim=2)
-#     # Max over image tokens per text token: (B, N_text)
-#     max_sim_text, _ = sim_matrix_masked.max(dim=1)
-#     # Replace -inf (no valid tokens) with zeros to avoid NaNs
-#     max_sim_img[max_sim_img == float('-inf')] = 0
-#     max_sim_text[max_sim_text == float('-inf')] = 0
-#     # Sum over valid tokens and divide by number of valid tokens (avoid division by zero)
-#     sum_img = (max_sim_img * mask_image).sum(dim=1)
-#     count_img = mask_image.sum(dim=1).clamp(min=1).float()
-#     sum_text = (max_sim_text * mask_text).sum(dim=1)
-#     count_text = mask_text.sum(dim=1).clamp(min=1).float()
-#     avg_img = sum_img / count_img
-#     avg_text = sum_text / count_text
-#     # Final similarity per batch element
-#     similarity = (avg_img + avg_text) / 2
-#     return similarity

 from torch import nn
 import torch.nn.functional as F
 def pad_graph_nodes(mol_enc, g_n_nodes):
     """
     Args:
+        mol_enc: (sum_nodes, D) tensor, node embeddings concatenated for all graphs
+        g_n_nodes: list[int], number of nodes per graph
     Returns:
+        padded: (B, max_nodes, D) tensor with requires_grad=True for original nodes
         mask:   (B, max_nodes) bool tensor, True for valid nodes
     """
     B = len(g_n_nodes)
     D = mol_enc.shape[1]
     max_nodes = max(g_n_nodes)
+    # Create output with same requires_grad as input
+    padded = torch.zeros(B, max_nodes, D, dtype=mol_enc.dtype, device=mol_enc.device)
+    # Force gradient tracking by making this a non-leaf tensor
+    padded = padded + mol_enc.new_zeros(1).requires_grad_(True)
+    mask = torch.zeros(B, max_nodes, dtype=torch.bool, device=mol_enc.device)
     idx = 0
     for i, n in enumerate(g_n_nodes):
         padded[i, :n] = mol_enc[idx:idx+n]
         mask[i, :n] = True
         idx += n
     return padded, mask
+# def pad_graph_nodes(mol_enc, g_n_nodes):
+#     """
+#     Args:
+#         mol_enc: 2D tensor of shape (sum_nodes, D)
+#                  Node embeddings for each molecule.
+#         g_n_nodes: list[int]  Number of nodes per graph (len = B)
+#     Returns:
+#         padded: (B, max_nodes, D) tensor
+#         mask:   (B, max_nodes) bool tensor, True for valid nodes
+#     """
+#     # Already concatenated: shape (sum_nodes, D)
+#     B = len(g_n_nodes)
+#     D = mol_enc.shape[1]
+#     max_nodes = max(g_n_nodes)
+#     padded = mol_enc.new_zeros((B, max_nodes, D))
+#     mask = torch.zeros((B, max_nodes), dtype=torch.bool, device=mol_enc.device)
+#     idx = 0
+#     for i, n in enumerate(g_n_nodes):
+#         padded[i, :n] = mol_enc[idx:idx+n]
+#         mask[i, :n] = True
+#         idx += n
+#     return padded, mask
 def filip_similarity_batch(
     image_tokens,
     return similarity
+def filip_similarity_single(
+    image_tokens,
+    text_tokens,
+    reduction="mean",  # "mean", "topk", "softmax", or "geom"
+    k=5,
+    temperature=0.05,
+    eps=1e-6
+):
+    """
+    Compute FILIP similarity for a single image and text pair (no masks).
+    Args:
+        image_tokens: (N_img, D) float tensor
+        text_tokens:  (N_text, D) float tensor
+        reduction:    str, aggregation strategy: "mean", "topk", "softmax", or "geom"
+        k:            int, used if reduction == "topk"
+        temperature:  float, used if reduction == "softmax"
+        eps:          float, small constant for numerical stability
+    Returns:
+        similarity: float scalar tensor
+    """
+    # Normalize tokens
+    image_norm = F.normalize(image_tokens, p=2, dim=-1)
+    text_norm = F.normalize(text_tokens, p=2, dim=-1)
+    # (N_img, N_text) cosine similarity matrix
+    sim_matrix = torch.matmul(image_norm, text_norm.t())
+    # Max similarity for each token (image->text and text->image)
+    max_sim_img, _ = sim_matrix.max(dim=1)  # (N_img,)
+    max_sim_text, _ = sim_matrix.max(dim=0)  # (N_text,)
+    # Aggregation helper
+    def aggregate(max_sim):
+        if reduction == "mean":
+            return max_sim.mean()
+        elif reduction == "topk":
+            k_eff = min(k, max_sim.numel())
+            topk_vals, _ = torch.topk(max_sim, k_eff)
+            return topk_vals.mean()
+        elif reduction == "softmax":
+            weights = torch.softmax(max_sim / temperature, dim=0)
+            return (weights * max_sim).sum()
+        elif reduction == "geom":
+            vals = max_sim.clamp(min=eps)
+            return torch.exp(torch.log(vals).mean())
+        else:
+            raise ValueError(f"Unknown reduction type: {reduction}")
+    # Aggregate both directions
+    avg_img = aggregate(max_sim_img)
+    avg_text = aggregate(max_sim_text)
+    # Final similarity (scalar)
+    similarity = (avg_img + avg_text) / 2
+    return similarity

flare/utils/loss.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 def contrastive_loss(v1, v2, tau=1.0) -> torch.Tensor:
         v1_norm = torch.norm(v1, dim=1, keepdim=True)
@@ -76,10 +77,6 @@ class fp_loss:
                 return 1 - torch.mean(sim)
-import torch
-import torch.nn.functional as F
-import torch.distributed as dist
 # ---------- Utility ----------
 def _safe_divide(num, denom, eps=1e-8):
     return num / (denom + eps)
@@ -154,3 +151,97 @@ def filip_loss_with_mask(a_tokens, b_tokens, mask_a, mask_b, temperature=0.07):
     return 0.5 * (loss_a2b + loss_b2a)

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torch.distributed as dist
 def contrastive_loss(v1, v2, tau=1.0) -> torch.Tensor:
         v1_norm = torch.norm(v1, dim=1, keepdim=True)
                 return 1 - torch.mean(sim)
 # ---------- Utility ----------
 def _safe_divide(num, denom, eps=1e-8):
     return num / (denom + eps)
     return 0.5 * (loss_a2b + loss_b2a)
+def global_infonce_loss(a_tokens, b_tokens, mask_a, mask_b, temperature=0.07, agg_fn="mean"):
+    """
+    Global InfoNCE loss (CLIP-style) for modalities A and B.
+    Args:
+        a_tokens: (B, N_a, D)
+        b_tokens: (B, N_b, D)
+        mask_a:   (B, N_a)  bool (True = valid)
+        mask_b:   (B, N_b)  bool (True = valid)
+        temperature: scalar
+        agg_fn: "mean" | "max" | "cls" | callable -> how to aggregate tokens into one vector
+    Returns:
+        scalar loss
+    """
+    device = a_tokens.device
+    B, N_a, D = a_tokens.shape
+    N_b = b_tokens.shape[1]
+    # ---- Normalize token embeddings ----
+    a = F.normalize(a_tokens, dim=-1)
+    b = F.normalize(b_tokens, dim=-1)
+    # ---- Aggregate per sample ----
+    if callable(agg_fn):
+        a_global = agg_fn(a, mask_a)  # custom aggregation
+        b_global = agg_fn(b, mask_b)
+    elif agg_fn == "mean":
+        # masked mean
+        a_global = (a * mask_a.unsqueeze(-1)).sum(dim=1) / mask_a.sum(dim=1, keepdim=True).clamp(min=1)
+        b_global = (b * mask_b.unsqueeze(-1)).sum(dim=1) / mask_b.sum(dim=1, keepdim=True).clamp(min=1)
+    elif agg_fn == "max":
+        a_global = (a.masked_fill(~mask_a.unsqueeze(-1), float('-inf'))).max(dim=1).values
+        b_global = (b.masked_fill(~mask_b.unsqueeze(-1), float('-inf'))).max(dim=1).values
+    elif agg_fn == "cls":
+        # use first valid token as "cls"
+        a_global = a[:, 0, :]
+        b_global = b[:, 0, :]
+    else:
+        raise ValueError(f"Unknown agg_fn: {agg_fn}")
+    # ---- Compute cosine similarity matrix ----
+    a_global = F.normalize(a_global, dim=-1)
+    b_global = F.normalize(b_global, dim=-1)
+    logits = (a_global @ b_global.T) / temperature  # (B, B)
+    # ---- InfoNCE loss ----
+    labels = torch.arange(B, device=device)
+    loss_a2b = F.cross_entropy(logits, labels)
+    loss_b2a = F.cross_entropy(logits.T, labels)
+    loss = 0.5 * (loss_a2b + loss_b2a)
+    return loss
+# ---------- PCGrad utility ----------
+def pcgrad_combine(losses, shared_params):
+    """
+    Compute PCGrad combined gradient for a list of scalar losses.
+    losses: list of scalar loss tensors
+    shared_params: list of parameters to project/aggregate gradients for
+    returns: scalar combined loss for logging (mean)
+    """
+    grads_list = [torch.autograd.grad(l, shared_params, retain_graph=True, allow_unused=True)
+                  for l in losses]
+    # flatten
+    flat_grads = [torch.cat([g.reshape(-1) for g in grads if g is not None]) for grads in grads_list]
+    projected = [fg.clone() for fg in flat_grads]
+    # project conflicting grads
+    for i in range(len(flat_grads)):
+        for j in range(len(flat_grads)):
+            if i == j:
+                continue
+            dot = (projected[i] * projected[j]).sum()
+            if dot < 0:
+                proj = dot / (projected[j].norm() ** 2 + 1e-12)
+                projected[i] = projected[i] - proj * projected[j]
+    # sum projected grads
+    final_grad = sum(projected)
+    # assign to params
+    pointer = 0
+    for p in shared_params:
+        if p.requires_grad:
+            numel = p.numel()
+            p.grad = final_grad[pointer:pointer + numel].view_as(p).clone()
+            pointer += numel
+    # return average loss for logging only
+    return sum(losses) / len(losses)

flare/utils/models.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from flare.models.spec_encoder import SpecEncMLP_BIN, SpecFormulaEncMLP, SpecFormulaTransformer,SpecFormula_mz_Encoder, SpecMzIntTokenTransformer
 from flare.models.mol_encoder import MolEnc
 from flare.models.encoders import MLP
-from flare.models.contrastive import ContrastiveModel, CrossAttenContrastive, FilipContrastive
 def get_spec_encoder(spec_enc:str, args):
     return {"MLP_BIN": SpecEncMLP_BIN,
@@ -28,6 +28,8 @@ def get_model(model:str,
         model = CrossAttenContrastive(**params)
     elif model == "filipContrastive":
         model = FilipContrastive(**params)
     else:
         raise Exception(f"Model {model} not implemented.")

 from flare.models.spec_encoder import SpecEncMLP_BIN, SpecFormulaEncMLP, SpecFormulaTransformer,SpecFormula_mz_Encoder, SpecMzIntTokenTransformer
 from flare.models.mol_encoder import MolEnc
 from flare.models.encoders import MLP
+from flare.models.contrastive import ContrastiveModel, CrossAttenContrastive, FilipContrastive, FilipGlobalContrastive
 def get_spec_encoder(spec_enc:str, args):
     return {"MLP_BIN": SpecEncMLP_BIN,
         model = CrossAttenContrastive(**params)
     elif model == "filipContrastive":
         model = FilipContrastive(**params)
+    elif model == "filipGlobalContrastive":
+        model = FilipGlobalContrastive(**params)
     else:
         raise Exception(f"Model {model} not implemented.")

flare/utils/mol_search.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import os
+import numpy as np
+import pickle
+from typing import Callable, List, Dict, Any, Optional
+from rdkit import Chem
+import faiss
+import torch
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+import dgl
+class MoleculeDataset(Dataset):
+    """Converts SMILES to DGL graphs in parallel via DataLoader workers."""
+    def __init__(self, smiles_dict, smiles_preprocess):
+        self.items = list(smiles_dict.items())
+        self.smiles_preprocess = smiles_preprocess
+    def __len__(self):
+        return len(self.items)
+    def __getitem__(self, idx):
+        mol_id, smi = self.items[idx]
+        try:
+            graph = self.smiles_preprocess(smi)
+            return mol_id, graph, None
+        except Exception as e:
+            return mol_id, None, str(e)
+def collate_graphs(batch):
+    """Custom collation: keep only valid graphs."""
+    valid = [(mid, g) for mid, g, err in batch if g is not None]
+    if not valid:
+        return [], None
+    mol_ids, graphs = zip(*valid)
+    batched_graph = dgl.batch(graphs)
+    return mol_ids, batched_graph
+class SpectraMoleculeRetriever:
+    """
+    Two-stage spectra–molecule retrieval system with hierarchical metadata filtering:
+    1. Coarse retrieval via FAISS on global embeddings.
+    2. Fine-grained reranking via custom similarity (e.g., FILIP alignment).
+    3. Supports fast subset search by class, superclass, or pathway.
+    """
+    def __init__(
+        self,
+        molecule_encoder,
+        spectra_encoder,
+        fine_similarity_fn: Callable[[Any, Any], float],
+        smiles_preprocess: Callable[[str], Any],
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    ):
+        """
+        Args:
+            molecule_encoder: callable with methods:
+                - global_embedding(mol)
+                - node_embeddings(mol)
+            spectra_encoder: callable with methods:
+                - global_embedding(spectrum)
+                - token_embeddings(spectrum)
+            fine_similarity_fn: function for fine-grained similarity.
+            smiles_preprocess: preprocessing function for SMILES → molecule object.
+            device: where to run encoders.
+        """
+        self.molecule_encoder = molecule_encoder
+        self.spectra_encoder = spectra_encoder
+        self.fine_similarity_fn = fine_similarity_fn
+        self.smiles_preprocess = smiles_preprocess
+        self.device = device
+        # Storage
+        self.molecule_db: Dict[str, Any] = {}        # mol_id → mol object
+        self.node_cache: Dict[str, Any] = {}         # mol_id → node embeddings
+        self.metadata: Dict[str, Dict[str, List[str]]] = {}  # e.g. {"class": {"lipid": [mol1, mol2], ...}}
+        self.molecule_ids: Optional[np.ndarray] = None
+        self.global_embeddings: Optional[np.ndarray] = None
+        self.index: Optional[faiss.Index] = None
+        self.smiles_dict: Optional[Dict[str, str]] = None # mol_id → smiles
+        self.failed_mols = []
+        # set model to eval mode and move to device
+        self.molecule_encoder.eval()
+        self.spectra_encoder.eval()
+        self.molecule_encoder.to(self.device)
+        self.spectra_encoder.to(self.device)
+    # -------------------------------
+    # Database building & saving
+    # -------------------------------
+    def build_database(
+        self,
+        smiles_dict: dict,
+        metadata=None,
+        cache_nodes: bool = False,
+        batch_size: int = 64,
+        num_workers: int = 25,
+        pooling: str = "max",  # or "sum", "mean"
+    ):
+        """
+        Parallelized database construction using PyTorch DataLoader for
+        SMILES → DGLGraph conversion and batched GPU encoding.
+        Args:
+            smiles_dict: dict {mol_id: smiles}
+            metadata: hierarchical dict for class/superclass/pathway
+            cache_nodes: if True, store node embeddings for fine-grained search
+            batch_size: number of molecules per GPU batch
+            num_workers: parallel CPU workers for SMILES parsing
+            pooling: global pooling type ("max" | "sum" | "mean")
+        """
+        print("Building molecule database with PyTorch DataLoader parallelization...")
+        # set up pooling
+        if pooling == "max":
+            self.pooling = dgl.nn.pytorch.glob.MaxPooling()
+        elif pooling == "sum":
+            self.pooling = dgl.nn.pytorch.glob.SumPooling()
+        elif pooling == "mean":
+            self.pooling = dgl.nn.pytorch.glob.MeanPooling()
+        else:
+            raise ValueError(f"Unsupported pooling: {pooling}")
+        dataset = MoleculeDataset(smiles_dict, self.smiles_preprocess)
+        loader = DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=False,
+            num_workers=num_workers,
+            collate_fn=collate_graphs,
+            pin_memory=True,
+        )
+        mol_ids_all, mol_objs, mol_embs = [], [], []
+        failed_mols = []
+        node_cache = {}
+        with torch.no_grad():
+            for mol_ids, batched_graph in tqdm(loader, desc="Encoding molecules"):
+                if batched_graph is None:
+                    # All failed in this batch
+                    continue
+                try:
+                    batched_graph = batched_graph.to(self.device)
+                    node_repr = self.molecule_encoder(batched_graph, batched_graph.ndata['h'])
+                    global_emb = self.pooling(batched_graph,node_repr)
+                    # Normalize embeddings
+                    emb_np = global_emb.detach().cpu().numpy()
+                    emb_np /= np.linalg.norm(emb_np, axis=1, keepdims=True)
+                    mol_ids_all.extend(mol_ids)
+                    mol_objs.extend([batched_graph] * len(mol_ids))
+                    mol_embs.append(emb_np)
+                    # Optionally store node embeddings for fine-grained search
+                    if cache_nodes:
+                        # Split batched node embeddings into per-graph chunks
+                        node_embs = dgl.unbatch(batched_graph)
+                        for mol_id, mol_graph in zip(mol_ids, node_embs):
+                            node_cache[mol_id] = mol_graph.ndata['h'].detach().cpu()
+                except Exception as e:
+                    failed_mols.extend(mol_ids)
+                    print(f"[Warning] Failed to encode batch with molecules {mol_ids}: {e}")
+                    continue
+        if not mol_embs:
+            raise RuntimeError("No valid molecules were successfully encoded.")
+        self.failed_mols = failed_mols
+        self.smiles_dict = smiles_dict
+        self.molecule_db = dict(zip(mol_ids_all, mol_objs))
+        self.molecule_ids = np.array(mol_ids_all)
+        self.global_embeddings = np.concatenate(mol_embs, axis=0)
+        self.metadata = metadata or {}
+        self.node_cache.update(node_cache)
+        self._build_faiss_index()
+        print(f"Database built with {len(self.molecule_ids)} molecules "
+            f"({len(self.failed_mols) + (len(smiles_dict) - len(self.molecule_ids))} failed).")
+    def _build_faiss_index(self):
+        d = self.global_embeddings.shape[1]
+        self.index = faiss.IndexFlatIP(d)
+        self.index.add(self.global_embeddings)
+        print(f"FAISS index built with {len(self.molecule_ids)} embeddings.")
+    def save_database(self, path: str):
+        """Save molecule database and embeddings."""
+        data = {
+            "molecule_ids": self.molecule_ids,
+            "global_embeddings": self.global_embeddings,
+            "metadata": self.metadata,
+            "node_cache": self.node_cache,
+            "smiles_dict": self.smiles_dict,
+        }
+        with open(path, "wb") as f:
+            pickle.dump(data, f)
+        print(f"Database saved to {path}")
+    def load_database(self, path: str):
+        """Load molecule database and rebuild FAISS index."""
+        with open(path, "rb") as f:
+            data = pickle.load(f)
+        self.molecule_ids = data["molecule_ids"]
+        self.global_embeddings = data["global_embeddings"]
+        self.metadata = data.get("metadata", {})
+        self.node_cache = data.get("node_cache", {})
+        self.smiles_dict = data.get("smiles_dict", {})
+        self._build_faiss_index()
+        print(f"Database loaded from {path}")
+    # -------------------------------
+    # Filtering utilities
+    # -------------------------------
+    def _get_filtered_indices(self, subset: Optional[Dict[str, str]] = None) -> np.ndarray:
+        """
+        Retrieve indices for molecules matching a given metadata subset.
+        Example subset: {"class": "lipid"} or {"pathway": "glycolysis"}
+        """
+        if not subset:
+            return np.arange(len(self.molecule_ids))
+        key, value = next(iter(subset.items()))
+        if key not in self.metadata or value not in self.metadata[key]:
+            print(f"[Warning] No molecules found for {key}={value}")
+            return np.array([], dtype=int)
+        mol_ids = self.metadata[key][value]
+        id_to_idx = {m: i for i, m in enumerate(self.molecule_ids)}
+        selected = [id_to_idx[m] for m in mol_ids if m in id_to_idx]
+        return np.array(selected, dtype=int)
+    # -------------------------------
+    # Retrieval
+    # -------------------------------
+    def coarse_search(self, spectrum, top_k: int = 256, subset: Optional[Dict[str, str]] = None):
+        """
+        Retrieve top-k candidates using FAISS, optionally restricted to subset metadata.
+        """
+        with torch.no_grad():
+            spectrum = spectrum.to(self.device)
+            z_spec = self.spectra_encoder(spectrum).sum(axis=0)
+        z_spec = z_spec.detach().cpu().numpy() if hasattr(z_spec, "detach") else np.asarray(z_spec)
+        z_spec = z_spec / np.linalg.norm(z_spec)
+        subset_idx = self._get_filtered_indices(subset)
+        if subset_idx.size == 0:
+            return [], []
+        # subset FAISS index
+        emb_subset = self.global_embeddings[subset_idx]
+        index_subset = faiss.IndexFlatIP(emb_subset.shape[1])
+        index_subset.add(emb_subset)
+        sims, idxs = index_subset.search(z_spec[None, :], min(top_k, len(subset_idx)))
+        candidate_ids = self.molecule_ids[subset_idx[idxs[0]]]
+        return candidate_ids, sims[0]
+    def fine_rerank(self, spectrum, candidate_ids: List[str], top_k: int = 50):
+        """
+        Compute fine-grained similarity for the candidates and rerank.
+        """
+        spectrum = spectrum.to(self.device)
+        with torch.no_grad():
+            z_spec_tokens = self.spectra_encoder(spectrum)
+            scores = []
+            for mol_id in candidate_ids:
+                if mol_id in self.node_cache:
+                    mol_tokens = self.node_cache[mol_id]
+                elif mol_id in self.molecule_db:
+                    mol = self.molecule_db[mol_id].to(self.device)
+                    mol_tokens = self.molecule_encoder(mol)
+                else:
+                    mol = self.smiles_preprocess(self.smiles_dict[mol_id])
+                    mol = mol.to(self.device)
+                    mol_tokens = self.molecule_encoder(mol)
+                s = self.fine_similarity_fn(z_spec_tokens, mol_tokens).item()
+                scores.append((mol_id, s))
+        scores.sort(key=lambda x: x[1], reverse=True)
+        return scores[:top_k]
+    def search(
+        self,
+        spectrum,
+        coarse_k: int = 256,
+        fine_k: int = 50,
+        subset: Optional[Dict[str, str]] = None,
+    ):
+        """
+        Full two-stage search pipeline with optional subset filtering.
+        """
+        candidate_ids, _ = self.coarse_search(spectrum, top_k=coarse_k, subset=subset)
+        if  len(candidate_ids) == 0:
+            return []
+        ranked = self.fine_rerank(spectrum, candidate_ids, top_k=fine_k)
+        return ranked
+if __name__ == "__main__":
+    import sys
+    sys.path.insert(0, "/data/yzhouc01/FILIP-MS")
+    from flare.utils.data import get_spec_featurizer, get_mol_featurizer
+    from flare.utils.models import get_model
+    from flare.utils.mol_search import SpectraMoleculeRetriever
+    from flare.utils.general import filip_similarity_single
+    import yaml
+    metadata = {
+        "class": {
+            "lipid": ["mol1", "mol2"],
+            "peptide": ["mol3"]
+        },
+        "pathway": {
+            "beta-oxidation": ["mol1"],
+            "glycolysis": ["mol2", "mol3"]
+        }
+    }
+    smiles_dict = {
+        "mol1": "CCO",
+        "mol2": "CCN",
+        "mol3": "CCC"
+    }
+    # Load model and data
+    param_pth = '/data/yzhouc01/cancer/flare.yaml'
+    with open(param_pth) as f:
+        params = yaml.load(f, Loader=yaml.FullLoader)
+    spec_featurizer = get_spec_featurizer(params['spectra_view'], params)
+    mol_featurizer = get_mol_featurizer(params['molecule_view'], params)
+    # load model
+    checkpoint_pth = "/data/yzhouc01/FILIP-MS/experiments/20250930_optimized_flare_42/epoch=1959-train_loss=0.08.ckpt"
+    params['checkpoint_pth'] = checkpoint_pth
+    model = get_model(params['model'], params)
+    specMolRetriever = SpectraMoleculeRetriever(
+        molecule_encoder=model.mol_enc_model,
+        spectra_encoder=model.spec_enc_model,
+        fine_similarity_fn=filip_similarity_single,
+        smiles_preprocess=mol_featurizer
+    )
+    specMolRetriever.build_database(smiles_dict, metadata=metadata, cache_nodes=True)
+    # Filter search to molecules in a specific pathway
+    # results = specMolRetriever.search(spectrum, subset={"pathway": "beta-oxidation"})
+    # for mol_id, score in results[:10]:
+    #     print(f"{mol_id}: {score:.3f}")

notebooks/UMAP_spectra_embeddings.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/fine-grained_vs_global.ipynb CHANGED Viewed

@@ -29819,9 +29819,13 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python (spec)",
    "language": "python",
-   "name": "spec"
   }
  },
  "nbformat": 4,

  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "spec",
    "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,

notebooks/good_vs_bad_instances.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/mol-spec_visualization.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/results.ipynb ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2cd3303a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8ccc0bc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"/data/yzhouc01/FILIP-MS/experiments/20251110_filip-global/result_MassSpecGym_retrieval_candidates_formula.pkl\", \"rb\") as f:\n",
+    "    result = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8e517777",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>rank_fine</th>\n",
+       "      <th>rank_global</th>\n",
+       "      <th>rank_sum</th>\n",
+       "      <th>rank_weighted</th>\n",
+       "      <th>rank_avg</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>R@1</th>\n",
+       "      <td>0.214571</td>\n",
+       "      <td>0.163306</td>\n",
+       "      <td>0.192869</td>\n",
+       "      <td>0.191274</td>\n",
+       "      <td>0.192869</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>R@5</th>\n",
+       "      <td>0.483140</td>\n",
+       "      <td>0.403566</td>\n",
+       "      <td>0.447425</td>\n",
+       "      <td>0.444862</td>\n",
+       "      <td>0.447425</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>R@20</th>\n",
+       "      <td>0.747095</td>\n",
+       "      <td>0.694350</td>\n",
+       "      <td>0.728355</td>\n",
+       "      <td>0.726361</td>\n",
+       "      <td>0.728355</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      rank_fine  rank_global  rank_sum  rank_weighted  rank_avg\n",
+       "R@1    0.214571     0.163306  0.192869       0.191274  0.192869\n",
+       "R@5    0.483140     0.403566  0.447425       0.444862  0.447425\n",
+       "R@20   0.747095     0.694350  0.728355       0.726361  0.728355"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = []\n",
+    "for i in [1, 5, 20]:\n",
+    "    curr_d = {}\n",
+    "    for c in result.columns.tolist():\n",
+    "        if c.startswith('rank'):\n",
+    "            curr_d[c] = result[result[c] <= i].shape[0] / result.shape[0]\n",
+    "    data.append(curr_d)\n",
+    "\n",
+    "data_df = pd.DataFrame(data, index=['R@1', 'R@5', 'R@20'])\n",
+    "data_df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "10493857",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>rank_fine</th>\n",
+       "      <th>rank_global</th>\n",
+       "      <th>rank_sum</th>\n",
+       "      <th>rank_weighted</th>\n",
+       "      <th>rank_avg</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>R@1</th>\n",
+       "      <td>0.420882</td>\n",
+       "      <td>0.369731</td>\n",
+       "      <td>0.412907</td>\n",
+       "      <td>0.411939</td>\n",
+       "      <td>0.412907</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>R@5</th>\n",
+       "      <td>0.744475</td>\n",
+       "      <td>0.707052</td>\n",
+       "      <td>0.738893</td>\n",
+       "      <td>0.737412</td>\n",
+       "      <td>0.738893</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>R@20</th>\n",
+       "      <td>0.927660</td>\n",
+       "      <td>0.916325</td>\n",
+       "      <td>0.926407</td>\n",
+       "      <td>0.926122</td>\n",
+       "      <td>0.926407</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      rank_fine  rank_global  rank_sum  rank_weighted  rank_avg\n",
+       "R@1    0.420882     0.369731  0.412907       0.411939  0.412907\n",
+       "R@5    0.744475     0.707052  0.738893       0.737412  0.738893\n",
+       "R@20   0.927660     0.916325  0.926407       0.926122  0.926407"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = []\n",
+    "for i in [1, 5, 20]:\n",
+    "    curr_d = {}\n",
+    "    for c in result.columns.tolist():\n",
+    "        if c.startswith('rank'):\n",
+    "            curr_d[c] = result[result[c] <= i].shape[0] / result.shape[0]\n",
+    "    data.append(curr_d)\n",
+    "\n",
+    "data_df = pd.DataFrame(data, index=['R@1', 'R@5', 'R@20'])\n",
+    "data_df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e4201db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "spec",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/spectra_sim.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff