{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "07d00685", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import numpy as np\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "from rdkit import Chem\n", "from rdkit.Chem import rdDepictor\n", "from rdkit.Chem.Draw import rdMolDraw2D\n", "import matplotlib.pyplot as plt\n", "import json" ] }, { "cell_type": "code", "execution_count": 2, "id": "cd9e10c7", "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(0, \"/data/yzhouc01/MassSpecGym\")\n", "sys.path.insert(0, \"/data/yzhouc01/FILIP-MS\")\n", "\n", "from rdkit import RDLogger\n", "import pytorch_lightning as pl\n", "from pytorch_lightning import Trainer\n", "from massspecgym.models.base import Stage\n", "import os\n", "\n", "from mvp.utils.data import get_spec_featurizer, get_mol_featurizer, get_ms_dataset,get_test_ms_dataset\n", "from mvp.utils.models import get_model\n", "\n", "from mvp.definitions import TEST_RESULTS_DIR\n", "import yaml\n", "from functools import partial\n", "# Suppress RDKit warnings and errors\n", "lg = RDLogger.logger()\n", "lg.setLevel(RDLogger.CRITICAL)" ] }, { "cell_type": "code", "execution_count": 3, "id": "9ba93f86", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data path: /r/hassounlab/spectra_data/msgym/MassSpecGym.tsv\n", "Processing formula spectra\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 231104/231104 [50:06<00:00, 76.87it/s] \n" ] } ], "source": [ "# Load model and data\n", "# param_pth = '/data/yzhouc01/FILIP-MS/experiments/20250824_filipContrastive/lightning_logs/version_0/hparams.yaml'\n", "param_pth = \"/data/yzhouc01/FILIP-MS/mvp/params_formSpec.yaml\"\n", "with open(param_pth) as f:\n", " params = yaml.load(f, Loader=yaml.FullLoader)\n", "# params['dataset_pth'] = \"/data/yzhouc01/MVP/data/sample/data.tsv\"\n", "\n", "spec_featurizer = get_spec_featurizer(params['spectra_view'], params)\n", "mol_featurizer = get_mol_featurizer(params['molecule_view'], params)\n", "dataset = get_test_ms_dataset(params['spectra_view'], params['molecule_view'], spec_featurizer, mol_featurizer, params)" ] }, { "cell_type": "code", "execution_count": 4, "id": "bcb28630", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'precursor_mz': 288.1225,\n", " 'formulas': array(['C7H6', 'C6H4O3', 'C13H12O', 'C14H12O3', 'C14H15NO3', 'C16H17NO4'],\n", " dtype='" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "# Collect number of formulas per spectrum\n", "n_formulas = [len(d.metadata['formulas']) for d in dataset.spectra]\n", "\n", "# Calculate mean and median\n", "mean_n_formulas = np.mean(n_formulas)\n", "median_n_formulas = np.median(n_formulas)\n", "\n", "# Plot histogram\n", "plt.hist(n_formulas, bins=30, alpha=0.7, color='skyblue')\n", "plt.axvline(mean_n_formulas, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_n_formulas:.2f}')\n", "plt.axvline(median_n_formulas, color='green', linestyle='dashed', linewidth=2, label=f'Median: {median_n_formulas:.2f}')\n", "plt.xlabel('Number of formulas per spectrum')\n", "plt.ylabel('Count')\n", "plt.title('Distribution of Number of Formulas per Spectrum (MIST labels)')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 7, "id": "b35bb5a4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "identifier MassSpecGymID0000001\n", "mzs 91.0542,125.0233,154.0499,155.0577,185.0961,20...\n", "intensities 0.24524524524524524,1.0,0.08008008008008008,0....\n", "smiles CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC\n", "inchikey VFMQMACUYWGDOJ\n", "formula C16H17NO4\n", "precursor_formula C16H18NO4\n", "parent_mass 287.115224\n", "precursor_mz 288.1225\n", "adduct [M+H]+\n", "instrument_type Orbitrap\n", "collision_energy 30.0\n", "fold train\n", "simulation_challenge True\n", "formulas [C16H17NO4]\n", "formula_mzs [288.1225]\n", "formula_intensities [1.0]\n", "Name: 0, dtype: object" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.metadata.iloc[0]" ] }, { "cell_type": "code", "execution_count": 9, "id": "da07f08a", "metadata": {}, "outputs": [ { "ename": "PermissionError", "evalue": "[Errno 13] Permission denied: '/r/hassounlab/msgym_sirius/MassSpecGymID0000140.json'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mPermissionError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[9], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m spec_id \u001b[38;5;241m=\u001b[39m dataset\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m123\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124midentifier\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 3\u001b[0m file \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(params[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msubformula_dir_pth\u001b[39m\u001b[38;5;124m'\u001b[39m], spec_id\u001b[38;5;241m+\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.json\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 5\u001b[0m data \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mload(f)\n", "File \u001b[0;32m/data/yzc-conda/spec/lib/python3.11/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 322\u001b[0m )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[0;31mPermissionError\u001b[0m: [Errno 13] Permission denied: '/r/hassounlab/msgym_sirius/MassSpecGymID0000140.json'" ] } ], "source": [ "import json\n", "spec_id = dataset.metadata.iloc[123]['identifier']\n", "file = os.path.join(params['subformula_dir_pth'], spec_id+\".json\")\n", "with open(file) as f:\n", " data = json.load(f)" ] }, { "cell_type": "code", "execution_count": null, "id": "a1341478", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "spec", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }