{ "cells": [ { "cell_type": "markdown", "id": "8d4475ae", "metadata": {}, "source": [ "Visualize peak embeddings\n", "\n", "peaks of the same formula same substructure should be clustered closer togther" ] }, { "cell_type": "code", "execution_count": null, "id": "75410b71", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "id": "b906b7f7", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv(\"/r/hassounlab/spectra_data/msgym/MassSpecGym.tsv\", sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "d83156c1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | identifier | \n", "mzs | \n", "intensities | \n", "smiles | \n", "inchikey | \n", "formula | \n", "precursor_formula | \n", "parent_mass | \n", "precursor_mz | \n", "adduct | \n", "instrument_type | \n", "collision_energy | \n", "fold | \n", "simulation_challenge | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "MassSpecGymID0000001 | \n", "91.0542,125.0233,154.0499,155.0577,185.0961,20... | \n", "0.24524524524524524,1.0,0.08008008008008008,0.... | \n", "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", "VFMQMACUYWGDOJ | \n", "C16H17NO4 | \n", "C16H18NO4 | \n", "287.115224 | \n", "288.1225 | \n", "[M+H]+ | \n", "Orbitrap | \n", "30.0 | \n", "train | \n", "True | \n", "
| 1 | \n", "MassSpecGymID0000002 | \n", "91.0542,125.0233,155.0577,185.0961,229.0859,24... | \n", "0.0990990990990991,0.28128128128128127,0.04004... | \n", "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", "VFMQMACUYWGDOJ | \n", "C16H17NO4 | \n", "C16H18NO4 | \n", "287.115224 | \n", "288.1225 | \n", "[M+H]+ | \n", "Orbitrap | \n", "20.0 | \n", "train | \n", "True | \n", "
| 2 | \n", "MassSpecGymID0000003 | \n", "69.0343,91.0542,125.0233,127.039,153.0699,154.... | \n", "0.03403403403403404,0.31431431431431434,1.0,0.... | \n", "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", "VFMQMACUYWGDOJ | \n", "C16H17NO4 | \n", "C16H18NO4 | \n", "287.115224 | \n", "288.1225 | \n", "[M+H]+ | \n", "Orbitrap | \n", "40.0 | \n", "train | \n", "True | \n", "
| 3 | \n", "MassSpecGymID0000004 | \n", "69.0343,91.0542,110.06,111.0441,112.0393,120.0... | \n", "0.17917917917917917,0.47347347347347346,0.0380... | \n", "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", "VFMQMACUYWGDOJ | \n", "C16H17NO4 | \n", "C16H18NO4 | \n", "287.115224 | \n", "288.1225 | \n", "[M+H]+ | \n", "Orbitrap | \n", "55.0 | \n", "train | \n", "True | \n", "
| 4 | \n", "MassSpecGymID0000005 | \n", "91.0542,125.0233,185.0961,229.0859,246.1125,28... | \n", "0.07807807807807808,0.1841841841841842,0.03503... | \n", "CC(=O)N[C@@H](CC1=CC=CC=C1)C2=CC(=CC(=O)O2)OC | \n", "VFMQMACUYWGDOJ | \n", "C16H17NO4 | \n", "C16H18NO4 | \n", "287.115224 | \n", "288.1225 | \n", "[M+H]+ | \n", "Orbitrap | \n", "10.0 | \n", "train | \n", "True | \n", "
| \n", " | Subformula | \n", "Unique_SMILES | \n", "Spectra_Indices | \n", "n_smiles | \n", "has_nitrogen | \n", "has_oxygen | \n", "
|---|---|---|---|---|---|---|
| 1046 | \n", "C9H7NO | \n", "[COC1=CC(=CC(=C1)NC(=O)CN2C3=CC=CC=C3C(=CC2=O)... | \n", "[392, 396, 671, 674, 675, 677, 901, 905, 910, ... | \n", "2345 | \n", "True | \n", "True | \n", "
| 1052 | \n", "C10H9NO | \n", "[C1OC2=CC3=CC(=C(N=C3C=C2O1)N)C(=O)NC4=CC=CC=C... | \n", "[392, 393, 394, 396, 397, 398, 901, 902, 904, ... | \n", "1788 | \n", "True | \n", "True | \n", "
| 2587 | \n", "C8H7NO | \n", "[C1CCOC(C1)CN2C=C(C=N2)NC(=O)C3=CC4=C(C=C3)NC=... | \n", "[1696, 1698, 1699, 1704, 1705, 1706, 1707, 170... | \n", "1706 | \n", "True | \n", "True | \n", "
| 1159 | \n", "C5H5NO | \n", "[C1OC2=CC3=CC(=C(N=C3C=C2O1)N)C(=O)NC4=CC=CC=C... | \n", "[421, 422, 424, 425, 1047, 1721, 1725, 1727, 1... | \n", "1450 | \n", "True | \n", "True | \n", "
| 2787 | \n", "C4H5NO | \n", "[C1=CC=C(C=C1)C[C@@H](C(=O)N[C@@H](CCC(=O)N)C(... | \n", "[1779, 1780, 1787, 1789, 1790, 1791, 1793, 179... | \n", "1380 | \n", "True | \n", "True | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2174 | \n", "C10H18N2O | \n", "[CC(C)[C@H](C(=O)NCC1=CC=C(C=C1)OC)NC(=O)C2CCN... | \n", "[1171, 1179, 51943, 51954, 62242, 82752, 82754... | \n", "100 | \n", "True | \n", "True | \n", "
| 6157 | \n", "C6H8NO2 | \n", "[CC1=CC(=O)OC2=C1C=CC(=C2)NC(=O)[C@@H]3CCCN3C(... | \n", "[4680, 6057, 7397, 8570, 9357, 21802, 23017, 2... | \n", "100 | \n", "True | \n", "True | \n", "
| 4045 | \n", "C3H7N2O2 | \n", "[CC(C)CS(=O)(=O)NCC(C1=COC=C1)N2CCSCC2, CCC1CN... | \n", "[2989, 7004, 9280, 9295, 9298, 20159, 35354, 3... | \n", "100 | \n", "True | \n", "True | \n", "
| 11009 | \n", "C15H10N2O2 | \n", "[CC(C)N1C=C(C=N1)NC(=O)CC2=CC=C(C=C2)OC3=NC=NC... | \n", "[10980, 10981, 10982, 10987, 36081, 36087, 360... | \n", "100 | \n", "True | \n", "True | \n", "
| 10951 | \n", "C9H5N2O | \n", "[CCOC(=O)C1=CN=C2C=CC(=NC2=C1Br)OC, CC(C)C1=C2... | \n", "[10908, 10914, 10915, 10956, 10957, 10958, 115... | \n", "100 | \n", "True | \n", "True | \n", "
547 rows × 6 columns
\n", "