Spaces:
Running
Running
File size: 6,130 Bytes
d9df210 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import pandas as pd
import pickle
import numpy as np
import mvp.utils.data as data_utils
import collections
import os
import requests
import tqdm
from multiprocessing import Pool
from urllib.parse import quote
from tqdm import tqdm
class NPClassProcess:
def process_smiles(smiles):
try:
encoded_smiles = quote(smiles)
url = f"https://npclassifier.gnps2.org/classify?smiles={encoded_smiles}"
r = requests.get(url)
return (smiles, r.json())
except:
return (smiles, None)
def NPclass_from_smiles(pth, output_dir, n_processes=20):
data = pd.read_csv(pth, sep='\t')
unique_smiles = data['smiles'].unique().tolist()
items = unique_smiles
with Pool(processes=n_processes) as pool:
results = list(tqdm(pool.imap(NPClassProcess.process_smiles, items), total=len(items)))
failed_ct = 0
smiles_to_class = {}
for s, out in results:
if out is None:
smiles_to_class[s] = 'NA'
failed_ct+=1
else:
smiles_to_class[s] = out
file_pth = os.path.join(output_dir, 'SMILES_TO_CLASS.pkl')
with open(file_pth, 'wb') as f:
pickle.dump(smiles_to_class, f)
print(f'Failed to process {failed_ct} SMILES')
print(f'result file saved to {file_pth}')
return file_pth
def construct_NL_spec(pth, output_dir):
def _get_spec(row):
mzs = np.array([float(m) for m in row["mzs"].split(",")], dtype=np.float32)
intensities = np.array([float(i) for i in row["intensities"].split(",")],dtype=np.float32)
mzs = float(row['precursor_mz']) - mzs
valid_idx = np.where(mzs>1.0)
mzs = mzs[valid_idx]
intensities = intensities[valid_idx]
sorted_idx = np.argsort(mzs)
mzs = np.concatenate((mzs[sorted_idx], [float(row['precursor_mz'])]))
intensities = np.concatenate((intensities[sorted_idx], [1.0]))
return mzs, intensities
spec_data = pd.read_csv(pth, sep='\t')
spec_data[['mzs', 'intensities']] = spec_data.apply(lambda row: _get_spec(row), axis=1, result_type='expand')
file_pth = os.path.join(output_dir, 'NL_spec.pkl')
with open(file_pth, 'wb') as f:
pickle.dump(spec_data, f)
return file_pth
def generate_cons_spec(pth, output_dir):
spec_data = pd.read_csv(pth, sep='\t')
data_by_smiles = spec_data[['identifier', 'smiles', 'mzs', 'intensities', 'fold']].groupby('smiles').agg({'identifier':list, 'mzs':lambda x: ','.join(x), 'intensities': lambda x: ','.join(x), 'fold':list})
smiles_to_fold = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['fold'].tolist()))
consensus_spectra = {}
for idx, row in tqdm(data_by_smiles.iterrows(), total=len(data_by_smiles)):
mzs = np.array([float(m) for m in row["mzs"].split(",")], dtype=np.float32)
intensities = np.array([float(i) for i in row["intensities"].split(",")],dtype=np.float32)
sorted_idx = np.argsort(mzs)
mzs = mzs[sorted_idx]
intensities = intensities[sorted_idx]
smiles = row.name
consensus_spectra[smiles] = {'mzs':mzs, 'intensities':intensities,'precursor_mz': 10000.0,
'fold': smiles_to_fold[smiles][0]}
df = pd.DataFrame.from_dict(consensus_spectra, orient='index')
df = df.rename_axis('smiles').reset_index()
return df
def generate_cons_spec_formulas(pth, subformula_dir, output_dir=''):
# load tsv file
spec_data = pd.read_csv(pth, sep='\t')
# goup spectra by SMILES
data_by_smiles = spec_data[['identifier', 'smiles', 'fold', 'precursor_mz', 'formula', 'adduct']].groupby('smiles').agg({'identifier':list, 'fold': list, 'formula': list, 'precursor_mz': "max", 'adduct': list})
smiles_to_id = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['identifier'].tolist()))
smiles_to_fold = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['fold'].tolist()))
smiles_to_precursorMz = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['precursor_mz'].tolist()))
smiles_to_precursorFormula = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['formula'].tolist()))
# load subformulas
subformulaLoader = data_utils.Subformula_Loader(spectra_view='SpecFormula', dir_path=subformula_dir)
id_to_spec = subformulaLoader(spec_data['identifier'].tolist())
# combine spectra
consensus_spectra = {}
for smiles, ids in tqdm(smiles_to_id.items(), total=len(data_by_smiles)):
cons_spec = collections.defaultdict(list)
for id in ids:
if id in id_to_spec:
for k, v in id_to_spec[id].items():
cons_spec[k].extend(v)
cons_spec = pd.DataFrame(cons_spec)
assert(len(set(smiles_to_fold[smiles]))==1)
# keep maxed mz and maxed intensity
try:
cons_spec = cons_spec.groupby('formulas').agg({'formula_mzs': "max", 'formula_intensities': "max"})
cons_spec.reset_index(inplace=True)
except:
d = {
'formulas': [smiles_to_precursorFormula[smiles][0]],
'formula_mzs': [smiles_to_precursorMz[smiles]],
'formula_intensities': [1.0]
}
cons_spec = pd.DataFrame(d)
cons_spec = cons_spec.sort_values(by='formula_mzs').reset_index(drop=True)
cons_spec = {'formulas': cons_spec['formulas'].tolist(),
'formula_mzs': cons_spec['formula_mzs'].tolist(),
'formula_intensities': cons_spec['formula_intensities'].tolist(),
'precursor_mz': smiles_to_precursorMz[smiles],
'fold': smiles_to_fold[smiles][0],
'precursor_formula': smiles_to_precursorFormula[smiles][0]}# formula without adduct...
consensus_spectra[smiles] = cons_spec
# save consensus spectra
df = pd.DataFrame.from_dict(consensus_spectra, orient='index')
df = df.rename_axis('smiles').reset_index()
return df |