MVP / mvp /utils /preprocessing.py
yzhouchen001's picture
model code
d9df210
raw
history blame
6.13 kB
import pandas as pd
import pickle
import numpy as np
import mvp.utils.data as data_utils
import collections
import os
import requests
import tqdm
from multiprocessing import Pool
from urllib.parse import quote
from tqdm import tqdm
class NPClassProcess:
def process_smiles(smiles):
try:
encoded_smiles = quote(smiles)
url = f"https://npclassifier.gnps2.org/classify?smiles={encoded_smiles}"
r = requests.get(url)
return (smiles, r.json())
except:
return (smiles, None)
def NPclass_from_smiles(pth, output_dir, n_processes=20):
data = pd.read_csv(pth, sep='\t')
unique_smiles = data['smiles'].unique().tolist()
items = unique_smiles
with Pool(processes=n_processes) as pool:
results = list(tqdm(pool.imap(NPClassProcess.process_smiles, items), total=len(items)))
failed_ct = 0
smiles_to_class = {}
for s, out in results:
if out is None:
smiles_to_class[s] = 'NA'
failed_ct+=1
else:
smiles_to_class[s] = out
file_pth = os.path.join(output_dir, 'SMILES_TO_CLASS.pkl')
with open(file_pth, 'wb') as f:
pickle.dump(smiles_to_class, f)
print(f'Failed to process {failed_ct} SMILES')
print(f'result file saved to {file_pth}')
return file_pth
def construct_NL_spec(pth, output_dir):
def _get_spec(row):
mzs = np.array([float(m) for m in row["mzs"].split(",")], dtype=np.float32)
intensities = np.array([float(i) for i in row["intensities"].split(",")],dtype=np.float32)
mzs = float(row['precursor_mz']) - mzs
valid_idx = np.where(mzs>1.0)
mzs = mzs[valid_idx]
intensities = intensities[valid_idx]
sorted_idx = np.argsort(mzs)
mzs = np.concatenate((mzs[sorted_idx], [float(row['precursor_mz'])]))
intensities = np.concatenate((intensities[sorted_idx], [1.0]))
return mzs, intensities
spec_data = pd.read_csv(pth, sep='\t')
spec_data[['mzs', 'intensities']] = spec_data.apply(lambda row: _get_spec(row), axis=1, result_type='expand')
file_pth = os.path.join(output_dir, 'NL_spec.pkl')
with open(file_pth, 'wb') as f:
pickle.dump(spec_data, f)
return file_pth
def generate_cons_spec(pth, output_dir):
spec_data = pd.read_csv(pth, sep='\t')
data_by_smiles = spec_data[['identifier', 'smiles', 'mzs', 'intensities', 'fold']].groupby('smiles').agg({'identifier':list, 'mzs':lambda x: ','.join(x), 'intensities': lambda x: ','.join(x), 'fold':list})
smiles_to_fold = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['fold'].tolist()))
consensus_spectra = {}
for idx, row in tqdm(data_by_smiles.iterrows(), total=len(data_by_smiles)):
mzs = np.array([float(m) for m in row["mzs"].split(",")], dtype=np.float32)
intensities = np.array([float(i) for i in row["intensities"].split(",")],dtype=np.float32)
sorted_idx = np.argsort(mzs)
mzs = mzs[sorted_idx]
intensities = intensities[sorted_idx]
smiles = row.name
consensus_spectra[smiles] = {'mzs':mzs, 'intensities':intensities,'precursor_mz': 10000.0,
'fold': smiles_to_fold[smiles][0]}
df = pd.DataFrame.from_dict(consensus_spectra, orient='index')
df = df.rename_axis('smiles').reset_index()
return df
def generate_cons_spec_formulas(pth, subformula_dir, output_dir=''):
# load tsv file
spec_data = pd.read_csv(pth, sep='\t')
# goup spectra by SMILES
data_by_smiles = spec_data[['identifier', 'smiles', 'fold', 'precursor_mz', 'formula', 'adduct']].groupby('smiles').agg({'identifier':list, 'fold': list, 'formula': list, 'precursor_mz': "max", 'adduct': list})
smiles_to_id = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['identifier'].tolist()))
smiles_to_fold = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['fold'].tolist()))
smiles_to_precursorMz = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['precursor_mz'].tolist()))
smiles_to_precursorFormula = dict(zip(data_by_smiles.index.tolist(), data_by_smiles['formula'].tolist()))
# load subformulas
subformulaLoader = data_utils.Subformula_Loader(spectra_view='SpecFormula', dir_path=subformula_dir)
id_to_spec = subformulaLoader(spec_data['identifier'].tolist())
# combine spectra
consensus_spectra = {}
for smiles, ids in tqdm(smiles_to_id.items(), total=len(data_by_smiles)):
cons_spec = collections.defaultdict(list)
for id in ids:
if id in id_to_spec:
for k, v in id_to_spec[id].items():
cons_spec[k].extend(v)
cons_spec = pd.DataFrame(cons_spec)
assert(len(set(smiles_to_fold[smiles]))==1)
# keep maxed mz and maxed intensity
try:
cons_spec = cons_spec.groupby('formulas').agg({'formula_mzs': "max", 'formula_intensities': "max"})
cons_spec.reset_index(inplace=True)
except:
d = {
'formulas': [smiles_to_precursorFormula[smiles][0]],
'formula_mzs': [smiles_to_precursorMz[smiles]],
'formula_intensities': [1.0]
}
cons_spec = pd.DataFrame(d)
cons_spec = cons_spec.sort_values(by='formula_mzs').reset_index(drop=True)
cons_spec = {'formulas': cons_spec['formulas'].tolist(),
'formula_mzs': cons_spec['formula_mzs'].tolist(),
'formula_intensities': cons_spec['formula_intensities'].tolist(),
'precursor_mz': smiles_to_precursorMz[smiles],
'fold': smiles_to_fold[smiles][0],
'precursor_formula': smiles_to_precursorFormula[smiles][0]}# formula without adduct...
consensus_spectra[smiles] = cons_spec
# save consensus spectra
df = pd.DataFrame.from_dict(consensus_spectra, orient='index')
df = df.rename_axis('smiles').reset_index()
return df