Spaces:
Sleeping
Sleeping
File size: 2,561 Bytes
d9df210 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import argparse
from mvp.utils.preprocessing import generate_cons_spec_formulas, generate_cons_spec
import os
import pickle
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument("--spec_type", choices=('formSpec', 'binnedSpec'), required=True)
parser.add_argument("--dataset_pth", required=True, help="path to spectra data")
parser.add_argument("--candidates_pth", required=True, help="path to candidates data")
parser.add_argument("--output_dir", required=True, help="path to output directory")
parser.add_argument("--subformula_dir_pth", default='', help="path to subformula directory if using formSpec")
def check_args():
# create output directory
os.makedirs(args.output_dir, exist_ok=True)
# check files
if args.spec_type == 'formSpec':
assert(os.path.isdir(args.subformula_dir_pth))
assert(os.path.exists(args.dataset_pth))
assert(os.path.exists(args.candidates_pth))
def construct_smiles_to_fp(smiles_list, r=5, fp_size=1024):
fpgen = AllChem.GetMorganGenerator(radius=r,fpSize=fp_size)
smiles_to_fp = {}
failed_ct = 0
for s in tqdm(smiles_list, total=len(smiles_list)):
try:
mol = Chem.MolFromSmiles(s)
fp = fpgen.GetFingerprint(mol)
smiles_to_fp[s] = fp
except:
failed_ct+=1
print(f'Failed to generate fingerprints for {failed_ct} smiles')
# save smiles_to_fp
with open(os.path.join(args.output_dir, f'morganfp_r{r}_{fp_size}.pickle'), 'wb') as f:
pickle.dump(smiles_to_fp, f)
def construct_consensus_spectra():
if args.spec_type == 'formSpec':
df = generate_cons_spec_formulas(args.dataset_pth, args.subformula_dir_pth, args.output_dir)
elif args.spec_type == 'binnedSpec':
df = generate_cons_spec(args.dataset_pth, args.output_dir)
# save consensus spectra df
with open(os.path.join(args.output_dir, f'consensus_{args.spec_type}.pkl'), 'wb') as f:
pickle.dump(df, f)
def main(data):
# generate fingerpints
print("Processing fingerprints...")
unique_smiles = data['smiles'].unique().tolist()
construct_smiles_to_fp(unique_smiles)
# generate consensus spectra
print("Processring consensus spectra...")
construct_consensus_spectra()
if __name__ == '__main__':
args = parser.parse_args([] if "__file__" not in globals() else None)
check_args()
# load data
data = pd.read_csv(args.dataset_pth, sep='\t')
main(data) |