File size: 2,561 Bytes
d9df210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import argparse
from mvp.utils.preprocessing import generate_cons_spec_formulas, generate_cons_spec
import os
import pickle
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from tqdm import tqdm

parser = argparse.ArgumentParser()
parser.add_argument("--spec_type", choices=('formSpec', 'binnedSpec'), required=True)
parser.add_argument("--dataset_pth", required=True, help="path to spectra data")
parser.add_argument("--candidates_pth", required=True, help="path to candidates data")
parser.add_argument("--output_dir", required=True, help="path to output directory")
parser.add_argument("--subformula_dir_pth",  default='', help="path to subformula directory if using formSpec")


def check_args():

    # create output directory
    os.makedirs(args.output_dir, exist_ok=True)

    # check files
    if args.spec_type == 'formSpec':
        assert(os.path.isdir(args.subformula_dir_pth))
    
    assert(os.path.exists(args.dataset_pth))
    assert(os.path.exists(args.candidates_pth))

def construct_smiles_to_fp(smiles_list, r=5, fp_size=1024):
    fpgen = AllChem.GetMorganGenerator(radius=r,fpSize=fp_size)
    smiles_to_fp = {}
    failed_ct = 0

    for s in tqdm(smiles_list, total=len(smiles_list)):
        try:
            mol = Chem.MolFromSmiles(s)
            fp = fpgen.GetFingerprint(mol)
            smiles_to_fp[s] = fp
        except:
            failed_ct+=1
    print(f'Failed to generate fingerprints for {failed_ct} smiles')

    # save smiles_to_fp
    with open(os.path.join(args.output_dir, f'morganfp_r{r}_{fp_size}.pickle'), 'wb') as f:
        pickle.dump(smiles_to_fp, f)

def construct_consensus_spectra():
    if args.spec_type == 'formSpec':
        df = generate_cons_spec_formulas(args.dataset_pth, args.subformula_dir_pth, args.output_dir)
    elif args.spec_type == 'binnedSpec':
        df = generate_cons_spec(args.dataset_pth, args.output_dir)

    # save consensus spectra df
    with open(os.path.join(args.output_dir, f'consensus_{args.spec_type}.pkl'), 'wb') as f:
        pickle.dump(df, f)

def main(data):

    # generate fingerpints
    print("Processing fingerprints...")
    unique_smiles = data['smiles'].unique().tolist()
    construct_smiles_to_fp(unique_smiles)

    # generate consensus spectra
    print("Processring consensus spectra...")
    construct_consensus_spectra()


if __name__ == '__main__':
    args = parser.parse_args([] if "__file__" not in globals() else None)

    check_args()

    # load data
    data = pd.read_csv(args.dataset_pth, sep='\t')

    main(data)