File size: 2,389 Bytes
7b7a7b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
""" magma_utils.py

Additional utility file to assist with fingerprinting.

"""

import os
from ast import literal_eval

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import re

def extract_adduct_ion(adduct, default='+H'):
    pattern = re.compile(r"\[M([+-][^\]]+)\]")
    match = pattern.search(adduct)
    if match:
        return match.group(1)
    return default


def get_magma_fingerprint(smile):
    """ get_magma_fingerprint. """
    mol = Chem.MolFromSmiles(smile, sanitize=False)
    Chem.SanitizeMol(
        mol,
        sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL
        ^ Chem.SanitizeFlags.SANITIZE_KEKULIZE,
    )
    curr_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)

    fingerprint = np.zeros((0,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(curr_fp, fingerprint)
    return fingerprint


def get_magma_fingerprint_bits(smile):
    """ get magma fingerprint bits """
    fingerprint = get_magma_fingerprint(smile)
    hot_indices = list(np.where(np.array(list(fingerprint)) == 1)[0])
    return hot_indices


def read_magma_file(magma_frag_file):
    """Read in magma file"""
    if (
        magma_frag_file is not None
        and os.path.exists(magma_frag_file)
        and os.path.getsize(magma_frag_file) > 0
    ):

        # correct for inconsistency by me in file parsing (sad)
        sep = "\t"
        spectra_df = pd.read_csv(magma_frag_file, index_col=0, sep=sep)
        if (
            "smiles" not in spectra_df.columns
            or "chemical_formula" not in spectra_df.columns
        ):
            pass
        else:
            spectra_df = _convert_str_to_list(spectra_df, "smiles")
            spectra_df = _convert_str_to_list(spectra_df, "chemical_formula")
            if "mass_to_charge" not in spectra_df.columns:
                spectra_df["mass_to_charge"] = spectra_df["mz"]

            return spectra_df
    spectra_df = pd.DataFrame(
        columns=[
            "mass_to_charge",
            "intensity",
            "chemical_formula",
            "smiles",
            "molecule_peak",
        ]
    )
    return spectra_df


def _convert_str_to_list(df, column):
    """_convert_str_to_list"""
    df.loc[:, column] = df.loc[:, column].apply(
        lambda x: literal_eval(x) if x != "NAN" and not pd.isna(x) else []
    )
    return df