Spaces:

HassounLab
/

MVP

Sleeping

App Files Files Community

yzhouchen001 commited on Oct 24

Commit

c65d76d

1 Parent(s): b6aab14

main scripts

Browse files

Files changed (35) hide show

mvp/__pycache__/__init__.cpython-311.pyc +0 -0
mvp/__pycache__/definitions.cpython-311.pyc +0 -0
mvp/data/__pycache__/__init__.cpython-311.pyc +0 -0
mvp/data/__pycache__/data_module.cpython-311.pyc +0 -0
mvp/data/__pycache__/datasets.cpython-311.pyc +0 -0
mvp/data/__pycache__/transforms.cpython-311.pyc +0 -0
mvp/models/__pycache__/__init__.cpython-311.pyc +0 -0
mvp/models/__pycache__/base.cpython-311.pyc +0 -0
mvp/models/__pycache__/contrastive.cpython-311.pyc +0 -0
mvp/models/__pycache__/contrastive_interaction.cpython-311.pyc +0 -0
mvp/models/__pycache__/encoders.cpython-311.pyc +0 -0
mvp/models/__pycache__/mol_encoder.cpython-311.pyc +0 -0
mvp/models/__pycache__/spec_encoder.cpython-311.pyc +0 -0
mvp/params_tmp.yaml +0 -0
mvp/subformula_assign/__pycache__/assign_subformulae.cpython-311.pyc +0 -0
mvp/subformula_assign/assign_subformulae.py +216 -0
mvp/subformula_assign/run.sh +14 -0
mvp/subformula_assign/utils/__init__.py +5 -0
mvp/subformula_assign/utils/__pycache__/__init__.cpython-311.pyc +0 -0
mvp/subformula_assign/utils/__pycache__/chem_utils.cpython-311.pyc +0 -0
mvp/subformula_assign/utils/__pycache__/parallel_utils.cpython-311.pyc +0 -0
mvp/subformula_assign/utils/__pycache__/parse_utils.cpython-311.pyc +0 -0
mvp/subformula_assign/utils/__pycache__/spectra_utils.cpython-311.pyc +0 -0
mvp/subformula_assign/utils/chem_utils.py +616 -0
mvp/subformula_assign/utils/parallel_utils.py +84 -0
mvp/subformula_assign/utils/parse_utils.py +295 -0
mvp/subformula_assign/utils/spectra_utils.py +326 -0
mvp/utils/__pycache__/__init__.cpython-311.pyc +0 -0
mvp/utils/__pycache__/data.cpython-311.pyc +0 -0
mvp/utils/__pycache__/debug.cpython-311.pyc +0 -0
mvp/utils/__pycache__/eval.cpython-311.pyc +0 -0
mvp/utils/__pycache__/general.cpython-311.pyc +0 -0
mvp/utils/__pycache__/loss.cpython-311.pyc +0 -0
mvp/utils/__pycache__/models.cpython-311.pyc +0 -0
mvp/utils/__pycache__/preprocessing.cpython-311.pyc +0 -0

mvp/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (155 Bytes). View file

mvp/__pycache__/definitions.cpython-311.pyc ADDED Viewed

Binary file (956 Bytes). View file

mvp/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (211 Bytes). View file

mvp/data/__pycache__/data_module.cpython-311.pyc ADDED Viewed

Binary file (4.34 kB). View file

mvp/data/__pycache__/datasets.cpython-311.pyc ADDED Viewed

Binary file (24 kB). View file

mvp/data/__pycache__/transforms.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

mvp/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (347 Bytes). View file

mvp/models/__pycache__/base.cpython-311.pyc ADDED Viewed

Binary file (9.6 kB). View file

mvp/models/__pycache__/contrastive.cpython-311.pyc ADDED Viewed

Binary file (25.1 kB). View file

mvp/models/__pycache__/contrastive_interaction.cpython-311.pyc ADDED Viewed

Binary file (1.46 kB). View file

mvp/models/__pycache__/encoders.cpython-311.pyc ADDED Viewed

Binary file (2.32 kB). View file

mvp/models/__pycache__/mol_encoder.cpython-311.pyc ADDED Viewed

Binary file (3.57 kB). View file

mvp/models/__pycache__/spec_encoder.cpython-311.pyc ADDED Viewed

Binary file (6.02 kB). View file

mvp/params_tmp.yaml ADDED Viewed

File without changes

mvp/subformula_assign/__pycache__/assign_subformulae.cpython-311.pyc ADDED Viewed

Binary file (10.3 kB). View file

mvp/subformula_assign/assign_subformulae.py ADDED Viewed

	@@ -0,0 +1,216 @@

+""" assign_subformulae.py
+Copied from https://github.com/samgoldman97/mist/blob/main_v2/src/mist/subformulae/assign_subformulae.py
+Given a set of spectra and candidates from a labels file, assign subformulae and save to JSON files.
+"""
+from pathlib import Path
+import argparse
+from functools import partial
+import numpy as np
+import pandas as pd
+import json
+import os
+from tqdm import tqdm
+import utils
+def get_args():
+    """get args"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--feature-id",
+        default="ID",
+        help="ID key in mgf input"
+    )
+    parser.add_argument(
+        "--spec-files",
+        default="data/paired_spectra/canopus_train/spec_files/",
+        help="Spec files; either MGF or directory.",
+    )
+    parser.add_argument("--output-dir", default=None,
+                        help="Name of output dir.")
+    parser.add_argument(
+        "--labels-file",
+        default="data/paired_spectra/canopus_train/labels.tsv",
+        help="Labels file",
+    )
+    parser.add_argument(
+        "--debug", action="store_true", default=False, help="Debug flag."
+    )
+    parser.add_argument(
+        "--mass-diff-type",
+        default="ppm",
+        type=str,
+        help="Type of mass difference - absolute differece (abs) or relative difference (ppm).",
+    )
+    parser.add_argument(
+        "--mass-diff-thresh",
+        action="store",
+        default=20,
+        type=float,
+        help="Threshold of mass difference.",
+    )
+    parser.add_argument(
+        "--inten-thresh",
+        action="store",
+        default=0.001,
+        type=float,
+        help="Threshold of MS2 subpeak intensity (normalized to 1).",
+    )
+    parser.add_argument(
+        "--max-formulae",
+        action="store",
+        default=50,
+        type=int,
+        help="Max number of peaks to keep",
+    )
+    parser.add_argument(
+        "--num-workers", action="store", default=32, type=int, help="num workers"
+    )
+    return parser.parse_args()
+def process_spec_file(spec_name: str, spec_files: str, max_inten=0.001, max_peaks=60):
+    """_summary_
+    Args:
+        spec_name (str): _description_
+        spec_files (str): _description_
+        max_inten (float, optional): _description_. Defaults to 0.001.
+        max_peaks (int, optional): _description_. Defaults to 60.
+    Returns:
+        _type_: _description_
+    """
+    spec_file = Path(spec_files) / f"{spec_name}.ms"
+    meta, tuples = utils.parse_spectra(spec_file)
+    spec = utils.process_spec_file(meta, tuples)
+    return spec_name, spec
+def assign_subforms(spec_files, labels_file,
+                    mass_diff_thresh: int = 20,
+                    mass_diff_type: str = "ppm",
+                    inten_thresh: float = 0.001,
+                    output_dir=None,
+                    num_workers: int = 32,
+                    feature_id="ID",
+                    max_formulae: int = 50,
+                    debug=False):
+    """_summary_
+    Args:
+        spec_files (_type_): _description_
+        labels_file (_type_): _description_
+        mass_diff_thresh (int, optional): _description_. Defaults to 20.
+        mass_diff_type (str, optional): _description_. Defaults to "ppm".
+        inten_thresh (float, optional): _description_. Defaults to 0.001.
+        output_dir (_type_, optional): _description_. Defaults to None.
+        num_workers (int, optional): _description_. Defaults to 32.
+        feature_id (str, optional): _description_. Defaults to "ID".
+        max_formulae (int, optional): _description_. Defaults to 50.
+        debug (bool, optional): _description_. Defaults to False.
+    Raises:
+        ValueError: _description_
+    """
+    spec_files = Path(spec_files)
+    label_path = Path(labels_file)
+    # Read in labels
+    labels_df = pd.read_csv(label_path, sep="\t").astype(str)
+    if spec_files.suffix == ".tsv": # YZC msgym-like data
+        labels_df.rename(columns={'identifier': 'spec',
+            'adduct': 'ionization'}, inplace=True)
+    if debug:
+        labels_df = labels_df[:50]
+    # Define output directory name
+    output_dir = Path(output_dir)
+    if output_dir is None:
+        subform_dir = label_path.parent / "subformulae"
+        output_dir_name = f"subform_{max_formulae}"
+        output_dir = subform_dir / output_dir_name
+    output_dir.mkdir(exist_ok=True, parents=True)
+    if spec_files.suffix == ".mgf":
+        # Input specs
+        parsed_specs = utils.parse_spectra_mgf(spec_files)
+        input_specs = [utils.process_spec_file(*i) for i in parsed_specs]
+        spec_names = [i[0][feature_id] for i in parsed_specs]
+        input_specs = list(zip(spec_names, input_specs))
+    elif spec_files.is_dir():
+        spec_fn_lst = labels_df["spec"].to_list()
+        proc_spec_full = partial(
+            process_spec_file,
+            spec_files=spec_files,
+            max_inten=inten_thresh,
+            max_peaks=max_formulae,
+        )
+        # input_specs = [proc_spec_full(i) for i in tqdm(spec_fn_lst)]
+        input_specs = utils.chunked_parallel(
+            spec_fn_lst, proc_spec_full, chunks=100, max_cpu=max(num_workers, 1)
+        )
+    elif spec_files.suffix == '.tsv':
+        parsed_specs = utils.parse_spectra_msgym(labels_df)
+        input_specs = [utils.process_spec_file(*i) for i in parsed_specs]
+        spec_names = [i[0][feature_id] for i in parsed_specs]
+        input_specs = list(zip(spec_names, input_specs))
+    else:
+        raise ValueError(f"Spec files arg {spec_files} is not a dir or mgf")
+    # input_specs contains a list of tuples (spec, subpeak tuple array)
+    input_specs_dict = {tup[0]: tup[1] for tup in input_specs}
+    export_dicts, spec_names = [], []
+    for _, row in labels_df.iterrows():
+        spec = str(row["spec"])
+        new_entry = {
+            "spec": input_specs_dict[spec],
+            "form": row["formula"],
+            "mass_diff_type": mass_diff_type,
+            "spec_name": spec,
+            "mass_diff_thresh": mass_diff_thresh,
+            "ion_type": row["ionization"],
+        }
+        spec_names.append(spec)
+        export_dicts.append(new_entry)
+    # Build dicts
+    print(f"There are {len(export_dicts)} spec-cand pairs this spec files")
+    def export_wrapper(x): return utils.get_output_dict(**x)
+    if debug:
+        output_dict_lst = [export_wrapper(i) for i in export_dicts[:10]]
+    else:
+        output_dict_lst = utils.chunked_parallel(
+            export_dicts, export_wrapper, chunks=100, max_cpu=max(num_workers, 1)
+        )
+    assert len(export_dicts) == len(output_dict_lst)
+    # Write all output jsons to files
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Writing output to {output_dir}")
+    for output_dict, spec_name in tqdm(zip(output_dict_lst, spec_names)):
+        with open(output_dir / f"{spec_name}.json", "w") as f:
+            json.dump(output_dict, f, indent=4)
+            f.close()
+if __name__ == "__main__":
+    args = get_args()
+    assign_subforms(spec_files=args.spec_files,
+                    labels_file=args.labels_file,
+                    mass_diff_thresh=args.mass_diff_thresh,
+                    mass_diff_type=args.mass_diff_type,
+                    inten_thresh=args.inten_thresh,
+                    output_dir=args.output_dir,
+                    num_workers=args.num_workers,
+                    feature_id=args.feature_id,
+                    max_formulae=args.max_formulae,
+                    debug=args.debug)

mvp/subformula_assign/run.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPEC_FILES="../data/sample/data.tsv"
+# OUTPUT_DIR="/data/sample/subformulae"
+# MAX_FORMULAE=60
+# LABELS_FILE="../data/sample/data.tsv"
+# python assign_subformulae.py --spec-files $SPEC_FILES --output-dir $OUTPUT_DIR --max-formulae $MAX_FORMULAE --labels-file $LABELS_FILE
+SPEC_FILES="/data/yzhouc01/cancer/data.tsv"
+OUTPUT_DIR="/data/yzhouc01/cancer/subformulae"
+MAX_FORMULAE=60
+LABELS_FILE="/data/yzhouc01/cancer/data.tsv"
+python assign_subformulae.py --spec-files $SPEC_FILES --output-dir $OUTPUT_DIR --max-formulae $MAX_FORMULAE --labels-file $LABELS_FILE

mvp/subformula_assign/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .parse_utils import *
+from .chem_utils import *
+from .parallel_utils import *
+from .spectra_utils import *

mvp/subformula_assign/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (302 Bytes). View file

mvp/subformula_assign/utils/__pycache__/chem_utils.cpython-311.pyc ADDED Viewed

Binary file (24.9 kB). View file

mvp/subformula_assign/utils/__pycache__/parallel_utils.cpython-311.pyc ADDED Viewed

Binary file (3.7 kB). View file

mvp/subformula_assign/utils/__pycache__/parse_utils.cpython-311.pyc ADDED Viewed

Binary file (16.4 kB). View file

mvp/subformula_assign/utils/__pycache__/spectra_utils.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

mvp/subformula_assign/utils/chem_utils.py ADDED Viewed

	@@ -0,0 +1,616 @@

+"""chem_utils.py"""
+import re
+import numpy as np
+import pandas as pd
+import json
+from functools import reduce
+from collections import defaultdict
+import torch
+from rdkit import Chem
+from rdkit.Chem import Atom
+from rdkit.Chem.rdMolDescriptors import CalcMolFormula
+from rdkit.Chem.Descriptors import ExactMolWt
+from rdkit.Chem.MolStandardize import rdMolStandardize
+P_TBL = Chem.GetPeriodicTable()
+ROUND_FACTOR = 4
+ELECTRON_MASS = 0.00054858
+CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)"
+VALID_ELEMENTS = [
+    "C",
+    "H",
+    "As",
+    "B",
+    "Br",
+    "Cl",
+    "Co",
+    "F",
+    "Fe",
+    "I",
+    "K",
+    "N",
+    "Na",
+    "O",
+    "P",
+    "S",
+    "Se",
+    "Si",
+]
+VALID_ATOM_NUM = [Atom(i).GetAtomicNum() for i in VALID_ELEMENTS]
+CHEM_ELEMENT_NUM = len(VALID_ELEMENTS)
+ATOM_NUM_TO_ONEHOT = torch.zeros((max(VALID_ATOM_NUM) + 1, CHEM_ELEMENT_NUM))
+# Convert to onehot
+ATOM_NUM_TO_ONEHOT[VALID_ATOM_NUM, torch.arange(CHEM_ELEMENT_NUM)] = 1
+VALID_MONO_MASSES = np.array(
+    [P_TBL.GetMostCommonIsotopeMass(i) for i in VALID_ELEMENTS]
+)
+CHEM_MASSES = VALID_MONO_MASSES[:, None]
+ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS))
+ELEMENT_VECTORS_MASS = np.hstack([ELEMENT_VECTORS, CHEM_MASSES])
+ELEMENT_TO_MASS = dict(zip(VALID_ELEMENTS, CHEM_MASSES.squeeze()))
+ELEMENT_DIM_MASS = len(ELEMENT_VECTORS_MASS[0])
+ELEMENT_DIM = len(ELEMENT_VECTORS[0])
+# Reasonable normalization vector for elements
+# Estimated by max counts (+ 1 when zero)
+NORM_VEC = np.array([81, 158, 2, 1, 3, 10, 1, 17, 1, 6, 1, 19, 2, 34, 6, 6, 2, 6])
+NORM_VEC_MASS = np.array(NORM_VEC.tolist() + [1471])
+# Assume 64 is the highest repeat of any 1 atom
+MAX_ELEMENT_NUM = 64
+element_to_ind = dict(zip(VALID_ELEMENTS, np.arange(len(VALID_ELEMENTS))))
+element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS))
+element_to_position_mass = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS_MASS))
+ION_LST = [
+    "[M+H]+",
+    "[M+Na]+",
+    "[M+K]+",
+    "[M-H2O+H]+",
+    "[M+H3N+H]+",
+    "[M]+",
+    "[M-H4O2+H]+",
+    "[M-H]-",
+    "[M+H2O+H]+"
+]
+ion_remap = dict(zip(ION_LST, ION_LST))
+ion_remap.update(
+    {
+        "[M+NH4]+": "[M+H3N+H]+",
+        "M+H": "[M+H]+",
+        "M+Na": "[M+Na]+",
+        "M+H-H2O": "[M-H2O+H]+",
+        "M-H2O+H": "[M-H2O+H]+",
+        "M+NH4": "[M+H3N+H]+",
+        "M-2H2O+H": "[M-H4O2+H]+",
+        "[M-2H2O+H]+": "[M-H4O2+H]+",
+        "[M-H]-": "[M-H]-",
+    }
+)
+ion_to_idx = dict(zip(ION_LST, np.arange(len(ION_LST))))
+ion_to_mass = {
+    "[M+H]+": ELEMENT_TO_MASS["H"] - ELECTRON_MASS,
+    "[M+Na]+": ELEMENT_TO_MASS["Na"] - ELECTRON_MASS,
+    "[M+K]+": ELEMENT_TO_MASS["K"] - ELECTRON_MASS,
+    "[M-H2O+H]+": -ELEMENT_TO_MASS["O"] - ELEMENT_TO_MASS["H"] - ELECTRON_MASS,
+    "[M+H3N+H]+": ELEMENT_TO_MASS["N"] + ELEMENT_TO_MASS["H"] * 4 - ELECTRON_MASS,
+    "[M]+": 0 - ELECTRON_MASS,
+    "[M-H4O2+H]+": -ELEMENT_TO_MASS["O"] * 2 - ELEMENT_TO_MASS["H"] * 3 - ELECTRON_MASS,
+    "[M-H]-": ELEMENT_TO_MASS["H"] + ELECTRON_MASS,
+    "[M+H2O+H]+":ELEMENT_TO_MASS["O"] * 2 + ELEMENT_TO_MASS["H"] * 2 - ELECTRON_MASS,
+}
+ion_to_add_vec = {
+    "[M+H]+": element_to_position["H"],
+    "[M+Na]+": element_to_position["Na"],
+    "[M+K]+": element_to_position["K"],
+    "[M-H2O+H]+": -element_to_position["O"] - element_to_position["H"],
+    "[M+H3N+H]+": element_to_position["N"] + element_to_position["H"] * 4,
+    "[M]+": np.zeros_like(element_to_position["H"]),
+    "[M-H4O2+H]+": -element_to_position["O"] * 2 - element_to_position["H"] * 3,
+    "[M+H2O+H]+": element_to_position["O"] + element_to_position["H"] * 2,
+}
+instrument_to_type = defaultdict(lambda : "unknown")
+instrument_to_type.update({
+    "Thermo Finnigan Velos Orbitrap": "orbitrap",
+    "Thermo Finnigan Elite Orbitrap": "orbitrap",
+    "Orbitrap Fusion Lumos": "orbitrap",
+    "Q-ToF (LCMS)": "qtof",
+    "Unknown (LCMS)": "unknown",
+    "ion trap": "iontrap",
+    "FTICR (LCMS)": "fticr",
+    "Bruker Q-ToF (LCMS)": "qtof",
+    "Orbitrap (LCMS)": "orbitrap",
+})
+instruments = sorted(list(set(instrument_to_type.values())))
+max_instr_idx = len(instruments) + 1
+instrument_to_idx = dict(zip(instruments, np.arange(len(instruments))))
+# Define rdbe mult
+rdbe_mult = np.zeros_like(ELEMENT_VECTORS[0])
+els = ["C", "N", "P", "H", "Cl", "Br", "I", "F"]
+weights = [2, 1, 1, -1, -1, -1, -1, -1]
+for k, v in zip(els, weights):
+    rdbe_mult[element_to_ind[k]] = v
+def get_ion_idx(ionization: str) -> int:
+    """map ionization to its index in one hot encoding"""
+    return ion_to_idx[ionization]
+def get_instr_idx(instrument: str) -> int:
+    """map instrument to its index in one hot encoding"""
+    inst = instrument_to_type.get(instrument, "unknown")
+    return instrument_to_idx[inst]
+def has_valid_els(chem_formula: str) -> bool:
+    """has_valid_els"""
+    for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
+        if chem_symbol not in VALID_ELEMENTS:
+            return False
+    return True
+def formula_to_dense(chem_formula: str) -> np.ndarray:
+    """formula_to_dense.
+    Args:
+        chem_formula (str): Input chemical formal
+    Return:
+        np.ndarray of vector
+    """
+    total_onehot = []
+    for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
+        # Convert num to int
+        num = 1 if num == "" else int(num)
+        one_hot = element_to_position[chem_symbol].reshape(1, -1)
+        one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0)
+        total_onehot.append(one_hot_repeats)
+    # Check if null
+    if len(total_onehot) == 0:
+        dense_vec = np.zeros(len(element_to_position))
+    else:
+        dense_vec = np.vstack(total_onehot).sum(0)
+    return dense_vec
+def cross_sum(x, y):
+    """cross_sum."""
+    return (np.expand_dims(x, 0) + np.expand_dims(y, 1)).reshape(-1, y.shape[-1])
+def get_all_subsets_dense(
+    dense_formula: str, element_vectors
+) -> (np.ndarray, np.ndarray):
+    """_summary_
+    Args:
+        dense_formula (str, element_vectors): _description_
+        np (_type_): _description_
+    Returns:
+        _type_: _description_
+    """
+    non_zero = np.argwhere(dense_formula > 0).flatten()
+    vectorized_formula = []
+    for nonzero_ind in non_zero:
+        temp = element_vectors[nonzero_ind] * np.arange(
+            0, dense_formula[nonzero_ind] + 1
+        ).reshape(-1, 1)
+        vectorized_formula.append(temp)
+    zero_vec = np.zeros((1, element_vectors.shape[-1]))
+    cross_prod = reduce(cross_sum, vectorized_formula, zero_vec)
+    cross_prod_inds = rdbe_filter(cross_prod)
+    cross_prod = cross_prod[cross_prod_inds]
+    all_masses = cross_prod.dot(VALID_MONO_MASSES)
+    return cross_prod, all_masses
+def get_all_subsets(chem_formula: str):
+    dense_formula = formula_to_dense(chem_formula)
+    return get_all_subsets_dense(dense_formula, element_vectors=ELEMENT_VECTORS)
+def rdbe_filter(cross_prod):
+    """rdbe_filter.
+    Args:
+        cross_prod:
+    """
+    rdbe_total = 1 + 0.5 * cross_prod.dot(rdbe_mult)
+    filter_inds = np.argwhere(rdbe_total >= 0).flatten()
+    return filter_inds
+def formula_to_dense(chem_formula: str) -> np.ndarray:
+    """formula_to_dense.
+    Args:
+        chem_formula (str): Input chemical formal
+    Return:
+        np.ndarray of vector
+    """
+    total_onehot = []
+    for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
+        # Convert num to int
+        num = 1 if num == "" else int(num)
+        one_hot = element_to_position[chem_symbol].reshape(1, -1)
+        one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0)
+        total_onehot.append(one_hot_repeats)
+    # Check if null
+    if len(total_onehot) == 0:
+        dense_vec = np.zeros(len(element_to_position))
+    else:
+        dense_vec = np.vstack(total_onehot).sum(0)
+    return dense_vec
+def formula_to_dense_mass(chem_formula: str) -> np.ndarray:
+    """formula_to_dense_mass.
+    Return formula including full compound mass
+    Args:
+        chem_formula (str): Input chemical formal
+    Return:
+        np.ndarray of vector
+    """
+    total_onehot = []
+    for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
+        # Convert num to int
+        num = 1 if num == "" else int(num)
+        one_hot = element_to_position_mass[chem_symbol].reshape(1, -1)
+        one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0)
+        total_onehot.append(one_hot_repeats)
+    # Check if null
+    if len(total_onehot) == 0:
+        dense_vec = np.zeros(len(element_to_position_mass["H"]))
+    else:
+        dense_vec = np.vstack(total_onehot).sum(0)
+    return dense_vec
+def formula_to_dense_mass_norm(chem_formula: str) -> np.ndarray:
+    """formula_to_dense_mass_norm.
+    Return formula including full compound mass and normalized
+    Args:
+        chem_formula (str): Input chemical formal
+    Return:
+        np.ndarray of vector
+    """
+    dense_vec = formula_to_dense_mass(chem_formula)
+    dense_vec = dense_vec / NORM_VEC_MASS
+    return dense_vec
+def formula_mass(chem_formula: str) -> float:
+    """get formula mass"""
+    mass = 0
+    for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
+        # Convert num to int
+        num = 1 if num == "" else int(num)
+        mass += ELEMENT_TO_MASS[chem_symbol] * num
+    return mass
+def electron_correct(mass: float) -> float:
+    """subtract the rest mass of an electron"""
+    return mass - ELECTRON_MASS
+def formula_difference(formula_1, formula_2):
+    """formula_1 - formula_2"""
+    form_1 = {
+        chem_symbol: (int(num) if num != "" else 1)
+        for chem_symbol, num in re.findall(CHEM_FORMULA_SIZE, formula_1)
+    }
+    form_2 = {
+        chem_symbol: (int(num) if num != "" else 1)
+        for chem_symbol, num in re.findall(CHEM_FORMULA_SIZE, formula_2)
+    }
+    for k, v in form_2.items():
+        form_1[k] = form_1[k] - form_2[k]
+    out_formula = "".join([f"{k}{v}" for k, v in form_1.items() if v > 0])
+    return out_formula
+def get_mol_from_structure_string(structure_string, structure_type):
+    if structure_type == "InChI":
+        mol = Chem.MolFromInchi(structure_string)
+    else:
+        mol = Chem.MolFromSmiles(structure_string)
+    return mol
+def vec_to_formula(form_vec):
+    """vec_to_formula."""
+    build_str = ""
+    for i in np.argwhere(form_vec > 0).flatten():
+        el = VALID_ELEMENTS[i]
+        ct = int(form_vec[i])
+        new_item = f"{el}{ct}" if ct > 1 else f"{el}"
+        build_str = build_str + new_item
+    return build_str
+def standardize_form(i):
+    """standardize_form."""
+    return vec_to_formula(formula_to_dense(i))
+def standardize_adduct(adduct):
+    """standardize_adduct."""
+    adduct = adduct.replace(" ", "")
+    adduct = ion_remap.get(adduct, adduct)
+    if adduct not in ION_LST:
+        raise ValueError(f"Adduct {adduct} not in ION_LST")
+    return adduct
+def calc_structure_string_type(structure_string):
+    """calc_structure_string_type.
+    Args:
+        structure_string:
+    """
+    structure_type = None
+    if pd.isna(structure_string):
+        structure_type = "empty"
+    elif structure_string.startswith("InChI="):
+        structure_type = "InChI"
+    elif Chem.MolFromSmiles(structure_string) is not None:
+        structure_type = "Smiles"
+    return structure_type
+def uncharged_formula(mol, mol_type="mol") -> str:
+    """Compute uncharged formula"""
+    if mol_type == "mol":
+        chem_formula = CalcMolFormula(mol)
+    elif mol_type == "smiles":
+        mol = Chem.MolFromSmiles(mol)
+        if mol is None:
+            return None
+        chem_formula = CalcMolFormula(mol)
+    else:
+        raise ValueError()
+    return re.findall(r"^([^\+,^\-]*)", chem_formula)[0]
+def form_from_smi(smi: str) -> str:
+    """form_from_smi.
+    Args:
+        smi (str): smi
+    Return:
+        str
+    """
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return ""
+    else:
+        return CalcMolFormula(mol)
+def inchikey_from_smiles(smi: str) -> str:
+    """inchikey_from_smiles.
+    Args:
+        smi (str): smi
+    Returns:
+        str:
+    """
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return ""
+    else:
+        return Chem.MolToInchiKey(mol)
+def contains_metals(formula: str) -> bool:
+    """returns true if formula contains metals"""
+    METAL_RE = "(Fe|Co|Zn|Rh|Pt|Li)"
+    return len(re.findall(METAL_RE, formula)) > 0
+class SmilesStandardizer(object):
+    """Standardize smiles"""
+    def __init__(self, *args, **kwargs):
+        self.fragment_standardizer = rdMolStandardize.LargestFragmentChooser()
+        self.charge_standardizer = rdMolStandardize.Uncharger()
+    def standardize_smiles(self, smi):
+        """Standardize smiles string"""
+        mol = Chem.MolFromSmiles(smi)
+        out_smi = self.standardize_mol(mol)
+        return out_smi
+    def standardize_mol(self, mol) -> str:
+        """Standardize smiles string"""
+        mol = self.fragment_standardizer.choose(mol)
+        mol = self.charge_standardizer.uncharge(mol)
+        # Round trip to and from inchi to tautomer correct
+        # Also standardize tautomer in the middle
+        output_smi = Chem.MolToSmiles(mol, isomericSmiles=False)
+        return output_smi
+def mass_from_smi(smi: str) -> float:
+    """mass_from_smi.
+    Args:
+        smi (str): smi
+    Return:
+        str
+    """
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return 0
+    else:
+        return ExactMolWt(mol)
+def min_formal_from_smi(smi: str):
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return 0
+    else:
+        formal = np.array([j.GetFormalCharge() for j in mol.GetAtoms()])
+        return formal.min()
+def max_formal_from_smi(smi: str):
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return 0
+    else:
+        formal = np.array([j.GetFormalCharge() for j in mol.GetAtoms()])
+        return formal.max()
+def atoms_from_smi(smi: str) -> int:
+    """atoms_from_smi.
+    Args:
+        smi (str): smi
+    Return:
+        int
+    """
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return 0
+    else:
+        return mol.GetNumAtoms()
+def has_valid_els(chem_formula: str) -> bool:
+    """has_valid_els"""
+    for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
+        if chem_symbol not in VALID_ELEMENTS:
+            return False
+    return True
+def add_ion(form: str, ion: str):
+    """add_ion.
+    Args:
+        form (str): form
+        ion (str): ion
+    """
+    ion_vec = ion_to_add_vec[ion]
+    form_vec = formula_to_dense(form)
+    return vec_to_formula(form_vec + ion_vec)
+def achiral_smi(smi: str) -> str:
+    """achiral_smi.
+    Return:
+        isomeric smiles
+    """
+    try:
+        mol = Chem.MolFromSmiles(smi)
+        if mol is not None:
+            smi = Chem.MolToSmiles(mol, isomericSmiles=False)
+            return smi
+        else:
+            return ""
+    except:
+        return ""
+def npclassifer_query(inputs):
+    """npclassifier_query.
+    Args:
+        input: Tuple of name, molecule
+    Return:
+        Dict of name to molecule
+    """
+    import requests
+    spec = inputs[0]
+    endpoint = "https://npclassifier.ucsd.edu/classify"
+    req_data = {"smiles": inputs[1]}
+    out = requests.get(f"{endpoint}", data=req_data)
+    out.raise_for_status()
+    out_json = out.json()
+    return {spec: out_json}
+def clipped_ppm(mass_diff: np.ndarray, parentmass: np.ndarray) -> np.ndarray:
+    """clipped_ppm.
+    Args:
+        mass_diff (np.ndarray): mass_diff
+        parentmass (np.ndarray): parentmass
+    Returns:
+        np.ndarray:
+    """
+    parentmass_copy = parentmass * 1
+    parentmass_copy[parentmass < 200] = 200
+    ppm = mass_diff / parentmass_copy * 1e6
+    return ppm
+def clipped_ppm_single(
+    cls_mass_diff: float,
+    parentmass: float,
+):
+    """clipped_ppm_single.
+    Args:
+        cls_mass_diff (float): cls_mass_diff
+        parentmass (float): parentmass
+    """
+    div_factor = 200 if parentmass < 200 else parentmass
+    cls_ppm = cls_mass_diff / div_factor * 1e6
+    return cls_ppm

mvp/subformula_assign/utils/parallel_utils.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""parallel_utils.py"""
+import logging
+from multiprocess.context import TimeoutError
+from pathos import multiprocessing as mp
+from tqdm import tqdm
+def simple_parallel(
+    input_list, function, max_cpu=16, timeout=4000, max_retries=3, use_ray: bool = False
+):
+    """Simple parallelization.
+    Use map async and retries in case we get odd stalling behavior.
+    input_list: Input list to op on
+    function: Fn to apply
+    max_cpu: Num cpus
+    timeout: Length of timeout
+    max_retries: Num times to retry this
+    use_ray
+    """
+    # If ray is required. Set to false.
+    if use_ray and False:
+        import ray
+        @ray.remote
+        def ray_func(x):
+            return function(x)
+        return ray.get([ray_func.remote(x) for x in input_list])
+    from multiprocess.context import TimeoutError
+    from pathos import multiprocessing as mp
+    cpus = min(mp.cpu_count(), max_cpu)
+    pool = mp.Pool(processes=cpus)
+    results = pool.map(function, input_list)
+    pool.close()
+    pool.join()
+    return results
+def chunked_parallel(
+    input_list, function, chunks=100, max_cpu=16, timeout=4000, max_retries=3
+):
+    """chunked_parallel.
+    Args:
+        input_list : list of objects to apply function
+        function : Callable with 1 input and returning a single value
+        chunks: number of hcunks
+        max_cpu: Max num cpus
+        timeout: Length of timeout
+        max_retries: Num times to retry this
+    """
+    # Adding it here fixes somessetting disrupted elsewhere
+    def batch_func(list_inputs):
+        outputs = []
+        for i in list_inputs:
+            outputs.append(function(i))
+        return outputs
+    list_len = len(input_list)
+    num_chunks = min(list_len, chunks)
+    step_size = len(input_list) // num_chunks + 1
+    chunked_list = [
+        input_list[i : i + step_size] for i in range(0, len(input_list), step_size)
+    ]
+    list_outputs = simple_parallel(
+        chunked_list,
+        batch_func,
+        max_cpu=max_cpu,
+        timeout=timeout,
+        max_retries=max_retries,
+    )
+    # Unroll
+    full_output = [j for i in list_outputs for j in i]
+    return full_output

mvp/subformula_assign/utils/parse_utils.py ADDED Viewed

	@@ -0,0 +1,295 @@

+""" parse_utils.py """
+from pathlib import Path
+from typing import Tuple, List, Optional
+from itertools import groupby
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+def parse_spectra(spectra_file: str) -> Tuple[dict, List[Tuple[str, np.ndarray]]]:
+    """parse_spectra.
+    Parses spectra in the SIRIUS format and returns
+    Args:
+        spectra_file (str): Name of spectra file to parse
+    Return:
+        Tuple[dict, List[Tuple[str, np.ndarray]]]: metadata and list of spectra
+            tuples containing name and array
+    """
+    lines = [i.strip() for i in open(spectra_file, "r").readlines()]
+    group_num = 0
+    metadata = {}
+    spectras = []
+    my_iterator = groupby(
+        lines, lambda line: line.startswith(">") or line.startswith("#")
+    )
+    for index, (start_line, lines) in enumerate(my_iterator):
+        group_lines = list(lines)
+        subject_lines = list(next(my_iterator)[1])
+        # Get spectra
+        if group_num > 0:
+            spectra_header = group_lines[0].split(">")[1]
+            peak_data = [
+                [float(x) for x in peak.split()[:2]]
+                for peak in subject_lines
+                if peak.strip()
+            ]
+            # Check if spectra is empty
+            if len(peak_data):
+                peak_data = np.vstack(peak_data)
+                # Add new tuple
+                spectras.append((spectra_header, peak_data))
+        # Get meta data
+        else:
+            entries = {}
+            for i in group_lines:
+                if " " not in i:
+                    continue
+                elif i.startswith("#INSTRUMENT TYPE"):
+                    key = "#INSTRUMENT TYPE"
+                    val = i.split(key)[1].strip()
+                    entries[key[1:]] = val
+                else:
+                    start, end = i.split(" ", 1)
+                    start = start[1:]
+                    while start in entries:
+                        start = f"{start}'"
+                    entries[start] = end
+            metadata.update(entries)
+        group_num += 1
+    metadata["_FILE_PATH"] = spectra_file
+    metadata["_FILE"] = Path(spectra_file).stem
+    return metadata, spectras
+def spec_to_ms_str(
+    spec: List[Tuple[str, np.ndarray]], essential_keys: dict, comments: dict = {}
+) -> str:
+    """spec_to_ms_str.
+    Turn spec ars and info dicts into str for output file
+    Args:
+        spec (List[Tuple[str, np.ndarray]]): spec
+        essential_keys (dict): essential_keys
+        comments (dict): comments
+    Returns:
+        str:
+    """
+    def pair_rows(rows):
+        return "\n".join([f"{i} {j}" for i, j in rows])
+    header = "\n".join(f">{k} {v}" for k, v in essential_keys.items())
+    comments = "\n".join(f"#{k} {v}" for k, v in essential_keys.items())
+    spec_strs = [f">{name}\n{pair_rows(ar)}" for name, ar in spec]
+    spec_str = "\n\n".join(spec_strs)
+    output = f"{header}\n{comments}\n\n{spec_str}"
+    return output
+def build_mgf_str(
+    meta_spec_list: List[Tuple[dict, List[Tuple[str, np.ndarray]]]],
+    merge_charges=True,
+    parent_mass_keys=["PEPMASS", "parentmass", "PRECURSOR_MZ"],
+) -> str:
+    """build_mgf_str.
+    Args:
+        meta_spec_list (List[Tuple[dict, List[Tuple[str, np.ndarray]]]]): meta_spec_list
+    Returns:
+        str:
+    """
+    entries = []
+    for meta, spec in tqdm(meta_spec_list):
+        str_rows = ["BEGIN IONS"]
+        # Try to add precusor mass
+        for i in parent_mass_keys:
+            if i in meta:
+                pep_mass = float(meta.get(i, -100))
+                str_rows.append(f"PEPMASS={pep_mass}")
+                break
+        for k, v in meta.items():
+            str_rows.append(f"{k.upper().replace(' ', '_')}={v}")
+        if merge_charges:
+            spec_ar = np.vstack([i[1] for i in spec])
+            spec_ar = np.vstack([i for i in sorted(spec_ar, key=lambda x: x[0])])
+        else:
+            raise NotImplementedError()
+        str_rows.extend([f"{i} {j}" for i, j in spec_ar])
+        str_rows.append("END IONS")
+        str_out = "\n".join(str_rows)
+        entries.append(str_out)
+    full_out = "\n\n".join(entries)
+    return full_out
+def parse_spectra_msp(
+    mgf_file: str, max_num: Optional[int] = None
+) -> List[Tuple[dict, List[Tuple[str, np.ndarray]]]]:
+    """parse_spectr_msp.
+    Parses spectra in the MSP file format
+    Args:
+        mgf_file (str) : str
+        max_num (Optional[int]): If set, only parse this many
+    Return:
+        List[Tuple[dict, List[Tuple[str, np.ndarray]]]]: metadata and list of spectra
+            tuples containing name and array
+    """
+    key = lambda x: x.strip().startswith("PEPMASS")
+    parsed_spectra = []
+    with open(mgf_file, "r", encoding="utf-8") as fp:
+        for (is_header, group) in tqdm(groupby(fp, key)):
+            if is_header:
+                continue
+            meta = dict()
+            spectra = []
+            # Note: Sometimes we have multiple scans
+            # This mgf has them collapsed
+            cur_spectra_name = "spec"
+            cur_spectra = []
+            group = list(group)
+            for line in group:
+                line = line.strip()
+                if not line:
+                    pass
+                elif ":" in line:
+                    k, v = [i.strip() for i in line.split(":", 1)]
+                    meta[k] = v
+                else:
+                    mz, intens = line.split()
+                    cur_spectra.append((float(mz), float(intens)))
+            if len(cur_spectra) > 0:
+                cur_spectra = np.vstack(cur_spectra)
+                spectra.append((cur_spectra_name, cur_spectra))
+                parsed_spectra.append((meta, spectra))
+            else:
+                pass
+                # print("no spectra found for group: ", "".join(group))
+            if max_num is not None and len(parsed_spectra) > max_num:
+                # print("Breaking")
+                break
+        return parsed_spectra
+def parse_spectra_mgf(
+    mgf_file: str, max_num: Optional[int] = None
+) -> List[Tuple[dict, List[Tuple[str, np.ndarray]]]]:
+    """parse_spectr_mgf.
+    Parses spectra in the MGF file formate, with
+    Args:
+        mgf_file (str) : str
+        max_num (Optional[int]): If set, only parse this many
+    Return:
+        List[Tuple[dict, List[Tuple[str, np.ndarray]]]]: metadata and list of spectra
+            tuples containing name and array
+    """
+    key = lambda x: x.strip() == "BEGIN IONS"
+    parsed_spectra = []
+    with open(mgf_file, "r") as fp:
+        for (is_header, group) in tqdm(groupby(fp, key)):
+            if is_header:
+                continue
+            meta = dict()
+            spectra = []
+            # Note: Sometimes we have multiple scans
+            # This mgf has them collapsed
+            cur_spectra_name = "spec"
+            cur_spectra = []
+            group = list(group)
+            for line in group:
+                line = line.strip()
+                if not line:
+                    pass
+                elif line == "END IONS" or line == "BEGIN IONS":
+                    pass
+                elif "=" in line:
+                    k, v = [i.strip() for i in line.split("=", 1)]
+                    meta[k] = v
+                else:
+                    mz, intens = line.split()
+                    cur_spectra.append((float(mz), float(intens)))
+            if len(cur_spectra) > 0:
+                cur_spectra = np.vstack(cur_spectra)
+                spectra.append((cur_spectra_name, cur_spectra))
+                parsed_spectra.append((meta, spectra))
+            else:
+                pass
+                # print("no spectra found for group: ", "".join(group))
+            if max_num is not None and len(parsed_spectra) > max_num:
+                # print("Breaking")
+                break
+        return parsed_spectra
+def parse_tsv_spectra(spectra_file: str) -> List[Tuple[str, np.ndarray]]:
+    """parse_tsv_spectra.
+    Parses spectra returned from sirius fragmentation tree
+    Args:
+        spectra_file (str): Name of spectra tsv file to parse
+    Return:
+        List[Tuple[str, np.ndarray]]]: list of spectra
+            tuples containing name and array. This is used to maintain
+            consistency with the parse_spectra output
+    """
+    output_spec = []
+    with open(spectra_file, "r") as fp:
+        for index, line in enumerate(fp):
+            if index == 0:
+                continue
+            line = line.strip().split("\t")
+            intensity = float(line[1])
+            exact_mass = float(line[3])
+            output_spec.append([exact_mass, intensity])
+    output_spec = np.array(output_spec)
+    return_obj = [("sirius_spec", output_spec)]
+    return return_obj
+# YZC parse msgym-like formatted data
+def parse_spectra_msgym(df):
+    parsed_spectra = []
+    for _, row in df.iterrows():
+        mzs = [float(m) for m in row['mzs'].split(',')]
+        intensities = [float(i) for i in row['intensities'].split(',')]
+        cur_spectra = [(m, i) for m, i in zip(mzs, intensities)]
+        cur_spectra = np.vstack(cur_spectra)
+        cur_spectra_name = row['spec']
+        meta = {'ID': cur_spectra_name,
+        'parentmass': row['parent_mass']}
+        parsed_spectra.append((meta, [(cur_spectra_name, cur_spectra)]))
+    return parsed_spectra

mvp/subformula_assign/utils/spectra_utils.py ADDED Viewed

	@@ -0,0 +1,326 @@

+""" spectra_utils.py"""
+import logging
+import numpy as np
+from typing import List
+from .chem_utils import (
+    vec_to_formula,
+    get_all_subsets,
+    ion_to_mass,
+    ION_LST,
+    clipped_ppm,
+)
+def bin_spectra(
+    spectras: List[np.ndarray], num_bins: int = 2000, upper_limit: int = 1000
+) -> np.ndarray:
+    """bin_spectra.
+    Args:
+        spectras (List[np.ndarray]): Input list of spectra tuples
+            [(header, spec array)]
+        num_bins (int): Number of discrete bins from [0, upper_limit)
+        upper_limit (int): Max m/z to consider featurizing
+    Return:
+        np.ndarray of shape [channels, num_bins]
+    """
+    bins = np.linspace(0, upper_limit, num=num_bins)
+    binned_spec = np.zeros((len(spectras), len(bins)))
+    for spec_index, spec in enumerate(spectras):
+        # Convert to digitized spectra
+        digitized_mz = np.digitize(spec[:, 0], bins=bins)
+        # Remove all spectral peaks out of range
+        in_range = digitized_mz < len(bins)
+        digitized_mz, spec = digitized_mz[in_range], spec[in_range, :]
+        # Add the current peaks to the spectra
+        # Use a loop rather than vectorize because certain bins have conflicts
+        # based upon resolution
+        for bin_index, spec_val in zip(digitized_mz, spec[:, 1]):
+            binned_spec[spec_index, bin_index] += spec_val
+    return binned_spec
+def merge_norm_spectra(spec_tuples, precision=4) -> np.ndarray:
+    """merge_norm_spectra.
+    Take a list of mz, inten tuple arrays and merge them by 4 digit precision
+    Note this uses _max_ merging
+    """
+    mz_to_inten_pair = {}
+    for i in spec_tuples:
+        for tup in i:
+            mz, inten = tup
+            mz_ind = np.round(mz, precision)
+            cur_pair = mz_to_inten_pair.get(mz_ind)
+            if cur_pair is None:
+                mz_to_inten_pair[mz_ind] = tup
+            elif inten > cur_pair[1]:
+                mz_to_inten_pair[mz_ind] = (mz_ind, inten)
+            else:
+                pass
+    merged_spec = np.vstack([v for k, v in mz_to_inten_pair.items()])
+    merged_spec[:, 1] = merged_spec[:, 1] / merged_spec[:, 1].max()
+    return merged_spec
+def norm_spectrum(binned_spec: np.ndarray) -> np.ndarray:
+    """norm_spectrum.
+    Normalizes each spectral channel to have norm 1
+    This change is made in place
+    Args:
+        binned_spec (np.ndarray) : Vector of spectras
+    Return:
+        np.ndarray where each channel has max(1)
+    """
+    spec_maxes = binned_spec.max(1)
+    non_zero_max = spec_maxes > 0
+    spec_maxes = spec_maxes[non_zero_max]
+    binned_spec[non_zero_max] = binned_spec[non_zero_max] / spec_maxes.reshape(-1, 1)
+    return binned_spec
+def process_spec_file(meta, tuples, precision=4, max_inten=0.001, max_peaks=60):
+    """process_spec_file."""
+    if "parentmass" in meta:
+        parentmass = meta.get("parentmass", None)
+    elif "PARENTMASS" in meta:
+        parentmass = meta.get("PARENTMASS", None)
+    elif "PEPMASS" in meta:
+        parentmass = meta.get("PEPMASS", None)
+    else:
+        logging.debug(f"missing parentmass for spec")
+        parentmass = 1000000
+    parentmass = float(parentmass)
+    # First norm spectra
+    fused_tuples = [x for _, x in tuples if x.size > 0]
+    if len(fused_tuples) == 0:
+        return
+    mz_to_inten_pair = {}
+    new_tuples = []
+    for i in fused_tuples:
+        for tup in i:
+            mz, inten = tup
+            mz_ind = np.round(mz, precision)
+            cur_pair = mz_to_inten_pair.get(mz_ind)
+            if cur_pair is None:
+                mz_to_inten_pair[mz_ind] = tup
+                new_tuples.append(tup)
+            elif inten > cur_pair[1]:
+                cur_pair[1] = inten
+            else:
+                pass
+    merged_spec = np.vstack(new_tuples)
+    merged_spec = merged_spec[merged_spec[:, 0] <= (parentmass + 1)] # could end up removing all peaks?
+    try:
+        merged_spec[:, 1] = merged_spec[:, 1] / merged_spec[:, 1].max()
+    except:
+        return
+    # Sqrt intensities here
+    merged_spec[:, 1] = np.sqrt(merged_spec[:, 1])
+    merged_spec = max_inten_spec(
+        merged_spec, max_num_inten=max_peaks, inten_thresh=max_inten
+    )
+    return merged_spec
+def max_inten_spec(spec, max_num_inten: int = 60, inten_thresh: float = 0):
+    """max_inten_spec.
+    Args:
+        spec: 2D spectra array
+        max_num_inten: Max number of peaks
+        inten_thresh: Min intensity to alloow in returned peak
+    Return:
+        Spec filtered down
+    """
+    spec_masses, spec_intens = spec[:, 0], spec[:, 1]
+    # Make sure to only take max of each formula
+    # Sort by intensity and select top subpeaks
+    new_sort_order = np.argsort(spec_intens)[::-1]
+    if max_num_inten is not None:
+        new_sort_order = new_sort_order[:max_num_inten]
+    spec_masses = spec_masses[new_sort_order]
+    spec_intens = spec_intens[new_sort_order]
+    spec_mask = spec_intens > inten_thresh
+    spec_masses = spec_masses[spec_mask]
+    spec_intens = spec_intens[spec_mask]
+    spec = np.vstack([spec_masses, spec_intens]).transpose(1, 0)
+    return spec
+def max_thresh_spec(spec: np.ndarray, max_peaks=100, inten_thresh=0.003):
+    """max_thresh_spec.
+    Args:
+        spec (np.ndarray): spec
+        max_peaks: Max num peaks to keep
+        inten_thresh: Min inten to keep
+    """
+    spec_masses, spec_intens = spec[:, 0], spec[:, 1]
+    # Make sure to only take max of each formula
+    # Sort by intensity and select top subpeaks
+    new_sort_order = np.argsort(spec_intens)[::-1]
+    new_sort_order = new_sort_order[:max_peaks]
+    spec_masses = spec_masses[new_sort_order]
+    spec_intens = spec_intens[new_sort_order]
+    spec_mask = spec_intens > inten_thresh
+    spec_masses = spec_masses[spec_mask]
+    spec_intens = spec_intens[spec_mask]
+    out_ar = np.vstack([spec_masses, spec_intens]).transpose(1, 0)
+    return out_ar
+def assign_subforms(form, spec, ion_type, mass_diff_thresh=15):
+    """_summary_
+    Args:
+        form (_type_): _description_
+        spec (_type_): _description_
+        ion_type (_type_): _description_
+        mass_diff_thresh (int, optional): _description_. Defaults to 15.
+    Returns:
+        _type_: _description_
+    """
+    try:
+        cross_prod, masses = get_all_subsets(form)
+        spec_masses, spec_intens = spec[:, 0], spec[:, 1]
+        ion_masses = ion_to_mass[ion_type]
+        masses_with_ion = masses + ion_masses
+        ion_types = np.array([ion_type] * len(masses_with_ion))
+        mass_diffs = np.abs(spec_masses[:, None] - masses_with_ion[None, :])
+        formula_inds = mass_diffs.argmin(-1)
+        min_mass_diff = mass_diffs[np.arange(len(mass_diffs)), formula_inds]
+        rel_mass_diff = clipped_ppm(min_mass_diff, spec_masses)
+        # Filter by mass diff threshold (ppm)
+        valid_mask = rel_mass_diff < mass_diff_thresh
+        spec_masses = spec_masses[valid_mask]
+        spec_intens = spec_intens[valid_mask]
+        min_mass_diff = min_mass_diff[valid_mask]
+        rel_mass_diff = rel_mass_diff[valid_mask]
+        formula_inds = formula_inds[valid_mask]
+        formulas = np.array([vec_to_formula(j) for j in cross_prod[formula_inds]])
+        formula_masses = masses_with_ion[formula_inds]
+        ion_types = ion_types[formula_inds]
+        # Build mask for uniqueness on formula and ionization
+        # note that ionization are all the same for one subformula assignment
+        # hence we only need to consider the uniqueness of the formula
+        formula_idx_dict = {}
+        uniq_mask = []
+        for idx, formula in enumerate(formulas):
+            uniq_mask.append(formula not in formula_idx_dict)
+            gather_ind = formula_idx_dict.get(formula, None)
+            if gather_ind is None:
+                continue
+            spec_intens[gather_ind] += spec_intens[idx]
+            formula_idx_dict[formula] = idx
+        spec_masses = spec_masses[uniq_mask]
+        spec_intens = spec_intens[uniq_mask]
+        min_mass_diff = min_mass_diff[uniq_mask]
+        rel_mass_diff = rel_mass_diff[uniq_mask]
+        formula_masses = formula_masses[uniq_mask]
+        formulas = formulas[uniq_mask]
+        ion_types = ion_types[uniq_mask]
+        # To calculate explained intensity, preserve the original normalized
+        # intensity
+        if spec_intens.size == 0:
+            output_tbl = None
+            print(f"No matches for {form}")
+        else:
+            output_tbl = {
+                "mz": list(spec_masses),
+                "ms2_inten": list(spec_intens),
+                "mono_mass": list(formula_masses),
+                "abs_mass_diff": list(min_mass_diff),
+                "mass_diff": list(rel_mass_diff),
+                "formula": list(formulas),
+                "ions": list(ion_types),
+            }
+    except:
+        output_tbl = None
+        print(f"failed to process formula {form}")
+        pass
+    output_dict = {
+        "cand_form": form,
+        "cand_ion": ion_type,
+        "output_tbl": output_tbl,
+    }
+    return output_dict
+def get_output_dict(
+    spec_name: str,
+    spec: np.ndarray,
+    form: str,
+    mass_diff_type: str,
+    mass_diff_thresh: float,
+    ion_type: str,
+) -> dict:
+    """_summary_
+    This function attemps to take an array of mass intensity values and assign
+    formula subsets to subpeaks
+    Args:
+        spec_name (str): _description_
+        spec (np.ndarray): _description_
+        form (str): _description_
+        mass_diff_type (str): _description_
+        mass_diff_thresh (float): _description_
+        ion_type (str): _description_
+    Returns:
+        dict: _description_
+    """
+    assert mass_diff_type == "ppm"
+    # This is the case for some erroneous MS2 files for which proc_spec_file return None
+    # All the MS2 subpeaks in these erroneous MS2 files has mz larger than parentmass
+    output_dict = {"cand_form": form, "cand_ion": ion_type, "output_tbl": None}
+    if spec is not None and ion_type in ION_LST:
+        output_dict = assign_subforms(
+            form, spec, ion_type, mass_diff_thresh=mass_diff_thresh
+        )
+    return output_dict

mvp/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (344 Bytes). View file

mvp/utils/__pycache__/data.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

mvp/utils/__pycache__/debug.cpython-311.pyc ADDED Viewed

Binary file (1.06 kB). View file

mvp/utils/__pycache__/eval.cpython-311.pyc ADDED Viewed

Binary file (13.8 kB). View file

mvp/utils/__pycache__/general.cpython-311.pyc ADDED Viewed

Binary file (3.77 kB). View file

mvp/utils/__pycache__/loss.cpython-311.pyc ADDED Viewed

Binary file (4.76 kB). View file

mvp/utils/__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (2.58 kB). View file

mvp/utils/__pycache__/preprocessing.cpython-311.pyc ADDED Viewed

Binary file (12.2 kB). View file