yzhouchen001 commited on
Commit
c65d76d
·
1 Parent(s): b6aab14

main scripts

Browse files
Files changed (35) hide show
  1. mvp/__pycache__/__init__.cpython-311.pyc +0 -0
  2. mvp/__pycache__/definitions.cpython-311.pyc +0 -0
  3. mvp/data/__pycache__/__init__.cpython-311.pyc +0 -0
  4. mvp/data/__pycache__/data_module.cpython-311.pyc +0 -0
  5. mvp/data/__pycache__/datasets.cpython-311.pyc +0 -0
  6. mvp/data/__pycache__/transforms.cpython-311.pyc +0 -0
  7. mvp/models/__pycache__/__init__.cpython-311.pyc +0 -0
  8. mvp/models/__pycache__/base.cpython-311.pyc +0 -0
  9. mvp/models/__pycache__/contrastive.cpython-311.pyc +0 -0
  10. mvp/models/__pycache__/contrastive_interaction.cpython-311.pyc +0 -0
  11. mvp/models/__pycache__/encoders.cpython-311.pyc +0 -0
  12. mvp/models/__pycache__/mol_encoder.cpython-311.pyc +0 -0
  13. mvp/models/__pycache__/spec_encoder.cpython-311.pyc +0 -0
  14. mvp/params_tmp.yaml +0 -0
  15. mvp/subformula_assign/__pycache__/assign_subformulae.cpython-311.pyc +0 -0
  16. mvp/subformula_assign/assign_subformulae.py +216 -0
  17. mvp/subformula_assign/run.sh +14 -0
  18. mvp/subformula_assign/utils/__init__.py +5 -0
  19. mvp/subformula_assign/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  20. mvp/subformula_assign/utils/__pycache__/chem_utils.cpython-311.pyc +0 -0
  21. mvp/subformula_assign/utils/__pycache__/parallel_utils.cpython-311.pyc +0 -0
  22. mvp/subformula_assign/utils/__pycache__/parse_utils.cpython-311.pyc +0 -0
  23. mvp/subformula_assign/utils/__pycache__/spectra_utils.cpython-311.pyc +0 -0
  24. mvp/subformula_assign/utils/chem_utils.py +616 -0
  25. mvp/subformula_assign/utils/parallel_utils.py +84 -0
  26. mvp/subformula_assign/utils/parse_utils.py +295 -0
  27. mvp/subformula_assign/utils/spectra_utils.py +326 -0
  28. mvp/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  29. mvp/utils/__pycache__/data.cpython-311.pyc +0 -0
  30. mvp/utils/__pycache__/debug.cpython-311.pyc +0 -0
  31. mvp/utils/__pycache__/eval.cpython-311.pyc +0 -0
  32. mvp/utils/__pycache__/general.cpython-311.pyc +0 -0
  33. mvp/utils/__pycache__/loss.cpython-311.pyc +0 -0
  34. mvp/utils/__pycache__/models.cpython-311.pyc +0 -0
  35. mvp/utils/__pycache__/preprocessing.cpython-311.pyc +0 -0
mvp/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (155 Bytes). View file
 
mvp/__pycache__/definitions.cpython-311.pyc ADDED
Binary file (956 Bytes). View file
 
mvp/data/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (211 Bytes). View file
 
mvp/data/__pycache__/data_module.cpython-311.pyc ADDED
Binary file (4.34 kB). View file
 
mvp/data/__pycache__/datasets.cpython-311.pyc ADDED
Binary file (24 kB). View file
 
mvp/data/__pycache__/transforms.cpython-311.pyc ADDED
Binary file (12 kB). View file
 
mvp/models/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (347 Bytes). View file
 
mvp/models/__pycache__/base.cpython-311.pyc ADDED
Binary file (9.6 kB). View file
 
mvp/models/__pycache__/contrastive.cpython-311.pyc ADDED
Binary file (25.1 kB). View file
 
mvp/models/__pycache__/contrastive_interaction.cpython-311.pyc ADDED
Binary file (1.46 kB). View file
 
mvp/models/__pycache__/encoders.cpython-311.pyc ADDED
Binary file (2.32 kB). View file
 
mvp/models/__pycache__/mol_encoder.cpython-311.pyc ADDED
Binary file (3.57 kB). View file
 
mvp/models/__pycache__/spec_encoder.cpython-311.pyc ADDED
Binary file (6.02 kB). View file
 
mvp/params_tmp.yaml ADDED
File without changes
mvp/subformula_assign/__pycache__/assign_subformulae.cpython-311.pyc ADDED
Binary file (10.3 kB). View file
 
mvp/subformula_assign/assign_subformulae.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ assign_subformulae.py
2
+
3
+ Copied from https://github.com/samgoldman97/mist/blob/main_v2/src/mist/subformulae/assign_subformulae.py
4
+
5
+ Given a set of spectra and candidates from a labels file, assign subformulae and save to JSON files.
6
+
7
+ """
8
+
9
+ from pathlib import Path
10
+ import argparse
11
+ from functools import partial
12
+ import numpy as np
13
+ import pandas as pd
14
+ import json
15
+ import os
16
+ from tqdm import tqdm
17
+ import utils
18
+
19
+
20
+ def get_args():
21
+ """get args"""
22
+ parser = argparse.ArgumentParser()
23
+ parser.add_argument(
24
+ "--feature-id",
25
+ default="ID",
26
+ help="ID key in mgf input"
27
+ )
28
+ parser.add_argument(
29
+ "--spec-files",
30
+ default="data/paired_spectra/canopus_train/spec_files/",
31
+ help="Spec files; either MGF or directory.",
32
+ )
33
+ parser.add_argument("--output-dir", default=None,
34
+ help="Name of output dir.")
35
+ parser.add_argument(
36
+ "--labels-file",
37
+ default="data/paired_spectra/canopus_train/labels.tsv",
38
+ help="Labels file",
39
+ )
40
+ parser.add_argument(
41
+ "--debug", action="store_true", default=False, help="Debug flag."
42
+ )
43
+ parser.add_argument(
44
+ "--mass-diff-type",
45
+ default="ppm",
46
+ type=str,
47
+ help="Type of mass difference - absolute differece (abs) or relative difference (ppm).",
48
+ )
49
+ parser.add_argument(
50
+ "--mass-diff-thresh",
51
+ action="store",
52
+ default=20,
53
+ type=float,
54
+ help="Threshold of mass difference.",
55
+ )
56
+ parser.add_argument(
57
+ "--inten-thresh",
58
+ action="store",
59
+ default=0.001,
60
+ type=float,
61
+ help="Threshold of MS2 subpeak intensity (normalized to 1).",
62
+ )
63
+ parser.add_argument(
64
+ "--max-formulae",
65
+ action="store",
66
+ default=50,
67
+ type=int,
68
+ help="Max number of peaks to keep",
69
+ )
70
+ parser.add_argument(
71
+ "--num-workers", action="store", default=32, type=int, help="num workers"
72
+ )
73
+ return parser.parse_args()
74
+
75
+
76
+ def process_spec_file(spec_name: str, spec_files: str, max_inten=0.001, max_peaks=60):
77
+ """_summary_
78
+
79
+ Args:
80
+ spec_name (str): _description_
81
+ spec_files (str): _description_
82
+ max_inten (float, optional): _description_. Defaults to 0.001.
83
+ max_peaks (int, optional): _description_. Defaults to 60.
84
+
85
+ Returns:
86
+ _type_: _description_
87
+ """
88
+ spec_file = Path(spec_files) / f"{spec_name}.ms"
89
+
90
+ meta, tuples = utils.parse_spectra(spec_file)
91
+ spec = utils.process_spec_file(meta, tuples)
92
+ return spec_name, spec
93
+
94
+
95
+ def assign_subforms(spec_files, labels_file,
96
+ mass_diff_thresh: int = 20,
97
+ mass_diff_type: str = "ppm",
98
+ inten_thresh: float = 0.001,
99
+ output_dir=None,
100
+ num_workers: int = 32,
101
+ feature_id="ID",
102
+ max_formulae: int = 50,
103
+ debug=False):
104
+ """_summary_
105
+
106
+ Args:
107
+ spec_files (_type_): _description_
108
+ labels_file (_type_): _description_
109
+ mass_diff_thresh (int, optional): _description_. Defaults to 20.
110
+ mass_diff_type (str, optional): _description_. Defaults to "ppm".
111
+ inten_thresh (float, optional): _description_. Defaults to 0.001.
112
+ output_dir (_type_, optional): _description_. Defaults to None.
113
+ num_workers (int, optional): _description_. Defaults to 32.
114
+ feature_id (str, optional): _description_. Defaults to "ID".
115
+ max_formulae (int, optional): _description_. Defaults to 50.
116
+ debug (bool, optional): _description_. Defaults to False.
117
+
118
+ Raises:
119
+ ValueError: _description_
120
+ """
121
+ spec_files = Path(spec_files)
122
+ label_path = Path(labels_file)
123
+
124
+ # Read in labels
125
+ labels_df = pd.read_csv(label_path, sep="\t").astype(str)
126
+ if spec_files.suffix == ".tsv": # YZC msgym-like data
127
+ labels_df.rename(columns={'identifier': 'spec',
128
+ 'adduct': 'ionization'}, inplace=True)
129
+
130
+ if debug:
131
+ labels_df = labels_df[:50]
132
+
133
+ # Define output directory name
134
+ output_dir = Path(output_dir)
135
+ if output_dir is None:
136
+ subform_dir = label_path.parent / "subformulae"
137
+ output_dir_name = f"subform_{max_formulae}"
138
+ output_dir = subform_dir / output_dir_name
139
+
140
+ output_dir.mkdir(exist_ok=True, parents=True)
141
+
142
+ if spec_files.suffix == ".mgf":
143
+ # Input specs
144
+ parsed_specs = utils.parse_spectra_mgf(spec_files)
145
+ input_specs = [utils.process_spec_file(*i) for i in parsed_specs]
146
+ spec_names = [i[0][feature_id] for i in parsed_specs]
147
+ input_specs = list(zip(spec_names, input_specs))
148
+ elif spec_files.is_dir():
149
+ spec_fn_lst = labels_df["spec"].to_list()
150
+ proc_spec_full = partial(
151
+ process_spec_file,
152
+ spec_files=spec_files,
153
+ max_inten=inten_thresh,
154
+ max_peaks=max_formulae,
155
+ )
156
+ # input_specs = [proc_spec_full(i) for i in tqdm(spec_fn_lst)]
157
+ input_specs = utils.chunked_parallel(
158
+ spec_fn_lst, proc_spec_full, chunks=100, max_cpu=max(num_workers, 1)
159
+ )
160
+
161
+ elif spec_files.suffix == '.tsv':
162
+ parsed_specs = utils.parse_spectra_msgym(labels_df)
163
+ input_specs = [utils.process_spec_file(*i) for i in parsed_specs]
164
+ spec_names = [i[0][feature_id] for i in parsed_specs]
165
+ input_specs = list(zip(spec_names, input_specs))
166
+ else:
167
+ raise ValueError(f"Spec files arg {spec_files} is not a dir or mgf")
168
+
169
+
170
+ # input_specs contains a list of tuples (spec, subpeak tuple array)
171
+ input_specs_dict = {tup[0]: tup[1] for tup in input_specs}
172
+ export_dicts, spec_names = [], []
173
+ for _, row in labels_df.iterrows():
174
+ spec = str(row["spec"])
175
+ new_entry = {
176
+ "spec": input_specs_dict[spec],
177
+ "form": row["formula"],
178
+ "mass_diff_type": mass_diff_type,
179
+ "spec_name": spec,
180
+ "mass_diff_thresh": mass_diff_thresh,
181
+ "ion_type": row["ionization"],
182
+ }
183
+ spec_names.append(spec)
184
+ export_dicts.append(new_entry)
185
+
186
+ # Build dicts
187
+ print(f"There are {len(export_dicts)} spec-cand pairs this spec files")
188
+ def export_wrapper(x): return utils.get_output_dict(**x)
189
+ if debug:
190
+ output_dict_lst = [export_wrapper(i) for i in export_dicts[:10]]
191
+ else:
192
+ output_dict_lst = utils.chunked_parallel(
193
+ export_dicts, export_wrapper, chunks=100, max_cpu=max(num_workers, 1)
194
+ )
195
+ assert len(export_dicts) == len(output_dict_lst)
196
+
197
+ # Write all output jsons to files
198
+ os.makedirs(output_dir, exist_ok=True)
199
+ print(f"Writing output to {output_dir}")
200
+ for output_dict, spec_name in tqdm(zip(output_dict_lst, spec_names)):
201
+ with open(output_dir / f"{spec_name}.json", "w") as f:
202
+ json.dump(output_dict, f, indent=4)
203
+ f.close()
204
+
205
+ if __name__ == "__main__":
206
+ args = get_args()
207
+ assign_subforms(spec_files=args.spec_files,
208
+ labels_file=args.labels_file,
209
+ mass_diff_thresh=args.mass_diff_thresh,
210
+ mass_diff_type=args.mass_diff_type,
211
+ inten_thresh=args.inten_thresh,
212
+ output_dir=args.output_dir,
213
+ num_workers=args.num_workers,
214
+ feature_id=args.feature_id,
215
+ max_formulae=args.max_formulae,
216
+ debug=args.debug)
mvp/subformula_assign/run.sh ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPEC_FILES="../data/sample/data.tsv"
2
+ # OUTPUT_DIR="/data/sample/subformulae"
3
+ # MAX_FORMULAE=60
4
+ # LABELS_FILE="../data/sample/data.tsv"
5
+
6
+ # python assign_subformulae.py --spec-files $SPEC_FILES --output-dir $OUTPUT_DIR --max-formulae $MAX_FORMULAE --labels-file $LABELS_FILE
7
+
8
+
9
+ SPEC_FILES="/data/yzhouc01/cancer/data.tsv"
10
+ OUTPUT_DIR="/data/yzhouc01/cancer/subformulae"
11
+ MAX_FORMULAE=60
12
+ LABELS_FILE="/data/yzhouc01/cancer/data.tsv"
13
+
14
+ python assign_subformulae.py --spec-files $SPEC_FILES --output-dir $OUTPUT_DIR --max-formulae $MAX_FORMULAE --labels-file $LABELS_FILE
mvp/subformula_assign/utils/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ from .parse_utils import *
3
+ from .chem_utils import *
4
+ from .parallel_utils import *
5
+ from .spectra_utils import *
mvp/subformula_assign/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (302 Bytes). View file
 
mvp/subformula_assign/utils/__pycache__/chem_utils.cpython-311.pyc ADDED
Binary file (24.9 kB). View file
 
mvp/subformula_assign/utils/__pycache__/parallel_utils.cpython-311.pyc ADDED
Binary file (3.7 kB). View file
 
mvp/subformula_assign/utils/__pycache__/parse_utils.cpython-311.pyc ADDED
Binary file (16.4 kB). View file
 
mvp/subformula_assign/utils/__pycache__/spectra_utils.cpython-311.pyc ADDED
Binary file (12 kB). View file
 
mvp/subformula_assign/utils/chem_utils.py ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """chem_utils.py"""
2
+
3
+ import re
4
+ import numpy as np
5
+ import pandas as pd
6
+ import json
7
+ from functools import reduce
8
+ from collections import defaultdict
9
+
10
+ import torch
11
+ from rdkit import Chem
12
+ from rdkit.Chem import Atom
13
+ from rdkit.Chem.rdMolDescriptors import CalcMolFormula
14
+ from rdkit.Chem.Descriptors import ExactMolWt
15
+ from rdkit.Chem.MolStandardize import rdMolStandardize
16
+
17
+ P_TBL = Chem.GetPeriodicTable()
18
+
19
+ ROUND_FACTOR = 4
20
+
21
+ ELECTRON_MASS = 0.00054858
22
+ CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)"
23
+
24
+ VALID_ELEMENTS = [
25
+ "C",
26
+ "H",
27
+ "As",
28
+ "B",
29
+ "Br",
30
+ "Cl",
31
+ "Co",
32
+ "F",
33
+ "Fe",
34
+ "I",
35
+ "K",
36
+ "N",
37
+ "Na",
38
+ "O",
39
+ "P",
40
+ "S",
41
+ "Se",
42
+ "Si",
43
+ ]
44
+ VALID_ATOM_NUM = [Atom(i).GetAtomicNum() for i in VALID_ELEMENTS]
45
+
46
+
47
+ CHEM_ELEMENT_NUM = len(VALID_ELEMENTS)
48
+
49
+ ATOM_NUM_TO_ONEHOT = torch.zeros((max(VALID_ATOM_NUM) + 1, CHEM_ELEMENT_NUM))
50
+
51
+ # Convert to onehot
52
+ ATOM_NUM_TO_ONEHOT[VALID_ATOM_NUM, torch.arange(CHEM_ELEMENT_NUM)] = 1
53
+
54
+ VALID_MONO_MASSES = np.array(
55
+ [P_TBL.GetMostCommonIsotopeMass(i) for i in VALID_ELEMENTS]
56
+ )
57
+ CHEM_MASSES = VALID_MONO_MASSES[:, None]
58
+
59
+ ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS))
60
+ ELEMENT_VECTORS_MASS = np.hstack([ELEMENT_VECTORS, CHEM_MASSES])
61
+ ELEMENT_TO_MASS = dict(zip(VALID_ELEMENTS, CHEM_MASSES.squeeze()))
62
+
63
+ ELEMENT_DIM_MASS = len(ELEMENT_VECTORS_MASS[0])
64
+ ELEMENT_DIM = len(ELEMENT_VECTORS[0])
65
+
66
+ # Reasonable normalization vector for elements
67
+ # Estimated by max counts (+ 1 when zero)
68
+ NORM_VEC = np.array([81, 158, 2, 1, 3, 10, 1, 17, 1, 6, 1, 19, 2, 34, 6, 6, 2, 6])
69
+
70
+ NORM_VEC_MASS = np.array(NORM_VEC.tolist() + [1471])
71
+
72
+ # Assume 64 is the highest repeat of any 1 atom
73
+ MAX_ELEMENT_NUM = 64
74
+
75
+ element_to_ind = dict(zip(VALID_ELEMENTS, np.arange(len(VALID_ELEMENTS))))
76
+ element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS))
77
+ element_to_position_mass = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS_MASS))
78
+
79
+ ION_LST = [
80
+ "[M+H]+",
81
+ "[M+Na]+",
82
+ "[M+K]+",
83
+ "[M-H2O+H]+",
84
+ "[M+H3N+H]+",
85
+ "[M]+",
86
+ "[M-H4O2+H]+",
87
+ "[M-H]-",
88
+ "[M+H2O+H]+"
89
+ ]
90
+
91
+ ion_remap = dict(zip(ION_LST, ION_LST))
92
+ ion_remap.update(
93
+ {
94
+ "[M+NH4]+": "[M+H3N+H]+",
95
+ "M+H": "[M+H]+",
96
+ "M+Na": "[M+Na]+",
97
+ "M+H-H2O": "[M-H2O+H]+",
98
+ "M-H2O+H": "[M-H2O+H]+",
99
+ "M+NH4": "[M+H3N+H]+",
100
+ "M-2H2O+H": "[M-H4O2+H]+",
101
+ "[M-2H2O+H]+": "[M-H4O2+H]+",
102
+ "[M-H]-": "[M-H]-",
103
+ }
104
+ )
105
+
106
+ ion_to_idx = dict(zip(ION_LST, np.arange(len(ION_LST))))
107
+
108
+ ion_to_mass = {
109
+ "[M+H]+": ELEMENT_TO_MASS["H"] - ELECTRON_MASS,
110
+ "[M+Na]+": ELEMENT_TO_MASS["Na"] - ELECTRON_MASS,
111
+ "[M+K]+": ELEMENT_TO_MASS["K"] - ELECTRON_MASS,
112
+ "[M-H2O+H]+": -ELEMENT_TO_MASS["O"] - ELEMENT_TO_MASS["H"] - ELECTRON_MASS,
113
+ "[M+H3N+H]+": ELEMENT_TO_MASS["N"] + ELEMENT_TO_MASS["H"] * 4 - ELECTRON_MASS,
114
+ "[M]+": 0 - ELECTRON_MASS,
115
+ "[M-H4O2+H]+": -ELEMENT_TO_MASS["O"] * 2 - ELEMENT_TO_MASS["H"] * 3 - ELECTRON_MASS,
116
+ "[M-H]-": ELEMENT_TO_MASS["H"] + ELECTRON_MASS,
117
+ "[M+H2O+H]+":ELEMENT_TO_MASS["O"] * 2 + ELEMENT_TO_MASS["H"] * 2 - ELECTRON_MASS,
118
+ }
119
+
120
+ ion_to_add_vec = {
121
+ "[M+H]+": element_to_position["H"],
122
+ "[M+Na]+": element_to_position["Na"],
123
+ "[M+K]+": element_to_position["K"],
124
+ "[M-H2O+H]+": -element_to_position["O"] - element_to_position["H"],
125
+ "[M+H3N+H]+": element_to_position["N"] + element_to_position["H"] * 4,
126
+ "[M]+": np.zeros_like(element_to_position["H"]),
127
+ "[M-H4O2+H]+": -element_to_position["O"] * 2 - element_to_position["H"] * 3,
128
+ "[M+H2O+H]+": element_to_position["O"] + element_to_position["H"] * 2,
129
+
130
+ }
131
+
132
+ instrument_to_type = defaultdict(lambda : "unknown")
133
+ instrument_to_type.update({
134
+ "Thermo Finnigan Velos Orbitrap": "orbitrap",
135
+ "Thermo Finnigan Elite Orbitrap": "orbitrap",
136
+ "Orbitrap Fusion Lumos": "orbitrap",
137
+ "Q-ToF (LCMS)": "qtof",
138
+ "Unknown (LCMS)": "unknown",
139
+ "ion trap": "iontrap",
140
+ "FTICR (LCMS)": "fticr",
141
+ "Bruker Q-ToF (LCMS)": "qtof",
142
+ "Orbitrap (LCMS)": "orbitrap",
143
+ })
144
+
145
+ instruments = sorted(list(set(instrument_to_type.values())))
146
+ max_instr_idx = len(instruments) + 1
147
+ instrument_to_idx = dict(zip(instruments, np.arange(len(instruments))))
148
+
149
+
150
+ # Define rdbe mult
151
+ rdbe_mult = np.zeros_like(ELEMENT_VECTORS[0])
152
+ els = ["C", "N", "P", "H", "Cl", "Br", "I", "F"]
153
+ weights = [2, 1, 1, -1, -1, -1, -1, -1]
154
+ for k, v in zip(els, weights):
155
+ rdbe_mult[element_to_ind[k]] = v
156
+
157
+
158
+ def get_ion_idx(ionization: str) -> int:
159
+ """map ionization to its index in one hot encoding"""
160
+ return ion_to_idx[ionization]
161
+
162
+
163
+ def get_instr_idx(instrument: str) -> int:
164
+ """map instrument to its index in one hot encoding"""
165
+ inst = instrument_to_type.get(instrument, "unknown")
166
+ return instrument_to_idx[inst]
167
+
168
+
169
+ def has_valid_els(chem_formula: str) -> bool:
170
+ """has_valid_els"""
171
+ for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
172
+ if chem_symbol not in VALID_ELEMENTS:
173
+ return False
174
+ return True
175
+
176
+
177
+ def formula_to_dense(chem_formula: str) -> np.ndarray:
178
+ """formula_to_dense.
179
+
180
+ Args:
181
+ chem_formula (str): Input chemical formal
182
+ Return:
183
+ np.ndarray of vector
184
+
185
+ """
186
+ total_onehot = []
187
+ for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
188
+ # Convert num to int
189
+ num = 1 if num == "" else int(num)
190
+ one_hot = element_to_position[chem_symbol].reshape(1, -1)
191
+ one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0)
192
+ total_onehot.append(one_hot_repeats)
193
+
194
+ # Check if null
195
+ if len(total_onehot) == 0:
196
+ dense_vec = np.zeros(len(element_to_position))
197
+ else:
198
+ dense_vec = np.vstack(total_onehot).sum(0)
199
+ return dense_vec
200
+
201
+
202
+ def cross_sum(x, y):
203
+ """cross_sum."""
204
+ return (np.expand_dims(x, 0) + np.expand_dims(y, 1)).reshape(-1, y.shape[-1])
205
+
206
+
207
+ def get_all_subsets_dense(
208
+ dense_formula: str, element_vectors
209
+ ) -> (np.ndarray, np.ndarray):
210
+ """_summary_
211
+
212
+ Args:
213
+ dense_formula (str, element_vectors): _description_
214
+ np (_type_): _description_
215
+
216
+ Returns:
217
+ _type_: _description_
218
+ """
219
+
220
+ non_zero = np.argwhere(dense_formula > 0).flatten()
221
+
222
+ vectorized_formula = []
223
+ for nonzero_ind in non_zero:
224
+ temp = element_vectors[nonzero_ind] * np.arange(
225
+ 0, dense_formula[nonzero_ind] + 1
226
+ ).reshape(-1, 1)
227
+ vectorized_formula.append(temp)
228
+
229
+ zero_vec = np.zeros((1, element_vectors.shape[-1]))
230
+ cross_prod = reduce(cross_sum, vectorized_formula, zero_vec)
231
+
232
+ cross_prod_inds = rdbe_filter(cross_prod)
233
+ cross_prod = cross_prod[cross_prod_inds]
234
+ all_masses = cross_prod.dot(VALID_MONO_MASSES)
235
+ return cross_prod, all_masses
236
+
237
+
238
+ def get_all_subsets(chem_formula: str):
239
+ dense_formula = formula_to_dense(chem_formula)
240
+ return get_all_subsets_dense(dense_formula, element_vectors=ELEMENT_VECTORS)
241
+
242
+
243
+ def rdbe_filter(cross_prod):
244
+ """rdbe_filter.
245
+ Args:
246
+ cross_prod:
247
+ """
248
+ rdbe_total = 1 + 0.5 * cross_prod.dot(rdbe_mult)
249
+ filter_inds = np.argwhere(rdbe_total >= 0).flatten()
250
+ return filter_inds
251
+
252
+
253
+ def formula_to_dense(chem_formula: str) -> np.ndarray:
254
+ """formula_to_dense.
255
+
256
+ Args:
257
+ chem_formula (str): Input chemical formal
258
+ Return:
259
+ np.ndarray of vector
260
+
261
+ """
262
+ total_onehot = []
263
+ for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
264
+ # Convert num to int
265
+ num = 1 if num == "" else int(num)
266
+ one_hot = element_to_position[chem_symbol].reshape(1, -1)
267
+ one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0)
268
+ total_onehot.append(one_hot_repeats)
269
+
270
+ # Check if null
271
+ if len(total_onehot) == 0:
272
+ dense_vec = np.zeros(len(element_to_position))
273
+ else:
274
+ dense_vec = np.vstack(total_onehot).sum(0)
275
+
276
+ return dense_vec
277
+
278
+
279
+ def formula_to_dense_mass(chem_formula: str) -> np.ndarray:
280
+ """formula_to_dense_mass.
281
+
282
+ Return formula including full compound mass
283
+
284
+ Args:
285
+ chem_formula (str): Input chemical formal
286
+ Return:
287
+ np.ndarray of vector
288
+
289
+ """
290
+ total_onehot = []
291
+ for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
292
+ # Convert num to int
293
+ num = 1 if num == "" else int(num)
294
+ one_hot = element_to_position_mass[chem_symbol].reshape(1, -1)
295
+ one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0)
296
+ total_onehot.append(one_hot_repeats)
297
+
298
+ # Check if null
299
+ if len(total_onehot) == 0:
300
+ dense_vec = np.zeros(len(element_to_position_mass["H"]))
301
+ else:
302
+ dense_vec = np.vstack(total_onehot).sum(0)
303
+
304
+ return dense_vec
305
+
306
+
307
+ def formula_to_dense_mass_norm(chem_formula: str) -> np.ndarray:
308
+ """formula_to_dense_mass_norm.
309
+
310
+ Return formula including full compound mass and normalized
311
+
312
+ Args:
313
+ chem_formula (str): Input chemical formal
314
+ Return:
315
+ np.ndarray of vector
316
+
317
+ """
318
+ dense_vec = formula_to_dense_mass(chem_formula)
319
+ dense_vec = dense_vec / NORM_VEC_MASS
320
+
321
+ return dense_vec
322
+
323
+
324
+ def formula_mass(chem_formula: str) -> float:
325
+ """get formula mass"""
326
+ mass = 0
327
+ for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
328
+ # Convert num to int
329
+ num = 1 if num == "" else int(num)
330
+ mass += ELEMENT_TO_MASS[chem_symbol] * num
331
+ return mass
332
+
333
+
334
+ def electron_correct(mass: float) -> float:
335
+ """subtract the rest mass of an electron"""
336
+ return mass - ELECTRON_MASS
337
+
338
+
339
+ def formula_difference(formula_1, formula_2):
340
+ """formula_1 - formula_2"""
341
+ form_1 = {
342
+ chem_symbol: (int(num) if num != "" else 1)
343
+ for chem_symbol, num in re.findall(CHEM_FORMULA_SIZE, formula_1)
344
+ }
345
+ form_2 = {
346
+ chem_symbol: (int(num) if num != "" else 1)
347
+ for chem_symbol, num in re.findall(CHEM_FORMULA_SIZE, formula_2)
348
+ }
349
+
350
+ for k, v in form_2.items():
351
+ form_1[k] = form_1[k] - form_2[k]
352
+ out_formula = "".join([f"{k}{v}" for k, v in form_1.items() if v > 0])
353
+ return out_formula
354
+
355
+
356
+ def get_mol_from_structure_string(structure_string, structure_type):
357
+ if structure_type == "InChI":
358
+ mol = Chem.MolFromInchi(structure_string)
359
+ else:
360
+ mol = Chem.MolFromSmiles(structure_string)
361
+ return mol
362
+
363
+
364
+ def vec_to_formula(form_vec):
365
+ """vec_to_formula."""
366
+ build_str = ""
367
+ for i in np.argwhere(form_vec > 0).flatten():
368
+ el = VALID_ELEMENTS[i]
369
+ ct = int(form_vec[i])
370
+ new_item = f"{el}{ct}" if ct > 1 else f"{el}"
371
+ build_str = build_str + new_item
372
+ return build_str
373
+
374
+
375
+ def standardize_form(i):
376
+ """standardize_form."""
377
+ return vec_to_formula(formula_to_dense(i))
378
+
379
+
380
+ def standardize_adduct(adduct):
381
+ """standardize_adduct."""
382
+ adduct = adduct.replace(" ", "")
383
+ adduct = ion_remap.get(adduct, adduct)
384
+ if adduct not in ION_LST:
385
+ raise ValueError(f"Adduct {adduct} not in ION_LST")
386
+ return adduct
387
+
388
+
389
+ def calc_structure_string_type(structure_string):
390
+ """calc_structure_string_type.
391
+
392
+ Args:
393
+ structure_string:
394
+ """
395
+ structure_type = None
396
+ if pd.isna(structure_string):
397
+ structure_type = "empty"
398
+ elif structure_string.startswith("InChI="):
399
+ structure_type = "InChI"
400
+ elif Chem.MolFromSmiles(structure_string) is not None:
401
+ structure_type = "Smiles"
402
+ return structure_type
403
+
404
+
405
+ def uncharged_formula(mol, mol_type="mol") -> str:
406
+ """Compute uncharged formula"""
407
+ if mol_type == "mol":
408
+ chem_formula = CalcMolFormula(mol)
409
+ elif mol_type == "smiles":
410
+ mol = Chem.MolFromSmiles(mol)
411
+ if mol is None:
412
+ return None
413
+ chem_formula = CalcMolFormula(mol)
414
+ else:
415
+ raise ValueError()
416
+
417
+ return re.findall(r"^([^\+,^\-]*)", chem_formula)[0]
418
+
419
+
420
+ def form_from_smi(smi: str) -> str:
421
+ """form_from_smi.
422
+
423
+ Args:
424
+ smi (str): smi
425
+
426
+ Return:
427
+ str
428
+ """
429
+ mol = Chem.MolFromSmiles(smi)
430
+ if mol is None:
431
+ return ""
432
+ else:
433
+ return CalcMolFormula(mol)
434
+
435
+
436
+ def inchikey_from_smiles(smi: str) -> str:
437
+ """inchikey_from_smiles.
438
+
439
+ Args:
440
+ smi (str): smi
441
+
442
+ Returns:
443
+ str:
444
+ """
445
+ mol = Chem.MolFromSmiles(smi)
446
+ if mol is None:
447
+ return ""
448
+ else:
449
+ return Chem.MolToInchiKey(mol)
450
+
451
+
452
+ def contains_metals(formula: str) -> bool:
453
+ """returns true if formula contains metals"""
454
+ METAL_RE = "(Fe|Co|Zn|Rh|Pt|Li)"
455
+ return len(re.findall(METAL_RE, formula)) > 0
456
+
457
+
458
+ class SmilesStandardizer(object):
459
+ """Standardize smiles"""
460
+
461
+ def __init__(self, *args, **kwargs):
462
+ self.fragment_standardizer = rdMolStandardize.LargestFragmentChooser()
463
+ self.charge_standardizer = rdMolStandardize.Uncharger()
464
+
465
+ def standardize_smiles(self, smi):
466
+ """Standardize smiles string"""
467
+ mol = Chem.MolFromSmiles(smi)
468
+ out_smi = self.standardize_mol(mol)
469
+ return out_smi
470
+
471
+ def standardize_mol(self, mol) -> str:
472
+ """Standardize smiles string"""
473
+ mol = self.fragment_standardizer.choose(mol)
474
+ mol = self.charge_standardizer.uncharge(mol)
475
+
476
+ # Round trip to and from inchi to tautomer correct
477
+ # Also standardize tautomer in the middle
478
+ output_smi = Chem.MolToSmiles(mol, isomericSmiles=False)
479
+ return output_smi
480
+
481
+
482
+ def mass_from_smi(smi: str) -> float:
483
+ """mass_from_smi.
484
+
485
+ Args:
486
+ smi (str): smi
487
+
488
+ Return:
489
+ str
490
+ """
491
+ mol = Chem.MolFromSmiles(smi)
492
+ if mol is None:
493
+ return 0
494
+ else:
495
+ return ExactMolWt(mol)
496
+
497
+
498
+ def min_formal_from_smi(smi: str):
499
+ mol = Chem.MolFromSmiles(smi)
500
+ if mol is None:
501
+ return 0
502
+ else:
503
+ formal = np.array([j.GetFormalCharge() for j in mol.GetAtoms()])
504
+ return formal.min()
505
+
506
+
507
+ def max_formal_from_smi(smi: str):
508
+ mol = Chem.MolFromSmiles(smi)
509
+ if mol is None:
510
+ return 0
511
+ else:
512
+ formal = np.array([j.GetFormalCharge() for j in mol.GetAtoms()])
513
+ return formal.max()
514
+
515
+
516
+ def atoms_from_smi(smi: str) -> int:
517
+ """atoms_from_smi.
518
+
519
+ Args:
520
+ smi (str): smi
521
+
522
+ Return:
523
+ int
524
+ """
525
+ mol = Chem.MolFromSmiles(smi)
526
+ if mol is None:
527
+ return 0
528
+ else:
529
+ return mol.GetNumAtoms()
530
+
531
+
532
+ def has_valid_els(chem_formula: str) -> bool:
533
+ """has_valid_els"""
534
+ for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
535
+ if chem_symbol not in VALID_ELEMENTS:
536
+ return False
537
+ return True
538
+
539
+
540
+ def add_ion(form: str, ion: str):
541
+ """add_ion.
542
+ Args:
543
+ form (str): form
544
+ ion (str): ion
545
+ """
546
+ ion_vec = ion_to_add_vec[ion]
547
+ form_vec = formula_to_dense(form)
548
+ return vec_to_formula(form_vec + ion_vec)
549
+
550
+
551
+ def achiral_smi(smi: str) -> str:
552
+ """achiral_smi.
553
+
554
+ Return:
555
+ isomeric smiles
556
+
557
+ """
558
+ try:
559
+ mol = Chem.MolFromSmiles(smi)
560
+ if mol is not None:
561
+ smi = Chem.MolToSmiles(mol, isomericSmiles=False)
562
+ return smi
563
+ else:
564
+ return ""
565
+ except:
566
+ return ""
567
+
568
+
569
+ def npclassifer_query(inputs):
570
+ """npclassifier_query.
571
+
572
+ Args:
573
+ input: Tuple of name, molecule
574
+ Return:
575
+ Dict of name to molecule
576
+ """
577
+ import requests
578
+
579
+ spec = inputs[0]
580
+ endpoint = "https://npclassifier.ucsd.edu/classify"
581
+ req_data = {"smiles": inputs[1]}
582
+ out = requests.get(f"{endpoint}", data=req_data)
583
+ out.raise_for_status()
584
+ out_json = out.json()
585
+ return {spec: out_json}
586
+
587
+
588
+ def clipped_ppm(mass_diff: np.ndarray, parentmass: np.ndarray) -> np.ndarray:
589
+ """clipped_ppm.
590
+
591
+ Args:
592
+ mass_diff (np.ndarray): mass_diff
593
+ parentmass (np.ndarray): parentmass
594
+
595
+ Returns:
596
+ np.ndarray:
597
+ """
598
+ parentmass_copy = parentmass * 1
599
+ parentmass_copy[parentmass < 200] = 200
600
+ ppm = mass_diff / parentmass_copy * 1e6
601
+ return ppm
602
+
603
+
604
+ def clipped_ppm_single(
605
+ cls_mass_diff: float,
606
+ parentmass: float,
607
+ ):
608
+ """clipped_ppm_single.
609
+
610
+ Args:
611
+ cls_mass_diff (float): cls_mass_diff
612
+ parentmass (float): parentmass
613
+ """
614
+ div_factor = 200 if parentmass < 200 else parentmass
615
+ cls_ppm = cls_mass_diff / div_factor * 1e6
616
+ return cls_ppm
mvp/subformula_assign/utils/parallel_utils.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """parallel_utils.py"""
2
+ import logging
3
+ from multiprocess.context import TimeoutError
4
+ from pathos import multiprocessing as mp
5
+ from tqdm import tqdm
6
+
7
+
8
+ def simple_parallel(
9
+ input_list, function, max_cpu=16, timeout=4000, max_retries=3, use_ray: bool = False
10
+ ):
11
+ """Simple parallelization.
12
+
13
+ Use map async and retries in case we get odd stalling behavior.
14
+
15
+ input_list: Input list to op on
16
+ function: Fn to apply
17
+ max_cpu: Num cpus
18
+ timeout: Length of timeout
19
+ max_retries: Num times to retry this
20
+ use_ray
21
+
22
+ """
23
+ # If ray is required. Set to false.
24
+ if use_ray and False:
25
+ import ray
26
+
27
+ @ray.remote
28
+ def ray_func(x):
29
+ return function(x)
30
+
31
+ return ray.get([ray_func.remote(x) for x in input_list])
32
+
33
+ from multiprocess.context import TimeoutError
34
+ from pathos import multiprocessing as mp
35
+
36
+ cpus = min(mp.cpu_count(), max_cpu)
37
+ pool = mp.Pool(processes=cpus)
38
+ results = pool.map(function, input_list)
39
+ pool.close()
40
+ pool.join()
41
+ return results
42
+
43
+
44
+ def chunked_parallel(
45
+ input_list, function, chunks=100, max_cpu=16, timeout=4000, max_retries=3
46
+ ):
47
+ """chunked_parallel.
48
+
49
+ Args:
50
+ input_list : list of objects to apply function
51
+ function : Callable with 1 input and returning a single value
52
+ chunks: number of hcunks
53
+ max_cpu: Max num cpus
54
+ timeout: Length of timeout
55
+ max_retries: Num times to retry this
56
+ """
57
+
58
+ # Adding it here fixes somessetting disrupted elsewhere
59
+
60
+ def batch_func(list_inputs):
61
+ outputs = []
62
+ for i in list_inputs:
63
+ outputs.append(function(i))
64
+ return outputs
65
+
66
+ list_len = len(input_list)
67
+ num_chunks = min(list_len, chunks)
68
+ step_size = len(input_list) // num_chunks + 1
69
+
70
+ chunked_list = [
71
+ input_list[i : i + step_size] for i in range(0, len(input_list), step_size)
72
+ ]
73
+
74
+ list_outputs = simple_parallel(
75
+ chunked_list,
76
+ batch_func,
77
+ max_cpu=max_cpu,
78
+ timeout=timeout,
79
+ max_retries=max_retries,
80
+ )
81
+ # Unroll
82
+ full_output = [j for i in list_outputs for j in i]
83
+
84
+ return full_output
mvp/subformula_assign/utils/parse_utils.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ parse_utils.py """
2
+ from pathlib import Path
3
+ from typing import Tuple, List, Optional
4
+ from itertools import groupby
5
+
6
+ from tqdm import tqdm
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+
11
+ def parse_spectra(spectra_file: str) -> Tuple[dict, List[Tuple[str, np.ndarray]]]:
12
+ """parse_spectra.
13
+
14
+ Parses spectra in the SIRIUS format and returns
15
+
16
+ Args:
17
+ spectra_file (str): Name of spectra file to parse
18
+ Return:
19
+ Tuple[dict, List[Tuple[str, np.ndarray]]]: metadata and list of spectra
20
+ tuples containing name and array
21
+ """
22
+ lines = [i.strip() for i in open(spectra_file, "r").readlines()]
23
+
24
+ group_num = 0
25
+ metadata = {}
26
+ spectras = []
27
+ my_iterator = groupby(
28
+ lines, lambda line: line.startswith(">") or line.startswith("#")
29
+ )
30
+
31
+ for index, (start_line, lines) in enumerate(my_iterator):
32
+ group_lines = list(lines)
33
+ subject_lines = list(next(my_iterator)[1])
34
+ # Get spectra
35
+ if group_num > 0:
36
+ spectra_header = group_lines[0].split(">")[1]
37
+ peak_data = [
38
+ [float(x) for x in peak.split()[:2]]
39
+ for peak in subject_lines
40
+ if peak.strip()
41
+ ]
42
+ # Check if spectra is empty
43
+ if len(peak_data):
44
+ peak_data = np.vstack(peak_data)
45
+ # Add new tuple
46
+ spectras.append((spectra_header, peak_data))
47
+ # Get meta data
48
+ else:
49
+ entries = {}
50
+ for i in group_lines:
51
+ if " " not in i:
52
+ continue
53
+ elif i.startswith("#INSTRUMENT TYPE"):
54
+ key = "#INSTRUMENT TYPE"
55
+ val = i.split(key)[1].strip()
56
+ entries[key[1:]] = val
57
+ else:
58
+ start, end = i.split(" ", 1)
59
+ start = start[1:]
60
+ while start in entries:
61
+ start = f"{start}'"
62
+ entries[start] = end
63
+
64
+ metadata.update(entries)
65
+ group_num += 1
66
+
67
+ metadata["_FILE_PATH"] = spectra_file
68
+ metadata["_FILE"] = Path(spectra_file).stem
69
+ return metadata, spectras
70
+
71
+
72
+ def spec_to_ms_str(
73
+ spec: List[Tuple[str, np.ndarray]], essential_keys: dict, comments: dict = {}
74
+ ) -> str:
75
+ """spec_to_ms_str.
76
+
77
+ Turn spec ars and info dicts into str for output file
78
+
79
+
80
+ Args:
81
+ spec (List[Tuple[str, np.ndarray]]): spec
82
+ essential_keys (dict): essential_keys
83
+ comments (dict): comments
84
+
85
+ Returns:
86
+ str:
87
+ """
88
+
89
+ def pair_rows(rows):
90
+ return "\n".join([f"{i} {j}" for i, j in rows])
91
+
92
+ header = "\n".join(f">{k} {v}" for k, v in essential_keys.items())
93
+ comments = "\n".join(f"#{k} {v}" for k, v in essential_keys.items())
94
+ spec_strs = [f">{name}\n{pair_rows(ar)}" for name, ar in spec]
95
+ spec_str = "\n\n".join(spec_strs)
96
+ output = f"{header}\n{comments}\n\n{spec_str}"
97
+ return output
98
+
99
+
100
+ def build_mgf_str(
101
+ meta_spec_list: List[Tuple[dict, List[Tuple[str, np.ndarray]]]],
102
+ merge_charges=True,
103
+ parent_mass_keys=["PEPMASS", "parentmass", "PRECURSOR_MZ"],
104
+ ) -> str:
105
+ """build_mgf_str.
106
+
107
+ Args:
108
+ meta_spec_list (List[Tuple[dict, List[Tuple[str, np.ndarray]]]]): meta_spec_list
109
+
110
+ Returns:
111
+ str:
112
+ """
113
+ entries = []
114
+ for meta, spec in tqdm(meta_spec_list):
115
+ str_rows = ["BEGIN IONS"]
116
+
117
+ # Try to add precusor mass
118
+ for i in parent_mass_keys:
119
+ if i in meta:
120
+ pep_mass = float(meta.get(i, -100))
121
+ str_rows.append(f"PEPMASS={pep_mass}")
122
+ break
123
+
124
+ for k, v in meta.items():
125
+ str_rows.append(f"{k.upper().replace(' ', '_')}={v}")
126
+
127
+ if merge_charges:
128
+ spec_ar = np.vstack([i[1] for i in spec])
129
+ spec_ar = np.vstack([i for i in sorted(spec_ar, key=lambda x: x[0])])
130
+ else:
131
+ raise NotImplementedError()
132
+ str_rows.extend([f"{i} {j}" for i, j in spec_ar])
133
+ str_rows.append("END IONS")
134
+
135
+ str_out = "\n".join(str_rows)
136
+ entries.append(str_out)
137
+
138
+ full_out = "\n\n".join(entries)
139
+ return full_out
140
+
141
+
142
+ def parse_spectra_msp(
143
+ mgf_file: str, max_num: Optional[int] = None
144
+ ) -> List[Tuple[dict, List[Tuple[str, np.ndarray]]]]:
145
+ """parse_spectr_msp.
146
+
147
+ Parses spectra in the MSP file format
148
+
149
+ Args:
150
+ mgf_file (str) : str
151
+ max_num (Optional[int]): If set, only parse this many
152
+ Return:
153
+ List[Tuple[dict, List[Tuple[str, np.ndarray]]]]: metadata and list of spectra
154
+ tuples containing name and array
155
+ """
156
+
157
+ key = lambda x: x.strip().startswith("PEPMASS")
158
+ parsed_spectra = []
159
+ with open(mgf_file, "r", encoding="utf-8") as fp:
160
+ for (is_header, group) in tqdm(groupby(fp, key)):
161
+
162
+ if is_header:
163
+ continue
164
+ meta = dict()
165
+ spectra = []
166
+ # Note: Sometimes we have multiple scans
167
+ # This mgf has them collapsed
168
+ cur_spectra_name = "spec"
169
+ cur_spectra = []
170
+ group = list(group)
171
+ for line in group:
172
+ line = line.strip()
173
+ if not line:
174
+ pass
175
+ elif ":" in line:
176
+ k, v = [i.strip() for i in line.split(":", 1)]
177
+ meta[k] = v
178
+ else:
179
+ mz, intens = line.split()
180
+ cur_spectra.append((float(mz), float(intens)))
181
+
182
+ if len(cur_spectra) > 0:
183
+ cur_spectra = np.vstack(cur_spectra)
184
+ spectra.append((cur_spectra_name, cur_spectra))
185
+ parsed_spectra.append((meta, spectra))
186
+ else:
187
+ pass
188
+ # print("no spectra found for group: ", "".join(group))
189
+
190
+ if max_num is not None and len(parsed_spectra) > max_num:
191
+ # print("Breaking")
192
+ break
193
+ return parsed_spectra
194
+
195
+
196
+ def parse_spectra_mgf(
197
+ mgf_file: str, max_num: Optional[int] = None
198
+ ) -> List[Tuple[dict, List[Tuple[str, np.ndarray]]]]:
199
+ """parse_spectr_mgf.
200
+
201
+ Parses spectra in the MGF file formate, with
202
+
203
+ Args:
204
+ mgf_file (str) : str
205
+ max_num (Optional[int]): If set, only parse this many
206
+ Return:
207
+ List[Tuple[dict, List[Tuple[str, np.ndarray]]]]: metadata and list of spectra
208
+ tuples containing name and array
209
+ """
210
+
211
+ key = lambda x: x.strip() == "BEGIN IONS"
212
+ parsed_spectra = []
213
+ with open(mgf_file, "r") as fp:
214
+
215
+ for (is_header, group) in tqdm(groupby(fp, key)):
216
+
217
+ if is_header:
218
+ continue
219
+
220
+ meta = dict()
221
+ spectra = []
222
+ # Note: Sometimes we have multiple scans
223
+ # This mgf has them collapsed
224
+ cur_spectra_name = "spec"
225
+ cur_spectra = []
226
+ group = list(group)
227
+ for line in group:
228
+ line = line.strip()
229
+ if not line:
230
+ pass
231
+ elif line == "END IONS" or line == "BEGIN IONS":
232
+ pass
233
+ elif "=" in line:
234
+ k, v = [i.strip() for i in line.split("=", 1)]
235
+ meta[k] = v
236
+ else:
237
+ mz, intens = line.split()
238
+ cur_spectra.append((float(mz), float(intens)))
239
+
240
+ if len(cur_spectra) > 0:
241
+ cur_spectra = np.vstack(cur_spectra)
242
+ spectra.append((cur_spectra_name, cur_spectra))
243
+ parsed_spectra.append((meta, spectra))
244
+ else:
245
+ pass
246
+ # print("no spectra found for group: ", "".join(group))
247
+
248
+ if max_num is not None and len(parsed_spectra) > max_num:
249
+ # print("Breaking")
250
+ break
251
+ return parsed_spectra
252
+
253
+
254
+ def parse_tsv_spectra(spectra_file: str) -> List[Tuple[str, np.ndarray]]:
255
+ """parse_tsv_spectra.
256
+
257
+ Parses spectra returned from sirius fragmentation tree
258
+
259
+ Args:
260
+ spectra_file (str): Name of spectra tsv file to parse
261
+ Return:
262
+ List[Tuple[str, np.ndarray]]]: list of spectra
263
+ tuples containing name and array. This is used to maintain
264
+ consistency with the parse_spectra output
265
+ """
266
+ output_spec = []
267
+ with open(spectra_file, "r") as fp:
268
+ for index, line in enumerate(fp):
269
+ if index == 0:
270
+ continue
271
+ line = line.strip().split("\t")
272
+ intensity = float(line[1])
273
+ exact_mass = float(line[3])
274
+ output_spec.append([exact_mass, intensity])
275
+
276
+ output_spec = np.array(output_spec)
277
+ return_obj = [("sirius_spec", output_spec)]
278
+ return return_obj
279
+
280
+ # YZC parse msgym-like formatted data
281
+ def parse_spectra_msgym(df):
282
+
283
+ parsed_spectra = []
284
+ for _, row in df.iterrows():
285
+ mzs = [float(m) for m in row['mzs'].split(',')]
286
+ intensities = [float(i) for i in row['intensities'].split(',')]
287
+ cur_spectra = [(m, i) for m, i in zip(mzs, intensities)]
288
+ cur_spectra = np.vstack(cur_spectra)
289
+ cur_spectra_name = row['spec']
290
+ meta = {'ID': cur_spectra_name,
291
+ 'parentmass': row['parent_mass']}
292
+ parsed_spectra.append((meta, [(cur_spectra_name, cur_spectra)]))
293
+ return parsed_spectra
294
+
295
+
mvp/subformula_assign/utils/spectra_utils.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ spectra_utils.py"""
2
+ import logging
3
+ import numpy as np
4
+ from typing import List
5
+
6
+
7
+ from .chem_utils import (
8
+ vec_to_formula,
9
+ get_all_subsets,
10
+ ion_to_mass,
11
+ ION_LST,
12
+ clipped_ppm,
13
+ )
14
+
15
+
16
+ def bin_spectra(
17
+ spectras: List[np.ndarray], num_bins: int = 2000, upper_limit: int = 1000
18
+ ) -> np.ndarray:
19
+ """bin_spectra.
20
+
21
+ Args:
22
+ spectras (List[np.ndarray]): Input list of spectra tuples
23
+ [(header, spec array)]
24
+ num_bins (int): Number of discrete bins from [0, upper_limit)
25
+ upper_limit (int): Max m/z to consider featurizing
26
+
27
+ Return:
28
+ np.ndarray of shape [channels, num_bins]
29
+ """
30
+ bins = np.linspace(0, upper_limit, num=num_bins)
31
+ binned_spec = np.zeros((len(spectras), len(bins)))
32
+ for spec_index, spec in enumerate(spectras):
33
+
34
+ # Convert to digitized spectra
35
+ digitized_mz = np.digitize(spec[:, 0], bins=bins)
36
+
37
+ # Remove all spectral peaks out of range
38
+ in_range = digitized_mz < len(bins)
39
+ digitized_mz, spec = digitized_mz[in_range], spec[in_range, :]
40
+
41
+ # Add the current peaks to the spectra
42
+ # Use a loop rather than vectorize because certain bins have conflicts
43
+ # based upon resolution
44
+ for bin_index, spec_val in zip(digitized_mz, spec[:, 1]):
45
+ binned_spec[spec_index, bin_index] += spec_val
46
+
47
+ return binned_spec
48
+
49
+
50
+ def merge_norm_spectra(spec_tuples, precision=4) -> np.ndarray:
51
+ """merge_norm_spectra.
52
+
53
+ Take a list of mz, inten tuple arrays and merge them by 4 digit precision
54
+
55
+ Note this uses _max_ merging
56
+
57
+ """
58
+ mz_to_inten_pair = {}
59
+ for i in spec_tuples:
60
+ for tup in i:
61
+ mz, inten = tup
62
+ mz_ind = np.round(mz, precision)
63
+ cur_pair = mz_to_inten_pair.get(mz_ind)
64
+ if cur_pair is None:
65
+ mz_to_inten_pair[mz_ind] = tup
66
+ elif inten > cur_pair[1]:
67
+ mz_to_inten_pair[mz_ind] = (mz_ind, inten)
68
+ else:
69
+ pass
70
+
71
+ merged_spec = np.vstack([v for k, v in mz_to_inten_pair.items()])
72
+ merged_spec[:, 1] = merged_spec[:, 1] / merged_spec[:, 1].max()
73
+ return merged_spec
74
+
75
+
76
+ def norm_spectrum(binned_spec: np.ndarray) -> np.ndarray:
77
+ """norm_spectrum.
78
+
79
+ Normalizes each spectral channel to have norm 1
80
+ This change is made in place
81
+
82
+ Args:
83
+ binned_spec (np.ndarray) : Vector of spectras
84
+
85
+ Return:
86
+ np.ndarray where each channel has max(1)
87
+ """
88
+
89
+ spec_maxes = binned_spec.max(1)
90
+
91
+ non_zero_max = spec_maxes > 0
92
+
93
+ spec_maxes = spec_maxes[non_zero_max]
94
+ binned_spec[non_zero_max] = binned_spec[non_zero_max] / spec_maxes.reshape(-1, 1)
95
+
96
+ return binned_spec
97
+
98
+
99
+ def process_spec_file(meta, tuples, precision=4, max_inten=0.001, max_peaks=60):
100
+ """process_spec_file."""
101
+
102
+ if "parentmass" in meta:
103
+ parentmass = meta.get("parentmass", None)
104
+ elif "PARENTMASS" in meta:
105
+ parentmass = meta.get("PARENTMASS", None)
106
+ elif "PEPMASS" in meta:
107
+ parentmass = meta.get("PEPMASS", None)
108
+ else:
109
+ logging.debug(f"missing parentmass for spec")
110
+ parentmass = 1000000
111
+
112
+ parentmass = float(parentmass)
113
+
114
+ # First norm spectra
115
+ fused_tuples = [x for _, x in tuples if x.size > 0]
116
+
117
+ if len(fused_tuples) == 0:
118
+ return
119
+
120
+ mz_to_inten_pair = {}
121
+ new_tuples = []
122
+ for i in fused_tuples:
123
+ for tup in i:
124
+ mz, inten = tup
125
+ mz_ind = np.round(mz, precision)
126
+ cur_pair = mz_to_inten_pair.get(mz_ind)
127
+ if cur_pair is None:
128
+ mz_to_inten_pair[mz_ind] = tup
129
+ new_tuples.append(tup)
130
+ elif inten > cur_pair[1]:
131
+ cur_pair[1] = inten
132
+ else:
133
+ pass
134
+
135
+ merged_spec = np.vstack(new_tuples)
136
+ merged_spec = merged_spec[merged_spec[:, 0] <= (parentmass + 1)] # could end up removing all peaks?
137
+ try:
138
+ merged_spec[:, 1] = merged_spec[:, 1] / merged_spec[:, 1].max()
139
+ except:
140
+ return
141
+
142
+ # Sqrt intensities here
143
+ merged_spec[:, 1] = np.sqrt(merged_spec[:, 1])
144
+
145
+ merged_spec = max_inten_spec(
146
+ merged_spec, max_num_inten=max_peaks, inten_thresh=max_inten
147
+ )
148
+ return merged_spec
149
+
150
+
151
+ def max_inten_spec(spec, max_num_inten: int = 60, inten_thresh: float = 0):
152
+ """max_inten_spec.
153
+
154
+ Args:
155
+ spec: 2D spectra array
156
+ max_num_inten: Max number of peaks
157
+ inten_thresh: Min intensity to alloow in returned peak
158
+
159
+ Return:
160
+ Spec filtered down
161
+
162
+
163
+ """
164
+ spec_masses, spec_intens = spec[:, 0], spec[:, 1]
165
+
166
+ # Make sure to only take max of each formula
167
+ # Sort by intensity and select top subpeaks
168
+ new_sort_order = np.argsort(spec_intens)[::-1]
169
+ if max_num_inten is not None:
170
+ new_sort_order = new_sort_order[:max_num_inten]
171
+
172
+ spec_masses = spec_masses[new_sort_order]
173
+ spec_intens = spec_intens[new_sort_order]
174
+
175
+ spec_mask = spec_intens > inten_thresh
176
+ spec_masses = spec_masses[spec_mask]
177
+ spec_intens = spec_intens[spec_mask]
178
+ spec = np.vstack([spec_masses, spec_intens]).transpose(1, 0)
179
+ return spec
180
+
181
+
182
+ def max_thresh_spec(spec: np.ndarray, max_peaks=100, inten_thresh=0.003):
183
+ """max_thresh_spec.
184
+
185
+ Args:
186
+ spec (np.ndarray): spec
187
+ max_peaks: Max num peaks to keep
188
+ inten_thresh: Min inten to keep
189
+ """
190
+
191
+ spec_masses, spec_intens = spec[:, 0], spec[:, 1]
192
+
193
+ # Make sure to only take max of each formula
194
+ # Sort by intensity and select top subpeaks
195
+ new_sort_order = np.argsort(spec_intens)[::-1]
196
+ new_sort_order = new_sort_order[:max_peaks]
197
+
198
+ spec_masses = spec_masses[new_sort_order]
199
+ spec_intens = spec_intens[new_sort_order]
200
+
201
+ spec_mask = spec_intens > inten_thresh
202
+ spec_masses = spec_masses[spec_mask]
203
+ spec_intens = spec_intens[spec_mask]
204
+ out_ar = np.vstack([spec_masses, spec_intens]).transpose(1, 0)
205
+ return out_ar
206
+
207
+
208
+ def assign_subforms(form, spec, ion_type, mass_diff_thresh=15):
209
+ """_summary_
210
+
211
+ Args:
212
+ form (_type_): _description_
213
+ spec (_type_): _description_
214
+ ion_type (_type_): _description_
215
+ mass_diff_thresh (int, optional): _description_. Defaults to 15.
216
+
217
+ Returns:
218
+ _type_: _description_
219
+ """
220
+ try:
221
+ cross_prod, masses = get_all_subsets(form)
222
+ spec_masses, spec_intens = spec[:, 0], spec[:, 1]
223
+
224
+ ion_masses = ion_to_mass[ion_type]
225
+ masses_with_ion = masses + ion_masses
226
+ ion_types = np.array([ion_type] * len(masses_with_ion))
227
+
228
+ mass_diffs = np.abs(spec_masses[:, None] - masses_with_ion[None, :])
229
+
230
+ formula_inds = mass_diffs.argmin(-1)
231
+ min_mass_diff = mass_diffs[np.arange(len(mass_diffs)), formula_inds]
232
+ rel_mass_diff = clipped_ppm(min_mass_diff, spec_masses)
233
+
234
+ # Filter by mass diff threshold (ppm)
235
+ valid_mask = rel_mass_diff < mass_diff_thresh
236
+ spec_masses = spec_masses[valid_mask]
237
+ spec_intens = spec_intens[valid_mask]
238
+ min_mass_diff = min_mass_diff[valid_mask]
239
+ rel_mass_diff = rel_mass_diff[valid_mask]
240
+ formula_inds = formula_inds[valid_mask]
241
+
242
+ formulas = np.array([vec_to_formula(j) for j in cross_prod[formula_inds]])
243
+ formula_masses = masses_with_ion[formula_inds]
244
+ ion_types = ion_types[formula_inds]
245
+
246
+ # Build mask for uniqueness on formula and ionization
247
+ # note that ionization are all the same for one subformula assignment
248
+ # hence we only need to consider the uniqueness of the formula
249
+ formula_idx_dict = {}
250
+ uniq_mask = []
251
+ for idx, formula in enumerate(formulas):
252
+ uniq_mask.append(formula not in formula_idx_dict)
253
+ gather_ind = formula_idx_dict.get(formula, None)
254
+ if gather_ind is None:
255
+ continue
256
+ spec_intens[gather_ind] += spec_intens[idx]
257
+ formula_idx_dict[formula] = idx
258
+
259
+ spec_masses = spec_masses[uniq_mask]
260
+ spec_intens = spec_intens[uniq_mask]
261
+ min_mass_diff = min_mass_diff[uniq_mask]
262
+ rel_mass_diff = rel_mass_diff[uniq_mask]
263
+ formula_masses = formula_masses[uniq_mask]
264
+ formulas = formulas[uniq_mask]
265
+ ion_types = ion_types[uniq_mask]
266
+
267
+ # To calculate explained intensity, preserve the original normalized
268
+ # intensity
269
+ if spec_intens.size == 0:
270
+ output_tbl = None
271
+ print(f"No matches for {form}")
272
+ else:
273
+ output_tbl = {
274
+ "mz": list(spec_masses),
275
+ "ms2_inten": list(spec_intens),
276
+ "mono_mass": list(formula_masses),
277
+ "abs_mass_diff": list(min_mass_diff),
278
+ "mass_diff": list(rel_mass_diff),
279
+ "formula": list(formulas),
280
+ "ions": list(ion_types),
281
+ }
282
+ except:
283
+ output_tbl = None
284
+ print(f"failed to process formula {form}")
285
+ pass
286
+ output_dict = {
287
+ "cand_form": form,
288
+ "cand_ion": ion_type,
289
+ "output_tbl": output_tbl,
290
+ }
291
+ return output_dict
292
+
293
+
294
+ def get_output_dict(
295
+ spec_name: str,
296
+ spec: np.ndarray,
297
+ form: str,
298
+ mass_diff_type: str,
299
+ mass_diff_thresh: float,
300
+ ion_type: str,
301
+ ) -> dict:
302
+ """_summary_
303
+
304
+ This function attemps to take an array of mass intensity values and assign
305
+ formula subsets to subpeaks
306
+
307
+ Args:
308
+ spec_name (str): _description_
309
+ spec (np.ndarray): _description_
310
+ form (str): _description_
311
+ mass_diff_type (str): _description_
312
+ mass_diff_thresh (float): _description_
313
+ ion_type (str): _description_
314
+
315
+ Returns:
316
+ dict: _description_
317
+ """
318
+ assert mass_diff_type == "ppm"
319
+ # This is the case for some erroneous MS2 files for which proc_spec_file return None
320
+ # All the MS2 subpeaks in these erroneous MS2 files has mz larger than parentmass
321
+ output_dict = {"cand_form": form, "cand_ion": ion_type, "output_tbl": None}
322
+ if spec is not None and ion_type in ION_LST:
323
+ output_dict = assign_subforms(
324
+ form, spec, ion_type, mass_diff_thresh=mass_diff_thresh
325
+ )
326
+ return output_dict
mvp/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (344 Bytes). View file
 
mvp/utils/__pycache__/data.cpython-311.pyc ADDED
Binary file (12.3 kB). View file
 
mvp/utils/__pycache__/debug.cpython-311.pyc ADDED
Binary file (1.06 kB). View file
 
mvp/utils/__pycache__/eval.cpython-311.pyc ADDED
Binary file (13.8 kB). View file
 
mvp/utils/__pycache__/general.cpython-311.pyc ADDED
Binary file (3.77 kB). View file
 
mvp/utils/__pycache__/loss.cpython-311.pyc ADDED
Binary file (4.76 kB). View file
 
mvp/utils/__pycache__/models.cpython-311.pyc ADDED
Binary file (2.58 kB). View file
 
mvp/utils/__pycache__/preprocessing.cpython-311.pyc ADDED
Binary file (12.2 kB). View file