Commit
·
f4fea1e
1
Parent(s):
9f2c6cc
Reorder/sort isp stats output in vs_null mode
Browse files
geneformer/in_silico_perturber_stats.py
CHANGED
|
@@ -19,6 +19,7 @@ import logging
|
|
| 19 |
import numpy as np
|
| 20 |
import pandas as pd
|
| 21 |
import pickle
|
|
|
|
| 22 |
import statsmodels.stats.multitest as smt
|
| 23 |
from pathlib import Path
|
| 24 |
from scipy.stats import ranksums
|
|
@@ -66,8 +67,7 @@ def n_detections(token, dict_list):
|
|
| 66 |
def get_fdr(pvalues):
|
| 67 |
return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
|
| 68 |
|
| 69 |
-
def isp_stats(cos_sims_df, dict_list
|
| 70 |
-
|
| 71 |
random_tuples = []
|
| 72 |
for i in trange(cos_sims_df.shape[0]):
|
| 73 |
token = cos_sims_df["Gene"][i]
|
|
@@ -131,6 +131,40 @@ def isp_stats(cos_sims_df, dict_list, cell_states_to_model):
|
|
| 131 |
|
| 132 |
return cos_sims_full_df
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
class InSilicoPerturberStats:
|
| 135 |
valid_option_dict = {
|
| 136 |
"mode": {"goal_state_shift","vs_null","vs_random"},
|
|
@@ -255,17 +289,16 @@ class InSilicoPerturberStats:
|
|
| 255 |
output_prefix : str
|
| 256 |
Prefix for output .dataset
|
| 257 |
"""
|
| 258 |
-
|
| 259 |
-
self.
|
| 260 |
-
self.gene_id_name_dict = invert_dict(self.gene_name_id_dict)
|
| 261 |
-
|
| 262 |
-
if self.mode == "goal_state_shift":
|
| 263 |
-
dict_list = read_dictionaries(input_data_directory,"cell")
|
| 264 |
-
else:
|
| 265 |
logger.error(
|
| 266 |
-
|
|
|
|
| 267 |
raise
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
| 269 |
# obtain total gene list
|
| 270 |
gene_list = get_gene_list(dict_list)
|
| 271 |
|
|
@@ -278,18 +311,24 @@ class InSilicoPerturberStats:
|
|
| 278 |
self.gene_token_id_dict[genes] \
|
| 279 |
for genes in gene_list]}, \
|
| 280 |
index=[i for i in range(len(gene_list))])
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
# save perturbation stats to output_path
|
| 295 |
output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
|
|
|
|
| 19 |
import numpy as np
|
| 20 |
import pandas as pd
|
| 21 |
import pickle
|
| 22 |
+
import random
|
| 23 |
import statsmodels.stats.multitest as smt
|
| 24 |
from pathlib import Path
|
| 25 |
from scipy.stats import ranksums
|
|
|
|
| 67 |
def get_fdr(pvalues):
|
| 68 |
return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
|
| 69 |
|
| 70 |
+
def isp_stats(cos_sims_df, dict_list):
|
|
|
|
| 71 |
random_tuples = []
|
| 72 |
for i in trange(cos_sims_df.shape[0]):
|
| 73 |
token = cos_sims_df["Gene"][i]
|
|
|
|
| 131 |
|
| 132 |
return cos_sims_full_df
|
| 133 |
|
| 134 |
+
def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
|
| 135 |
+
cos_sims_full_df = cos_sims_df.copy()
|
| 136 |
+
|
| 137 |
+
cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 138 |
+
cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 139 |
+
cos_sims_full_df["Test_v_null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 140 |
+
cos_sims_full_df["Test_v_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 141 |
+
cos_sims_full_df["Test_v_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 142 |
+
cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
| 143 |
+
cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
| 144 |
+
|
| 145 |
+
for i in trange(cos_sims_df.shape[0]):
|
| 146 |
+
token = cos_sims_df["Gene"][i]
|
| 147 |
+
test_shifts = []
|
| 148 |
+
null_shifts = []
|
| 149 |
+
|
| 150 |
+
for dict_i in dict_list:
|
| 151 |
+
token_tuples += dict_i.get((token, "cell_emb"),[])
|
| 152 |
+
|
| 153 |
+
for dict_i in null_dict_list:
|
| 154 |
+
null_tuples += dict_i.get((token, "cell_emb"),[])
|
| 155 |
+
|
| 156 |
+
cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
|
| 157 |
+
cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
|
| 158 |
+
cos_sims_full_df.loc[i, "Test_v_null_avg_shift"] = np.mean(test_shifts)-np.mean(null_shifts)
|
| 159 |
+
cos_sims_full_df.loc[i, "Test_v_null_pval"] = ranksums(test_shifts,
|
| 160 |
+
null_shifts, nan_policy="omit").pvalue
|
| 161 |
+
|
| 162 |
+
cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
|
| 163 |
+
cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
|
| 164 |
+
|
| 165 |
+
cos_sims_full_df["Test_v_null_FDR"] = get_fdr(cos_sims_full_df["Test_v_null_pval"])
|
| 166 |
+
return cos_sims_full_df
|
| 167 |
+
|
| 168 |
class InSilicoPerturberStats:
|
| 169 |
valid_option_dict = {
|
| 170 |
"mode": {"goal_state_shift","vs_null","vs_random"},
|
|
|
|
| 289 |
output_prefix : str
|
| 290 |
Prefix for output .dataset
|
| 291 |
"""
|
| 292 |
+
|
| 293 |
+
if self.mode not in ["goal_state_shift", "vs_null"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
logger.error(
|
| 295 |
+
"Currently, only modes available are stats for goal_state_shift \
|
| 296 |
+
and comparing vs a null distribution.")
|
| 297 |
raise
|
| 298 |
+
|
| 299 |
+
self.gene_token_id_dict = invert_dict(self.gene_token_dict)
|
| 300 |
+
self.gene_id_name_dict = invert_dict(self.gene_name_id_dict)
|
| 301 |
+
|
| 302 |
# obtain total gene list
|
| 303 |
gene_list = get_gene_list(dict_list)
|
| 304 |
|
|
|
|
| 311 |
self.gene_token_id_dict[genes] \
|
| 312 |
for genes in gene_list]}, \
|
| 313 |
index=[i for i in range(len(gene_list))])
|
| 314 |
+
|
| 315 |
+
dict_list = read_dictionaries(input_data_directory, "cell")
|
| 316 |
+
if self.mode == "goal_state_shift":
|
| 317 |
+
cos_sims_df = isp_stats(cos_sims_df_initial, dict_list)
|
| 318 |
+
|
| 319 |
+
# quantify number of detections of each gene
|
| 320 |
+
cos_sims_df["N_Detections"] = [n_detections(i, dict_list) for i in cos_sims_df["Gene"]]
|
| 321 |
+
|
| 322 |
+
# sort by shift to desired state
|
| 323 |
+
cos_sims_df = cos_sims_df.sort_values(by=["Shift_from_goal_end",
|
| 324 |
+
"Goal_end_FDR"])
|
| 325 |
+
elif self.mode == "vs_null":
|
| 326 |
+
dict_list = read_dictionaries(input_data_directory, "cell")
|
| 327 |
+
null_dict_list = read_dictionaries(null_dist_data_directory, "cell")
|
| 328 |
+
cos_sims_df = isp_stats_vs_null(cos_sims_df_initial, dict_list,
|
| 329 |
+
null_dict_list)
|
| 330 |
+
cos_sims_df = cos_sims_df.sort_values(by=["Test_v_null_avg_shift",
|
| 331 |
+
"Test_v_null_FDR"])
|
| 332 |
|
| 333 |
# save perturbation stats to output_path
|
| 334 |
output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
|