Christina Theodoris
commited on
Commit
·
3072225
1
Parent(s):
98ce6d7
Add explanation of output columns and sort by largest shift
Browse files
geneformer/in_silico_perturber_stats.py
CHANGED
|
@@ -142,12 +142,12 @@ def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model):
|
|
| 142 |
names=["Gene",
|
| 143 |
"Gene_name",
|
| 144 |
"Ensembl_ID",
|
| 145 |
-
"
|
| 146 |
-
"
|
| 147 |
"Goal_end_vs_random_pval",
|
| 148 |
"Alt_end_vs_random_pval"]
|
| 149 |
if alt_end_state_exists == False:
|
| 150 |
-
names.remove("
|
| 151 |
names.remove("Alt_end_vs_random_pval")
|
| 152 |
cos_sims_full_df = pd.DataFrame(columns=names)
|
| 153 |
|
|
@@ -197,8 +197,9 @@ def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model):
|
|
| 197 |
cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
|
| 198 |
|
| 199 |
# sort by shift to desired state
|
| 200 |
-
cos_sims_full_df = cos_sims_full_df.sort_values(by=["
|
| 201 |
-
"Goal_end_FDR"]
|
|
|
|
| 202 |
|
| 203 |
return cos_sims_full_df
|
| 204 |
|
|
@@ -208,9 +209,9 @@ def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
|
|
| 208 |
|
| 209 |
cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 210 |
cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 211 |
-
cos_sims_full_df["
|
| 212 |
-
cos_sims_full_df["
|
| 213 |
-
cos_sims_full_df["
|
| 214 |
cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
| 215 |
cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
| 216 |
|
|
@@ -227,17 +228,18 @@ def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
|
|
| 227 |
|
| 228 |
cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
|
| 229 |
cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
|
| 230 |
-
cos_sims_full_df.loc[i, "
|
| 231 |
-
cos_sims_full_df.loc[i, "
|
| 232 |
null_shifts, nan_policy="omit").pvalue
|
| 233 |
|
| 234 |
cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
|
| 235 |
cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
|
| 236 |
|
| 237 |
-
cos_sims_full_df["
|
| 238 |
|
| 239 |
-
cos_sims_full_df = cos_sims_full_df.sort_values(by=["
|
| 240 |
-
"
|
|
|
|
| 241 |
return cos_sims_full_df
|
| 242 |
|
| 243 |
# stats for identifying perturbations with largest effect within a given set of cells
|
|
@@ -498,6 +500,46 @@ class InSilicoPerturberStats:
|
|
| 498 |
Path to directory where perturbation data will be saved as .csv
|
| 499 |
output_prefix : str
|
| 500 |
Prefix for output .dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
"""
|
| 502 |
|
| 503 |
if self.mode not in ["goal_state_shift", "vs_null", "mixture_model"]:
|
|
|
|
| 142 |
names=["Gene",
|
| 143 |
"Gene_name",
|
| 144 |
"Ensembl_ID",
|
| 145 |
+
"Shift_to_goal_end",
|
| 146 |
+
"Shift_to_alt_end",
|
| 147 |
"Goal_end_vs_random_pval",
|
| 148 |
"Alt_end_vs_random_pval"]
|
| 149 |
if alt_end_state_exists == False:
|
| 150 |
+
names.remove("Shift_to_alt_end")
|
| 151 |
names.remove("Alt_end_vs_random_pval")
|
| 152 |
cos_sims_full_df = pd.DataFrame(columns=names)
|
| 153 |
|
|
|
|
| 197 |
cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
|
| 198 |
|
| 199 |
# sort by shift to desired state
|
| 200 |
+
cos_sims_full_df = cos_sims_full_df.sort_values(by=["Shift_to_goal_end",
|
| 201 |
+
"Goal_end_FDR"],
|
| 202 |
+
ascending=[False,True])
|
| 203 |
|
| 204 |
return cos_sims_full_df
|
| 205 |
|
|
|
|
| 209 |
|
| 210 |
cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 211 |
cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 212 |
+
cos_sims_full_df["Test_vs_null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 213 |
+
cos_sims_full_df["Test_vs_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 214 |
+
cos_sims_full_df["Test_vs_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
| 215 |
cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
| 216 |
cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
| 217 |
|
|
|
|
| 228 |
|
| 229 |
cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
|
| 230 |
cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
|
| 231 |
+
cos_sims_full_df.loc[i, "Test_vs_null_avg_shift"] = np.mean(test_shifts)-np.mean(null_shifts)
|
| 232 |
+
cos_sims_full_df.loc[i, "Test_vs_null_pval"] = ranksums(test_shifts,
|
| 233 |
null_shifts, nan_policy="omit").pvalue
|
| 234 |
|
| 235 |
cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
|
| 236 |
cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
|
| 237 |
|
| 238 |
+
cos_sims_full_df["Test_vs_null_FDR"] = get_fdr(cos_sims_full_df["Test_vs_null_pval"])
|
| 239 |
|
| 240 |
+
cos_sims_full_df = cos_sims_full_df.sort_values(by=["Test_vs_null_avg_shift",
|
| 241 |
+
"Test_vs_null_FDR"],
|
| 242 |
+
ascending=[False,True])
|
| 243 |
return cos_sims_full_df
|
| 244 |
|
| 245 |
# stats for identifying perturbations with largest effect within a given set of cells
|
|
|
|
| 500 |
Path to directory where perturbation data will be saved as .csv
|
| 501 |
output_prefix : str
|
| 502 |
Prefix for output .dataset
|
| 503 |
+
|
| 504 |
+
Outputs
|
| 505 |
+
----------
|
| 506 |
+
Definition of possible columns in .csv output file.
|
| 507 |
+
|
| 508 |
+
Of note, not all columns will be present in all output files.
|
| 509 |
+
Some columns are specific to particular perturbation modes.
|
| 510 |
+
|
| 511 |
+
"Gene": gene token
|
| 512 |
+
"Gene_name": gene name
|
| 513 |
+
"Ensembl_ID": gene Ensembl ID
|
| 514 |
+
"N_Detections": number of cells in which each gene or gene combination was detected in the input dataset
|
| 515 |
+
|
| 516 |
+
"Shift_to_goal_end": cosine shift from start state towards goal end state in response to given perturbation
|
| 517 |
+
"Shift_to_alt_end": cosine shift from start state towards alternate end state in response to given perturbation
|
| 518 |
+
"Goal_end_vs_random_pval": pvalue of cosine shift from start state towards goal end state by Wilcoxon
|
| 519 |
+
pvalue compares shift caused by perturbing given gene compared to random genes
|
| 520 |
+
"Alt_end_vs_random_pval": pvalue of cosine shift from start state towards alternate end state by Wilcoxon
|
| 521 |
+
pvalue compares shift caused by perturbing given gene compared to random genes
|
| 522 |
+
"Goal_end_FDR": Benjamini-Hochberg correction of "Goal_end_vs_random_pval"
|
| 523 |
+
"Alt_end_FDR": Benjamini-Hochberg correction of "Alt_end_vs_random_pval"
|
| 524 |
+
|
| 525 |
+
"Test_avg_shift": cosine shift in response to given perturbation in cells from test distribution
|
| 526 |
+
"Null_avg_shift": cosine shift in response to given perturbation in cells from null distribution (e.g. random cells)
|
| 527 |
+
"Test_vs_null_avg_shift": difference in cosine shift in cells from test vs. null distribution
|
| 528 |
+
(i.e. "Test_avg_shift" minus "Null_avg_shift")
|
| 529 |
+
"Test_vs_null_pval": pvalue of cosine shift in test vs. null distribution
|
| 530 |
+
"Test_vs_null_FDR": Benjamini-Hochberg correction of "Test_vs_null_pval"
|
| 531 |
+
"N_Detections_test": "N_Detections" in cells from test distribution
|
| 532 |
+
"N_Detections_null": "N_Detections" in cells from null distribution
|
| 533 |
+
|
| 534 |
+
"Anchor_shift": cosine shift in response to given perturbation of anchor gene
|
| 535 |
+
"Test_token_shift": cosine shift in response to given perturbation of test gene
|
| 536 |
+
"Sum_of_indiv_shifts": sum of cosine shifts in response to individually perturbing test and anchor genes
|
| 537 |
+
"Combo_shift": cosine shift in response to given perturbation of both anchor and test gene(s) in combination
|
| 538 |
+
"Combo_minus_sum_shift": difference of cosine shifts in response combo perturbation vs. sum of individual perturbations
|
| 539 |
+
(i.e. "Combo_shift" minus "Sum_of_indiv_shifts")
|
| 540 |
+
"Impact_component": whether the given perturbation was modeled to be within the impact component by the mixture model
|
| 541 |
+
1: within impact component; 0: not within impact component
|
| 542 |
+
"Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
|
| 543 |
"""
|
| 544 |
|
| 545 |
if self.mode not in ["goal_state_shift", "vs_null", "mixture_model"]:
|