Update isp to allow modeling single perturbation in multiple cells as batches

Files changed (3) hide show

examples/in_silico_perturbation.ipynb +23 -23
geneformer/in_silico_perturber.py +512 -238
geneformer/in_silico_perturber_stats.py +142 -84

examples/in_silico_perturbation.ipynb CHANGED Viewed

@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "67b44366-f255-4415-a865-6a27a8ffcce7",
    "metadata": {
     "tags": []
@@ -24,21 +24,20 @@
     "# deletion in the dilated cardiomyopathy (dcm) state significantly shifts\n",
     "# the embedding towards non-failing (nf) state\n",
     "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
-    "                      perturb_rank_shift=None,\n",
-    "                      genes_to_perturb=\"all\",\n",
-    "                      combos=0,\n",
-    "                      anchor_gene=None,\n",
-    "                      model_type=\"CellClassifier\",\n",
-    "                      num_classes=3,\n",
-    "                      emb_mode=\"cell\",\n",
-    "                      cell_emb_style=\"mean_pool\",\n",
-    "                      filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
-    "                      cell_states_to_model={\"disease\":([\"dcm\"],[\"nf\"],[\"hcm\"])},\n",
-    "                      max_ncells=2000,\n",
-    "                      emb_layer=0,\n",
-    "                      forward_batch_size=400,\n",
-    "                      nproc=16,\n",
-    "                      save_raw_data=True)"
    ]
   },
   {
@@ -50,22 +49,23 @@
    "source": [
     "# outputs intermediate files from in silico perturbation\n",
     "isp.perturb_data(\"path/to/model\",\n",
-    "               \"path/to/input_data\",\n",
-    "               \"path/to/output_directory\",\n",
-    "               \"output_prefix\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "f8aadabb-516a-4dc0-b307-6de880e64e26",
    "metadata": {},
    "outputs": [],
    "source": [
     "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n",
-    "                                combos=0,\n",
-    "                                anchor_gene=None,\n",
-    "                                cell_states_to_model={\"disease\":([\"dcm\"],[\"nf\"],[\"hcm\"])})"
    ]
   },
   {

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "67b44366-f255-4415-a865-6a27a8ffcce7",
    "metadata": {
     "tags": []
     "# deletion in the dilated cardiomyopathy (dcm) state significantly shifts\n",
     "# the embedding towards non-failing (nf) state\n",
     "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
+    "                        perturb_rank_shift=None,\n",
+    "                        genes_to_perturb=\"all\",\n",
+    "                        combos=0,\n",
+    "                        anchor_gene=None,\n",
+    "                        model_type=\"CellClassifier\",\n",
+    "                        num_classes=3,\n",
+    "                        emb_mode=\"cell\",\n",
+    "                        cell_emb_style=\"mean_pool\",\n",
+    "                        filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
+    "                        cell_states_to_model={\"disease\":([\"dcm\"],[\"nf\"],[\"hcm\"])},\n",
+    "                        max_ncells=2000,\n",
+    "                        emb_layer=0,\n",
+    "                        forward_batch_size=400,\n",
+    "                        nproc=16)"
    ]
   },
   {
    "source": [
     "# outputs intermediate files from in silico perturbation\n",
     "isp.perturb_data(\"path/to/model\",\n",
+    "                 \"path/to/input_data\",\n",
+    "                 \"path/to/output_directory\",\n",
+    "                 \"output_prefix\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "f8aadabb-516a-4dc0-b307-6de880e64e26",
    "metadata": {},
    "outputs": [],
    "source": [
     "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n",
+    "                                  genes_perturbed=\"all\",\n",
+    "                                  combos=0,\n",
+    "                                  anchor_gene=None,\n",
+    "                                  cell_states_to_model={\"disease\":([\"dcm\"],[\"nf\"],[\"hcm\"])})"
    ]
   },
   {

geneformer/in_silico_perturber.py CHANGED Viewed

@@ -17,8 +17,7 @@ Usage:
                           max_ncells=None,
                           emb_layer=-1,
                           forward_batch_size=100,
-                          nproc=4,
-                          save_raw_data=False)
   isp.perturb_data("path/to/model",
                    "path/to/input_data",
                    "path/to/output_directory",
@@ -28,7 +27,9 @@ Usage:
 # imports
 import itertools as it
 import logging
 import pickle
 import seaborn as sns; sns.set()
 import torch
 from collections import defaultdict
@@ -47,9 +48,16 @@ def quant_layers(model):
             layer_nums += [int(name.split("layer.")[1].split(".")[0])]
     return int(max(layer_nums))+1
 def flatten_list(megalist):
     return [item for sublist in megalist for item in sublist]
 def forward_pass_single_cell(model, example_cell, layer_to_quant):
     example_cell.set_format(type="torch")
     input_data = example_cell["input_ids"]
@@ -66,15 +74,16 @@ def perturb_emb_by_index(emb, indices):
     mask[indices] = False
     return emb[mask]
-def delete_index(example):
-    indexes = example["perturb_index"]
-    if len(indexes)>1:
-        indexes = flatten_list(indexes)
-    for index in sorted(indexes, reverse=True):
         del example["input_ids"][index]
     return example
-def overexpress_index(example):
     indexes = example["perturb_index"]
     if len(indexes)>1:
         indexes = flatten_list(indexes)
@@ -82,11 +91,19 @@ def overexpress_index(example):
         example["input_ids"].insert(0, example["input_ids"].pop(index))
     return example
 def make_perturbation_batch(example_cell,
                             perturb_type,
                             tokens_to_perturb,
                             anchor_token,
-                            combo_lvl,
                             num_proc):
     if tokens_to_perturb == "all":
         if perturb_type in ["overexpress","activate"]:
@@ -114,21 +131,38 @@ def make_perturbation_batch(example_cell,
             all_indices = [index for index in all_indices if index not in indices_to_perturb]
             indices_to_perturb = [[[j for i in indices_to_perturb for j in i], x] for x in all_indices]
     length = len(indices_to_perturb)
-    perturbation_dataset = Dataset.from_dict({"input_ids": example_cell["input_ids"]*length, "perturb_index": indices_to_perturb})
     if length<400:
         num_proc_i = 1
     else:
         num_proc_i = num_proc
     if perturb_type == "delete":
-        perturbation_dataset = perturbation_dataset.map(delete_index, num_proc=num_proc_i)
     elif perturb_type == "overexpress":
-        perturbation_dataset = perturbation_dataset.map(overexpress_index, num_proc=num_proc_i)
     return perturbation_dataset, indices_to_perturb
-# original cell emb removing the respective perturbed gene emb
-def make_comparison_batch(original_emb, indices_to_perturb):
     all_embs_list = []
-    for indices in indices_to_perturb:
         emb_list = []
         start = 0
         if len(indices)>1 and isinstance(indices[0],list):
@@ -138,28 +172,22 @@ def make_comparison_batch(original_emb, indices_to_perturb):
             start = i+1
         emb_list += [original_emb[start:]]
         all_embs_list += [torch.cat(emb_list)]
     return torch.stack(all_embs_list)
-# perturbed cell emb removing the activated/overexpressed/inhibited gene emb
-# so that only non-perturbed gene embeddings are compared to each other
-# in original or perturbed context
-def make_perturbed_remainder_batch(emb_batch, indices_to_remove):
-    if type(indices_to_remove) == int:
-        indices_to_keep = [i for i in range(emb_batch.size()[1])]
-        indices_to_keep.pop(indices_to_remove)
-        perturbed_remainder_batch = torch.stack([emb[indices_to_keep,:] for emb in emb_batch])
-    elif type(indices_to_remove) == list:
-        perturbed_remainder_batch = torch.stack([make_comparison_batch(emb_batch[i],indices_to_remove[i]) for i in range(len(emb_batch))])
-    return perturbed_remainder_batch
 # average embedding position of goal cell states
 def get_cell_state_avg_embs(model,
                             filtered_input_data,
                             cell_states_to_model,
                             layer_to_quant,
-                            token_dictionary,
                             forward_batch_size,
                             num_proc):
     possible_states = [value[0]+value[1]+value[2] for value in cell_states_to_model.values()][0]
     state_embs_dict = dict()
     for possible_state in possible_states:
@@ -179,7 +207,10 @@ def get_cell_state_avg_embs(model,
             state_minibatch.set_format(type="torch")
             input_data_minibatch = state_minibatch["input_ids"]
-            input_data_minibatch = pad_tensor_list(input_data_minibatch, max_len, token_dictionary)
             with torch.no_grad():
                 outputs = model(
@@ -204,51 +235,131 @@ def quant_cos_sims(model,
                    perturbation_batch,
                    forward_batch_size,
                    layer_to_quant,
-                   original_emb,
                    indices_to_perturb,
                    cell_states_to_model,
-                   state_embs_dict):
     cos = torch.nn.CosineSimilarity(dim=2)
     total_batch_length = len(perturbation_batch)
     if ((total_batch_length-1)/forward_batch_size).is_integer():
         forward_batch_size = forward_batch_size-1
     if cell_states_to_model is None:
-        comparison_batch = make_comparison_batch(original_emb, indices_to_perturb)
         cos_sims = []
     else:
         possible_states = [value[0]+value[1]+value[2] for value in cell_states_to_model.values()][0]
         cos_sims_vs_alt_dict = dict(zip(possible_states,[[] for i in range(len(possible_states))]))
     for i in range(0, total_batch_length, forward_batch_size):
         max_range = min(i+forward_batch_size, total_batch_length)
         perturbation_minibatch = perturbation_batch.select([i for i in range(i, max_range)])
         perturbation_minibatch.set_format(type="torch")
         input_data_minibatch = perturbation_minibatch["input_ids"]
         with torch.no_grad():
             outputs = model(
                 input_ids = input_data_minibatch.to("cuda")
             )
         del input_data_minibatch
         del perturbation_minibatch
-        # cosine similarity between original emb and batch items
         if len(indices_to_perturb)>1:
             minibatch_emb = torch.squeeze(outputs.hidden_states[layer_to_quant])
         else:
             minibatch_emb = outputs.hidden_states[layer_to_quant]
-        if cell_states_to_model is None:
-            minibatch_comparison = comparison_batch[i:max_range]
             if perturb_type == "overexpress":
-                index_to_remove = 0
-                minibatch_emb = make_perturbed_remainder_batch(minibatch_emb, index_to_remove)
-            # elif (perturb_type == "inhibit") or (perturb_type == "activate"):
-            #     index_to_remove = placeholder
-            #     minibatch_emb = make_perturbed_remainder_batch(minibatch_emb, index_to_remove)
             cos_sims += [cos(minibatch_emb, minibatch_comparison).to("cpu")]
         elif cell_states_to_model is not None:
             for state in possible_states:
-                cos_sims_vs_alt_dict[state] += cos_sim_shift(original_emb, minibatch_emb, state_embs_dict[state])
         del outputs
         del minibatch_emb
         if cell_states_to_model is None:
@@ -263,17 +374,55 @@ def quant_cos_sims(model,
         return cos_sims_vs_alt_dict
 # calculate cos sim shift of perturbation with respect to origin and alternative cell
-def cos_sim_shift(original_emb, minibatch_emb, alt_emb):
     cos = torch.nn.CosineSimilarity(dim=2)
-    original_emb = torch.mean(original_emb,dim=0,keepdim=True)[None, :]
     origin_v_end = cos(original_emb,alt_emb)
-    perturb_v_end = cos(torch.mean(minibatch_emb,dim=1,keepdim=True),alt_emb)
     return [(perturb_v_end-origin_v_end).to("cpu")]
 # pad list of tensors and convert to tensor
-def pad_tensor_list(tensor_list, dynamic_or_constant, token_dictionary):
-    pad_token_id = token_dictionary.get("<pad>")
     # Determine maximum tensor length
     if dynamic_or_constant == "dynamic":
@@ -281,15 +430,13 @@ def pad_tensor_list(tensor_list, dynamic_or_constant, token_dictionary):
     elif type(dynamic_or_constant) == int:
         max_len = dynamic_or_constant
     else:
         logger.warning(
                     "If padding style is constant, must provide integer value. " \
-                    "Setting padding to max input size 2048.")
     # pad all tensors to maximum length
-    tensor_list = [torch.nn.functional.pad(tensor, pad=(0,
-                                                   max_len - tensor.numel()),
-                                                   mode='constant',
-                                                   value=pad_token_id) for tensor in tensor_list]
     # return stacked tensors
     return torch.stack(tensor_list)
@@ -299,7 +446,7 @@ class InSilicoPerturber:
         "perturb_type": {"delete","overexpress","inhibit","activate"},
         "perturb_rank_shift": {None, 1, 2, 3},
         "genes_to_perturb": {"all", list},
-        "combos": {0, 1, 2},
         "anchor_gene": {None, str},
         "model_type": {"Pretrained","GeneClassifier","CellClassifier"},
         "num_classes": {int},
@@ -311,7 +458,6 @@ class InSilicoPerturber:
         "emb_layer": {-1, 0},
         "forward_batch_size": {int},
         "nproc": {int},
-        "save_raw_data": {False, True},
     }
     def __init__(
         self,
@@ -330,7 +476,6 @@ class InSilicoPerturber:
         emb_layer=-1,
         forward_batch_size=100,
         nproc=4,
-        save_raw_data=False,
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
     ):
         """
@@ -358,8 +503,10 @@ class InSilicoPerturber:
         genes_to_perturb : "all", list
             Default is perturbing each gene detected in each cell in the dataset.
             Otherwise, may provide a list of ENSEMBL IDs of genes to perturb.
-        combos : {0,1,2}
-            Whether to perturb genes individually (0), in pairs (1), or in triplets (2).
         anchor_gene : None, str
             ENSEMBL ID of gene to use as anchor in combination perturbations.
             For example, if combos=1 and anchor_gene="ENSG00000148400":
@@ -393,8 +540,6 @@ class InSilicoPerturber:
             Batch size for forward pass.
         nproc : int
             Number of CPU processes to use.
-        save_raw_data: {False,True}
-            Whether to save raw perturbation data for each gene/cell.
         token_dictionary_file : Path
             Path to pickle file containing token dictionary (Ensembl ID:token).
         """
@@ -404,6 +549,18 @@ class InSilicoPerturber:
         self.genes_to_perturb = genes_to_perturb
         self.combos = combos
         self.anchor_gene = anchor_gene
         self.model_type = model_type
         self.num_classes = num_classes
         self.emb_mode = emb_mode
@@ -414,7 +571,6 @@ class InSilicoPerturber:
         self.emb_layer = emb_layer
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
-        self.save_raw_data = save_raw_data
         self.validate_options()
@@ -422,22 +578,39 @@ class InSilicoPerturber:
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
-        if anchor_gene is None:
             self.anchor_token = None
         else:
-            self.anchor_token = [self.gene_token_dict[self.anchor_gene]]
-        if genes_to_perturb == "all":
             self.tokens_to_perturb = "all"
         else:
-            self.tokens_to_perturb = [self.gene_token_dict[gene] for gene in self.genes_to_perturb]
     def validate_options(self):
         # first disallow options under development
         if self.perturb_type in ["inhibit", "activate"]:
             logger.error(
-                f"In silico inhibition and activation currently under developemnt. " \
-                f"Current valid options for 'perturb_type': 'delete' or 'overexpress'"
             )
             raise
@@ -462,7 +635,7 @@ class InSilicoPerturber:
                 f"Valid options for {attr_name}: {valid_options}"
             )
             raise
         if self.perturb_type in ["delete","overexpress"]:
             if self.perturb_rank_shift is not None:
                 if self.perturb_type == "delete":
@@ -538,9 +711,9 @@ class InSilicoPerturber:
         input_data_file : Path
             Path to directory containing .dataset inputs
         output_directory : Path
-            Path to directory where perturbation data will be saved as .csv
         output_prefix : str
-            Prefix for output .dataset
         """
         filtered_input_data = self.load_and_filter(input_data_file)
@@ -555,7 +728,7 @@ class InSilicoPerturber:
                                                       filtered_input_data,
                                                       self.cell_states_to_model,
                                                       layer_to_quant,
-                                                      self.gene_token_dict,
                                                       self.forward_batch_size,
                                                       self.nproc)
             # filter for start state cells
@@ -571,13 +744,6 @@ class InSilicoPerturber:
                               state_embs_dict,
                               output_directory,
                               output_prefix)
-        # if self.save_raw_data is False:
-        #     # delete intermediate dictionaries
-        #     output_dir = os.listdir(output_directory)
-        #     for output_file in output_dir:
-        #         if output_file.endswith("_raw.pickle"):
-        #             os.remove(os.path.join(output_directory, output_file))
     # load data and filter by defined criteria
     def load_and_filter(self, input_data_file):
@@ -632,6 +798,7 @@ class InSilicoPerturber:
                           output_prefix):
         output_path_prefix = f"{output_directory}in_silico_{self.perturb_type}_{output_prefix}_dict_1Kbatch"
         # filter dataset for cells that have tokens to be perturbed
         if self.anchor_token is not None:
@@ -639,183 +806,290 @@ class InSilicoPerturber:
                 return (len(set(example["input_ids"]).intersection(self.anchor_token))==len(self.anchor_token))
             filtered_input_data = filtered_input_data.filter(if_has_tokens_to_perturb, num_proc=self.nproc)
             logger.info(f"# cells with anchor gene: {len(filtered_input_data)}")
-        if self.tokens_to_perturb != "all":
             def if_has_tokens_to_perturb(example):
-                return (len(set(example["input_ids"]).intersection(self.tokens_to_perturb))>self.combos)
             filtered_input_data = filtered_input_data.filter(if_has_tokens_to_perturb, num_proc=self.nproc)
         cos_sims_dict = defaultdict(list)
         pickle_batch = -1
-        for i in trange(len(filtered_input_data)):
-            example_cell = filtered_input_data.select([i])
-            original_emb = forward_pass_single_cell(model, example_cell, layer_to_quant)
-            gene_list = torch.squeeze(example_cell["input_ids"])
-            # reset to original type to prevent downstream issues due to forward_pass_single_cell modifying as torch format in place
-            example_cell = filtered_input_data.select([i])
-            if self.anchor_token is None:
-                for combo_lvl in range(self.combos+1):
                     perturbation_batch, indices_to_perturb = make_perturbation_batch(example_cell,
-                                                                                    self.perturb_type,
-                                                                                    self.tokens_to_perturb,
-                                                                                    self.anchor_token,
-                                                                                    combo_lvl,
-                                                                                    self.nproc)
                     cos_sims_data = quant_cos_sims(model,
                                                    self.perturb_type,
-                                                   perturbation_batch,
-                                                   self.forward_batch_size,
-                                                   layer_to_quant,
-                                                   original_emb,
                                                    indices_to_perturb,
                                                    self.cell_states_to_model,
-                                                   state_embs_dict)
-                    if self.cell_states_to_model is None:
-                        # update cos sims dict
-                        # key is tuple of (perturbed_gene, affected_gene)
-                        # or (perturbed_gene, "cell_emb") for avg cell emb change
-                        cos_sims_data = cos_sims_data.to("cuda")
-                        for j in range(cos_sims_data.shape[0]):
-                            if self.genes_to_perturb != "all":
-                                j_index = torch.tensor(indices_to_perturb[j])
-                                if j_index.shape[0]>1:
-                                    j_index = torch.squeeze(j_index)
-                            else:
-                                j_index = torch.tensor([j])
-                            perturbed_gene = torch.index_select(gene_list, 0, j_index)
-                            if perturbed_gene.shape[0]==1:
-                                perturbed_gene = perturbed_gene.item()
-                            elif perturbed_gene.shape[0]>1:
-                                perturbed_gene = tuple(perturbed_gene.tolist())
-                            cell_cos_sim = torch.mean(cos_sims_data[j]).item()
-                            cos_sims_dict[(perturbed_gene, "cell_emb")] += [cell_cos_sim]
-                            # not_j_index = list(set(i for i in range(gene_list.shape[0])).difference(j_index))
-                            # gene_list_j = torch.index_select(gene_list, 0, j_index)
-                            if self.emb_mode == "cell_and_gene":
-                                for k in range(cos_sims_data.shape[1]):
-                                    cos_sim_value = cos_sims_data[j][k]
-                                    affected_gene = gene_list[k].item()
-                                    cos_sims_dict[(perturbed_gene, affected_gene)] += [cos_sim_value.item()]
-                    else:
-                        # update cos sims dict
-                        # key is tuple of (perturbed_gene, "cell_emb")
-                        # value is list of tuples of cos sims for cell_states_to_model
-                        origin_state_key = [value[0] for value in self.cell_states_to_model.values()][0][0]
-                        cos_sims_origin = cos_sims_data[origin_state_key]
-                        for j in range(cos_sims_origin.shape[0]):
-                            if (self.genes_to_perturb != "all") or (combo_lvl>0):
-                                j_index = torch.tensor(indices_to_perturb[j])
-                                if j_index.shape[0]>1:
-                                    j_index = torch.squeeze(j_index)
-                            else:
-                                j_index = torch.tensor([j])
-                            perturbed_gene = torch.index_select(gene_list, 0, j_index)
-                            if perturbed_gene.shape[0]==1:
-                                perturbed_gene = perturbed_gene.item()
-                            elif perturbed_gene.shape[0]>1:
-                                perturbed_gene = tuple(perturbed_gene.tolist())
-                            data_list = []
-                            for data in list(cos_sims_data.values()):
-                                data_item = data.to("cuda")
-                                cell_data = torch.mean(data_item[j]).item()
-                                data_list += [cell_data]
-                            cos_sims_dict[(perturbed_gene, "cell_emb")] += [tuple(data_list)]
-            elif self.anchor_token is not None:
-                perturbation_batch, indices_to_perturb = make_perturbation_batch(example_cell,
-                                                                                 self.perturb_type,
-                                                                                 self.tokens_to_perturb,
-                                                                                 None,  # first run without anchor token to test individual gene perturbations
-                                                                                 0,
-                                                                                 self.nproc)
-                cos_sims_data = quant_cos_sims(model,
-                                               self.perturb_type,
-                                               perturbation_batch,
-                                               self.forward_batch_size,
-                                               layer_to_quant,
-                                               original_emb,
-                                               indices_to_perturb,
-                                               self.cell_states_to_model,
-                                               state_embs_dict)
-                cos_sims_data = cos_sims_data.to("cuda")
-                combo_perturbation_batch, combo_indices_to_perturb = make_perturbation_batch(example_cell,
-                                                                                             self.perturb_type,
-                                                                                             self.tokens_to_perturb,
-                                                                                             self.anchor_token,
-                                                                                             1,
-                                                                                             self.nproc)
-                combo_cos_sims_data = quant_cos_sims(model,
-                                                     self.perturb_type,
-                                                     combo_perturbation_batch,
-                                                     self.forward_batch_size,
-                                                     layer_to_quant,
-                                                     original_emb,
-                                                     combo_indices_to_perturb,
-                                                     self.cell_states_to_model,
-                                                     state_embs_dict)
-                combo_cos_sims_data = combo_cos_sims_data.to("cuda")
-                # update cos sims dict
-                # key is tuple of (perturbed_gene, "cell_emb") for avg cell emb change
-                anchor_index = example_cell["input_ids"][0].index(self.anchor_token[0])
-                anchor_cell_cos_sim = torch.mean(cos_sims_data[anchor_index]).item()
-                non_anchor_indices = [k for k in range(cos_sims_data.shape[0]) if k != anchor_index]
-                cos_sims_data = cos_sims_data[non_anchor_indices,:]
-                for j in range(cos_sims_data.shape[0]):
-                    if j<anchor_index:
-                        j_index = torch.tensor([j])
-                    else:
-                        j_index = torch.tensor([j+1])
-                    perturbed_gene = torch.index_select(gene_list, 0, j_index)
-                    perturbed_gene = perturbed_gene.item()
-                    cell_cos_sim = torch.mean(cos_sims_data[j]).item()
-                    combo_cos_sim = torch.mean(combo_cos_sims_data[j]).item()
-                    cos_sims_dict[(perturbed_gene, "cell_emb")] += [(anchor_cell_cos_sim, # cos sim anchor gene alone
-                                                                     cell_cos_sim, # cos sim deleted gene alone
-                                                                     combo_cos_sim)] # cos sim anchor gene + deleted gene
-            # save dict to disk every 100 cells
-            if (i/100).is_integer():
-                with open(f"{output_path_prefix}{pickle_batch}_raw.pickle", "wb") as fp:
-                    pickle.dump(cos_sims_dict, fp)
-            # reset and clear memory every 1000 cells
-            if (i/1000).is_integer():
-                pickle_batch = pickle_batch+1
-                # clear memory
-                del perturbed_gene
-                del cos_sims_data
-                if self.cell_states_to_model is None:
-                    del cell_cos_sim
-                if self.cell_states_to_model is not None:
-                    del cell_data
-                    del data_list
-                elif self.anchor_token is None:
-                    if self.emb_mode == "cell_and_gene":
-                        del affected_gene
-                        del cos_sim_value
-                else:
-                    del combo_cos_sim
-                    del combo_cos_sims_data
-                # reset dict
-                del cos_sims_dict
-                cos_sims_dict = defaultdict(list)
-                torch.cuda.empty_cache()
-        # save remainder cells
-        with open(f"{output_path_prefix}{pickle_batch}_raw.pickle", "wb") as fp:
-            pickle.dump(cos_sims_dict, fp)

                           max_ncells=None,
                           emb_layer=-1,
                           forward_batch_size=100,
+                          nproc=4)
   isp.perturb_data("path/to/model",
                    "path/to/input_data",
                    "path/to/output_directory",
 # imports
 import itertools as it
 import logging
+import numpy as np
 import pickle
+import re
 import seaborn as sns; sns.set()
 import torch
 from collections import defaultdict
             layer_nums += [int(name.split("layer.")[1].split(".")[0])]
     return int(max(layer_nums))+1
+def get_model_input_size(model):
+    return int(re.split("\(|,",str(model.bert.embeddings.position_embeddings))[1])
 def flatten_list(megalist):
     return [item for sublist in megalist for item in sublist]
+def measure_length(example):
+    example["length"] = len(example["input_ids"])
+    return example
 def forward_pass_single_cell(model, example_cell, layer_to_quant):
     example_cell.set_format(type="torch")
     input_data = example_cell["input_ids"]
     mask[indices] = False
     return emb[mask]
+def delete_indices(example):
+    indices = example["perturb_index"]
+    if len(indices)>1:
+        indices = flatten_list(indices)
+    for index in sorted(indices, reverse=True):
         del example["input_ids"][index]
     return example
+# for genes_to_perturb = "all" where only genes within cell are overexpressed
+def overexpress_indices(example):
     indexes = example["perturb_index"]
     if len(indexes)>1:
         indexes = flatten_list(indexes)
         example["input_ids"].insert(0, example["input_ids"].pop(index))
     return example
+# for genes_to_perturb = list of genes to overexpress that are not necessarily expressed in cell
+def overexpress_tokens(example):
+    # -100 indicates tokens to overexpress are not present in rank value encoding
+    if example["perturb_index"] != [-100]:
+        example = delete_indices(example)
+    [example["input_ids"].insert(0, token) for token in example["tokens_to_perturb"][::-1]]
+    return example
 def make_perturbation_batch(example_cell,
                             perturb_type,
                             tokens_to_perturb,
                             anchor_token,
+                            combo_lvl,
                             num_proc):
     if tokens_to_perturb == "all":
         if perturb_type in ["overexpress","activate"]:
             all_indices = [index for index in all_indices if index not in indices_to_perturb]
             indices_to_perturb = [[[j for i in indices_to_perturb for j in i], x] for x in all_indices]
     length = len(indices_to_perturb)
+    perturbation_dataset = Dataset.from_dict({"input_ids": example_cell["input_ids"]*length,
+                                              "perturb_index": indices_to_perturb})
     if length<400:
         num_proc_i = 1
     else:
         num_proc_i = num_proc
     if perturb_type == "delete":
+        perturbation_dataset = perturbation_dataset.map(delete_indices, num_proc=num_proc_i)
     elif perturb_type == "overexpress":
+        perturbation_dataset = perturbation_dataset.map(overexpress_indices, num_proc=num_proc_i)
     return perturbation_dataset, indices_to_perturb
+# perturbed cell emb removing the activated/overexpressed/inhibited gene emb
+# so that only non-perturbed gene embeddings are compared to each other
+# in original or perturbed context
+def make_comparison_batch(original_emb_batch, indices_to_perturb, perturb_group):
     all_embs_list = []
+    # if making comparison batch for multiple perturbations in single cell
+    if perturb_group == False:
+        original_emb_list = [original_emb_batch]*len(indices_to_perturb)
+    # if making comparison batch for single perturbation in multiple cells
+    elif perturb_group == True:
+        original_emb_list = original_emb_batch
+    for i in range(len(original_emb_list)):
+        original_emb = original_emb_list[i]
+        indices = indices_to_perturb[i]
+        if indices == [-100]:
+            all_embs_list += [original_emb[:]]
+            continue
         emb_list = []
         start = 0
         if len(indices)>1 and isinstance(indices[0],list):
             start = i+1
         emb_list += [original_emb[start:]]
         all_embs_list += [torch.cat(emb_list)]
+    len_set = set([emb.size()[0] for emb in all_embs_list])
+    if len(len_set) > 1:
+        max_len = max(len_set)
+        all_embs_list = [pad_2d_tensor(emb, None, max_len, 0) for emb in all_embs_list]
     return torch.stack(all_embs_list)
 # average embedding position of goal cell states
 def get_cell_state_avg_embs(model,
                             filtered_input_data,
                             cell_states_to_model,
                             layer_to_quant,
+                            pad_token_id,
                             forward_batch_size,
                             num_proc):
+    model_input_size = get_model_input_size(model)
     possible_states = [value[0]+value[1]+value[2] for value in cell_states_to_model.values()][0]
     state_embs_dict = dict()
     for possible_state in possible_states:
             state_minibatch.set_format(type="torch")
             input_data_minibatch = state_minibatch["input_ids"]
+            input_data_minibatch = pad_tensor_list(input_data_minibatch,
+                                                   max_len,
+                                                   pad_token_id,
+                                                   model_input_size)
             with torch.no_grad():
                 outputs = model(
                    perturbation_batch,
                    forward_batch_size,
                    layer_to_quant,
+                   original_emb,
+                   tokens_to_perturb,
                    indices_to_perturb,
+                   perturb_group,
                    cell_states_to_model,
+                   state_embs_dict,
+                   pad_token_id,
+                   model_input_size,
+                   nproc):
     cos = torch.nn.CosineSimilarity(dim=2)
     total_batch_length = len(perturbation_batch)
     if ((total_batch_length-1)/forward_batch_size).is_integer():
         forward_batch_size = forward_batch_size-1
     if cell_states_to_model is None:
+        if perturb_group == False: # (if perturb_group is True, original_emb is filtered_input_data)
+            comparison_batch = make_comparison_batch(original_emb, indices_to_perturb, perturb_group)
         cos_sims = []
     else:
         possible_states = [value[0]+value[1]+value[2] for value in cell_states_to_model.values()][0]
         cos_sims_vs_alt_dict = dict(zip(possible_states,[[] for i in range(len(possible_states))]))
+    # measure length of each element in perturbation_batch
+    perturbation_batch = perturbation_batch.map(
+            measure_length, num_proc=nproc
+        )
     for i in range(0, total_batch_length, forward_batch_size):
         max_range = min(i+forward_batch_size, total_batch_length)
         perturbation_minibatch = perturbation_batch.select([i for i in range(i, max_range)])
+        # determine if need to pad or truncate batch
+        minibatch_length_set = set(perturbation_minibatch["length"])
+        if (len(minibatch_length_set) > 1) or (max(minibatch_length_set) > model_input_size):
+            needs_pad_or_trunc = True
+        else:
+            needs_pad_or_trunc = False
+        if needs_pad_or_trunc == True:
+            max_len = min(max(minibatch_length_set),model_input_size)
+            def pad_or_trunc_example(example):
+                example["input_ids"] = pad_or_truncate_encoding(example["input_ids"],
+                                                               pad_token_id,
+                                                               max_len)
+                return example
+            perturbation_minibatch = perturbation_minibatch.map(pad_or_trunc_example, num_proc=nproc)
         perturbation_minibatch.set_format(type="torch")
         input_data_minibatch = perturbation_minibatch["input_ids"]
+        # extract embeddings for perturbation minibatch
         with torch.no_grad():
             outputs = model(
                 input_ids = input_data_minibatch.to("cuda")
             )
         del input_data_minibatch
         del perturbation_minibatch
         if len(indices_to_perturb)>1:
             minibatch_emb = torch.squeeze(outputs.hidden_states[layer_to_quant])
         else:
             minibatch_emb = outputs.hidden_states[layer_to_quant]
+        if perturb_type == "overexpress":
+            # remove overexpressed genes to quantify effect on remaining genes
+            if perturb_group == False:
+                overexpressed_to_remove = 1
+            if perturb_group == True:
+                overexpressed_to_remove = len(tokens_to_perturb)
+            minibatch_emb = minibatch_emb[:,overexpressed_to_remove:,:]
+        # if quantifying single perturbation in multiple different cells, pad original batch and extract embs
+        if perturb_group == True:
+            # pad minibatch of original batch to extract embeddings
+            # truncate to the (model input size - # tokens to overexpress) to ensure comparability
+            # since max input size of perturb batch will be reduced by # tokens to overexpress
+            original_minibatch = original_emb.select([i for i in range(i, max_range)])
+            original_minibatch_length_set = set(original_minibatch["length"])
             if perturb_type == "overexpress":
+                new_max_len = model_input_size - len(tokens_to_perturb)
+            else:
+                new_max_len = model_input_size
+            if (len(original_minibatch_length_set) > 1) or (max(original_minibatch_length_set) > new_max_len):
+                original_max_len = min(max(original_minibatch_length_set),new_max_len)
+                def pad_or_trunc_example(example):
+                    example["input_ids"] = pad_or_truncate_encoding(example["input_ids"], pad_token_id, original_max_len)
+                    return example
+                original_minibatch = original_minibatch.map(pad_or_trunc_example, num_proc=nproc)
+            original_minibatch.set_format(type="torch")
+            original_input_data_minibatch = original_minibatch["input_ids"]
+            # extract embeddings for original minibatch
+            with torch.no_grad():
+                original_outputs = model(
+                    input_ids = original_input_data_minibatch.to("cuda")
+                )
+            del original_input_data_minibatch
+            del original_minibatch
+            if len(indices_to_perturb)>1:
+                original_minibatch_emb = torch.squeeze(original_outputs.hidden_states[layer_to_quant])
+            else:
+                original_minibatch_emb = original_outputs.hidden_states[layer_to_quant]
+        # cosine similarity between original emb and batch items
+        if cell_states_to_model is None:
+            if perturb_group == False:
+                minibatch_comparison = comparison_batch[i:max_range]
+            elif perturb_group == True:
+                minibatch_comparison = make_comparison_batch(original_minibatch_emb,
+                                                             indices_to_perturb,
+                                                             perturb_group)
             cos_sims += [cos(minibatch_emb, minibatch_comparison).to("cpu")]
         elif cell_states_to_model is not None:
             for state in possible_states:
+                if perturb_group == False:
+                    cos_sims_vs_alt_dict[state] += cos_sim_shift(original_emb,
+                                                                minibatch_emb,
+                                                                state_embs_dict[state],
+                                                                perturb_group)
+                elif perturb_group == True:
+                    cos_sims_vs_alt_dict[state] += cos_sim_shift(original_minibatch_emb,
+                                                                minibatch_emb,
+                                                                state_embs_dict[state],
+                                                                perturb_group)
         del outputs
         del minibatch_emb
         if cell_states_to_model is None:
         return cos_sims_vs_alt_dict
 # calculate cos sim shift of perturbation with respect to origin and alternative cell
+def cos_sim_shift(original_emb, minibatch_emb, alt_emb, perturb_group):
     cos = torch.nn.CosineSimilarity(dim=2)
+    original_emb = torch.mean(original_emb,dim=0,keepdim=True)
+    if perturb_group == False:
+        original_emb = original_emb[None, :]
     origin_v_end = cos(original_emb,alt_emb)
+    perturb_emb = torch.mean(minibatch_emb,dim=1,keepdim=True)
+    perturb_v_end = cos(perturb_emb,alt_emb)
     return [(perturb_v_end-origin_v_end).to("cpu")]
+def pad_list(input_ids, pad_token_id, max_len):
+    input_ids = np.pad(input_ids,
+                       (0, max_len-len(input_ids)),
+                       mode='constant', constant_values=pad_token_id)
+    return input_ids
+def pad_tensor(tensor, pad_token_id, max_len):
+    tensor = torch.nn.functional.pad(tensor, pad=(0,
+                                     max_len - tensor.numel()),
+                                     mode='constant',
+                                     value=pad_token_id)
+    return tensor
+def pad_2d_tensor(tensor, pad_token_id, max_len, dim):
+    if dim == 0:
+        pad = (0, 0, 0, max_len - tensor.size()[dim])
+    elif dim == 1:
+        pad = (0, max_len - tensor.size()[dim], 0, 0)
+    tensor = torch.nn.functional.pad(tensor, pad=pad,
+                                     mode='constant',
+                                     value=pad_token_id)
+    return tensor
+def pad_or_truncate_encoding(encoding, pad_token_id, max_len):
+    if isinstance(encoding, torch.Tensor):
+        encoding_len = tensor.size()[0]
+    elif isinstance(encoding, list):
+        encoding_len = len(encoding)
+    if encoding_len > max_len:
+        encoding = encoding[0:max_len]
+    elif encoding_len < max_len:
+        if isinstance(encoding, torch.Tensor):
+            encoding = pad_tensor(encoding, pad_token_id, max_len)
+        elif isinstance(encoding, list):
+            encoding = pad_list(encoding, pad_token_id, max_len)
+    return encoding
 # pad list of tensors and convert to tensor
+def pad_tensor_list(tensor_list, dynamic_or_constant, pad_token_id, model_input_size):
     # Determine maximum tensor length
     if dynamic_or_constant == "dynamic":
     elif type(dynamic_or_constant) == int:
         max_len = dynamic_or_constant
     else:
+        max_len = model_input_size
         logger.warning(
                     "If padding style is constant, must provide integer value. " \
+                    f"Setting padding to max input size {model_input_size}.")
     # pad all tensors to maximum length
+    tensor_list = [pad_tensor(tensor, pad_token_id, max_len) for tensor in tensor_list]
     # return stacked tensors
     return torch.stack(tensor_list)
         "perturb_type": {"delete","overexpress","inhibit","activate"},
         "perturb_rank_shift": {None, 1, 2, 3},
         "genes_to_perturb": {"all", list},
+        "combos": {0, 1},
         "anchor_gene": {None, str},
         "model_type": {"Pretrained","GeneClassifier","CellClassifier"},
         "num_classes": {int},
         "emb_layer": {-1, 0},
         "forward_batch_size": {int},
         "nproc": {int},
     }
     def __init__(
         self,
         emb_layer=-1,
         forward_batch_size=100,
         nproc=4,
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
     ):
         """
         genes_to_perturb : "all", list
             Default is perturbing each gene detected in each cell in the dataset.
             Otherwise, may provide a list of ENSEMBL IDs of genes to perturb.
+            If gene list is provided, then perturber will only test perturbing them all together
+            (rather than testing each possible combination of the provided genes).
+        combos : {0,1}
+            Whether to perturb genes individually (0) or in pairs (1).
         anchor_gene : None, str
             ENSEMBL ID of gene to use as anchor in combination perturbations.
             For example, if combos=1 and anchor_gene="ENSG00000148400":
             Batch size for forward pass.
         nproc : int
             Number of CPU processes to use.
         token_dictionary_file : Path
             Path to pickle file containing token dictionary (Ensembl ID:token).
         """
         self.genes_to_perturb = genes_to_perturb
         self.combos = combos
         self.anchor_gene = anchor_gene
+        if self.genes_to_perturb == "all":
+            self.perturb_group = False
+        else:
+            self.perturb_group = True
+            if (self.anchor_gene != None) or (self.combos != 0):
+                self.anchor_gene = None
+                self.combos = 0
+                logger.warning(
+                    "anchor_gene set to None and combos set to 0. " \
+                    "If providing list of genes to perturb, " \
+                    "list of genes_to_perturb will be perturbed together, "\
+                    "without anchor gene or combinations.")
         self.model_type = model_type
         self.num_classes = num_classes
         self.emb_mode = emb_mode
         self.emb_layer = emb_layer
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
         self.validate_options()
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
+        self.pad_token_id = self.gene_token_dict.get("<pad>")
+        if self.anchor_gene is None:
             self.anchor_token = None
         else:
+            try:
+                self.anchor_token = [self.gene_token_dict[self.anchor_gene]]
+            except KeyError:
+                logger.error(
+                    f"Anchor gene {self.anchor_gene} not in token dictionary."
+                )
+                raise
+        if self.genes_to_perturb == "all":
             self.tokens_to_perturb = "all"
         else:
+            missing_genes = [gene for gene in self.genes_to_perturb if gene not in self.gene_token_dict.keys()]
+            if len(missing_genes) == len(self.genes_to_perturb):
+                logger.error(
+                    "None of the provided genes to perturb are in token dictionary."
+                )
+                raise
+            elif len(missing_genes)>0:
+                logger.warning(
+                    f"Genes to perturb {missing_genes} are not in token dictionary.")
+            self.tokens_to_perturb = [self.gene_token_dict.get(gene) for gene in self.genes_to_perturb]
     def validate_options(self):
         # first disallow options under development
         if self.perturb_type in ["inhibit", "activate"]:
             logger.error(
+                "In silico inhibition and activation currently under development. " \
+                "Current valid options for 'perturb_type': 'delete' or 'overexpress'"
             )
             raise
                 f"Valid options for {attr_name}: {valid_options}"
             )
             raise
         if self.perturb_type in ["delete","overexpress"]:
             if self.perturb_rank_shift is not None:
                 if self.perturb_type == "delete":
         input_data_file : Path
             Path to directory containing .dataset inputs
         output_directory : Path
+            Path to directory where perturbation data will be saved as batched pickle files
         output_prefix : str
+            Prefix for output files
         """
         filtered_input_data = self.load_and_filter(input_data_file)
                                                       filtered_input_data,
                                                       self.cell_states_to_model,
                                                       layer_to_quant,
+                                                      self.pad_token_id,
                                                       self.forward_batch_size,
                                                       self.nproc)
             # filter for start state cells
                               state_embs_dict,
                               output_directory,
                               output_prefix)
     # load data and filter by defined criteria
     def load_and_filter(self, input_data_file):
                           output_prefix):
         output_path_prefix = f"{output_directory}in_silico_{self.perturb_type}_{output_prefix}_dict_1Kbatch"
+        model_input_size = get_model_input_size(model)
         # filter dataset for cells that have tokens to be perturbed
         if self.anchor_token is not None:
                 return (len(set(example["input_ids"]).intersection(self.anchor_token))==len(self.anchor_token))
             filtered_input_data = filtered_input_data.filter(if_has_tokens_to_perturb, num_proc=self.nproc)
             logger.info(f"# cells with anchor gene: {len(filtered_input_data)}")
+        if (self.tokens_to_perturb != "all") and (self.perturb_type != "overexpress"):
+            # minimum # genes needed for perturbation test
+            min_genes = len(self.tokens_to_perturb)
             def if_has_tokens_to_perturb(example):
+                return (len(set(example["input_ids"]).intersection(self.tokens_to_perturb))>min_genes)
             filtered_input_data = filtered_input_data.filter(if_has_tokens_to_perturb, num_proc=self.nproc)
         cos_sims_dict = defaultdict(list)
         pickle_batch = -1
+        # make perturbation batch w/ single perturbation in multiple cells
+        if self.perturb_group == True:
+            def make_group_perturbation_batch(example):
+                example_input_ids = example["input_ids"]
+                example["tokens_to_perturb"] = self.tokens_to_perturb
+                indices_to_perturb = [example_input_ids.index(token) if token in example_input_ids else None for token in self.tokens_to_perturb]
+                indices_to_perturb = [item for item in indices_to_perturb if item is not None]
+                if len(indices_to_perturb) > 0:
+                    example["perturb_index"] = indices_to_perturb
+                else:
+                    # -100 indicates tokens to overexpress are not present in rank value encoding
+                    example["perturb_index"] = [-100]
+                if self.perturb_type == "delete":
+                    example = delete_indices(example)
+                elif self.perturb_type == "overexpress":
+                    example = overexpress_tokens(example)
+                return example
+            perturbation_batch = filtered_input_data.map(make_group_perturbation_batch, num_proc=self.nproc)
+            indices_to_perturb = perturbation_batch["perturb_index"]
+            cos_sims_data = quant_cos_sims(model,
+                                           self.perturb_type,
+                                           perturbation_batch,
+                                           self.forward_batch_size,
+                                           layer_to_quant,
+                                           filtered_input_data,
+                                           self.tokens_to_perturb,
+                                           indices_to_perturb,
+                                           self.perturb_group,
+                                           self.cell_states_to_model,
+                                           state_embs_dict,
+                                           self.pad_token_id,
+                                           model_input_size,
+                                           self.nproc)
+            perturbed_genes = tuple(self.tokens_to_perturb)
+            original_lengths = filtered_input_data["length"]
+            if self.cell_states_to_model is None:
+                # update cos sims dict
+                # key is tuple of (perturbed_gene, affected_gene)
+                # or (perturbed_genes, "cell_emb") for avg cell emb change
+                cos_sims_data = cos_sims_data.to("cuda")
+                max_padded_len = cos_sims_data.shape[1]
+                for j in range(cos_sims_data.shape[0]):
+                    # remove padding before mean pooling cell embedding
+                    original_length = original_lengths[j]
+                    gene_list = filtered_input_data[j]["input_ids"]
+                    indices_removed = indices_to_perturb[j]
+                    padding_to_remove = max_padded_len - (original_length \
+                                                          - len(self.tokens_to_perturb) \
+                                                          - len(indices_removed))
+                    nonpadding_cos_sims_data = cos_sims_data[j][:-padding_to_remove]
+                    cell_cos_sim = torch.mean(nonpadding_cos_sims_data).item()
+                    cos_sims_dict[(perturbed_genes, "cell_emb")] += [cell_cos_sim]
+                    if self.emb_mode == "cell_and_gene":
+                        for k in range(cos_sims_data.shape[1]):
+                            cos_sim_value = nonpadding_cos_sims_data[k]
+                            affected_gene = gene_list[k].item()
+                            cos_sims_dict[(perturbed_genes, affected_gene)] += [cos_sim_value.item()]
+            else:
+                # update cos sims dict
+                # key is tuple of (perturbed_genes, "cell_emb")
+                # value is list of tuples of cos sims for cell_states_to_model
+                origin_state_key = [value[0] for value in self.cell_states_to_model.values()][0][0]
+                cos_sims_origin = cos_sims_data[origin_state_key]
+                for j in range(cos_sims_origin.shape[0]):
+                    original_length = original_lengths[j]
+                    max_padded_len = cos_sims_origin.shape[1]
+                    indices_removed = indices_to_perturb[j]
+                    padding_to_remove = max_padded_len - (original_length \
+                                                          - len(self.tokens_to_perturb) \
+                                                          - len(indices_removed))
+                    data_list = []
+                    for data in list(cos_sims_data.values()):
+                        data_item = data.to("cuda")
+                        nonpadding_data_item = data_item[j][:-padding_to_remove]
+                        cell_data = torch.mean(nonpadding_data_item).item()
+                        data_list += [cell_data]
+                    cos_sims_dict[(perturbed_genes, "cell_emb")] += [tuple(data_list)]
+            with open(f"{output_path_prefix}_raw.pickle", "wb") as fp:
+                pickle.dump(cos_sims_dict, fp)
+        # make perturbation batch w/ multiple perturbations in single cell
+        if self.perturb_group == False:
+            for i in trange(len(filtered_input_data)):
+                example_cell = filtered_input_data.select([i])
+                original_emb = forward_pass_single_cell(model, example_cell, layer_to_quant)
+                gene_list = torch.squeeze(example_cell["input_ids"])
+                # reset to original type to prevent downstream issues due to forward_pass_single_cell modifying as torch format in place
+                example_cell = filtered_input_data.select([i])
+                if self.anchor_token is None:
+                    for combo_lvl in range(self.combos+1):
+                        perturbation_batch, indices_to_perturb = make_perturbation_batch(example_cell,
+                                                                                        self.perturb_type,
+                                                                                        self.tokens_to_perturb,
+                                                                                        self.anchor_token,
+                                                                                        combo_lvl,
+                                                                                        self.nproc)
+                        cos_sims_data = quant_cos_sims(model,
+                                                       self.perturb_type,
+                                                       perturbation_batch,
+                                                       self.forward_batch_size,
+                                                       layer_to_quant,
+                                                       original_emb,
+                                                       self.tokens_to_perturb,
+                                                       indices_to_perturb,
+                                                       self.perturb_group,
+                                                       self.cell_states_to_model,
+                                                       state_embs_dict,
+                                                       self.pad_token_id,
+                                                       model_input_size,
+                                                       self.nproc)
+                        if self.cell_states_to_model is None:
+                            # update cos sims dict
+                            # key is tuple of (perturbed_gene, affected_gene)
+                            # or (perturbed_gene, "cell_emb") for avg cell emb change
+                            cos_sims_data = cos_sims_data.to("cuda")
+                            for j in range(cos_sims_data.shape[0]):
+                                if self.tokens_to_perturb != "all":
+                                    j_index = torch.tensor(indices_to_perturb[j])
+                                    if j_index.shape[0]>1:
+                                        j_index = torch.squeeze(j_index)
+                                else:
+                                    j_index = torch.tensor([j])
+                                perturbed_gene = torch.index_select(gene_list, 0, j_index)
+                                if perturbed_gene.shape[0]==1:
+                                    perturbed_gene = perturbed_gene.item()
+                                elif perturbed_gene.shape[0]>1:
+                                    perturbed_gene = tuple(perturbed_gene.tolist())
+                                cell_cos_sim = torch.mean(cos_sims_data[j]).item()
+                                cos_sims_dict[(perturbed_gene, "cell_emb")] += [cell_cos_sim]
+                                # not_j_index = list(set(i for i in range(gene_list.shape[0])).difference(j_index))
+                                # gene_list_j = torch.index_select(gene_list, 0, j_index)
+                                if self.emb_mode == "cell_and_gene":
+                                    for k in range(cos_sims_data.shape[1]):
+                                        cos_sim_value = cos_sims_data[j][k]
+                                        affected_gene = gene_list[k].item()
+                                        cos_sims_dict[(perturbed_gene, affected_gene)] += [cos_sim_value.item()]
+                        else:
+                            # update cos sims dict
+                            # key is tuple of (perturbed_gene, "cell_emb")
+                            # value is list of tuples of cos sims for cell_states_to_model
+                            origin_state_key = [value[0] for value in self.cell_states_to_model.values()][0][0]
+                            cos_sims_origin = cos_sims_data[origin_state_key]
+                            for j in range(cos_sims_origin.shape[0]):
+                                if (self.tokens_to_perturb != "all") or (combo_lvl>0):
+                                    j_index = torch.tensor(indices_to_perturb[j])
+                                    if j_index.shape[0]>1:
+                                        j_index = torch.squeeze(j_index)
+                                else:
+                                    j_index = torch.tensor([j])
+                                perturbed_gene = torch.index_select(gene_list, 0, j_index)
+                                if perturbed_gene.shape[0]==1:
+                                    perturbed_gene = perturbed_gene.item()
+                                elif perturbed_gene.shape[0]>1:
+                                    perturbed_gene = tuple(perturbed_gene.tolist())
+                                data_list = []
+                                for data in list(cos_sims_data.values()):
+                                    data_item = data.to("cuda")
+                                    cell_data = torch.mean(data_item[j]).item()
+                                    data_list += [cell_data]
+                                cos_sims_dict[(perturbed_gene, "cell_emb")] += [tuple(data_list)]
+                elif self.anchor_token is not None:
                     perturbation_batch, indices_to_perturb = make_perturbation_batch(example_cell,
+                                                                                     self.perturb_type,
+                                                                                     self.tokens_to_perturb,
+                                                                                     None,  # first run without anchor token to test individual gene perturbations
+                                                                                     0,
+                                                                                     self.nproc)
                     cos_sims_data = quant_cos_sims(model,
                                                    self.perturb_type,
+                                                   perturbation_batch,
+                                                   self.forward_batch_size,
+                                                   layer_to_quant,
+                                                   original_emb,
+                                                   self.tokens_to_perturb,
                                                    indices_to_perturb,
+                                                   self.perturb_group,
                                                    self.cell_states_to_model,
+                                                   state_embs_dict,
+                                                   self.pad_token_id,
+                                                   model_input_size,
+                                                   self.nproc)
+                    cos_sims_data = cos_sims_data.to("cuda")
+                    combo_perturbation_batch, combo_indices_to_perturb = make_perturbation_batch(example_cell,
+                                                                                                 self.perturb_type,
+                                                                                                 self.tokens_to_perturb,
+                                                                                                 self.anchor_token,
+                                                                                                 1,
+                                                                                                 self.nproc)
+                    combo_cos_sims_data = quant_cos_sims(model,
+                                                         self.perturb_type,
+                                                         combo_perturbation_batch,
+                                                         self.forward_batch_size,
+                                                         layer_to_quant,
+                                                         original_emb,
+                                                         self.tokens_to_perturb,
+                                                         combo_indices_to_perturb,
+                                                         self.perturb_group,
+                                                         self.cell_states_to_model,
+                                                         state_embs_dict,
+                                                         self.pad_token_id,
+                                                         model_input_size,
+                                                         self.nproc)
+                    combo_cos_sims_data = combo_cos_sims_data.to("cuda")
+                    # update cos sims dict
+                    # key is tuple of (perturbed_gene, "cell_emb") for avg cell emb change
+                    anchor_index = example_cell["input_ids"][0].index(self.anchor_token[0])
+                    anchor_cell_cos_sim = torch.mean(cos_sims_data[anchor_index]).item()
+                    non_anchor_indices = [k for k in range(cos_sims_data.shape[0]) if k != anchor_index]
+                    cos_sims_data = cos_sims_data[non_anchor_indices,:]
+                    for j in range(cos_sims_data.shape[0]):
+                        if j<anchor_index:
+                            j_index = torch.tensor([j])
+                        else:
+                            j_index = torch.tensor([j+1])
+                        perturbed_gene = torch.index_select(gene_list, 0, j_index)
+                        perturbed_gene = perturbed_gene.item()
+                        cell_cos_sim = torch.mean(cos_sims_data[j]).item()
+                        combo_cos_sim = torch.mean(combo_cos_sims_data[j]).item()
+                        cos_sims_dict[(perturbed_gene, "cell_emb")] += [(anchor_cell_cos_sim, # cos sim anchor gene alone
+                                                                         cell_cos_sim, # cos sim deleted gene alone
+                                                                         combo_cos_sim)] # cos sim anchor gene + deleted gene
+                # save dict to disk every 100 cells
+                if (i/100).is_integer():
+                    with open(f"{output_path_prefix}{pickle_batch}_raw.pickle", "wb") as fp:
+                        pickle.dump(cos_sims_dict, fp)
+                # reset and clear memory every 1000 cells
+                if (i/1000).is_integer():
+                    pickle_batch = pickle_batch+1
+                    # clear memory
+                    del perturbed_gene
+                    del cos_sims_data
+                    if self.cell_states_to_model is None:
+                        del cell_cos_sim
+                    if self.cell_states_to_model is not None:
+                        del cell_data
+                        del data_list
+                    elif self.anchor_token is None:
+                        if self.emb_mode == "cell_and_gene":
+                            del affected_gene
+                            del cos_sim_value
+                    else:
+                        del combo_cos_sim
+                        del combo_cos_sims_data
+                    # reset dict
+                    del cos_sims_dict
+                    cos_sims_dict = defaultdict(list)
+                    torch.cuda.empty_cache()
+            # save remainder cells
+            with open(f"{output_path_prefix}{pickle_batch}_raw.pickle", "wb") as fp:
+                pickle.dump(cos_sims_dict, fp)

geneformer/in_silico_perturber_stats.py CHANGED Viewed

@@ -79,6 +79,9 @@ def get_gene_list(dict_list,mode):
     gene_list.sort()
     return gene_list
 def n_detections(token, dict_list, mode, anchor_token):
     cos_sim_megalist = []
     for dict_i in dict_list:
@@ -106,98 +109,130 @@ def get_impact_component(test_value, gaussian_mixture_model):
             impact_component = 1
     return impact_component
 # stats comparing cos sim shifts towards goal state of test perturbations vs random perturbations
-def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model):
     cell_state_key = list(cell_states_to_model.keys())[0]
     if cell_states_to_model[cell_state_key][2] == []:
         alt_end_state_exists = False
     elif (len(cell_states_to_model[cell_state_key][2]) > 0) and (cell_states_to_model[cell_state_key][2] != [None]):
         alt_end_state_exists = True
-    random_tuples = []
-    for i in trange(cos_sims_df.shape[0]):
-        token = cos_sims_df["Gene"][i]
-        for dict_i in dict_list:
-            random_tuples += dict_i.get((token, "cell_emb"),[])
-    if alt_end_state_exists == False:
-        goal_end_random_megalist = [goal_end for start_state,goal_end in random_tuples]
-    elif alt_end_state_exists == True:
-        goal_end_random_megalist = [goal_end for start_state,goal_end,alt_end in random_tuples]
-        alt_end_random_megalist = [alt_end for start_state,goal_end,alt_end in random_tuples]
-    # downsample to improve speed of ranksums
-    if len(goal_end_random_megalist) > 100_000:
-        random.seed(42)
-        goal_end_random_megalist = random.sample(goal_end_random_megalist, k=100_000)
-    if alt_end_state_exists == True:
-        if len(alt_end_random_megalist) > 100_000:
-            random.seed(42)
-            alt_end_random_megalist = random.sample(alt_end_random_megalist, k=100_000)
-    names=["Gene",
-           "Gene_name",
-           "Ensembl_ID",
-           "Shift_to_goal_end",
-           "Shift_to_alt_end",
-           "Goal_end_vs_random_pval",
-           "Alt_end_vs_random_pval"]
-    if alt_end_state_exists == False:
-        names.remove("Shift_to_alt_end")
-        names.remove("Alt_end_vs_random_pval")
-    cos_sims_full_df = pd.DataFrame(columns=names)
-    for i in trange(cos_sims_df.shape[0]):
-        token = cos_sims_df["Gene"][i]
-        name = cos_sims_df["Gene_name"][i]
-        ensembl_id = cos_sims_df["Ensembl_ID"][i]
-        cos_shift_data = []
         for dict_i in dict_list:
             cos_shift_data += dict_i.get((token, "cell_emb"),[])
         if alt_end_state_exists == False:
-            goal_end_cos_sim_megalist = [goal_end for start_state,goal_end in cos_shift_data]
         elif alt_end_state_exists == True:
-            goal_end_cos_sim_megalist = [goal_end for start_state,goal_end,alt_end in cos_shift_data]
-            alt_end_cos_sim_megalist = [alt_end for start_state,goal_end,alt_end in cos_shift_data]
-            mean_alt_end = np.mean(alt_end_cos_sim_megalist)
-            pval_alt_end = ranksums(alt_end_random_megalist,alt_end_cos_sim_megalist).pvalue
-        mean_goal_end = np.mean(goal_end_cos_sim_megalist)
-        pval_goal_end = ranksums(goal_end_random_megalist,goal_end_cos_sim_megalist).pvalue
         if alt_end_state_exists == False:
-            data_i = [token,
-                      name,
-                      ensembl_id,
-                      mean_goal_end,
-                      pval_goal_end]
-        elif alt_end_state_exists == True:
-            data_i = [token,
-                      name,
-                      ensembl_id,
-                      mean_goal_end,
-                      mean_alt_end,
-                      pval_goal_end,
-                      pval_alt_end]
-        cos_sims_df_i = pd.DataFrame(dict(zip(names,data_i)),index=[i])
-        cos_sims_full_df = pd.concat([cos_sims_full_df,cos_sims_df_i])
-    cos_sims_full_df["Goal_end_FDR"] = get_fdr(list(cos_sims_full_df["Goal_end_vs_random_pval"]))
-    if alt_end_state_exists == True:
-        cos_sims_full_df["Alt_end_FDR"] = get_fdr(list(cos_sims_full_df["Alt_end_vs_random_pval"]))
-    # quantify number of detections of each gene
-    cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
-    # sort by shift to desired state
-    cos_sims_full_df = cos_sims_full_df.sort_values(by=["Shift_to_goal_end",
-                                                        "Goal_end_FDR"],
-                                                        ascending=[False,True])
-    return cos_sims_full_df
 # stats comparing cos sim shifts of test perturbations vs null distribution
 def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
@@ -362,7 +397,7 @@ def isp_stats_mixture_model(cos_sims_df, dict_list, combos, anchor_token):
 class InSilicoPerturberStats:
     valid_option_dict = {
-        "mode": {"goal_state_shift","vs_null","mixture_model"},
         "combos": {0,1},
         "anchor_gene": {None, str},
         "cell_states_to_model": {None, dict},
@@ -370,6 +405,7 @@ class InSilicoPerturberStats:
     def __init__(
         self,
         mode="mixture_model",
         combos=0,
         anchor_gene=None,
         cell_states_to_model=None,
@@ -381,11 +417,16 @@ class InSilicoPerturberStats:
         Parameters
         ----------
-        mode : {"goal_state_shift","vs_null","mixture_model"}
             Type of stats.
             "goal_state_shift": perturbation vs. random for desired cell state shift
             "vs_null": perturbation vs. null from provided null distribution dataset
             "mixture_model": perturbation in impact vs. no impact component of mixture model (no goal direction)
         combos : {0,1,2}
             Whether to perturb genes individually (0), in pairs (1), or in triplets (2).
         anchor_gene : None, str
@@ -406,6 +447,7 @@ class InSilicoPerturberStats:
         """
         self.mode = mode
         self.combos = combos
         self.anchor_gene = anchor_gene
         self.cell_states_to_model = cell_states_to_model
@@ -477,6 +519,17 @@ class InSilicoPerturberStats:
                     "in silico perturbation run with anchor gene. Please add " \
                     "anchor gene when using with combos > 0. ")
                 raise
     def get_stats(self,
                   input_data_directory,
@@ -495,7 +548,7 @@ class InSilicoPerturberStats:
         output_directory : Path
             Path to directory where perturbation data will be saved as .csv
         output_prefix : str
-            Prefix for output .dataset
         Outputs
         ----------
@@ -538,11 +591,11 @@ class InSilicoPerturberStats:
         "Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
         """
-        if self.mode not in ["goal_state_shift", "vs_null", "mixture_model"]:
             logger.error(
                 "Currently, only modes available are stats for goal_state_shift, " \
-                    "vs_null (comparing to null distribution), and " \
-                    "mixture_model (fitting mixture model for perturbations with or without impact.")
             raise
         self.gene_token_id_dict = invert_dict(self.gene_token_dict)
@@ -562,14 +615,16 @@ class InSilicoPerturberStats:
         cos_sims_df_initial = pd.DataFrame({"Gene": gene_list,
                                             "Gene_name": [self.token_to_gene_name(item) \
                                                           for item in gene_list], \
-                                            "Ensembl_ID": [self.gene_token_id_dict[genes[1]] \
                                                            if isinstance(genes,tuple) else \
                                                            self.gene_token_id_dict[genes] \
                                                            for genes in gene_list]}, \
                                              index=[i for i in range(len(gene_list))])
         if self.mode == "goal_state_shift":
-            cos_sims_df = isp_stats_to_goal_state(cos_sims_df_initial, dict_list, self.cell_states_to_model)
         elif self.mode == "vs_null":
             null_dict_list = read_dictionaries(null_dist_data_directory, "cell", self.anchor_token)
@@ -577,6 +632,9 @@ class InSilicoPerturberStats:
         elif self.mode == "mixture_model":
             cos_sims_df = isp_stats_mixture_model(cos_sims_df_initial, dict_list, self.combos, self.anchor_token)
         # save perturbation stats to output_path
         output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")

     gene_list.sort()
     return gene_list
+def token_tuple_to_ensembl_ids(token_tuple, gene_token_id_dict):
+    return tuple([gene_token_id_dict.get(i, np.nan) for i in token_tuple])
 def n_detections(token, dict_list, mode, anchor_token):
     cos_sim_megalist = []
     for dict_i in dict_list:
             impact_component = 1
     return impact_component
+# aggregate data for single perturbation in multiple cells
+def isp_aggregate_grouped_perturb(cos_sims_df, dict_list):
+    names=["Cosine_shift"]
+    cos_sims_full_df = pd.DataFrame(columns=names)
+    cos_shift_data = []
+    token = cos_sims_df["Gene"][0]
+    for dict_i in dict_list:
+        cos_shift_data += dict_i.get((token, "cell_emb"),[])
+    cos_sims_full_df["Cosine_shift"] = cos_shift_data
+    return cos_sims_full_df
 # stats comparing cos sim shifts towards goal state of test perturbations vs random perturbations
+def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model, genes_perturbed):
     cell_state_key = list(cell_states_to_model.keys())[0]
     if cell_states_to_model[cell_state_key][2] == []:
         alt_end_state_exists = False
     elif (len(cell_states_to_model[cell_state_key][2]) > 0) and (cell_states_to_model[cell_state_key][2] != [None]):
         alt_end_state_exists = True
+    # for single perturbation in multiple cells, there are no random perturbations to compare to
+    if genes_perturbed != "all":
+        names=["Shift_to_goal_end",
+               "Shift_to_alt_end"]
+        if alt_end_state_exists == False:
+            names.remove("Shift_to_alt_end")
+        cos_sims_full_df = pd.DataFrame(columns=names)
+        cos_shift_data = []
+        token = cos_sims_df["Gene"][0]
         for dict_i in dict_list:
             cos_shift_data += dict_i.get((token, "cell_emb"),[])
+        if alt_end_state_exists == False:
+            cos_sims_full_df["Shift_to_goal_end"] = [goal_end for start_state,goal_end in cos_shift_data]
+        if alt_end_state_exists == True:
+            cos_sims_full_df["Shift_to_goal_end"] = [goal_end for start_state,goal_end,alt_end in cos_shift_data]
+            cos_sims_full_df["Shift_to_alt_end"] = [alt_end for start_state,goal_end,alt_end in cos_shift_data]
+        return cos_sims_full_df
+    elif genes_perturbed == "all":
+        random_tuples = []
+        for i in trange(cos_sims_df.shape[0]):
+            token = cos_sims_df["Gene"][i]
+            for dict_i in dict_list:
+                random_tuples += dict_i.get((token, "cell_emb"),[])
         if alt_end_state_exists == False:
+            goal_end_random_megalist = [goal_end for start_state,goal_end in random_tuples]
         elif alt_end_state_exists == True:
+            goal_end_random_megalist = [goal_end for start_state,goal_end,alt_end in random_tuples]
+            alt_end_random_megalist = [alt_end for start_state,goal_end,alt_end in random_tuples]
+        # downsample to improve speed of ranksums
+        if len(goal_end_random_megalist) > 100_000:
+            random.seed(42)
+            goal_end_random_megalist = random.sample(goal_end_random_megalist, k=100_000)
+        if alt_end_state_exists == True:
+            if len(alt_end_random_megalist) > 100_000:
+                random.seed(42)
+                alt_end_random_megalist = random.sample(alt_end_random_megalist, k=100_000)
+        names=["Gene",
+               "Gene_name",
+               "Ensembl_ID",
+               "Shift_to_goal_end",
+               "Shift_to_alt_end",
+               "Goal_end_vs_random_pval",
+               "Alt_end_vs_random_pval"]
         if alt_end_state_exists == False:
+            names.remove("Shift_to_alt_end")
+            names.remove("Alt_end_vs_random_pval")
+        cos_sims_full_df = pd.DataFrame(columns=names)
+        for i in trange(cos_sims_df.shape[0]):
+            token = cos_sims_df["Gene"][i]
+            name = cos_sims_df["Gene_name"][i]
+            ensembl_id = cos_sims_df["Ensembl_ID"][i]
+            cos_shift_data = []
+            for dict_i in dict_list:
+                cos_shift_data += dict_i.get((token, "cell_emb"),[])
+            if alt_end_state_exists == False:
+                goal_end_cos_sim_megalist = [goal_end for start_state,goal_end in cos_shift_data]
+            elif alt_end_state_exists == True:
+                goal_end_cos_sim_megalist = [goal_end for start_state,goal_end,alt_end in cos_shift_data]
+                alt_end_cos_sim_megalist = [alt_end for start_state,goal_end,alt_end in cos_shift_data]
+                mean_alt_end = np.mean(alt_end_cos_sim_megalist)
+                pval_alt_end = ranksums(alt_end_random_megalist,alt_end_cos_sim_megalist).pvalue
+            mean_goal_end = np.mean(goal_end_cos_sim_megalist)
+            pval_goal_end = ranksums(goal_end_random_megalist,goal_end_cos_sim_megalist).pvalue
+            if alt_end_state_exists == False:
+                data_i = [token,
+                          name,
+                          ensembl_id,
+                          mean_goal_end,
+                          pval_goal_end]
+            elif alt_end_state_exists == True:
+                data_i = [token,
+                          name,
+                          ensembl_id,
+                          mean_goal_end,
+                          mean_alt_end,
+                          pval_goal_end,
+                          pval_alt_end]
+            cos_sims_df_i = pd.DataFrame(dict(zip(names,data_i)),index=[i])
+            cos_sims_full_df = pd.concat([cos_sims_full_df,cos_sims_df_i])
+        cos_sims_full_df["Goal_end_FDR"] = get_fdr(list(cos_sims_full_df["Goal_end_vs_random_pval"]))
+        if alt_end_state_exists == True:
+            cos_sims_full_df["Alt_end_FDR"] = get_fdr(list(cos_sims_full_df["Alt_end_vs_random_pval"]))
+        # quantify number of detections of each gene
+        cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
+        # sort by shift to desired state
+        cos_sims_full_df = cos_sims_full_df.sort_values(by=["Shift_to_goal_end",
+                                                            "Goal_end_FDR"],
+                                                            ascending=[False,True])
+        return cos_sims_full_df
 # stats comparing cos sim shifts of test perturbations vs null distribution
 def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
 class InSilicoPerturberStats:
     valid_option_dict = {
+        "mode": {"goal_state_shift","vs_null","mixture_model","aggregate_data"},
         "combos": {0,1},
         "anchor_gene": {None, str},
         "cell_states_to_model": {None, dict},
     def __init__(
         self,
         mode="mixture_model",
+        genes_perturbed="all",
         combos=0,
         anchor_gene=None,
         cell_states_to_model=None,
         Parameters
         ----------
+        mode : {"goal_state_shift","vs_null","mixture_model","aggregate_data"}
             Type of stats.
             "goal_state_shift": perturbation vs. random for desired cell state shift
             "vs_null": perturbation vs. null from provided null distribution dataset
             "mixture_model": perturbation in impact vs. no impact component of mixture model (no goal direction)
+            "aggregate_data": aggregates cosine shifts for single perturbation in multiple cells
+        genes_perturbed : "all", list
+            Genes perturbed in isp experiment.
+            Default is assuming genes_to_perturb in isp experiment was "all" (each gene in each cell).
+            Otherwise, may provide a list of ENSEMBL IDs of genes perturbed as a group all together.
         combos : {0,1,2}
             Whether to perturb genes individually (0), in pairs (1), or in triplets (2).
         anchor_gene : None, str
         """
         self.mode = mode
+        self.genes_perturbed = genes_perturbed
         self.combos = combos
         self.anchor_gene = anchor_gene
         self.cell_states_to_model = cell_states_to_model
                     "in silico perturbation run with anchor gene. Please add " \
                     "anchor gene when using with combos > 0. ")
                 raise
+        if (self.mode == "mixture_model") and (self.genes_perturbed != "all"):
+            logger.error(
+                    "Mixture model mode requires multiple gene perturbations to fit model " \
+                    "so is incompatible with a single grouped perturbation.")
+            raise
+        if (self.mode == "aggregate_data") and (self.genes_perturbed == "all"):
+            logger.error(
+                    "Simple data aggregation mode is for single perturbation in multiple cells " \
+                    "so is incompatible with a genes_perturbed being 'all'.")
+            raise
     def get_stats(self,
                   input_data_directory,
         output_directory : Path
             Path to directory where perturbation data will be saved as .csv
         output_prefix : str
+            Prefix for output .csv
         Outputs
         ----------
         "Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
         """
+        if self.mode not in ["goal_state_shift", "vs_null", "mixture_model","aggregate_data"]:
             logger.error(
                 "Currently, only modes available are stats for goal_state_shift, " \
+                "vs_null (comparing to null distribution), and " \
+                "mixture_model (fitting mixture model for perturbations with or without impact.")
             raise
         self.gene_token_id_dict = invert_dict(self.gene_token_dict)
         cos_sims_df_initial = pd.DataFrame({"Gene": gene_list,
                                             "Gene_name": [self.token_to_gene_name(item) \
                                                           for item in gene_list], \
+                                            "Ensembl_ID": [token_tuple_to_ensembl_ids(genes, self.gene_token_id_dict) \
+                                                           if self.genes_perturbed != "all" else \
+                                                           self.gene_token_id_dict[genes[1]] \
                                                            if isinstance(genes,tuple) else \
                                                            self.gene_token_id_dict[genes] \
                                                            for genes in gene_list]}, \
                                              index=[i for i in range(len(gene_list))])
         if self.mode == "goal_state_shift":
+            cos_sims_df = isp_stats_to_goal_state(cos_sims_df_initial, dict_list, self.cell_states_to_model, self.genes_perturbed)
         elif self.mode == "vs_null":
             null_dict_list = read_dictionaries(null_dist_data_directory, "cell", self.anchor_token)
         elif self.mode == "mixture_model":
             cos_sims_df = isp_stats_mixture_model(cos_sims_df_initial, dict_list, self.combos, self.anchor_token)
+        elif self.mode == "aggregate_data":
+            cos_sims_df = isp_aggregate_grouped_perturb(cos_sims_df_initial, dict_list)
         # save perturbation stats to output_path
         output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")