In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import os
import numpy as np

# Loading the dataset
After performing the cleaning on uscase_1_.csv using clean.ipynb and then merging the criteria field into the csv using the merged.ipynb and the filtered_combined.xlsx was achieved as the output.

The following code loads an Excel file into a pandas DataFrame.
 - `file_name` specifies the path to the Excel file.
 - `pd.read_excel(file_name)` reads the Excel file and stores its content in the DataFrame `df`.

In [3]:
# Loading the dataset
file_name = "../Data/filtered_combined.xlsx"
df = pd.read_excel(file_name)

# Connecting to the available device

The following code detects if a GPU is available and sets the device accordingly. If a GPU is available, it sets the device to "cuda"; otherwise, it sets the device to "cpu".


In [4]:

# Detecting if GPU is available and setting the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Initializing the BioBERT Model and Tokenizer

In this section, we initialize the BioBERT model and tokenizer. BioBERT is a pre-trained biomedical language representation model designed to handle various biomedical text mining tasks. It is based on BERT (Bidirectional Encoder Representations from Transformers) and has been further pre-trained on large-scale biomedical corpora.

## Model Details

- **Model Name**: `dmis-lab/biobert-base-cased-v1.1`
- **Architecture**: BERT (Bidirectional Encoder Representations from Transformers)
- **Pre-training**: The model has been pre-trained on PubMed abstracts and PMC full-text articles, making it highly suitable for biomedical text mining tasks.

The BioBERT model has the following architecture details:

- **Number of Layers**: 12
- **Hidden Size**: 768
- **Number of Attention Heads**: 12
- **Total Parameters**: 110M

These architectural details make BioBERT a powerful model for understanding and processing biomedical text, leveraging the transformer architecture to capture complex patterns and relationships within the data.

## Code Explanation

The following code initializes the BioBERT model and tokenizer:

```python
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)  
```

### Steps:

1. **Model Name**: We specify the model name `dmis-lab/biobert-base-cased-v1.1`.
2. **Tokenizer Initialization**: We use `AutoTokenizer.from_pretrained(model_name)` to load the tokenizer associated with the BioBERT model. The tokenizer is responsible for converting text into tokens that the model can process.
3. **Model Initialization**: We use `AutoModel.from_pretrained(model_name)` to load the pre-trained BioBERT model. This model is capable of generating embeddings for biomedical text.
4. **Device Assignment**: We move the model to the appropriate device (GPU or CPU) using `.to(device)`. This ensures that the model computations are performed on the available hardware, optimizing performance.

By initializing the BioBERT model and tokenizer, we are now equipped to process and generate embeddings for biomedical text, which will be used in subsequent steps for tasks such as similarity computation and information retrieval.

In [5]:
# Initializing the BioBERT model and tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to the device


### Initializing Output and Model File Names

In this cell, we are initializing two variables to store the names of the output and model files.

- `output_file`: This variable is assigned the string `"similar_trials_results.xlsx"`, which indicates that the results of similar trials will be saved in an Excel file with this name.
- `model_file`: This variable is assigned the string `"biobert_embeddings.pt"`, which indicates that the BioBERT model embeddings will be saved in a PyTorch file with this name.

These variables will be used later in the code to save and load the respective files.

In [6]:
# Initializing the output and model file names
output_file = "similar_trials_results.xlsx"
model_file = "biobert_embeddings.pt"

# ClinicalTrialsDataset Class

## Description
The `ClinicalTrialsDataset` class is a custom dataset class designed for tokenizing a list of clinical trial texts using a specified tokenizer. It inherits from the `Dataset` class provided by PyTorch.

## Parameters
- `data` (list): A list of texts to be tokenized.
- `tokenizer` (Tokenizer): A tokenizer object used to tokenize the texts.
- `max_length` (int, optional): The maximum length of the tokenized sequences. Default is 512.

## Methods

 `__init__(self, data, tokenizer, max_length=512)`
Initializes the dataset with the provided data, tokenizer, and maximum sequence length.

 `__len__(self)`
Returns the number of texts in the dataset.

 `__getitem__(self, idx)`
Tokenizes the text at the specified index and returns the tokenized encoding as tensors. The tensors are moved to the specified device.

#### Parameters
- `idx` (int): The index of the text to be tokenized.

#### Returns
- `dict`: A dictionary containing the tokenized encoding as tensors, with each tensor moved to the specified device.

In [7]:
# Defining a dataset class for tokenization
class ClinicalTrialsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data  # This should be a list of texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]  # Access list item directly
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {key: val.squeeze(0).to(device) for key, val in encoding.items()}  # Return tensors and move to device


## Function: generate_embeddings

### Description
Generates embeddings for a list of texts using the BioBERT model. This function tokenizes the input texts, processes them in batches, and extracts the embeddings from the model's output.

### Parameters
- **texts** (`list` of `str`): A list of input texts to generate embeddings for.
- **tokenizer** (`transformers.PreTrainedTokenizer`): The tokenizer associated with the BioBERT model, used to convert texts into token IDs.
- **model** (`transformers.PreTrainedModel`): The BioBERT model used to generate embeddings.
- **batch_size** (`int`, optional): The number of samples to process in each batch. Default is 16.

### Returns
- **torch.Tensor**: A tensor containing the embeddings for the input texts. The shape of the tensor is `(number_of_texts, embedding_dimension)`.

### Example Usage

In [8]:
# Generate embeddings using BioBERT
def generate_embeddings(texts, tokenizer, model, batch_size=16):
    dataset = ClinicalTrialsDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    embeddings = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating embeddings"):
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]

            # Move tensors to the device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())  # Move embeddings back to CPU for numpy conversion

    return torch.tensor(np.vstack(embeddings))

# Preprocess the text data
 This code performs the following steps:
1. Creates a new column "Combined_Text" in the DataFrame `df` by filling any missing values in the "Combined Column" with an empty string.
2. Converts the "Combined_Text" column into a list of strings and assigns it to the variable `texts`.

 - `df["Combined Column"].fillna("")`: Replaces any NaN values in the "Combined Column" with an empty string.
 - `df["Combined_Text"].tolist()`: Converts the "Combined_Text" column to a list.

In [9]:
# Preprocess the text data
df["Combined_Text"] = df["Combined Column"].fillna("")
texts = df["Combined_Text"].tolist()

## Loading or Generating Embeddings

This code snippet checks if the embeddings are already saved in a file. If the file exists, it loads the embeddings using `torch.load()`. Otherwise, it generates embeddings for all clinical trials, saves them to a file, and informs the user of the save location.

### Code Explanation:
1. **Check File Existence**: 
   - `os.path.exists(model_file)` checks if the embeddings file already exists.
   - If it exists, the embeddings are loaded, and a message is displayed: `"Loaded embeddings from saved model."`.

2. **Generate and Save Embeddings**:
   - If the file doesn't exist, embeddings are generated using the `generate_embeddings()` function.
   - `torch.save()` saves the generated embeddings to `model_file`.
   - A confirmation message is printed: `"Embeddings saved to {model_file}"`.

In [11]:
# Check if embeddings are already saved
if os.path.exists(model_file):
    embeddings = torch.load(model_file)
    print("Loaded embeddings from saved model.")
else:
    # Generate embeddings for all clinical trials
    embeddings = generate_embeddings(texts, tokenizer, model)
    torch.save(embeddings, model_file)
    print(f"Embeddings saved to {model_file}")

  embeddings = torch.load(model_file)


Loaded embeddings from saved model.


## Retrieving Top N Similar Clinical Trials

This function computes the top N similar clinical trials based on cosine similarity between embeddings. 

### Code Functionality:
1. **Input Parameters**:
   - `query_embedding`: The embedding of the query trial for which similar trials are to be found.
   - `embeddings`: The embeddings of all clinical trials.
   - `top_n`: The number of similar trials to retrieve (default is 10).

2. **Steps in the Function**:
   - **Convert to CPU and Numpy**:
     - Move `query_embedding` and `embeddings` to CPU (if not already) using `.cpu()` to ensure compatibility with `cosine_similarity`.
     - Convert them to NumPy arrays using `.numpy()`.
   - **Compute Cosine Similarity**:
     - Use `cosine_similarity` from sklearn to compute the similarity between the query embedding and all other embeddings.
   - **Retrieve Indices of Similar Trials**:
     - Sort the similarity scores and retrieve indices of the top N most similar trials using `argsort` (in descending order).

3. **Output**:
   - The function returns the indices of the top N most similar trials.

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def get_similar_trials(query_embedding, embeddings, top_n=10):
    # Ensure both tensors are on the CPU before calling cosine_similarity
    query_embedding_cpu = query_embedding.cpu().detach().numpy()
    embeddings_cpu = embeddings.cpu().detach().numpy()

    # Compute cosine similarity between the query and all embeddings
    similarities = cosine_similarity(query_embedding_cpu, embeddings_cpu)
    
    # Get the indices of the top_n most similar trials (excluding the query itself)
    similar_indices = similarities.argsort(axis=1)[:, -top_n-1:-1][:, ::-1]
    
    return similar_indices


## Specifying Clinical Trials for Evaluation
This list contains the NCT IDs of the trials on which the project is to be tested.

**Attributes**:
   - `evaluation_trials`(list): A list of strings representing the NCT IDs of the trials.

In [13]:
# Trials to evaluate
evaluation_trials = ["NCT00385736", "NCT00386607", "NCT03518073"]


### Create a mapping of NCT IDs to indices.

This dictionary comprehension iterates over the `nct_id` column of the DataFrame `df`, 
enumerates the values, and creates a dictionary where each NCT ID is mapped to its 
corresponding index.

**Returns**:

   - `nct_id_to_index `: A dictionary with NCT IDs as keys and their respective indices as values.


In [14]:
# Create a mapping of NCT IDs to indices
nct_id_to_index = {nct_id: idx for idx, nct_id in enumerate(df["nct_id"])}


## Generating Similar Trials for Evaluation NCT IDs

The following code generates similar trials for a list of evaluation NCT IDs and saves the results to an Excel sheet.

### Code Explanation

1. **Initialize Output List**:
    - `output_data = []`: A list to collect the results for each NCT ID.

2. **Iterate Over Evaluation Trials**:
    - For each `trial_id` in `evaluation_trials`:
      - Check if the `trial_id` exists in the `nct_id_to_index` dictionary.
      - Retrieve the index of the query trial and its embedding.
      - Move the query embedding to the appropriate device (GPU or CPU).

3. **Get Similar Trials**:
    - Use the `get_similar_trials` function to get the indices of similar trials based on cosine similarity.

4. **Retrieve and Process Similar Trials**:
    - Retrieve the similar trials from the DataFrame using the indices.
    - Calculate and store similarity scores for each similar trial.
    - Add the query NCT ID as a new column to track which trial it corresponds to.
    - Append the results to the `output_data` list.

5. **Combine and Save Results**:
    - Combine all results into a single DataFrame.
    - Save the results to an Excel sheet using `pd.ExcelWriter`.

### Code


In [None]:
# Generate similar trials for evaluation NCT IDs
output_data = []  # List to collect the results for each NCT ID

for trial_id in evaluation_trials:
    if trial_id in nct_id_to_index:
        query_idx = nct_id_to_index[trial_id]
        query_embedding = embeddings[query_idx].unsqueeze(0).to(device)  # Move query embedding to device
        
        # Get similar trial indices
        similar_indices = get_similar_trials(query_embedding, embeddings)

        # Retrieve the similar trials from the DataFrame
        similar_trials = df.iloc[similar_indices[0]]
        
        # Calculate and store similarity scores
        similar_trials["Similarity_Score"] = [
            cosine_similarity(query_embedding.cpu().detach().numpy().reshape(1, -1), embeddings[idx].cpu().detach().numpy().reshape(1, -1)).item()
            for idx in similar_indices[0]
        ]
        
        # Add the NCT ID (trial_id) as a new column to track which trial it corresponds to
        similar_trials["Query_NCT_ID"] = trial_id
        
        # Append the results to the output list
        output_data.append(similar_trials)

# Combine all results into a single DataFrame
final_results = pd.concat(output_data, ignore_index=True)

# Save the results to an Excel sheet
output_file = "similar_trials_with_nct_id.xlsx"
with pd.ExcelWriter(output_file, engine='xlsxwriter') as output_writer:
    final_results.to_excel(output_writer, index=False, sheet_name='Similar Trials')

print(f"Similar trials with NCT IDs saved to {output_file}")
