File size: 5,920 Bytes
fe414e5
 
 
 
7412a0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1622379
 
 
 
 
d4088d4
1622379
 
d4088d4
1622379
 
7412a0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c428aa
7412a0d
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# !pip install streamlit
# !pip install transformers
# !pip install torch
# !pip install scikit-learn
import streamlit as st
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# Load the BioBERT model and tokenizer
@st.cache_resource
def load_model_and_tokenizer():
    model_name = "dmis-lab/biobert-base-cased-v1.1"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    return tokenizer, model

# Function to generate embeddings for a single input text
def generate_single_embedding(text, tokenizer, model):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            text,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        encoding = {key: val.squeeze(0).to(device) for key, val in encoding.items()}
        output = model(**encoding)
        return output.last_hidden_state[:, 0, :].cpu().numpy()

# Load the dataset and embeddings
@st.cache_data
def load_data_and_embeddings():
    file_name = "./filtered_combined.xlsx"
    model_file = "./biobert_embeddings.pt"

    df = pd.read_excel(file_name)
    df["Combined_Text"] = df["Combined Column"].fillna("")
    embeddings = torch.load(model_file)
    return df, embeddings

# Function to get top N similar trials
def get_similar_trials(query_embedding, embeddings, top_n=10):
    query_embedding_cpu = query_embedding.cpu().detach().numpy()
    embeddings_cpu = embeddings.cpu().detach().numpy()
    similarities = cosine_similarity(query_embedding_cpu, embeddings_cpu)
    similar_indices = similarities.argsort(axis=1)[:, -top_n-1:-1][:, ::-1]
    return similar_indices, similarities

# Load resources
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer, model = load_model_and_tokenizer()
df, embeddings = load_data_and_embeddings()
def main():
    tokenizer, model = load_model_and_tokenizer()
    st.write("Model and Tokenizer Loaded Successfully!")
    # Add your Streamlit app code here
    # Streamlit GUI
    st.title("Clinical Trials Similarity Finder")
    st.write("Find the most similar clinical trials using BioBERT embeddings.")
    dropdown_data = [
        ("NCT00385736", "Efficacy and Safety of Adalimumab in Subjects With Moderately to Severely Acute Ulcerative Colitis"),
        ("NCT00386607", "A Safety and Tolerability Study of the Combination of Aliskiren/Valsartan in Patients With High Blood Pressure, Followed by Long-term Safety and Tolerability of Aliskiren, Valsartan and Hydrochlorothiazide."),
        ("NCT03518073", "A Study of LY3303560 in Participants With Early Symptomatic Alzheimer's Disease"),
    ]
    st.write("Use the following NCT_IDs for testing the project.")
    # Create a DataFrame for better presentation
    df1 = pd.DataFrame(dropdown_data, columns=["NCT ID", "Study Title"])
    
    # Show the table in the Streamlit app
    st.dataframe(df1)
    # Input method
    # option = st.radio(
    #     "Search by:",
    #     ("NCT ID", "Outcome or Criteria"),
    #     index=0,
    #     help="Choose how you want to search for similar trials."
    # )

    # if option == "NCT ID":
    #     nct_id = st.text_input("Enter NCT ID:", placeholder="e.g., NCT00385736")
    # else:
    #     criteria_text = st.text_area(
    #         "Enter Outcome or Criteria:",
    #         placeholder="e.g., A study evaluating the effects of drug X on Y patients..."
    #     )
    nct_id = st.text_input("Enter NCT ID:", placeholder="e.g., NCT00385736")

    top_n = st.slider("Number of similar trials to retrieve:", min_value=1, max_value=20, value=10)

    if st.button("Find Similar Trials"):
        # if option == "NCT ID" and nct_id:
        #     # Search by NCT ID
        #     nct_id_to_index = {nct_id: idx for idx, nct_id in enumerate(df["nct_id"])}
        #     if nct_id in nct_id_to_index:
        #         query_idx = nct_id_to_index[nct_id]
        #         query_embedding = embeddings[query_idx].unsqueeze(0).to(device)
        #     else:
        #         st.error(f"NCT ID {nct_id} not found in the dataset.")
        #         st.stop()
        # elif option == "Outcome or Criteria" and criteria_text:
        #     # Search by text
        #     query_embedding = torch.tensor(generate_single_embedding(criteria_text, tokenizer, model)).to(device)
        # else:
        #     st.error("Please provide a valid input.")
        #     st.stop()
        if nct_id:
            # Search by NCT ID
            nct_id_to_index = {nct_id: idx for idx, nct_id in enumerate(df["nct_id"])}
            if nct_id in nct_id_to_index:
                query_idx = nct_id_to_index[nct_id]
                query_embedding = embeddings[query_idx].unsqueeze(0).to(device)
            else:
                st.error(f"NCT ID {nct_id} not found in the dataset.")
                st.stop()

        # Get similar trials
        similar_indices, similarities = get_similar_trials(query_embedding, embeddings, top_n=top_n)
        similar_trials = df.iloc[similar_indices[0]].copy()
        similar_trials["Similarity Score"] = [
            similarities[0, idx] for idx in similar_indices[0]
        ]

        # Display results
        st.write("### Top Similar Clinical Trials:")
        st.dataframe(similar_trials[["nct_id", "Study Title", "Similarity Score"]])

        # Download as Excel
        output_file = "similar_trials_results.xlsx"
        similar_trials=similar_trials.drop(columns=['Combined_Text', 'Combined Column'])
        similar_trials.to_excel(output_file, index=False)
        with open(output_file, "rb") as f:
            st.download_button("Download Results as Excel", f, file_name="similar_trials_results.xlsx")


if __name__ == "__main__":
    main()