presidio-de-identify

Sleeping

App Files Files Community

awacke1 commited on Apr 14

Commit

35c70df

verified ·

1 Parent(s): 75fec96

Create app.py

Browse files

Files changed (1) hide show

app.py +167 -0

app.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import logging
+import os
+import base64
+import datetime
+import dotenv
+import pandas as pd
+import streamlit as st
+import streamlit.components.v1 as components
+from annotated_text import annotated_text
+from streamlit_tags import st_tags
+from PyPDF2 import PdfReader, PdfWriter
+from presidio_helpers import (
+    get_supported_entities,
+    analyze,
+    anonymize,
+    annotate,
+    analyzer_engine,
+)
+st.set_page_config(
+    page_title="Presidio PHI De-identification",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={"About": "https://microsoft.github.io/presidio/"},
+)
+dotenv.load_dotenv()
+logger = logging.getLogger("presidio-streamlit")
+# Sidebar
+st.sidebar.header("PHI De-identification with Presidio")
+model_help_text = "Select Named Entity Recognition (NER) model for PHI detection."
+model_list = [
+    ("spaCy/en_core_web_lg", "https://huggingface.co/spacy/en_core_web_lg"),
+    ("HuggingFace/obi/deid_roberta_i2b2", "https://huggingface.co/obi/deid_roberta_i2b2"),
+    ("flair/ner-english-large", "https://huggingface.co/flair/ner-english-large"),
+    ("HuggingFace/StanfordAIMI/stanford-deidentifier-base", "https://huggingface.co/StanfordAIMI/stanford-deidentifier-base"),
+]
+st_model = st.sidebar.selectbox(
+    "NER model package",
+    [model[0] for model in model_list],
+    index=1,
+    help=model_help_text,
+)
+# Display HuggingFace link for selected model
+selected_model_url = next(url for model, url in model_list if model == st_model)
+st.sidebar.markdown(f"[View model on HuggingFace]({selected_model_url})")
+# Extract model package
+st_model_package = st_model.split("/")[0]
+st_model = st_model if st_model_package.lower() not in ("spacy", "huggingface") else "/".join(st_model.split("/")[1:])
+analyzer_params = (st_model_package, st_model, "", "")
+st.sidebar.warning("Note: Models might take some time to download.")
+st_operator = st.sidebar.selectbox(
+    "De-identification approach",
+    ["replace", "redact", "mask"],
+    index=0,
+    help="Select PHI manipulation method.",
+)
+st_threshold = st.sidebar.slider(
+    label="Acceptance threshold",
+    min_value=0.0,
+    max_value=1.0,
+    value=0.35,
+)
+st_return_decision_process = st.sidebar.checkbox(
+    "Add analysis explanations",
+    value=False,
+)
+# Allow and deny lists
+with st.sidebar.expander("Allowlists and denylists", expanded=False):
+    st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
+    st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")
+# Main panel
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader("Input")
+    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
+    if uploaded_file:
+        # Read PDF
+        pdf_reader = PdfReader(uploaded_file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text() + "\n"
+        # Analyze
+        analyzer = analyzer_engine(*analyzer_params)
+        st_analyze_results = analyze(
+            *analyzer_params,
+            text=text,
+            entities=get_supported_entities(*analyzer_params),
+            language="en",
+            score_threshold=st_threshold,
+            return_decision_process=st_return_decision_process,
+            allow_list=st_allow_list,
+            deny_list=st_deny_list,
+        )
+        # Process results
+        phi_types = set(res.entity_type for res in st_analyze_results)
+        if phi_types:
+            st.success(f"Removed PHI types: {', '.join(phi_types)}")
+        else:
+            st.info("No PHI detected")
+        # Anonymize
+        anonymized_result = anonymize(
+            text=text,
+            operator=st_operator,
+            analyze_results=st_analyze_results,
+        )
+        # Create new PDF
+        pdf_writer = PdfWriter()
+        for page in pdf_reader.pages:
+            pdf_writer.add_page(page)
+        # Generate output filename with timestamp
+        timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
+        output_filename = f"{timestamp}_{uploaded_file.name}"
+        # Save modified PDF
+        with open(output_filename, "wb") as f:
+            pdf_writer.write(f)
+        # Generate base64 download link
+        with open(output_filename, "rb") as f:
+            pdf_bytes = f.read()
+            b64 = base64.b64encode(pdf_bytes).decode()
+            href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
+            st.markdown(href, unsafe_allow_html=True)
+        # Display findings
+        with col2:
+            st.subheader("Findings")
+            if st_analyze_results:
+                df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
+                df["text"] = [text[res.start:res.end] for res in st_analyze_results]
+                df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
+                    {
+                        "entity_type": "Entity type",
+                        "text": "Text",
+                        "start": "Start",
+                        "end": "End",
+                        "score": "Confidence",
+                    },
+                    axis=1,
+                )
+                if st_return_decision_process:
+                    analysis_explanation_df = pd.DataFrame.from_records(
+                        [r.analysis_explanation.to_dict() for r in st_analyze_results]
+                    )
+                    df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
+                st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
+            else:
+                st.text("No findings")