Spaces:
Runtime error
Runtime error
Update app.py
#1
by
lhoestq
HF Staff
- opened
- analyze.py +3 -5
- app.py +11 -4
analyze.py
CHANGED
|
@@ -11,7 +11,8 @@ Row = dict[str, Any]
|
|
| 11 |
T = TypeVar("T")
|
| 12 |
BATCH_SIZE = 1
|
| 13 |
MAX_TEXT_LENGTH = 500
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class PresidioEntity(TypedDict):
|
|
@@ -121,16 +122,13 @@ def analyze(
|
|
| 121 |
def presidio_scan_entities(
|
| 122 |
rows: Iterable[Row], scanned_columns: list[str], columns_descriptions: list[str]
|
| 123 |
) -> Iterable[PresidioEntity]:
|
| 124 |
-
global batch_analyzer
|
| 125 |
cache: dict[str, list[RecognizerResult]] = {}
|
| 126 |
-
if batch_analyzer is None:
|
| 127 |
-
batch_analyser = BatchAnalyzerEngine(AnalyzerEngine())
|
| 128 |
rows_with_scanned_columns_only = (
|
| 129 |
{column_name: get_strings(row[column_name])[:MAX_TEXT_LENGTH] for column_name in scanned_columns} for row in rows
|
| 130 |
)
|
| 131 |
for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
|
| 132 |
yield from analyze(
|
| 133 |
-
batch_analyzer=
|
| 134 |
batch=batch,
|
| 135 |
indices=indices,
|
| 136 |
scanned_columns=scanned_columns,
|
|
|
|
| 11 |
T = TypeVar("T")
|
| 12 |
BATCH_SIZE = 1
|
| 13 |
MAX_TEXT_LENGTH = 500
|
| 14 |
+
analyzer = AnalyzerEngine()
|
| 15 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer)
|
| 16 |
|
| 17 |
|
| 18 |
class PresidioEntity(TypedDict):
|
|
|
|
| 122 |
def presidio_scan_entities(
|
| 123 |
rows: Iterable[Row], scanned_columns: list[str], columns_descriptions: list[str]
|
| 124 |
) -> Iterable[PresidioEntity]:
|
|
|
|
| 125 |
cache: dict[str, list[RecognizerResult]] = {}
|
|
|
|
|
|
|
| 126 |
rows_with_scanned_columns_only = (
|
| 127 |
{column_name: get_strings(row[column_name])[:MAX_TEXT_LENGTH] for column_name in scanned_columns} for row in rows
|
| 128 |
)
|
| 129 |
for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
|
| 130 |
yield from analyze(
|
| 131 |
+
batch_analyzer=batch_analyzer,
|
| 132 |
batch=batch,
|
| 133 |
indices=indices,
|
| 134 |
scanned_columns=scanned_columns,
|
app.py
CHANGED
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
| 7 |
from datasets import Features
|
| 8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 9 |
|
| 10 |
-
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
|
| 11 |
|
| 12 |
MAX_ROWS = 100
|
| 13 |
T = TypeVar("T")
|
|
@@ -34,7 +34,7 @@ class track_iter:
|
|
| 34 |
self.next_idx += 1
|
| 35 |
yield item
|
| 36 |
|
| 37 |
-
def analyze_dataset(dataset: str) -> pd.DataFrame:
|
| 38 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 39 |
if "error" in info_resp:
|
| 40 |
yield "β " + info_resp["error"], pd.DataFrame()
|
|
@@ -52,8 +52,9 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
|
|
| 52 |
for presidio_entity in presidio_scan_entities(
|
| 53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
| 54 |
):
|
| 55 |
-
|
| 56 |
-
|
|
|
|
| 57 |
yield f"β
Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
| 58 |
|
| 59 |
with gr.Blocks() as demo:
|
|
@@ -65,6 +66,12 @@ with gr.Blocks() as demo:
|
|
| 65 |
placeholder="Search for dataset id on Huggingface",
|
| 66 |
search_type="dataset",
|
| 67 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
]
|
| 69 |
button = gr.Button("Run Presidio Scan")
|
| 70 |
outputs = [
|
|
|
|
| 7 |
from datasets import Features
|
| 8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
| 9 |
|
| 10 |
+
from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
|
| 11 |
|
| 12 |
MAX_ROWS = 100
|
| 13 |
T = TypeVar("T")
|
|
|
|
| 34 |
self.next_idx += 1
|
| 35 |
yield item
|
| 36 |
|
| 37 |
+
def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFrame:
|
| 38 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
| 39 |
if "error" in info_resp:
|
| 40 |
yield "β " + info_resp["error"], pd.DataFrame()
|
|
|
|
| 52 |
for presidio_entity in presidio_scan_entities(
|
| 53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
| 54 |
):
|
| 55 |
+
if presidio_entity.type in enabled_presidio_entities:
|
| 56 |
+
presidio_entities.append(presidio_entity)
|
| 57 |
+
yield f"βοΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
| 58 |
yield f"β
Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
| 59 |
|
| 60 |
with gr.Blocks() as demo:
|
|
|
|
| 66 |
placeholder="Search for dataset id on Huggingface",
|
| 67 |
search_type="dataset",
|
| 68 |
),
|
| 69 |
+
gr.CheckBoxGroup(
|
| 70 |
+
label="Presidio entities",
|
| 71 |
+
choices=analyzer.get_supported_entities(),
|
| 72 |
+
value=["PERSON", "CREDIT_CARD", "US_SSN", "PHONE_NUMBER", "EMAIL_ADDRESS", "IP_ADDRESS", "US_BANK_NUMBER", "EMAIL", "IBAN_CODE"],
|
| 73 |
+
interative=True,
|
| 74 |
+
),
|
| 75 |
]
|
| 76 |
button = gr.Button("Run Presidio Scan")
|
| 77 |
outputs = [
|