Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,10 +7,6 @@ import phonenumbers
|
|
| 7 |
import pandas as pd
|
| 8 |
import urllib.parse
|
| 9 |
from bs4 import BeautifulSoup
|
| 10 |
-
import json
|
| 11 |
-
from pathlib import Path
|
| 12 |
-
import traceback
|
| 13 |
-
|
| 14 |
|
| 15 |
import torch
|
| 16 |
from transformers import (
|
|
@@ -109,96 +105,15 @@ def dedup_by_url(items):
|
|
| 109 |
# ============================
|
| 110 |
# SEARCH & SCRAPING
|
| 111 |
# ============================
|
| 112 |
-
CACHE_PATH = Path("tmp/google_cse_cache.json")
|
| 113 |
-
CACHE_TTL = 60 * 60 * 24 # 24 hours cache; adjust as needed
|
| 114 |
-
MAX_GOOGLE_RETRIES = 5
|
| 115 |
-
MIN_SECONDS_BETWEEN_CALLS = 1.0 # throttle: 1 sec between Google calls to avoid bursts
|
| 116 |
-
|
| 117 |
-
# load cache (simple file-based)
|
| 118 |
-
def _load_cache():
|
| 119 |
-
try:
|
| 120 |
-
if CACHE_PATH.exists():
|
| 121 |
-
return json.loads(CACHE_PATH.read_text(encoding="utf-8"))
|
| 122 |
-
except Exception:
|
| 123 |
-
pass
|
| 124 |
-
return {}
|
| 125 |
-
|
| 126 |
-
def _save_cache(cache):
|
| 127 |
-
try:
|
| 128 |
-
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 129 |
-
CACHE_PATH.write_text(json.dumps(cache), encoding="utf-8")
|
| 130 |
-
except Exception:
|
| 131 |
-
pass
|
| 132 |
-
|
| 133 |
-
# track last call time so we can throttle
|
| 134 |
-
_last_google_call = {"t": 0.0}
|
| 135 |
-
|
| 136 |
def google_search(query, num_results=5):
|
| 137 |
-
"""
|
| 138 |
-
Robust Google CSE caller with caching, exponential backoff on 429, and simple throttling.
|
| 139 |
-
Returns list of {"title","link","snippet"}.
|
| 140 |
-
"""
|
| 141 |
-
global _last_google_call
|
| 142 |
if not API_KEY or not CX or "YOUR_GOOGLE_API_KEY" in API_KEY or "YOUR_CSE_ID" in CX:
|
| 143 |
raise RuntimeError("Google API key and CSE ID must be set as environment variables.")
|
| 144 |
-
|
| 145 |
-
cache = _load_cache()
|
| 146 |
-
cache_key = f"gse::{query}::n{num_results}"
|
| 147 |
-
now = time.time()
|
| 148 |
-
|
| 149 |
-
# Check cache and TTL
|
| 150 |
-
if cache_key in cache:
|
| 151 |
-
entry = cache[cache_key]
|
| 152 |
-
if now - entry.get("ts", 0) < CACHE_TTL:
|
| 153 |
-
# cached
|
| 154 |
-
return entry.get("items", [])
|
| 155 |
-
|
| 156 |
-
# throttle to avoid bursts
|
| 157 |
-
elapsed = now - _last_google_call["t"]
|
| 158 |
-
if elapsed < MIN_SECONDS_BETWEEN_CALLS:
|
| 159 |
-
time.sleep(MIN_SECONDS_BETWEEN_CALLS - elapsed)
|
| 160 |
-
|
| 161 |
url = "https://www.googleapis.com/customsearch/v1"
|
| 162 |
-
params = {"q":
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
r = requests.get(url, params=params, timeout=15)
|
| 168 |
-
_last_google_call["t"] = time.time()
|
| 169 |
-
if r.status_code == 200:
|
| 170 |
-
items = r.json().get("items", []) or []
|
| 171 |
-
parsed = [{"title": i.get("title",""), "link": i.get("link",""), "snippet": i.get("snippet","")} for i in items]
|
| 172 |
-
# write to cache
|
| 173 |
-
cache[cache_key] = {"ts": time.time(), "items": parsed}
|
| 174 |
-
_save_cache(cache)
|
| 175 |
-
return parsed
|
| 176 |
-
elif r.status_code == 429:
|
| 177 |
-
# handle Retry-After if present
|
| 178 |
-
ra = r.headers.get("Retry-After")
|
| 179 |
-
wait = float(ra) if ra and ra.isdigit() else backoff
|
| 180 |
-
print(f"[google_search] 429 -> sleeping {wait}s (attempt {attempt})")
|
| 181 |
-
time.sleep(wait)
|
| 182 |
-
backoff *= 2
|
| 183 |
-
continue
|
| 184 |
-
else:
|
| 185 |
-
r.raise_for_status()
|
| 186 |
-
except requests.HTTPError as e:
|
| 187 |
-
print(f"[google_search HTTPError] {e} (attempt {attempt})")
|
| 188 |
-
if attempt == MAX_GOOGLE_RETRIES:
|
| 189 |
-
raise
|
| 190 |
-
time.sleep(backoff)
|
| 191 |
-
backoff *= 2
|
| 192 |
-
except Exception as e:
|
| 193 |
-
print(f"[google_search error] {e} (attempt {attempt})")
|
| 194 |
-
if attempt == MAX_GOOGLE_RETRIES:
|
| 195 |
-
raise
|
| 196 |
-
time.sleep(backoff)
|
| 197 |
-
backoff *= 2
|
| 198 |
-
|
| 199 |
-
# If we exit loop without return, return empty list
|
| 200 |
-
return []
|
| 201 |
-
# ----------------- end google_search replacement -----------------
|
| 202 |
|
| 203 |
def extract_phones(text, region="GH"):
|
| 204 |
phones = []
|
|
@@ -430,23 +345,9 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
|
|
| 430 |
region = get_region_for_country(country)
|
| 431 |
queries, profs = build_queries(story, country)
|
| 432 |
|
| 433 |
-
|
| 434 |
-
queries, profs = build_queries(story, country)
|
| 435 |
-
|
| 436 |
-
# Deduplicate and limit queries to e.g. top 3
|
| 437 |
-
unique_queries = []
|
| 438 |
-
seen_q = set()
|
| 439 |
-
for q in queries:
|
| 440 |
-
if q not in seen_q:
|
| 441 |
-
seen_q.add(q)
|
| 442 |
-
unique_queries.append(q)
|
| 443 |
-
# limit number of queries to reduce CSE calls
|
| 444 |
-
MAX_QUERIES_PER_STORY = 3
|
| 445 |
-
unique_queries = unique_queries[:MAX_QUERIES_PER_STORY]
|
| 446 |
-
|
| 447 |
-
# Search (serialized, cached, with error handling)
|
| 448 |
search_results = []
|
| 449 |
-
for q in
|
| 450 |
try:
|
| 451 |
items = google_search(q, num_results=results_per_query)
|
| 452 |
for it in items:
|
|
@@ -455,6 +356,10 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
|
|
| 455 |
except Exception as e:
|
| 456 |
print("[search error]", q, e)
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
# NER on titles/snippets
|
| 460 |
all_people, all_orgs, all_locs = [], [], []
|
|
@@ -581,34 +486,25 @@ def send_ally_ai_email(to_email, subject, body, user_email,
|
|
| 581 |
|
| 582 |
def run_search(story, country):
|
| 583 |
"""
|
| 584 |
-
Robust wrapper
|
| 585 |
-
|
| 586 |
"""
|
| 587 |
-
# default safe return
|
| 588 |
-
default_summary = "No results found (an error occurred or no matches)."
|
| 589 |
-
default_records, default_options, default_anon = [], ["0 — No results (try again)"], "I am seeking confidential support."
|
| 590 |
-
|
| 591 |
try:
|
| 592 |
out = find_professionals_from_story(story, country=country, results_per_query=RESULTS_PER_QUERY)
|
| 593 |
-
# If the function returned None or invalid type, replace with defaults
|
| 594 |
-
if not isinstance(out, dict):
|
| 595 |
-
raise ValueError("find_professionals_from_story returned non-dict or None")
|
| 596 |
except Exception as e:
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
out = {"summary": f"Search failed: {str(e)}", "professionals": [], "queries_used": []}
|
| 601 |
|
| 602 |
pros = out.get("professionals", []) or []
|
| 603 |
|
| 604 |
-
#
|
| 605 |
try:
|
| 606 |
records = pd.DataFrame(pros).to_dict(orient="records") if pros else []
|
| 607 |
-
except Exception
|
| 608 |
-
print("[run_search] DataFrame conversion error:", e)
|
| 609 |
records = []
|
| 610 |
|
| 611 |
-
#
|
| 612 |
options = []
|
| 613 |
for i, r in enumerate(pros):
|
| 614 |
label_contact = r.get("email") if r.get("email") and r.get("email") != "Not found" else (r.get("phone", "No contact"))
|
|
@@ -617,33 +513,37 @@ def run_search(story, country):
|
|
| 617 |
options.append(label)
|
| 618 |
|
| 619 |
if not options:
|
| 620 |
-
options =
|
| 621 |
|
| 622 |
-
#
|
| 623 |
try:
|
| 624 |
-
anon = anonymize_story(story) or
|
| 625 |
except Exception as e:
|
| 626 |
-
print("[
|
| 627 |
-
anon =
|
| 628 |
|
| 629 |
-
summary = out.get("summary",
|
| 630 |
return summary, records, options, anon
|
| 631 |
|
| 632 |
|
| 633 |
def _on_search(story, country):
|
| 634 |
"""
|
| 635 |
-
Function wired to
|
| 636 |
-
|
| 637 |
[summary_out, results_table, dropdown_sel, anon_out, message_in]
|
| 638 |
"""
|
| 639 |
summary, records, options, anon = run_search(story, country)
|
| 640 |
|
| 641 |
-
#
|
| 642 |
prefill = make_body(anon, story, True, "")
|
| 643 |
|
| 644 |
# Return plain serializable values (not gr.update)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
return summary, records, options, anon, prefill
|
| 646 |
-
# ---------- end replacements ----------
|
| 647 |
|
| 648 |
|
| 649 |
def make_body(anon_text, full_story, use_anon, user_email):
|
|
@@ -798,4 +698,4 @@ with gr.Blocks() as demo:
|
|
| 798 |
user_email_in, sender_email_in, sender_pass_in, logo_url_in],
|
| 799 |
outputs=[status_out, mailto_html_out, eml_file_out])
|
| 800 |
|
| 801 |
-
demo.launch(share=False)
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
import urllib.parse
|
| 9 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
import torch
|
| 12 |
from transformers import (
|
|
|
|
| 105 |
# ============================
|
| 106 |
# SEARCH & SCRAPING
|
| 107 |
# ============================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
def google_search(query, num_results=5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
if not API_KEY or not CX or "YOUR_GOOGLE_API_KEY" in API_KEY or "YOUR_CSE_ID" in CX:
|
| 110 |
raise RuntimeError("Google API key and CSE ID must be set as environment variables.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
url = "https://www.googleapis.com/customsearch/v1"
|
| 112 |
+
params = {"q":query, "key":API_KEY, "cx":CX, "num":num_results}
|
| 113 |
+
r = requests.get(url, params=params, timeout=20)
|
| 114 |
+
r.raise_for_status()
|
| 115 |
+
items = r.json().get("items", []) or []
|
| 116 |
+
return [{"title":i.get("title",""), "link":i.get("link",""), "snippet":i.get("snippet","")} for i in items]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
def extract_phones(text, region="GH"):
|
| 119 |
phones = []
|
|
|
|
| 345 |
region = get_region_for_country(country)
|
| 346 |
queries, profs = build_queries(story, country)
|
| 347 |
|
| 348 |
+
# Search
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
search_results = []
|
| 350 |
+
for q in queries:
|
| 351 |
try:
|
| 352 |
items = google_search(q, num_results=results_per_query)
|
| 353 |
for it in items:
|
|
|
|
| 356 |
except Exception as e:
|
| 357 |
print("[search error]", q, e)
|
| 358 |
|
| 359 |
+
search_results = dedup_by_url(search_results)
|
| 360 |
+
if not search_results:
|
| 361 |
+
return {"summary":"No results found. Try a different country or wording.",
|
| 362 |
+
"professionals":[], "queries_used":queries}
|
| 363 |
|
| 364 |
# NER on titles/snippets
|
| 365 |
all_people, all_orgs, all_locs = [], [], []
|
|
|
|
| 486 |
|
| 487 |
def run_search(story, country):
|
| 488 |
"""
|
| 489 |
+
Robust search wrapper: returns (summary, table_records, dropdown_options, anonymized_text).
|
| 490 |
+
Avoids returning gr.update(...) to prevent KeyError during serialization.
|
| 491 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
try:
|
| 493 |
out = find_professionals_from_story(story, country=country, results_per_query=RESULTS_PER_QUERY)
|
|
|
|
|
|
|
|
|
|
| 494 |
except Exception as e:
|
| 495 |
+
err_msg = f"Search failed: {e}"
|
| 496 |
+
placeholder = ["0 — No results (search failed)"]
|
| 497 |
+
return err_msg, [], placeholder, ""
|
|
|
|
| 498 |
|
| 499 |
pros = out.get("professionals", []) or []
|
| 500 |
|
| 501 |
+
# build table records
|
| 502 |
try:
|
| 503 |
records = pd.DataFrame(pros).to_dict(orient="records") if pros else []
|
| 504 |
+
except Exception:
|
|
|
|
| 505 |
records = []
|
| 506 |
|
| 507 |
+
# build dropdown options as list of strings (guarantee at least one)
|
| 508 |
options = []
|
| 509 |
for i, r in enumerate(pros):
|
| 510 |
label_contact = r.get("email") if r.get("email") and r.get("email") != "Not found" else (r.get("phone", "No contact"))
|
|
|
|
| 513 |
options.append(label)
|
| 514 |
|
| 515 |
if not options:
|
| 516 |
+
options = ["0 — No results (try a different country/query)"]
|
| 517 |
|
| 518 |
+
# anonymize safely
|
| 519 |
try:
|
| 520 |
+
anon = anonymize_story(story) or "I am seeking confidential support regarding gender-based violence."
|
| 521 |
except Exception as e:
|
| 522 |
+
print("[anonymize error]", e)
|
| 523 |
+
anon = "I am seeking confidential support regarding gender-based violence."
|
| 524 |
|
| 525 |
+
summary = out.get("summary", "No results found.")
|
| 526 |
return summary, records, options, anon
|
| 527 |
|
| 528 |
|
| 529 |
def _on_search(story, country):
|
| 530 |
"""
|
| 531 |
+
Function wired to the search button.
|
| 532 |
+
Returns exactly 5 outputs to match:
|
| 533 |
[summary_out, results_table, dropdown_sel, anon_out, message_in]
|
| 534 |
"""
|
| 535 |
summary, records, options, anon = run_search(story, country)
|
| 536 |
|
| 537 |
+
# pre-fill message body with anonymized text (user email left empty for now)
|
| 538 |
prefill = make_body(anon, story, True, "")
|
| 539 |
|
| 540 |
# Return plain serializable values (not gr.update)
|
| 541 |
+
# summary -> str
|
| 542 |
+
# records -> list[dict] (or [])
|
| 543 |
+
# options -> list[str] for dropdown (Gradio will accept it)
|
| 544 |
+
# anon -> str
|
| 545 |
+
# prefill -> str (message body)
|
| 546 |
return summary, records, options, anon, prefill
|
|
|
|
| 547 |
|
| 548 |
|
| 549 |
def make_body(anon_text, full_story, use_anon, user_email):
|
|
|
|
| 698 |
user_email_in, sender_email_in, sender_pass_in, logo_url_in],
|
| 699 |
outputs=[status_out, mailto_html_out, eml_file_out])
|
| 700 |
|
| 701 |
+
demo.launch(share=False)
|