Gamortsey commited on
Commit
5916940
·
verified ·
1 Parent(s): ff81c49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -134
app.py CHANGED
@@ -7,10 +7,6 @@ import phonenumbers
7
  import pandas as pd
8
  import urllib.parse
9
  from bs4 import BeautifulSoup
10
- import json
11
- from pathlib import Path
12
- import traceback
13
-
14
 
15
  import torch
16
  from transformers import (
@@ -109,96 +105,15 @@ def dedup_by_url(items):
109
  # ============================
110
  # SEARCH & SCRAPING
111
  # ============================
112
- CACHE_PATH = Path("tmp/google_cse_cache.json")
113
- CACHE_TTL = 60 * 60 * 24 # 24 hours cache; adjust as needed
114
- MAX_GOOGLE_RETRIES = 5
115
- MIN_SECONDS_BETWEEN_CALLS = 1.0 # throttle: 1 sec between Google calls to avoid bursts
116
-
117
- # load cache (simple file-based)
118
- def _load_cache():
119
- try:
120
- if CACHE_PATH.exists():
121
- return json.loads(CACHE_PATH.read_text(encoding="utf-8"))
122
- except Exception:
123
- pass
124
- return {}
125
-
126
- def _save_cache(cache):
127
- try:
128
- CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
129
- CACHE_PATH.write_text(json.dumps(cache), encoding="utf-8")
130
- except Exception:
131
- pass
132
-
133
- # track last call time so we can throttle
134
- _last_google_call = {"t": 0.0}
135
-
136
  def google_search(query, num_results=5):
137
- """
138
- Robust Google CSE caller with caching, exponential backoff on 429, and simple throttling.
139
- Returns list of {"title","link","snippet"}.
140
- """
141
- global _last_google_call
142
  if not API_KEY or not CX or "YOUR_GOOGLE_API_KEY" in API_KEY or "YOUR_CSE_ID" in CX:
143
  raise RuntimeError("Google API key and CSE ID must be set as environment variables.")
144
-
145
- cache = _load_cache()
146
- cache_key = f"gse::{query}::n{num_results}"
147
- now = time.time()
148
-
149
- # Check cache and TTL
150
- if cache_key in cache:
151
- entry = cache[cache_key]
152
- if now - entry.get("ts", 0) < CACHE_TTL:
153
- # cached
154
- return entry.get("items", [])
155
-
156
- # throttle to avoid bursts
157
- elapsed = now - _last_google_call["t"]
158
- if elapsed < MIN_SECONDS_BETWEEN_CALLS:
159
- time.sleep(MIN_SECONDS_BETWEEN_CALLS - elapsed)
160
-
161
  url = "https://www.googleapis.com/customsearch/v1"
162
- params = {"q": query, "key": API_KEY, "cx": CX, "num": num_results}
163
-
164
- backoff = 1.0
165
- for attempt in range(1, MAX_GOOGLE_RETRIES + 1):
166
- try:
167
- r = requests.get(url, params=params, timeout=15)
168
- _last_google_call["t"] = time.time()
169
- if r.status_code == 200:
170
- items = r.json().get("items", []) or []
171
- parsed = [{"title": i.get("title",""), "link": i.get("link",""), "snippet": i.get("snippet","")} for i in items]
172
- # write to cache
173
- cache[cache_key] = {"ts": time.time(), "items": parsed}
174
- _save_cache(cache)
175
- return parsed
176
- elif r.status_code == 429:
177
- # handle Retry-After if present
178
- ra = r.headers.get("Retry-After")
179
- wait = float(ra) if ra and ra.isdigit() else backoff
180
- print(f"[google_search] 429 -> sleeping {wait}s (attempt {attempt})")
181
- time.sleep(wait)
182
- backoff *= 2
183
- continue
184
- else:
185
- r.raise_for_status()
186
- except requests.HTTPError as e:
187
- print(f"[google_search HTTPError] {e} (attempt {attempt})")
188
- if attempt == MAX_GOOGLE_RETRIES:
189
- raise
190
- time.sleep(backoff)
191
- backoff *= 2
192
- except Exception as e:
193
- print(f"[google_search error] {e} (attempt {attempt})")
194
- if attempt == MAX_GOOGLE_RETRIES:
195
- raise
196
- time.sleep(backoff)
197
- backoff *= 2
198
-
199
- # If we exit loop without return, return empty list
200
- return []
201
- # ----------------- end google_search replacement -----------------
202
 
203
  def extract_phones(text, region="GH"):
204
  phones = []
@@ -430,23 +345,9 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
430
  region = get_region_for_country(country)
431
  queries, profs = build_queries(story, country)
432
 
433
- # Build queries earlier as you already do:
434
- queries, profs = build_queries(story, country)
435
-
436
- # Deduplicate and limit queries to e.g. top 3
437
- unique_queries = []
438
- seen_q = set()
439
- for q in queries:
440
- if q not in seen_q:
441
- seen_q.add(q)
442
- unique_queries.append(q)
443
- # limit number of queries to reduce CSE calls
444
- MAX_QUERIES_PER_STORY = 3
445
- unique_queries = unique_queries[:MAX_QUERIES_PER_STORY]
446
-
447
- # Search (serialized, cached, with error handling)
448
  search_results = []
449
- for q in unique_queries:
450
  try:
451
  items = google_search(q, num_results=results_per_query)
452
  for it in items:
@@ -455,6 +356,10 @@ def find_professionals_from_story(story, country=DEFAULT_COUNTRY, results_per_qu
455
  except Exception as e:
456
  print("[search error]", q, e)
457
 
 
 
 
 
458
 
459
  # NER on titles/snippets
460
  all_people, all_orgs, all_locs = [], [], []
@@ -581,34 +486,25 @@ def send_ally_ai_email(to_email, subject, body, user_email,
581
 
582
  def run_search(story, country):
583
  """
584
- Robust wrapper around find_professionals_from_story.
585
- Always returns: (summary:str, records:list[dict], options:list[str], anonymized_text:str)
586
  """
587
- # default safe return
588
- default_summary = "No results found (an error occurred or no matches)."
589
- default_records, default_options, default_anon = [], ["0 — No results (try again)"], "I am seeking confidential support."
590
-
591
  try:
592
  out = find_professionals_from_story(story, country=country, results_per_query=RESULTS_PER_QUERY)
593
- # If the function returned None or invalid type, replace with defaults
594
- if not isinstance(out, dict):
595
- raise ValueError("find_professionals_from_story returned non-dict or None")
596
  except Exception as e:
597
- # Log the full traceback so you can inspect logs in the Space
598
- print("[run_search] find_professionals_from_story error:", e)
599
- traceback.print_exc()
600
- out = {"summary": f"Search failed: {str(e)}", "professionals": [], "queries_used": []}
601
 
602
  pros = out.get("professionals", []) or []
603
 
604
- # Build table records safely
605
  try:
606
  records = pd.DataFrame(pros).to_dict(orient="records") if pros else []
607
- except Exception as e:
608
- print("[run_search] DataFrame conversion error:", e)
609
  records = []
610
 
611
- # Build dropdown options as plain list
612
  options = []
613
  for i, r in enumerate(pros):
614
  label_contact = r.get("email") if r.get("email") and r.get("email") != "Not found" else (r.get("phone", "No contact"))
@@ -617,33 +513,37 @@ def run_search(story, country):
617
  options.append(label)
618
 
619
  if not options:
620
- options = default_options
621
 
622
- # Anonymize safely (fallback if anonymizer fails)
623
  try:
624
- anon = anonymize_story(story) or default_anon
625
  except Exception as e:
626
- print("[run_search] anonymize_story error:", e)
627
- anon = default_anon
628
 
629
- summary = out.get("summary", default_summary)
630
  return summary, records, options, anon
631
 
632
 
633
  def _on_search(story, country):
634
  """
635
- Function wired to search_btn.click(...)
636
- Must return exactly these outputs:
637
  [summary_out, results_table, dropdown_sel, anon_out, message_in]
638
  """
639
  summary, records, options, anon = run_search(story, country)
640
 
641
- # Prefill message body (user email left blank for now)
642
  prefill = make_body(anon, story, True, "")
643
 
644
  # Return plain serializable values (not gr.update)
 
 
 
 
 
645
  return summary, records, options, anon, prefill
646
- # ---------- end replacements ----------
647
 
648
 
649
  def make_body(anon_text, full_story, use_anon, user_email):
@@ -798,4 +698,4 @@ with gr.Blocks() as demo:
798
  user_email_in, sender_email_in, sender_pass_in, logo_url_in],
799
  outputs=[status_out, mailto_html_out, eml_file_out])
800
 
801
- demo.launch(share=False)
 
7
  import pandas as pd
8
  import urllib.parse
9
  from bs4 import BeautifulSoup
 
 
 
 
10
 
11
  import torch
12
  from transformers import (
 
105
  # ============================
106
  # SEARCH & SCRAPING
107
  # ============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def google_search(query, num_results=5):
 
 
 
 
 
109
  if not API_KEY or not CX or "YOUR_GOOGLE_API_KEY" in API_KEY or "YOUR_CSE_ID" in CX:
110
  raise RuntimeError("Google API key and CSE ID must be set as environment variables.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  url = "https://www.googleapis.com/customsearch/v1"
112
+ params = {"q":query, "key":API_KEY, "cx":CX, "num":num_results}
113
+ r = requests.get(url, params=params, timeout=20)
114
+ r.raise_for_status()
115
+ items = r.json().get("items", []) or []
116
+ return [{"title":i.get("title",""), "link":i.get("link",""), "snippet":i.get("snippet","")} for i in items]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def extract_phones(text, region="GH"):
119
  phones = []
 
345
  region = get_region_for_country(country)
346
  queries, profs = build_queries(story, country)
347
 
348
+ # Search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  search_results = []
350
+ for q in queries:
351
  try:
352
  items = google_search(q, num_results=results_per_query)
353
  for it in items:
 
356
  except Exception as e:
357
  print("[search error]", q, e)
358
 
359
+ search_results = dedup_by_url(search_results)
360
+ if not search_results:
361
+ return {"summary":"No results found. Try a different country or wording.",
362
+ "professionals":[], "queries_used":queries}
363
 
364
  # NER on titles/snippets
365
  all_people, all_orgs, all_locs = [], [], []
 
486
 
487
  def run_search(story, country):
488
  """
489
+ Robust search wrapper: returns (summary, table_records, dropdown_options, anonymized_text).
490
+ Avoids returning gr.update(...) to prevent KeyError during serialization.
491
  """
 
 
 
 
492
  try:
493
  out = find_professionals_from_story(story, country=country, results_per_query=RESULTS_PER_QUERY)
 
 
 
494
  except Exception as e:
495
+ err_msg = f"Search failed: {e}"
496
+ placeholder = ["0 — No results (search failed)"]
497
+ return err_msg, [], placeholder, ""
 
498
 
499
  pros = out.get("professionals", []) or []
500
 
501
+ # build table records
502
  try:
503
  records = pd.DataFrame(pros).to_dict(orient="records") if pros else []
504
+ except Exception:
 
505
  records = []
506
 
507
+ # build dropdown options as list of strings (guarantee at least one)
508
  options = []
509
  for i, r in enumerate(pros):
510
  label_contact = r.get("email") if r.get("email") and r.get("email") != "Not found" else (r.get("phone", "No contact"))
 
513
  options.append(label)
514
 
515
  if not options:
516
+ options = ["0 — No results (try a different country/query)"]
517
 
518
+ # anonymize safely
519
  try:
520
+ anon = anonymize_story(story) or "I am seeking confidential support regarding gender-based violence."
521
  except Exception as e:
522
+ print("[anonymize error]", e)
523
+ anon = "I am seeking confidential support regarding gender-based violence."
524
 
525
+ summary = out.get("summary", "No results found.")
526
  return summary, records, options, anon
527
 
528
 
529
  def _on_search(story, country):
530
  """
531
+ Function wired to the search button.
532
+ Returns exactly 5 outputs to match:
533
  [summary_out, results_table, dropdown_sel, anon_out, message_in]
534
  """
535
  summary, records, options, anon = run_search(story, country)
536
 
537
+ # pre-fill message body with anonymized text (user email left empty for now)
538
  prefill = make_body(anon, story, True, "")
539
 
540
  # Return plain serializable values (not gr.update)
541
+ # summary -> str
542
+ # records -> list[dict] (or [])
543
+ # options -> list[str] for dropdown (Gradio will accept it)
544
+ # anon -> str
545
+ # prefill -> str (message body)
546
  return summary, records, options, anon, prefill
 
547
 
548
 
549
  def make_body(anon_text, full_story, use_anon, user_email):
 
698
  user_email_in, sender_email_in, sender_pass_in, logo_url_in],
699
  outputs=[status_out, mailto_html_out, eml_file_out])
700
 
701
+ demo.launch(share=False)