Mohammedmarzuk17 commited on
Commit
ce545c8
·
verified ·
1 Parent(s): 12551e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -38
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  from transformers import pipeline
3
  import requests, re, datetime
4
  from concurrent.futures import ThreadPoolExecutor
5
- from sentence_transformers import SentenceTransformer, util
6
 
7
  # ---------------------------
8
  # Load Models
@@ -16,14 +15,11 @@ claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
16
  ai_detect_model_name = "roberta-base-openai-detector"
17
  ai_detector = pipeline("text-classification", model=ai_detect_model_name, device=-1)
18
 
19
- # Semantic similarity model
20
- sem_model = SentenceTransformer('all-MiniLM-L6-v2')
21
-
22
  # ---------------------------
23
  # Google Search Config
24
  # ---------------------------
25
- GOOGLE_API_KEY = "YOUR_GOOGLE_API_KEY"
26
- GOOGLE_CX = "YOUR_GOOGLE_CX"
27
 
28
  google_quota = {"count": 0, "date": datetime.date.today()}
29
  GOOGLE_DAILY_LIMIT = 100
@@ -42,9 +38,15 @@ def safe_split_text(text):
42
  # ---------------------------
43
  # Claim Extraction
44
  # ---------------------------
45
- def extract_claims(page_text, max_claims=20):
 
 
 
 
 
46
  sentences = safe_split_text(page_text)
47
 
 
48
  def classify_sentence(s):
49
  out = claim_classifier(s, claim_labels)
50
  label_priority = ["factual claim", "opinion", "personal anecdote"]
@@ -53,12 +55,14 @@ def extract_claims(page_text, max_claims=20):
53
  return {"text": s, "label": lbl, "score": round(out["scores"][out["labels"].index(lbl)], 3)}
54
  return None
55
 
 
56
  results = []
57
  with ThreadPoolExecutor() as executor:
58
  for r in executor.map(classify_sentence, sentences):
59
  if r:
60
  results.append(r)
61
 
 
62
  results = sorted(results, key=lambda x: -len(x["text"]))[:max_claims]
63
  return results
64
 
@@ -66,6 +70,7 @@ def extract_claims(page_text, max_claims=20):
66
  # AI Text Detection
67
  # ---------------------------
68
  def detect_ai(texts):
 
69
  if isinstance(texts, str):
70
  texts = [texts]
71
  results = []
@@ -79,56 +84,40 @@ def detect_ai(texts):
79
  # ---------------------------
80
  # Google Evidence Gathering
81
  # ---------------------------
82
- def fetch_google_search(claim, num_results=10):
83
- """
84
- Returns top 3 keyword results as before
85
- and top 3 semantic results
86
- """
87
  global google_quota
88
  today = datetime.date.today()
89
  if google_quota["date"] != today:
90
  google_quota = {"count": 0, "date": today}
91
 
92
  if google_quota["count"] >= GOOGLE_DAILY_LIMIT:
93
- return {
94
- "keyword_results": ["[Google] Daily quota reached (100 queries)."],
95
- "semantic_results": ["[Google] Daily quota reached (100 queries)."]
96
- }
97
 
98
  try:
99
- url = f"https://www.googleapis.com/customsearch/v1?q={requests.utils.quote(claim)}&key={GOOGLE_API_KEY}&cx={GOOGLE_CX}&num={num_results}"
100
  r = requests.get(url).json()
101
  google_quota["count"] += 1
102
  items = r.get("items", [])
103
-
104
- # Keyword-based results (top 3)
105
- keyword_results = [f"{item['title']}: {item['snippet']}" for item in items[:3]]
106
-
107
- # Semantic-based results (top 3 by similarity)
108
- semantic_results = []
109
- if items:
110
- claim_emb = sem_model.encode(claim, convert_to_tensor=True)
111
- snippets = [f"{item['title']}: {item['snippet']}" for item in items]
112
- snippet_embs = sem_model.encode(snippets, convert_to_tensor=True)
113
- sims = util.cos_sim(claim_emb, snippet_embs)[0]
114
- top_indices = sims.argsort(descending=True)[:3]
115
- semantic_results = [snippets[i] for i in top_indices]
116
-
117
- return {"keyword_results": keyword_results, "semantic_results": semantic_results}
118
-
119
- except Exception as e:
120
- return {"keyword_results": [], "semantic_results": []}
121
 
122
  # ---------------------------
123
  # Unified Predict Function
124
  # ---------------------------
125
  def predict(user_text=""):
 
 
 
 
 
126
  if not user_text.strip():
127
  return {"error": "No text provided."}
128
 
129
  # --- Full text analysis ---
130
  full_ai_result = detect_ai(user_text)
131
- # Strict split by '.' to preserve full sentences
 
132
  dot_sentences = [s.strip() for s in user_text.split('.') if s.strip()]
133
  full_fact_checking = {s: fetch_google_search(s) for s in dot_sentences}
134
 
@@ -136,7 +125,7 @@ def predict(user_text=""):
136
  claims_data = extract_claims(user_text)
137
  claims_texts = [c["text"] for c in claims_data]
138
  claims_ai_results = detect_ai(claims_texts) if claims_texts else []
139
- claims_fact_checking = {c["text"]: fetch_google_search(c["text"]) for c in claims_data}
140
 
141
  return {
142
  "full_text": {
@@ -146,7 +135,7 @@ def predict(user_text=""):
146
  },
147
  "claims": claims_data,
148
  "claims_ai_detection": claims_ai_results,
149
- "claims_fact_checking": claims_fact_checking,
150
  "google_quota_used": google_quota["count"],
151
  "google_quota_reset": str(datetime.datetime.combine(
152
  google_quota["date"] + datetime.timedelta(days=1),
 
2
  from transformers import pipeline
3
  import requests, re, datetime
4
  from concurrent.futures import ThreadPoolExecutor
 
5
 
6
  # ---------------------------
7
  # Load Models
 
15
  ai_detect_model_name = "roberta-base-openai-detector"
16
  ai_detector = pipeline("text-classification", model=ai_detect_model_name, device=-1)
17
 
 
 
 
18
  # ---------------------------
19
  # Google Search Config
20
  # ---------------------------
21
+ GOOGLE_API_KEY = "AIzaSyAC56onKwR17zd_djUPEfGXQACy9qRjDxw"
22
+ GOOGLE_CX = "87391aed073954cae"
23
 
24
  google_quota = {"count": 0, "date": datetime.date.today()}
25
  GOOGLE_DAILY_LIMIT = 100
 
38
  # ---------------------------
39
  # Claim Extraction
40
  # ---------------------------
41
+ def extract_claims(page_text, max_claims=20, batch_size=50):
42
+ """
43
+ Extract top claims from text:
44
+ - Uses safe_split_text for splitting.
45
+ - Classifies each piece into factual claim, opinion, or anecdote.
46
+ """
47
  sentences = safe_split_text(page_text)
48
 
49
+ # Step 1: Function to classify a single sentence
50
  def classify_sentence(s):
51
  out = claim_classifier(s, claim_labels)
52
  label_priority = ["factual claim", "opinion", "personal anecdote"]
 
55
  return {"text": s, "label": lbl, "score": round(out["scores"][out["labels"].index(lbl)], 3)}
56
  return None
57
 
58
+ # Step 2: Threaded classification
59
  results = []
60
  with ThreadPoolExecutor() as executor:
61
  for r in executor.map(classify_sentence, sentences):
62
  if r:
63
  results.append(r)
64
 
65
+ # Step 3: Limit top claims
66
  results = sorted(results, key=lambda x: -len(x["text"]))[:max_claims]
67
  return results
68
 
 
70
  # AI Text Detection
71
  # ---------------------------
72
  def detect_ai(texts):
73
+ """Detect whether input text is AI-generated or human-written."""
74
  if isinstance(texts, str):
75
  texts = [texts]
76
  results = []
 
84
  # ---------------------------
85
  # Google Evidence Gathering
86
  # ---------------------------
87
+ def fetch_google_search(claim):
 
 
 
 
88
  global google_quota
89
  today = datetime.date.today()
90
  if google_quota["date"] != today:
91
  google_quota = {"count": 0, "date": today}
92
 
93
  if google_quota["count"] >= GOOGLE_DAILY_LIMIT:
94
+ return ["[Google] Daily quota reached (100 queries)."]
 
 
 
95
 
96
  try:
97
+ url = f"https://www.googleapis.com/customsearch/v1?q={requests.utils.quote(claim)}&key={GOOGLE_API_KEY}&cx={GOOGLE_CX}"
98
  r = requests.get(url).json()
99
  google_quota["count"] += 1
100
  items = r.get("items", [])
101
+ return [f"{item['title']}: {item['snippet']}" for item in items[:3]] # top 3 results
102
+ except Exception:
103
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  # ---------------------------
106
  # Unified Predict Function
107
  # ---------------------------
108
  def predict(user_text=""):
109
+ """
110
+ Runs both:
111
+ 1. Full-text analysis (AI detection on entire input + sentence-based fact-check)
112
+ 2. Claim-extracted analysis (claim split + AI detection + fact-check)
113
+ """
114
  if not user_text.strip():
115
  return {"error": "No text provided."}
116
 
117
  # --- Full text analysis ---
118
  full_ai_result = detect_ai(user_text)
119
+
120
+ # NEW: Split strictly by '.' to preserve full user input sentences
121
  dot_sentences = [s.strip() for s in user_text.split('.') if s.strip()]
122
  full_fact_checking = {s: fetch_google_search(s) for s in dot_sentences}
123
 
 
125
  claims_data = extract_claims(user_text)
126
  claims_texts = [c["text"] for c in claims_data]
127
  claims_ai_results = detect_ai(claims_texts) if claims_texts else []
128
+ fact_checking = {c["text"]: fetch_google_search(c["text"]) for c in claims_data}
129
 
130
  return {
131
  "full_text": {
 
135
  },
136
  "claims": claims_data,
137
  "claims_ai_detection": claims_ai_results,
138
+ "claims_fact_checking": fact_checking,
139
  "google_quota_used": google_quota["count"],
140
  "google_quota_reset": str(datetime.datetime.combine(
141
  google_quota["date"] + datetime.timedelta(days=1),