Nymbo commited on
Commit
36e1ba2
·
verified ·
1 Parent(s): cbe0212

Update Modules/Web_Search.py

Browse files
Files changed (1) hide show
  1. Modules/Web_Search.py +269 -268
Modules/Web_Search.py CHANGED
@@ -1,268 +1,269 @@
1
- from __future__ import annotations
2
-
3
- from typing import Annotated, List
4
-
5
- import gradio as gr
6
- from ddgs import DDGS
7
-
8
- from app import _log_call_end, _log_call_start, _search_rate_limiter, _truncate_for_log
9
-
10
-
11
- def _extract_date_from_snippet(snippet: str) -> str:
12
- if not snippet:
13
- return ""
14
- import re
15
-
16
- date_patterns = [
17
- r"\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b",
18
- r"\b([A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})\b",
19
- r"\b(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})\b",
20
- r"\b(\d+\s+(?:day|week|month|year)s?\s+ago)\b",
21
- r"(?:Published|Updated|Posted):\s*([^,\n]+?)(?:[,\n]|$)",
22
- ]
23
- for pattern in date_patterns:
24
- matches = re.findall(pattern, snippet, re.IGNORECASE)
25
- if matches:
26
- return matches[0].strip()
27
- return ""
28
-
29
-
30
- def _format_search_result(result: dict, search_type: str, index: int) -> List[str]:
31
- lines: List[str] = []
32
- if search_type == "text":
33
- title = result.get("title", "").strip()
34
- url = result.get("href", "").strip()
35
- snippet = result.get("body", "").strip()
36
- date = _extract_date_from_snippet(snippet)
37
- lines.append(f"{index}. {title}")
38
- lines.append(f" URL: {url}")
39
- if snippet:
40
- lines.append(f" Summary: {snippet}")
41
- if date:
42
- lines.append(f" Date: {date}")
43
- elif search_type == "news":
44
- title = result.get("title", "").strip()
45
- url = result.get("url", "").strip()
46
- body = result.get("body", "").strip()
47
- date = result.get("date", "").strip()
48
- source = result.get("source", "").strip()
49
- lines.append(f"{index}. {title}")
50
- lines.append(f" URL: {url}")
51
- if source:
52
- lines.append(f" Source: {source}")
53
- if date:
54
- lines.append(f" Date: {date}")
55
- if body:
56
- lines.append(f" Summary: {body}")
57
- elif search_type == "images":
58
- title = result.get("title", "").strip()
59
- image_url = result.get("image", "").strip()
60
- source_url = result.get("url", "").strip()
61
- source = result.get("source", "").strip()
62
- width = result.get("width", "")
63
- height = result.get("height", "")
64
- lines.append(f"{index}. {title}")
65
- lines.append(f" Image: {image_url}")
66
- lines.append(f" Source: {source_url}")
67
- if source:
68
- lines.append(f" Publisher: {source}")
69
- if width and height:
70
- lines.append(f" Dimensions: {width}x{height}")
71
- elif search_type == "videos":
72
- title = result.get("title", "").strip()
73
- description = result.get("description", "").strip()
74
- duration = result.get("duration", "").strip()
75
- published = result.get("published", "").strip()
76
- uploader = result.get("uploader", "").strip()
77
- embed_url = result.get("embed_url", "").strip()
78
- lines.append(f"{index}. {title}")
79
- if embed_url:
80
- lines.append(f" Video: {embed_url}")
81
- if uploader:
82
- lines.append(f" Uploader: {uploader}")
83
- if duration:
84
- lines.append(f" Duration: {duration}")
85
- if published:
86
- lines.append(f" Published: {published}")
87
- if description:
88
- lines.append(f" Description: {description}")
89
- elif search_type == "books":
90
- title = result.get("title", "").strip()
91
- url = result.get("url", "").strip()
92
- body = result.get("body", "").strip()
93
- lines.append(f"{index}. {title}")
94
- lines.append(f" URL: {url}")
95
- if body:
96
- lines.append(f" Description: {body}")
97
- return lines
98
-
99
-
100
- def Web_Search(
101
- query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
102
- max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
103
- page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
104
- search_type: Annotated[str, "Type of search: 'text' (web pages), 'news', 'images', 'videos', or 'books'."] = "text",
105
- offset: Annotated[int, "Result offset to start from (overrides page if > 0, for precise continuation)."] = 0,
106
- ) -> str:
107
- _log_call_start("Web_Search", query=query, max_results=max_results, page=page, search_type=search_type, offset=offset)
108
- if not query or not query.strip():
109
- result = "No search query provided. Please enter a search term."
110
- _log_call_end("Web_Search", _truncate_for_log(result))
111
- return result
112
- max_results = max(1, min(20, max_results))
113
- page = max(1, page)
114
- offset = max(0, offset)
115
- valid_types = ["text", "news", "images", "videos", "books"]
116
- if search_type not in valid_types:
117
- search_type = "text"
118
- if offset > 0:
119
- actual_offset = offset
120
- calculated_page = (offset // max_results) + 1
121
- else:
122
- actual_offset = (page - 1) * max_results
123
- calculated_page = page
124
- total_needed = actual_offset + max_results
125
- used_fallback = False
126
- original_search_type = search_type
127
-
128
- def _perform_search(stype: str) -> list[dict]:
129
- try:
130
- _search_rate_limiter.acquire()
131
- with DDGS() as ddgs:
132
- if stype == "text":
133
- raw_gen = ddgs.text(query, max_results=total_needed + 10)
134
- elif stype == "news":
135
- raw_gen = ddgs.news(query, max_results=total_needed + 10)
136
- elif stype == "images":
137
- raw_gen = ddgs.images(query, max_results=total_needed + 10)
138
- elif stype == "videos":
139
- raw_gen = ddgs.videos(query, max_results=total_needed + 10)
140
- else:
141
- raw_gen = ddgs.books(query, max_results=total_needed + 10)
142
- try:
143
- return list(raw_gen)
144
- except Exception as inner_exc:
145
- if "no results" in str(inner_exc).lower() or "not found" in str(inner_exc).lower():
146
- return []
147
- raise inner_exc
148
- except Exception as exc:
149
- error_msg = f"Search failed: {str(exc)[:200]}"
150
- lowered = str(exc).lower()
151
- if "blocked" in lowered or "rate" in lowered:
152
- error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
153
- elif "timeout" in lowered:
154
- error_msg = "Search timed out. Please try again with a simpler query."
155
- elif "network" in lowered or "connection" in lowered:
156
- error_msg = "Network connection error. Please check your internet connection and try again."
157
- elif "no results" in lowered or "not found" in lowered:
158
- return []
159
- raise Exception(error_msg)
160
-
161
- try:
162
- raw = _perform_search(search_type)
163
- except Exception as exc:
164
- result = f"Error: {exc}"
165
- _log_call_end("Web_Search", _truncate_for_log(result))
166
- return result
167
-
168
- if not raw and search_type == "news":
169
- try:
170
- raw = _perform_search("text")
171
- if raw:
172
- used_fallback = True
173
- search_type = "text"
174
- except Exception:
175
- pass
176
-
177
- if not raw:
178
- fallback_note = " (also tried 'text' search as fallback)" if original_search_type == "news" and used_fallback else ""
179
- result = f"No {original_search_type} results found for query: {query}{fallback_note}"
180
- _log_call_end("Web_Search", _truncate_for_log(result))
181
- return result
182
-
183
- paginated_results = raw[actual_offset: actual_offset + max_results]
184
- if not paginated_results:
185
- if actual_offset >= len(raw):
186
- result = f"Offset {actual_offset} exceeds available results ({len(raw)} total). Try offset=0 to start from beginning."
187
- else:
188
- result = f"No {original_search_type} results found on page {calculated_page} for query: {query}. Try page 1 or reduce page number."
189
- _log_call_end("Web_Search", _truncate_for_log(result))
190
- return result
191
-
192
- total_available = len(raw)
193
- start_num = actual_offset + 1
194
- end_num = actual_offset + len(paginated_results)
195
- next_offset = actual_offset + len(paginated_results)
196
- search_label = original_search_type.title()
197
- if used_fallback:
198
- search_label += " Text (Smart Fallback)"
199
- pagination_info = f"Page {calculated_page}"
200
- if offset > 0:
201
- pagination_info = f"Offset {actual_offset} (≈ {pagination_info})"
202
- lines = [f"{search_label} search results for: {query}"]
203
- if used_fallback:
204
- lines.append("📍 Note: News search returned no results, automatically searched general web content instead")
205
- lines.append(f"{pagination_info} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
206
- for i, result in enumerate(paginated_results, start_num):
207
- result_lines = _format_search_result(result, search_type, i)
208
- lines.extend(result_lines)
209
- lines.append("")
210
- if total_available > end_num:
211
- lines.append("💡 More results available:")
212
- lines.append(f" • Next page: page={calculated_page + 1}")
213
- lines.append(f" Next offset: offset={next_offset}")
214
- lines.append(f" • Use offset={next_offset} to continue exactly from result {next_offset + 1}")
215
- result = "\n".join(lines)
216
- search_info = f"type={original_search_type}"
217
- if used_fallback:
218
- search_info += "→text"
219
- _log_call_end("Web_Search", f"{search_info} page={calculated_page} offset={actual_offset} results={len(paginated_results)} chars={len(result)}")
220
- return result
221
-
222
-
223
- def build_interface() -> gr.Interface:
224
- return gr.Interface(
225
- fn=Web_Search,
226
- inputs=[
227
- gr.Textbox(label="Query", placeholder="topic OR site:example.com", max_lines=1),
228
- gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
229
- gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination (ignored if offset > 0)"),
230
- gr.Radio(
231
- label="Search Type",
232
- choices=["text", "news", "images", "videos", "books"],
233
- value="text",
234
- info="Type of content to search for",
235
- ),
236
- gr.Slider(
237
- minimum=0,
238
- maximum=1000,
239
- value=0,
240
- step=1,
241
- label="Offset",
242
- info="Result offset to start from (overrides page if > 0, use next_offset from previous search)",
243
- ),
244
- ],
245
- outputs=gr.Textbox(label="Search Results", interactive=False, lines=20, max_lines=20),
246
- title="Web Search",
247
- description=(
248
- "<div style=\"text-align:center\">Multi-type web search with readable output format, date detection, and flexible pagination. "
249
- "Supports text, news, images, videos, and books. Features smart fallback for news searches and precise offset control.</div>"
250
- ),
251
- api_description=(
252
- "Run a web search (DuckDuckGo backend) with support for multiple content types and return formatted results. "
253
- "Features smart fallback: if 'news' search returns no results, automatically retries with 'text' search "
254
- "to catch sources like Hacker News that might not appear in news-specific results. "
255
- "Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
256
- "OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
257
- "'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'. "
258
- "Parameters: query (str), max_results (int, 1-20), page (int, 1-based pagination), "
259
- "search_type (str: text/news/images/videos/books), offset (int, result offset for precise continuation). "
260
- "If offset > 0, it overrides the page parameter. Returns appropriately formatted results with metadata, "
261
- "pagination hints, and next_offset information for each content type."
262
- ),
263
- flagging_mode="never",
264
- submit_btn="Search",
265
- )
266
-
267
-
268
- __all__ = ["Web_Search", "build_interface"]
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Annotated, List
4
+
5
+ import gradio as gr
6
+ from ddgs import DDGS
7
+
8
+ from app import _log_call_end, _log_call_start, _search_rate_limiter, _truncate_for_log
9
+ from ._docstrings import autodoc
10
+
11
+
12
+ # Single source of truth for the LLM-facing tool description
13
+ TOOL_SUMMARY = (
14
+ "Run a DuckDuckGo-backed search across text, news, images, videos, or books. "
15
+ "Readable results include pagination hints and next_offset when more results are available; "
16
+ "Use in combination with `Web_Fetch` to navigate the web."
17
+ )
18
+
19
+
20
+ def _extract_date_from_snippet(snippet: str) -> str:
21
+ if not snippet:
22
+ return ""
23
+ import re
24
+
25
+ date_patterns = [
26
+ r"\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b",
27
+ r"\b([A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})\b",
28
+ r"\b(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})\b",
29
+ r"\b(\d+\s+(?:day|week|month|year)s?\s+ago)\b",
30
+ r"(?:Published|Updated|Posted):\s*([^,\n]+?)(?:[,\n]|$)",
31
+ ]
32
+ for pattern in date_patterns:
33
+ matches = re.findall(pattern, snippet, re.IGNORECASE)
34
+ if matches:
35
+ return matches[0].strip()
36
+ return ""
37
+
38
+
39
+ def _format_search_result(result: dict, search_type: str, index: int) -> List[str]:
40
+ lines: List[str] = []
41
+ if search_type == "text":
42
+ title = result.get("title", "").strip()
43
+ url = result.get("href", "").strip()
44
+ snippet = result.get("body", "").strip()
45
+ date = _extract_date_from_snippet(snippet)
46
+ lines.append(f"{index}. {title}")
47
+ lines.append(f" URL: {url}")
48
+ if snippet:
49
+ lines.append(f" Summary: {snippet}")
50
+ if date:
51
+ lines.append(f" Date: {date}")
52
+ elif search_type == "news":
53
+ title = result.get("title", "").strip()
54
+ url = result.get("url", "").strip()
55
+ body = result.get("body", "").strip()
56
+ date = result.get("date", "").strip()
57
+ source = result.get("source", "").strip()
58
+ lines.append(f"{index}. {title}")
59
+ lines.append(f" URL: {url}")
60
+ if source:
61
+ lines.append(f" Source: {source}")
62
+ if date:
63
+ lines.append(f" Date: {date}")
64
+ if body:
65
+ lines.append(f" Summary: {body}")
66
+ elif search_type == "images":
67
+ title = result.get("title", "").strip()
68
+ image_url = result.get("image", "").strip()
69
+ source_url = result.get("url", "").strip()
70
+ source = result.get("source", "").strip()
71
+ width = result.get("width", "")
72
+ height = result.get("height", "")
73
+ lines.append(f"{index}. {title}")
74
+ lines.append(f" Image: {image_url}")
75
+ lines.append(f" Source: {source_url}")
76
+ if source:
77
+ lines.append(f" Publisher: {source}")
78
+ if width and height:
79
+ lines.append(f" Dimensions: {width}x{height}")
80
+ elif search_type == "videos":
81
+ title = result.get("title", "").strip()
82
+ description = result.get("description", "").strip()
83
+ duration = result.get("duration", "").strip()
84
+ published = result.get("published", "").strip()
85
+ uploader = result.get("uploader", "").strip()
86
+ embed_url = result.get("embed_url", "").strip()
87
+ lines.append(f"{index}. {title}")
88
+ if embed_url:
89
+ lines.append(f" Video: {embed_url}")
90
+ if uploader:
91
+ lines.append(f" Uploader: {uploader}")
92
+ if duration:
93
+ lines.append(f" Duration: {duration}")
94
+ if published:
95
+ lines.append(f" Published: {published}")
96
+ if description:
97
+ lines.append(f" Description: {description}")
98
+ elif search_type == "books":
99
+ title = result.get("title", "").strip()
100
+ url = result.get("url", "").strip()
101
+ body = result.get("body", "").strip()
102
+ lines.append(f"{index}. {title}")
103
+ lines.append(f" URL: {url}")
104
+ if body:
105
+ lines.append(f" Description: {body}")
106
+ return lines
107
+
108
+
109
+ @autodoc(
110
+ summary=TOOL_SUMMARY,
111
+ )
112
+ def Web_Search(
113
+ query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
114
+ max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
115
+ page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
116
+ search_type: Annotated[str, "Type of search: 'text' (web pages), 'news', 'images', 'videos', or 'books'."] = "text",
117
+ offset: Annotated[int, "Result offset to start from (overrides page if > 0, for precise continuation)."] = 0,
118
+ ) -> str:
119
+ _log_call_start("Web_Search", query=query, max_results=max_results, page=page, search_type=search_type, offset=offset)
120
+ if not query or not query.strip():
121
+ result = "No search query provided. Please enter a search term."
122
+ _log_call_end("Web_Search", _truncate_for_log(result))
123
+ return result
124
+ max_results = max(1, min(20, max_results))
125
+ page = max(1, page)
126
+ offset = max(0, offset)
127
+ valid_types = ["text", "news", "images", "videos", "books"]
128
+ if search_type not in valid_types:
129
+ search_type = "text"
130
+ if offset > 0:
131
+ actual_offset = offset
132
+ calculated_page = (offset // max_results) + 1
133
+ else:
134
+ actual_offset = (page - 1) * max_results
135
+ calculated_page = page
136
+ total_needed = actual_offset + max_results
137
+ used_fallback = False
138
+ original_search_type = search_type
139
+
140
+ def _perform_search(stype: str) -> list[dict]:
141
+ try:
142
+ _search_rate_limiter.acquire()
143
+ with DDGS() as ddgs:
144
+ if stype == "text":
145
+ raw_gen = ddgs.text(query, max_results=total_needed + 10)
146
+ elif stype == "news":
147
+ raw_gen = ddgs.news(query, max_results=total_needed + 10)
148
+ elif stype == "images":
149
+ raw_gen = ddgs.images(query, max_results=total_needed + 10)
150
+ elif stype == "videos":
151
+ raw_gen = ddgs.videos(query, max_results=total_needed + 10)
152
+ else:
153
+ raw_gen = ddgs.books(query, max_results=total_needed + 10)
154
+ try:
155
+ return list(raw_gen)
156
+ except Exception as inner_exc:
157
+ if "no results" in str(inner_exc).lower() or "not found" in str(inner_exc).lower():
158
+ return []
159
+ raise inner_exc
160
+ except Exception as exc:
161
+ error_msg = f"Search failed: {str(exc)[:200]}"
162
+ lowered = str(exc).lower()
163
+ if "blocked" in lowered or "rate" in lowered:
164
+ error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
165
+ elif "timeout" in lowered:
166
+ error_msg = "Search timed out. Please try again with a simpler query."
167
+ elif "network" in lowered or "connection" in lowered:
168
+ error_msg = "Network connection error. Please check your internet connection and try again."
169
+ elif "no results" in lowered or "not found" in lowered:
170
+ return []
171
+ raise Exception(error_msg)
172
+
173
+ try:
174
+ raw = _perform_search(search_type)
175
+ except Exception as exc:
176
+ result = f"Error: {exc}"
177
+ _log_call_end("Web_Search", _truncate_for_log(result))
178
+ return result
179
+
180
+ if not raw and search_type == "news":
181
+ try:
182
+ raw = _perform_search("text")
183
+ if raw:
184
+ used_fallback = True
185
+ search_type = "text"
186
+ except Exception:
187
+ pass
188
+
189
+ if not raw:
190
+ fallback_note = " (also tried 'text' search as fallback)" if original_search_type == "news" and used_fallback else ""
191
+ result = f"No {original_search_type} results found for query: {query}{fallback_note}"
192
+ _log_call_end("Web_Search", _truncate_for_log(result))
193
+ return result
194
+
195
+ paginated_results = raw[actual_offset: actual_offset + max_results]
196
+ if not paginated_results:
197
+ if actual_offset >= len(raw):
198
+ result = f"Offset {actual_offset} exceeds available results ({len(raw)} total). Try offset=0 to start from beginning."
199
+ else:
200
+ result = f"No {original_search_type} results found on page {calculated_page} for query: {query}. Try page 1 or reduce page number."
201
+ _log_call_end("Web_Search", _truncate_for_log(result))
202
+ return result
203
+
204
+ total_available = len(raw)
205
+ start_num = actual_offset + 1
206
+ end_num = actual_offset + len(paginated_results)
207
+ next_offset = actual_offset + len(paginated_results)
208
+ search_label = original_search_type.title()
209
+ if used_fallback:
210
+ search_label += " → Text (Smart Fallback)"
211
+ pagination_info = f"Page {calculated_page}"
212
+ if offset > 0:
213
+ pagination_info = f"Offset {actual_offset} (≈ {pagination_info})"
214
+ lines = [f"{search_label} search results for: {query}"]
215
+ if used_fallback:
216
+ lines.append("📍 Note: News search returned no results, automatically searched general web content instead")
217
+ lines.append(f"{pagination_info} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
218
+ for i, result in enumerate(paginated_results, start_num):
219
+ result_lines = _format_search_result(result, search_type, i)
220
+ lines.extend(result_lines)
221
+ lines.append("")
222
+ if total_available > end_num:
223
+ lines.append("💡 More results available:")
224
+ lines.append(f" • Next page: page={calculated_page + 1}")
225
+ lines.append(f" • Next offset: offset={next_offset}")
226
+ lines.append(f" • Use offset={next_offset} to continue exactly from result {next_offset + 1}")
227
+ result = "\n".join(lines)
228
+ search_info = f"type={original_search_type}"
229
+ if used_fallback:
230
+ search_info += "→text"
231
+ _log_call_end("Web_Search", f"{search_info} page={calculated_page} offset={actual_offset} results={len(paginated_results)} chars={len(result)}")
232
+ return result
233
+
234
+
235
+ def build_interface() -> gr.Interface:
236
+ return gr.Interface(
237
+ fn=Web_Search,
238
+ inputs=[
239
+ gr.Textbox(label="Query", placeholder="topic OR site:example.com", max_lines=1),
240
+ gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
241
+ gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination (ignored if offset > 0)"),
242
+ gr.Radio(
243
+ label="Search Type",
244
+ choices=["text", "news", "images", "videos", "books"],
245
+ value="text",
246
+ info="Type of content to search for",
247
+ ),
248
+ gr.Slider(
249
+ minimum=0,
250
+ maximum=1000,
251
+ value=0,
252
+ step=1,
253
+ label="Offset",
254
+ info="Result offset to start from (overrides page if > 0, use next_offset from previous search)",
255
+ ),
256
+ ],
257
+ outputs=gr.Textbox(label="Search Results", interactive=False, lines=20, max_lines=20),
258
+ title="Web Search",
259
+ description=(
260
+ "<div style=\"text-align:center\">Multi-type web search with readable output format, date detection, and flexible pagination. "
261
+ "Supports text, news, images, videos, and books. Features smart fallback for news searches and precise offset control.</div>"
262
+ ),
263
+ api_description=TOOL_SUMMARY,
264
+ flagging_mode="never",
265
+ submit_btn="Search",
266
+ )
267
+
268
+
269
+ __all__ = ["Web_Search", "build_interface"]