danulr05 commited on
Commit
0aab8f9
·
verified ·
1 Parent(s): e9e0375

Upload 4 files

Browse files

Adding Initial files

Files changed (4) hide show
  1. README.md +81 -13
  2. app.py +387 -0
  3. dynamic_metadata.json +73 -0
  4. requirements.txt +6 -0
README.md CHANGED
@@ -1,13 +1,81 @@
1
- ---
2
- title: Budget Proposals Search Api
3
- emoji: 🔥
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.44.0
8
- app_file: app.py
9
- pinned: false
10
- short_description: Semantic search api for verite budget
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Budget Proposals Semantic Search API
2
+
3
+ A Flask-based API for semantic search of budget proposals using Pinecone vector database and sentence transformers.
4
+
5
+ ## Features
6
+
7
+ - Semantic search of budget proposals using AI embeddings
8
+ - Category-based filtering
9
+ - Relevance scoring and ranking
10
+ - Health check and statistics endpoints
11
+ - CORS enabled for web applications
12
+
13
+ ## API Endpoints
14
+
15
+ ### Search Proposals
16
+ - **POST** `/api/search` - Search with JSON body
17
+ - **GET** `/api/search?query=<search_term>` - Search with query parameter
18
+
19
+ ### Get All Proposals
20
+ - **GET** `/api/proposals` - Get all proposals
21
+ - **GET** `/api/proposals?category_filter=<category>` - Get proposals by category
22
+
23
+ ### Categories
24
+ - **GET** `/api/categories` - Get all available categories
25
+
26
+ ### Health & Stats
27
+ - **GET** `/api/health` - Health check
28
+ - **GET** `/api/stats` - Index statistics
29
+
30
+ ## Environment Variables
31
+
32
+ Set these in your Hugging Face Spaces secrets:
33
+
34
+ - `PINECONE_API_KEY` - Your Pinecone API key (required)
35
+
36
+ ## Usage Examples
37
+
38
+ ### Search Proposals
39
+ ```bash
40
+ curl -X POST https://your-space-url.hf.space/api/search \
41
+ -H "Content-Type: application/json" \
42
+ -d '{"query": "education funding", "top_k": 5}'
43
+ ```
44
+
45
+ ### Get All Proposals
46
+ ```bash
47
+ curl https://your-space-url.hf.space/api/proposals
48
+ ```
49
+
50
+ ### Get Categories
51
+ ```bash
52
+ curl https://your-space-url.hf.space/api/categories
53
+ ```
54
+
55
+ ## Response Format
56
+
57
+ ```json
58
+ {
59
+ "query": "education funding",
60
+ "results": [
61
+ {
62
+ "title": "Education Enhancement Program",
63
+ "summary": "Proposal for improving educational infrastructure...",
64
+ "costLKR": "500,000,000",
65
+ "category": "Education",
66
+ "pdfUrl": "assets/pdfs/education_proposal.pdf",
67
+ "thumbUrl": "assets/thumbs/education_proposal.jpg",
68
+ "score": 0.85,
69
+ "relevance_percentage": 85,
70
+ "file_path": "education_proposal.pdf",
71
+ "id": "doc_123"
72
+ }
73
+ ],
74
+ "total_results": 1,
75
+ "category_filter": null
76
+ }
77
+ ```
78
+
79
+ ## Deployment
80
+
81
+ This API is deployed on Hugging Face Spaces and automatically serves on port 7860.
app.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from flask_cors import CORS
3
+ from sentence_transformers import SentenceTransformer
4
+ from pinecone import Pinecone
5
+ import os
6
+ import logging
7
+ import json
8
+
9
+ app = Flask(__name__)
10
+ CORS(app) # Enable CORS for all routes
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Initialize Pinecone
17
+ PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
18
+ if not PINECONE_API_KEY:
19
+ raise ValueError("PINECONE_API_KEY environment variable is required")
20
+
21
+ pc = Pinecone(api_key=PINECONE_API_KEY)
22
+ # Configuration
23
+ INDEX_NAME = "budget-proposals-optimized" # Use the new optimized index
24
+
25
+ # Load embedding model
26
+ embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
+
28
+ # Load dynamic metadata
29
+ def load_dynamic_metadata():
30
+ """Load metadata from dynamic_metadata.json"""
31
+ try:
32
+ if os.path.exists("dynamic_metadata.json"):
33
+ with open("dynamic_metadata.json", 'r', encoding='utf-8') as f:
34
+ return json.load(f)
35
+ except Exception as e:
36
+ logger.error(f"Error loading dynamic metadata: {e}")
37
+ return {}
38
+
39
+ # Load dynamic metadata
40
+ DYNAMIC_METADATA = load_dynamic_metadata()
41
+
42
+ def get_pinecone_index():
43
+ """Get the budget proposals Pinecone index"""
44
+ try:
45
+ return pc.Index(INDEX_NAME)
46
+ except Exception as e:
47
+ logger.error(f"Error accessing Pinecone index: {e}")
48
+ return None
49
+
50
+ def semantic_search(query: str, top_k=1, category_filter=None):
51
+ """Perform semantic search on budget proposals - return relevant documents based on query specificity"""
52
+ try:
53
+ pc_index = get_pinecone_index()
54
+ if not pc_index:
55
+ return []
56
+
57
+ query_emb = embed_model.encode(query).tolist()
58
+
59
+ # Build filter if category is specified
60
+ filter_dict = {"source": "budget_proposals"}
61
+ if category_filter and category_filter != "All categories":
62
+ filter_dict["category"] = category_filter
63
+
64
+ # Get more results to find relevant documents
65
+ res = pc_index.query(
66
+ vector=query_emb,
67
+ top_k=50, # Get more results to find relevant documents
68
+ include_metadata=True,
69
+ filter=filter_dict
70
+ )
71
+
72
+ # Track the best score for each unique document
73
+ best_scores = {} # file_path -> best_score
74
+
75
+ for match in res["matches"]:
76
+ metadata = match["metadata"]
77
+ score = match["score"]
78
+ file_path = metadata.get("file_path", "")
79
+
80
+ # Keep track of the best score for each document
81
+ if file_path not in best_scores or score > best_scores[file_path]:
82
+ best_scores[file_path] = score
83
+
84
+ if not best_scores:
85
+ return []
86
+
87
+ # Sort documents by their best scores
88
+ sorted_docs = sorted(best_scores.items(), key=lambda x: x[1], reverse=True)
89
+
90
+ # Determine how many documents to return based on query specificity
91
+ max_score = sorted_docs[0][1] # Best score
92
+
93
+ # If the best score is very high (>0.6), it's a specific query - show fewer results
94
+ # If the best score is moderate (0.3-0.6), it's a medium query - show some results
95
+ # If the best score is low (<0.3), it's a broad query - show more results
96
+ if max_score > 0.6:
97
+ # Specific query - show 1-2 documents
98
+ threshold = max_score * 0.8 # Show documents within 80% of best score
99
+ max_docs = 2
100
+ elif max_score > 0.3:
101
+ # Medium query - show 2-3 documents
102
+ threshold = max_score * 0.7 # Show documents within 70% of best score
103
+ max_docs = 3
104
+ else:
105
+ # Broad query - show 3-5 documents
106
+ threshold = max_score * 0.5 # Show documents within 50% of best score
107
+ max_docs = 5
108
+
109
+ results = []
110
+ doc_count = 0
111
+
112
+ for file_path, score in sorted_docs:
113
+ if doc_count >= max_docs or score < threshold:
114
+ break
115
+
116
+ # Get the metadata for this document
117
+ for match in res["matches"]:
118
+ metadata = match["metadata"]
119
+ if metadata.get("file_path", "") == file_path:
120
+ # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
121
+ proposal_data = DYNAMIC_METADATA.get(file_path, {
122
+ "title": metadata.get("title", "Unknown Title"),
123
+ "summary": metadata.get("summary", ""),
124
+ "category": metadata.get("category", "Budget Proposal"),
125
+ "costLKR": metadata.get("costLKR", "No Costing Available")
126
+ })
127
+
128
+ title = proposal_data["title"]
129
+ summary = proposal_data["summary"]
130
+ costLKR = proposal_data["costLKR"]
131
+ category = proposal_data["category"]
132
+ thumb_url = metadata.get("thumbUrl", "")
133
+
134
+ result = {
135
+ "title": title,
136
+ "summary": summary,
137
+ "costLKR": costLKR,
138
+ "category": category,
139
+ "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
140
+ "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
141
+ "score": score,
142
+ "relevance_percentage": int(score * 100),
143
+ "file_path": file_path,
144
+ "id": match["id"],
145
+ "content": metadata.get("content", "") # Add the actual content
146
+ }
147
+
148
+ results.append(result)
149
+ doc_count += 1
150
+ break
151
+
152
+ return results
153
+ except Exception as e:
154
+ logger.error(f"Search error: {e}")
155
+ return []
156
+
157
+ def get_all_proposals(category_filter=None):
158
+ """Get all budget proposals (for initial load or when no search query)"""
159
+ try:
160
+ pc_index = get_pinecone_index()
161
+ if not pc_index:
162
+ logger.warning("Pinecone index not available, returning empty list")
163
+ return []
164
+
165
+ # Build filter if category is specified
166
+ filter_dict = {"source": "budget_proposals"}
167
+ if category_filter and category_filter != "All categories":
168
+ filter_dict["category"] = category_filter
169
+
170
+ # Query with a dummy vector to get all documents
171
+ # Use a more realistic dummy vector (all 0.1 instead of 0.0)
172
+ dummy_vector = [0.1] * 384 # 384 is the dimension of all-MiniLM-L6-v2
173
+ res = pc_index.query(
174
+ vector=dummy_vector,
175
+ top_k=100, # Get all proposals
176
+ include_metadata=True,
177
+ filter=filter_dict
178
+ )
179
+
180
+ logger.info(f"Query returned {len(res['matches'])} matches")
181
+
182
+ results = []
183
+ seen_files = set() # Track unique files to avoid duplicates
184
+
185
+ for match in res["matches"]:
186
+ metadata = match["metadata"]
187
+ file_path = metadata.get("file_path", "")
188
+
189
+ # Skip if we've already included this file (avoid duplicates from chunks)
190
+ if file_path in seen_files:
191
+ continue
192
+
193
+ seen_files.add(file_path)
194
+
195
+ # Use the DYNAMIC_METADATA mapping if available, otherwise use metadata
196
+ proposal_data = DYNAMIC_METADATA.get(file_path, {
197
+ "title": metadata.get("title", "Unknown Title"),
198
+ "summary": metadata.get("summary", ""),
199
+ "category": metadata.get("category", "Budget Proposal"),
200
+ "costLKR": metadata.get("costLKR", "No Costing Available")
201
+ })
202
+
203
+ title = proposal_data["title"]
204
+ summary = proposal_data["summary"]
205
+ costLKR = proposal_data["costLKR"]
206
+ category = proposal_data["category"]
207
+ thumb_url = metadata.get("thumbUrl", "")
208
+
209
+ result = {
210
+ "title": title,
211
+ "summary": summary,
212
+ "costLKR": costLKR,
213
+ "category": category,
214
+ "pdfUrl": f"assets/pdfs/{file_path}" if file_path else "",
215
+ "thumbUrl": f"assets/thumbs/{thumb_url}" if thumb_url else "",
216
+ "score": 1.0, # Default score for all proposals
217
+ "relevance_percentage": 100,
218
+ "file_path": file_path,
219
+ "id": match["id"]
220
+ }
221
+
222
+ results.append(result)
223
+
224
+ return results
225
+
226
+ except Exception as e:
227
+ logger.error(f"Error getting all proposals: {e}")
228
+ return []
229
+
230
+ @app.route('/api/search', methods=['POST'])
231
+ def search_proposals():
232
+ """API endpoint for searching budget proposals"""
233
+ try:
234
+ data = request.get_json()
235
+ query = data.get('query', '').strip()
236
+ top_k = data.get('top_k', 10)
237
+ category_filter = data.get('category_filter')
238
+
239
+ if not query:
240
+ # If no query, return all proposals
241
+ results = get_all_proposals(category_filter)
242
+ else:
243
+ results = semantic_search(query, top_k, category_filter)
244
+
245
+ return jsonify({
246
+ "query": query,
247
+ "results": results,
248
+ "total_results": len(results),
249
+ "category_filter": category_filter
250
+ })
251
+
252
+ except Exception as e:
253
+ logger.error(f"API error: {e}")
254
+ return jsonify({"error": str(e)}), 500
255
+
256
+ @app.route('/api/search', methods=['GET'])
257
+ def search_proposals_get():
258
+ """API endpoint for searching proposals (GET method)"""
259
+ try:
260
+ query = request.args.get('query', '').strip()
261
+ top_k = int(request.args.get('top_k', 10))
262
+ category_filter = request.args.get('category_filter')
263
+
264
+ if not query:
265
+ # If no query, return all proposals
266
+ results = get_all_proposals(category_filter)
267
+ else:
268
+ results = semantic_search(query, top_k, category_filter)
269
+
270
+ return jsonify({
271
+ "query": query,
272
+ "results": results,
273
+ "total_results": len(results),
274
+ "category_filter": category_filter
275
+ })
276
+
277
+ except Exception as e:
278
+ logger.error(f"API error: {e}")
279
+ return jsonify({"error": str(e)}), 500
280
+
281
+ @app.route('/api/proposals', methods=['GET'])
282
+ def get_proposals():
283
+ """Get all budget proposals"""
284
+ try:
285
+ category_filter = request.args.get('category_filter')
286
+ results = get_all_proposals(category_filter)
287
+
288
+ return jsonify({
289
+ "results": results,
290
+ "total_results": len(results),
291
+ "category_filter": category_filter
292
+ })
293
+
294
+ except Exception as e:
295
+ logger.error(f"API error: {e}")
296
+ return jsonify({"error": str(e)}), 500
297
+
298
+ @app.route('/api/categories', methods=['GET'])
299
+ def get_categories():
300
+ """Get all available categories"""
301
+ try:
302
+ # Get categories directly from dynamic metadata for reliability
303
+ categories = set()
304
+ for file_path, metadata in DYNAMIC_METADATA.items():
305
+ category = metadata.get("category")
306
+ if category:
307
+ categories.add(category)
308
+
309
+ # If no categories from metadata, fallback to Pinecone
310
+ if not categories:
311
+ all_proposals = get_all_proposals()
312
+ for proposal in all_proposals:
313
+ category = proposal.get("category")
314
+ if category:
315
+ categories.add(category)
316
+
317
+ return jsonify({
318
+ "categories": sorted(list(categories))
319
+ })
320
+
321
+ except Exception as e:
322
+ logger.error(f"API error: {e}")
323
+ return jsonify({"error": str(e)}), 500
324
+
325
+ @app.route('/api/health', methods=['GET'])
326
+ def health_check():
327
+ """Health check endpoint"""
328
+ try:
329
+ pc_index = get_pinecone_index()
330
+ if pc_index:
331
+ stats = pc_index.describe_index_stats()
332
+ return jsonify({
333
+ "status": "healthy",
334
+ "message": "Budget proposals semantic search API is running",
335
+ "index_stats": {
336
+ "total_vector_count": stats.total_vector_count,
337
+ "dimension": stats.dimension,
338
+ "index_fullness": stats.index_fullness
339
+ }
340
+ })
341
+ else:
342
+ return jsonify({
343
+ "status": "unhealthy",
344
+ "message": "Cannot connect to Pinecone index"
345
+ }), 500
346
+ except Exception as e:
347
+ return jsonify({
348
+ "status": "unhealthy",
349
+ "message": f"Error: {str(e)}"
350
+ }), 500
351
+
352
+ @app.route('/api/stats', methods=['GET'])
353
+ def get_stats():
354
+ """Get index statistics"""
355
+ try:
356
+ pc_index = get_pinecone_index()
357
+ if not pc_index:
358
+ return jsonify({"error": "Cannot connect to Pinecone index"}), 500
359
+
360
+ stats = pc_index.describe_index_stats()
361
+ return jsonify({
362
+ "total_vector_count": stats.total_vector_count,
363
+ "dimension": stats.dimension,
364
+ "index_fullness": stats.index_fullness
365
+ })
366
+ except Exception as e:
367
+ return jsonify({"error": str(e)}), 500
368
+
369
+ @app.route('/', methods=['GET'])
370
+ def home():
371
+ """Home endpoint with API documentation"""
372
+ return jsonify({
373
+ "message": "Budget Proposals Semantic Search API",
374
+ "version": "1.0.0",
375
+ "endpoints": {
376
+ "POST /api/search": "Search proposals with JSON body",
377
+ "GET /api/search?query=<search_term>": "Search proposals with query parameter",
378
+ "GET /api/proposals": "Get all proposals",
379
+ "GET /api/categories": "Get all categories",
380
+ "GET /api/health": "Health check",
381
+ "GET /api/stats": "Index statistics"
382
+ },
383
+ "status": "running"
384
+ })
385
+
386
+ if __name__ == '__main__':
387
+ app.run(debug=False, host='0.0.0.0', port=7860)
dynamic_metadata.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "MLB.pdf": {
3
+ "title": "State Sponsored Maternity Leave Benefits",
4
+ "summary": "Shift maternity leave costs from employers to the state to remove hiring bias against women, boost labour force participation, and align with gender equality goals.",
5
+ "category": "Economic Growth",
6
+ "costLKR": "Cost = LKR 7.5 Billion",
7
+ "thumbUrl": "MLB.jpg",
8
+ "pdfUrl": "MLB.pdf",
9
+ "added_date": "2025-08-24T00:34:55.502282",
10
+ "auto_generated": false
11
+ },
12
+ "Cigs.pdf": {
13
+ "title": "Implement Indexation Formula for Cigarette Taxes",
14
+ "summary": "Introduce annual cigarette tax indexation tied to inflation and GDP growth to reduce affordability, protect revenue, and improve public health.",
15
+ "category": "Revenue Generation",
16
+ "costLKR": "Revenue = LKR 20.3 Billion",
17
+ "thumbUrl": "Cigs.jpg",
18
+ "pdfUrl": "Cigs.pdf",
19
+ "added_date": "2025-08-24T00:34:55.502464",
20
+ "auto_generated": false
21
+ },
22
+ "EPF.pdf": {
23
+ "title": "Reduce EPF Tax Rate from 14% to 10%",
24
+ "summary": "Reduce EPF investment return tax from 14% to 10%, benefiting over 90% of members earning below the tax threshold.",
25
+ "category": "Social Protection & Pensions",
26
+ "costLKR": "Cost = 15.7 Billion",
27
+ "thumbUrl": "EPF.jpg",
28
+ "pdfUrl": "EPF.pdf",
29
+ "added_date": "2025-08-24T00:34:55.502618",
30
+ "auto_generated": false
31
+ },
32
+ "Audit_EPF.pdf": {
33
+ "title": "Conduct Forensic Audit on the EPF Once Every 3 Years",
34
+ "summary": "Mandate a forensic audit of the EPF every three years to expose corruption, improve transparency, and ensure accountability.",
35
+ "category": "Social Protection & Pensions",
36
+ "costLKR": "No Costing Available",
37
+ "thumbUrl": "Audit_EPF.jpg",
38
+ "pdfUrl": "Audit_EPF.pdf",
39
+ "added_date": "2025-08-24T00:34:55.502960",
40
+ "auto_generated": false
41
+ },
42
+ "Discretion.pdf": {
43
+ "title": "Amend Tax Laws to Limit Ministerial Discretion in Tax Changes",
44
+ "summary": "Remove excessive ministerial powers to alter taxes without parliamentary approval, ensuring checks and balances in tax changes.",
45
+ "category": "Governance & Anti-Corruption",
46
+ "costLKR": "No Costing Available",
47
+ "thumbUrl": "Discretion.jpg",
48
+ "pdfUrl": "Discretion.pdf",
49
+ "added_date": "2025-08-24T00:34:55.503162",
50
+ "auto_generated": false
51
+ },
52
+ "Elec.pdf": {
53
+ "title": "Introduce a Rational Method for Electricity Tariffs for Households",
54
+ "summary": "Implement a transparent, cost-based tariff system rewarding lower electricity use with lower rates, ensuring fairness and affordability.",
55
+ "category": "Utilities & Energy Pricing",
56
+ "costLKR": "No Costing Available",
57
+ "thumbUrl": "Elec.jpg",
58
+ "pdfUrl": "Elec.pdf",
59
+ "added_date": "2025-08-24T00:34:55.503456",
60
+ "auto_generated": false
61
+ },
62
+ "1750164001872.pdf": {
63
+ "title": "test",
64
+ "summary": "testing",
65
+ "category": "Other",
66
+ "costLKR": "2bn",
67
+ "thumbUrl": "1750164001872.jpg",
68
+ "pdfUrl": "1750164001872.pdf",
69
+ "added_date": "2025-08-26T00:32:35.274295",
70
+ "auto_generated": false,
71
+ "vectorized": true
72
+ }
73
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ flask==3.0.0
2
+ flask-cors==4.0.0
3
+ sentence-transformers==2.2.2
4
+ pinecone-client==3.0.0
5
+ torch==2.1.2
6
+ transformers==4.36.2