File size: 19,464 Bytes
ef821d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
"""

Analytics Database Module for Query Logging and Performance Tracking.

Tracks every query, method, answer, and citation for comprehensive analytics.

"""

import sqlite3
import json
import time
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Tuple
from pathlib import Path
import logging

from config import DATA_DIR

logger = logging.getLogger(__name__)

# Database path
ANALYTICS_DB = DATA_DIR / "analytics.db"

class AnalyticsDB:
    """Database manager for query analytics and logging."""
    
    def __init__(self):
        self.db_path = ANALYTICS_DB
        self._init_database()
    
    def _init_database(self):
        """Initialize analytics database with required tables."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Main queries table
        cursor.execute('''

            CREATE TABLE IF NOT EXISTS queries (

                query_id INTEGER PRIMARY KEY AUTOINCREMENT,

                timestamp TEXT NOT NULL,

                user_query TEXT NOT NULL,

                retrieval_method TEXT NOT NULL,

                answer TEXT NOT NULL,

                response_time_ms REAL,

                num_citations INTEGER DEFAULT 0,

                image_path TEXT,

                error_message TEXT,

                top_k_used INTEGER DEFAULT 5,

                additional_settings TEXT,

                answer_length INTEGER,

                session_id TEXT

            )

        ''')
        
        # Citations table
        cursor.execute('''

            CREATE TABLE IF NOT EXISTS citations (

                citation_id INTEGER PRIMARY KEY AUTOINCREMENT,

                query_id INTEGER NOT NULL,

                source TEXT NOT NULL,

                citation_type TEXT,

                relevance_score REAL,

                bm25_score REAL,

                rerank_score REAL,

                similarity_score REAL,

                url TEXT,

                path TEXT,

                rank INTEGER,

                FOREIGN KEY (query_id) REFERENCES queries (query_id)

            )

        ''')
        
        # Performance metrics table
        cursor.execute('''

            CREATE TABLE IF NOT EXISTS performance_metrics (

                metric_id INTEGER PRIMARY KEY AUTOINCREMENT,

                query_id INTEGER NOT NULL,

                retrieval_time_ms REAL,

                generation_time_ms REAL,

                total_time_ms REAL,

                chunks_retrieved INTEGER,

                tokens_estimated INTEGER,

                FOREIGN KEY (query_id) REFERENCES queries (query_id)

            )

        ''')
        
        conn.commit()
        conn.close()
        logger.info("Analytics database initialized")
    
    def log_query(self, user_query: str, method: str, answer: str, 

                  citations: List[Dict], response_time: float = None,

                  image_path: str = None, error_message: str = None,

                  top_k: int = 5, additional_settings: Dict = None,

                  session_id: str = None) -> int:
        """

        Log a complete query interaction.

        

        Args:

            user_query: The user's question

            method: Retrieval method used

            answer: Generated answer

            citations: List of citation dictionaries

            response_time: Time taken in milliseconds

            image_path: Path to uploaded image (if any)

            error_message: Error message (if any)

            top_k: Number of chunks retrieved

            additional_settings: Method-specific settings

            session_id: Session identifier

            

        Returns:

            query_id: The ID of the logged query

        """
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            # Insert main query record
            cursor.execute('''

                INSERT INTO queries (

                    timestamp, user_query, retrieval_method, answer, 

                    response_time_ms, num_citations, image_path, error_message,

                    top_k_used, additional_settings, answer_length, session_id

                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)

            ''', (
                datetime.now().isoformat(),
                user_query,
                method,
                answer,
                response_time,
                len(citations),
                image_path,
                error_message,
                top_k,
                json.dumps(additional_settings) if additional_settings else None,
                len(answer),
                session_id
            ))
            
            query_id = cursor.lastrowid
            
            # Insert citations
            for rank, citation in enumerate(citations, 1):
                cursor.execute('''

                    INSERT INTO citations (

                        query_id, source, citation_type, relevance_score,

                        bm25_score, rerank_score, similarity_score, url, path, rank

                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)

                ''', (
                    query_id,
                    citation.get('source', ''),
                    citation.get('type', ''),
                    citation.get('relevance_score'),
                    citation.get('bm25_score'),
                    citation.get('rerank_score'),
                    citation.get('similarity_score'),
                    citation.get('url'),
                    citation.get('path'),
                    rank
                ))
            
            conn.commit()
            logger.info(f"Logged query {query_id} with {len(citations)} citations")
            return query_id
            
        except Exception as e:
            logger.error(f"Error logging query: {e}")
            conn.rollback()
            return None
        finally:
            conn.close()
    
    def get_query_stats(self, days: int = 30) -> Dict[str, Any]:
        """Get comprehensive query statistics."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        since_date = (datetime.now() - timedelta(days=days)).isoformat()
        
        try:
            stats = {}
            
            # Total queries
            cursor.execute('''

                SELECT COUNT(*) FROM queries 

                WHERE timestamp >= ?

            ''', (since_date,))
            stats['total_queries'] = cursor.fetchone()[0]
            
            # Method usage
            cursor.execute('''

                SELECT retrieval_method, COUNT(*) as count

                FROM queries 

                WHERE timestamp >= ?

                GROUP BY retrieval_method

                ORDER BY count DESC

            ''', (since_date,))
            stats['method_usage'] = dict(cursor.fetchall())
            
            # Average response times by method
            cursor.execute('''

                SELECT retrieval_method, AVG(response_time_ms) as avg_time

                FROM queries 

                WHERE timestamp >= ? AND response_time_ms IS NOT NULL

                GROUP BY retrieval_method

            ''', (since_date,))
            stats['avg_response_times'] = dict(cursor.fetchall())
            
            # Citation statistics
            cursor.execute('''

                SELECT AVG(num_citations) as avg_citations, 

                       SUM(num_citations) as total_citations

                FROM queries 

                WHERE timestamp >= ?

            ''', (since_date,))
            result = cursor.fetchone()
            stats['avg_citations'] = result[0] or 0
            stats['total_citations'] = result[1] or 0
            
            # Citation types
            cursor.execute('''

                SELECT c.citation_type, COUNT(*) as count

                FROM citations c

                JOIN queries q ON c.query_id = q.query_id

                WHERE q.timestamp >= ?

                GROUP BY c.citation_type

                ORDER BY count DESC

            ''', (since_date,))
            stats['citation_types'] = dict(cursor.fetchall())
            
            # Error rate
            cursor.execute('''

                SELECT 

                    COUNT(CASE WHEN error_message IS NOT NULL THEN 1 END) as errors,

                    COUNT(*) as total

                FROM queries 

                WHERE timestamp >= ?

            ''', (since_date,))
            result = cursor.fetchone()
            stats['error_rate'] = (result[0] / result[1]) * 100 if result[1] > 0 else 0
            
            # Most common query topics (simple word analysis)
            cursor.execute('''

                SELECT user_query FROM queries 

                WHERE timestamp >= ?

            ''', (since_date,))
            queries = [row[0].lower() for row in cursor.fetchall()]
            
            # Simple keyword extraction
            keywords = {}
            for query in queries:
                words = [word for word in query.split() if len(word) > 3]
                for word in words:
                    keywords[word] = keywords.get(word, 0) + 1
            
            # Top 10 keywords
            stats['top_keywords'] = dict(sorted(keywords.items(), 
                                              key=lambda x: x[1], 
                                              reverse=True)[:10])
            
            return stats
            
        except Exception as e:
            logger.error(f"Error getting query stats: {e}")
            return {}
        finally:
            conn.close()
    
    def get_method_performance(self) -> Dict[str, Dict[str, float]]:
        """Get detailed performance metrics by method."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            cursor.execute('''

                SELECT 

                    retrieval_method,

                    AVG(response_time_ms) as avg_response_time,

                    AVG(num_citations) as avg_citations,

                    AVG(answer_length) as avg_answer_length,

                    COUNT(*) as query_count

                FROM queries 

                WHERE response_time_ms IS NOT NULL

                GROUP BY retrieval_method

            ''')
            
            results = {}
            for row in cursor.fetchall():
                method, avg_time, avg_cites, avg_length, count = row
                results[method] = {
                    'avg_response_time': avg_time,
                    'avg_citations': avg_cites,
                    'avg_answer_length': avg_length,
                    'query_count': count
                }
            
            return results
            
        except Exception as e:
            logger.error(f"Error getting method performance: {e}")
            return {}
        finally:
            conn.close()
    
    def get_recent_queries(self, limit: int = 20, include_answers: bool = True) -> List[Dict[str, Any]]:
        """Get recent queries with basic information and optionally full answers."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            if include_answers:
                cursor.execute('''

                    SELECT query_id, timestamp, user_query, retrieval_method, 

                           answer, answer_length, num_citations, response_time_ms, error_message

                    FROM queries 

                    ORDER BY timestamp DESC 

                    LIMIT ?

                ''', (limit,))
                
                columns = ['query_id', 'timestamp', 'query', 'method', 
                          'answer', 'answer_length', 'citations', 'response_time', 'error_message']
            else:
                cursor.execute('''

                    SELECT query_id, timestamp, user_query, retrieval_method, 

                           answer_length, num_citations, response_time_ms

                    FROM queries 

                    ORDER BY timestamp DESC 

                    LIMIT ?

                ''', (limit,))
                
                columns = ['query_id', 'timestamp', 'query', 'method', 
                          'answer_length', 'citations', 'response_time']
            
            return [dict(zip(columns, row)) for row in cursor.fetchall()]
            
        except Exception as e:
            logger.error(f"Error getting recent queries: {e}")
            return []
        finally:
            conn.close()
    
    def get_query_with_citations(self, query_id: int) -> Dict[str, Any]:
        """Get full query details including citations."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            # Get query details
            cursor.execute('''

                SELECT query_id, timestamp, user_query, retrieval_method, answer,

                       response_time_ms, num_citations, error_message, top_k_used

                FROM queries WHERE query_id = ?

            ''', (query_id,))
            
            query_row = cursor.fetchone()
            if not query_row:
                return {}
            
            query_data = {
                'query_id': query_row[0],
                'timestamp': query_row[1],
                'user_query': query_row[2],
                'method': query_row[3],
                'answer': query_row[4],
                'response_time': query_row[5],
                'num_citations': query_row[6],
                'error_message': query_row[7],
                'top_k_used': query_row[8]
            }
            
            # Get citations
            cursor.execute('''

                SELECT source, citation_type, relevance_score, bm25_score, 

                       rerank_score, similarity_score, url, path, rank

                FROM citations WHERE query_id = ?

                ORDER BY rank

            ''', (query_id,))
            
            citations = []
            for row in cursor.fetchall():
                citation = {
                    'source': row[0],
                    'type': row[1],
                    'relevance_score': row[2],
                    'bm25_score': row[3],
                    'rerank_score': row[4],
                    'similarity_score': row[5],
                    'url': row[6],
                    'path': row[7],
                    'rank': row[8]
                }
                citations.append(citation)
            
            query_data['citations'] = citations
            return query_data
            
        except Exception as e:
            logger.error(f"Error getting query with citations: {e}")
            return {}
        finally:
            conn.close()
    
    def get_query_trends(self, days: int = 30) -> Dict[str, List[Tuple[str, int]]]:
        """Get query trends over time."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        since_date = (datetime.now() - timedelta(days=days)).isoformat()
        
        try:
            # Queries per day
            cursor.execute('''

                SELECT DATE(timestamp) as date, COUNT(*) as count

                FROM queries 

                WHERE timestamp >= ?

                GROUP BY DATE(timestamp)

                ORDER BY date

            ''', (since_date,))
            
            daily_queries = cursor.fetchall()
            
            # Method usage trends
            cursor.execute('''

                SELECT DATE(timestamp) as date, retrieval_method, COUNT(*) as count

                FROM queries 

                WHERE timestamp >= ?

                GROUP BY DATE(timestamp), retrieval_method

                ORDER BY date, retrieval_method

            ''', (since_date,))
            
            method_trends = {}
            for date, method, count in cursor.fetchall():
                if method not in method_trends:
                    method_trends[method] = []
                method_trends[method].append((date, count))
            
            return {
                'daily_queries': daily_queries,
                'method_trends': method_trends
            }
            
        except Exception as e:
            logger.error(f"Error getting query trends: {e}")
            return {}
        finally:
            conn.close()
    
    def get_voice_interaction_stats(self) -> Dict[str, Any]:
        """Get statistics about voice interactions."""
        try:
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            
            # Count voice interactions (those with voice_interaction=true in additional_settings)
            cursor.execute('''

                SELECT COUNT(*) as total_voice_queries

                FROM queries 

                WHERE additional_settings LIKE '%voice_interaction%'

                   OR session_id LIKE 'voice_%'

            ''')
            result = cursor.fetchone()
            total_voice = result[0] if result else 0
            
            # Get voice queries by method
            cursor.execute('''

                SELECT retrieval_method, COUNT(*) as count

                FROM queries 

                WHERE additional_settings LIKE '%voice_interaction%'

                   OR session_id LIKE 'voice_%'

                GROUP BY retrieval_method

            ''')
            voice_by_method = dict(cursor.fetchall())
            
            # Average response time for voice queries
            cursor.execute('''

                SELECT AVG(response_time_ms) as avg_response_time

                FROM queries 

                WHERE (additional_settings LIKE '%voice_interaction%'

                   OR session_id LIKE 'voice_%')

                   AND response_time_ms IS NOT NULL

            ''')
            result = cursor.fetchone()
            avg_response_time = result[0] if result and result[0] else 0
            
            return {
                'total_voice_queries': total_voice,
                'voice_by_method': voice_by_method,
                'avg_voice_response_time': avg_response_time
            }
            
        except Exception as e:
            logger.error(f"Error getting voice interaction stats: {e}")
            return {}
        finally:
            conn.close()

# Global instance
analytics_db = AnalyticsDB()

# Convenience functions
def log_query(user_query: str, method: str, answer: str, citations: List[Dict], 

              **kwargs) -> int:
    """Log a query to the analytics database."""
    return analytics_db.log_query(user_query, method, answer, citations, **kwargs)

def get_analytics_stats(days: int = 30) -> Dict[str, Any]:
    """Get analytics statistics."""
    return analytics_db.get_query_stats(days)

def get_method_performance() -> Dict[str, Dict[str, float]]:
    """Get method performance metrics."""
    return analytics_db.get_method_performance()