Spaces:

ayush2917
/

finance-news-api

Running

App Files Files Community

ayush2917 commited on Apr 7

Commit

b1e9722

verified ·

1 Parent(s): 1fa9651

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -116

app.py CHANGED Viewed

@@ -7,7 +7,6 @@ from datetime import datetime, timedelta
 import requests
 from threading import Thread, Event
 import logging
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 from typing import Dict, List
 from bs4 import BeautifulSoup
@@ -15,7 +14,7 @@ app = Flask(__name__)
 # Configuration
 NEWS_API_KEY = os.environ.get('NEWS_API_KEY', '352f67b35a544f408c58c74c654cfd7e')
-MAX_NEWS_ARTICLES = 10
 API_CALL_INTERVAL = 10  # seconds
 REFRESH_INTERVAL = 7200  # 2 hours (increased to reduce CPU load)
 CACHE_EXPIRY_DURATION = 3600  # 60 minutes (increased to reduce API calls)
@@ -24,6 +23,7 @@ last_fetch_time = None
 last_api_call = 0
 cached_articles = []
 cache_expiry = None
 # List of Indian finance news websites (reduced to avoid HTTP errors)
 WEBSITES = [
@@ -245,65 +245,12 @@ def calculate_age(published):
     except ValueError:
         return "Unknown time"
-# Chatbot Models (Initialized on-demand)
-qa_pipeline = None
-t5_tokenizer = None
-t5_model = None
-qa_loaded = Event()
-t5_loaded = Event()
-def load_qa_model():
-    global qa_pipeline
-    if not qa_loaded.is_set():
-        logging.info("Loading QA model on-demand...")
-        try:
-            qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased-distilled-squad")
-            logging.info("QA model loaded successfully")
-            qa_loaded.set()
-        except Exception as e:
-            logging.error(f"Failed to load QA model: {str(e)}")
-def load_t5_model():
-    global t5_tokenizer, t5_model
-    if not t5_loaded.is_set():
-        logging.info("Loading Flan-T5 model on-demand...")
-        try:
-            t5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
-            t5_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
-            logging.info("Flan-T5 model loaded successfully")
-            t5_loaded.set()
-        except Exception as e:
-            logging.error(f"Failed to load Flan-T5 model: {str(e)}")
-# Function to generate a 60-80 word description using Flan-T5 (disabled for now)
-def generate_description(title: str, raw_content: str, category: str, current_date_str: str) -> str:
-    # Disabled to reduce CPU usage
-    return raw_content[:200] + "..."
-# Function to generate response using Flan-T5
-def generate_t5_response(prompt: str, max_length: int = 80) -> str:
-    load_t5_model()  # Load Flan-T5 on-demand
-    if not t5_loaded.is_set():
-        return None
-    try:
-        inputs = t5_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
-        outputs = t5_model.generate(
-            inputs.input_ids,
-            max_length=max_length,
-            min_length=30,
-            temperature=0.7,
-            top_p=0.9,
-            do_sample=True
-        )
-        response = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return response
-    except Exception as e:
-        logging.error(f"Error generating response with Flan-T5: {str(e)}")
-        return None
 # Function to fetch news from websites using BeautifulSoup and requests
 def fetch_news_from_websites() -> List[Dict]:
     articles = []
     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
     used_headlines = set()
@@ -319,7 +266,7 @@ def fetch_news_from_websites() -> List[Dict]:
             # Generic selectors (adjust per site)
             news_items = soup.select('h1, h2, h3, .story, .article, .headline, .title')
             for item in news_items:
-                if len(articles) >= 10:  # Reduced limit to 10 articles to lower CPU usage
                     break
                 title = item.get_text(strip=True)[:100]
                 if title and title not in used_headlines:
@@ -350,7 +297,7 @@ def fetch_news_from_websites() -> List[Dict]:
                         })
         except Exception as e:
             logging.error(f"Failed to fetch from {url}: {str(e)}")
-        if len(articles) >= 10:
             break
     return articles
@@ -437,10 +384,10 @@ def fetch_news(query: str = None) -> List[Dict]:
                             'age': calculate_age(article['publishedAt'])
                         })
-            # Fetch additional articles from websites if NewsAPI yields fewer than 10 articles
-            if len(processed) < 10:
                 web_articles = fetch_news_from_websites()
-                processed.extend(web_articles[:10 - len(processed)])
             cached_articles = processed
             cache_expiry = current_time + CACHE_EXPIRY_DURATION
@@ -466,10 +413,14 @@ def fetch_news(query: str = None) -> List[Dict]:
     logging.error("Max retry attempts reached for NewsAPI, returning cached articles")
     return cached_articles
-# Background Refresh Thread
 stop_refresh = Event()
 def refresh_news_periodically():
     while not stop_refresh.is_set():
         with app.app_context():
             fetch_news()
@@ -477,7 +428,8 @@ def refresh_news_periodically():
         time.sleep(REFRESH_INTERVAL)
 refresh_thread = Thread(target=refresh_news_periodically, daemon=True)
-refresh_thread.start()
 # Startup Logic
 with app.app_context():
@@ -535,15 +487,6 @@ def category_news(category_name):
 @app.route('/chat', methods=['POST'])
 def chat():
     logging.info("Received chat request")
-    if not qa_loaded.is_set() and not t5_loaded.is_set():
-        load_qa_model()
-        load_t5_model()
-        if not qa_loaded.is_set() or not t5_loaded.is_set():
-            return jsonify({
-                'response': ['One or more models failed to load. Please try again later.'],
-                'status': 'error'
-            }), 500
     try:
         data = request.get_json()
         if not data or 'message' not in data:
@@ -628,31 +571,18 @@ def chat():
         for article in context_articles:
             article['description'] = article['summary']
-        # Use QA model to extract a factual answer if possible
         qa_answer = None
-        if context_articles and qa_loaded.is_set():
-            context = " ".join([article['content'] for article in context_articles])
-            # Add static knowledge base to context for better QA
-            if topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
-                knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
-                context += " " + " ".join(knowledge.values())
-            try:
-                qa_result = qa_pipeline(question=user_input, context=context, max_answer_len=30)
-                qa_answer = qa_result['answer'] if qa_result['score'] > 0.5 else None
-            except Exception as e:
-                logging.error(f"QA model error: {str(e)}")
-        # If QA model fails, use static knowledge base
-        if not qa_answer and topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
             knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
             for key, value in knowledge.items():
                 if key in user_input:
                     qa_answer = value
                     break
-        # Use Flan-T5 to generate the summary response
-        summary = "No recent news available."
-        if context_articles:
             # Deduplicate descriptions and limit to unique content
             descriptions = list(dict.fromkeys([article['description'] for article in context_articles]))
             summary = " ".join(descriptions[:2])  # Limit to 2 descriptions to avoid repetition
@@ -666,29 +596,11 @@ def chat():
             knowledge = FINANCIAL_KNOWLEDGE_BASE['Stock Market'].get('nifty trend', '')
             summary = knowledge + " " + summary
-        prompt = f"""You are a financial analyst providing concise answers as of {datetime.now().strftime('%Y-%m-%d')}.
-Query: {user_input}
-Context from recent news: {summary}
-Factual answer (if available): {qa_answer if qa_answer else 'Not found'}
-Provide a summary in 2-3 sentences (each under 30 words)."""
-        logging.info(f"Generated prompt: {prompt[:100]}...")
-        t5_response = generate_t5_response(prompt, max_length=80)
-        if t5_response:
-            summary_response = t5_response
-        else:
-            if qa_answer:
-                summary_response = f"As of {datetime.now().strftime('%b %d, %Y')}, {qa_answer}"
-            else:
-                summary_response = summary
-        # Ensure summary is within 30 words per sentence
-        summary_lines = summary_response.split('\n')
-        summary_lines = [line.strip() for line in summary_lines if line.strip()]
-        summary_lines = [line if len(line.split()) <= 30 else ' '.join(line.split()[:30]) + '.' for line in summary_lines]
         # Construct the response as a list of lines
         response_lines = ["**Summary**"]
         response_lines.extend(summary_lines)
         response_lines.append("")
         response_lines.append("**Investment Recommendations for Indian Investors**")
@@ -777,8 +689,6 @@ def health():
     return jsonify({
         "status": "healthy",
         "refresh_running": refresh_thread.is_alive(),
-        "qa_loaded": qa_loaded.is_set(),
-        "t5_loaded": t5_loaded.is_set(),
         "database": db_status
     })

 import requests
 from threading import Thread, Event
 import logging
 from typing import Dict, List
 from bs4 import BeautifulSoup
 # Configuration
 NEWS_API_KEY = os.environ.get('NEWS_API_KEY', '352f67b35a544f408c58c74c654cfd7e')
+MAX_NEWS_ARTICLES = 5  # Reduced to lower CPU usage during build
 API_CALL_INTERVAL = 10  # seconds
 REFRESH_INTERVAL = 7200  # 2 hours (increased to reduce CPU load)
 CACHE_EXPIRY_DURATION = 3600  # 60 minutes (increased to reduce API calls)
 last_api_call = 0
 cached_articles = []
 cache_expiry = None
+IS_BUILDING = os.environ.get('IS_BUILDING', 'false').lower() == 'true'  # Flag to skip heavy tasks during build
 # List of Indian finance news websites (reduced to avoid HTTP errors)
 WEBSITES = [
     except ValueError:
         return "Unknown time"
 # Function to fetch news from websites using BeautifulSoup and requests
 def fetch_news_from_websites() -> List[Dict]:
+    if IS_BUILDING:
+        logging.info("Skipping web scraping during build phase")
+        return []
     articles = []
     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
     used_headlines = set()
             # Generic selectors (adjust per site)
             news_items = soup.select('h1, h2, h3, .story, .article, .headline, .title')
             for item in news_items:
+                if len(articles) >= 5:  # Further reduced limit to 5 articles
                     break
                 title = item.get_text(strip=True)[:100]
                 if title and title not in used_headlines:
                         })
         except Exception as e:
             logging.error(f"Failed to fetch from {url}: {str(e)}")
+        if len(articles) >= 5:
             break
     return articles
                             'age': calculate_age(article['publishedAt'])
                         })
+            # Fetch additional articles from websites if NewsAPI yields fewer than 5 articles
+            if len(processed) < 5 and not IS_BUILDING:
                 web_articles = fetch_news_from_websites()
+                processed.extend(web_articles[:5 - len(processed)])
             cached_articles = processed
             cache_expiry = current_time + CACHE_EXPIRY_DURATION
     logging.error("Max retry attempts reached for NewsAPI, returning cached articles")
     return cached_articles
+# Background Refresh Thread (disabled during build)
 stop_refresh = Event()
 def refresh_news_periodically():
+    if IS_BUILDING:
+        logging.info("Skipping background news refresh during build phase")
+        return
     while not stop_refresh.is_set():
         with app.app_context():
             fetch_news()
         time.sleep(REFRESH_INTERVAL)
 refresh_thread = Thread(target=refresh_news_periodically, daemon=True)
+if not IS_BUILDING:
+    refresh_thread.start()
 # Startup Logic
 with app.app_context():
 @app.route('/chat', methods=['POST'])
 def chat():
     logging.info("Received chat request")
     try:
         data = request.get_json()
         if not data or 'message' not in data:
         for article in context_articles:
             article['description'] = article['summary']
+        # Use static knowledge base for summary
+        summary = "No recent news available."
         qa_answer = None
+        if topic_info['primary_category'] in FINANCIAL_KNOWLEDGE_BASE:
             knowledge = FINANCIAL_KNOWLEDGE_BASE[topic_info['primary_category']]
             for key, value in knowledge.items():
                 if key in user_input:
                     qa_answer = value
+                    summary = value
                     break
+        if context_articles and summary == "No recent news available.":
             # Deduplicate descriptions and limit to unique content
             descriptions = list(dict.fromkeys([article['description'] for article in context_articles]))
             summary = " ".join(descriptions[:2])  # Limit to 2 descriptions to avoid repetition
             knowledge = FINANCIAL_KNOWLEDGE_BASE['Stock Market'].get('nifty trend', '')
             summary = knowledge + " " + summary
         # Construct the response as a list of lines
         response_lines = ["**Summary**"]
+        summary_lines = summary.split('. ')
+        summary_lines = [line.strip() for line in summary_lines if line.strip()]
+        summary_lines = [line if len(line.split()) <= 30 else ' '.join(line.split()[:30]) + '.' for line in summary_lines]
         response_lines.extend(summary_lines)
         response_lines.append("")
         response_lines.append("**Investment Recommendations for Indian Investors**")
     return jsonify({
         "status": "healthy",
         "refresh_running": refresh_thread.is_alive(),
         "database": db_status
     })