Spaces:
Sleeping
Sleeping
| ## Creating Brochure App that takes a website, fetches all links and retrieves data from all and then creates a brochure with all the data | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import json | |
| import gradio as gr | |
| from langchain_groq import ChatGroq | |
| import os | |
| # A class to represent a Webpage | |
| class Website: | |
| """ | |
| A utility class to represent a Website that we have scraped, now with links | |
| """ | |
| def __init__(self, url): | |
| self.url = url | |
| response = requests.get(url) | |
| self.body = response.content | |
| soup = BeautifulSoup(self.body, 'html.parser') | |
| self.title = soup.title.string if soup.title else "No title found" | |
| if soup.body: | |
| for irrelevant in soup.body(["script", "style", "img", "input"]): | |
| irrelevant.decompose() | |
| self.text = soup.body.get_text(separator="\n", strip=True) | |
| else: | |
| self.text = "" | |
| links = [link.get('href') for link in soup.find_all('a')] | |
| self.links = [link for link in links if link] | |
| def get_contents(self): | |
| return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n" | |
| ## Check whether the links are relevant or not | |
| def get_links_system_prompt(website): | |
| link_system_prompt = "You are provided with a list of links found on a webpage. \ | |
| You are able to decide which of the links would be most relevant to include in a brochure about the company, \ | |
| such as links to an About page, or a Company page, or Careers/Jobs pages.\n" | |
| link_system_prompt += "You should respond in JSON as in this example, do not say anything else:" | |
| link_system_prompt += """ | |
| { | |
| "links": [ | |
| {"type": "about page", "url": "https://full.url/goes/here/about"}, | |
| {"type": "careers page": "url": "https://another.full.url/careers"} | |
| ] | |
| } | |
| """ | |
| return link_system_prompt | |
| def get_links_user_prompt(website): | |
| user_prompt = f"Here is the list of links on the website of {website.url} - " | |
| user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \ | |
| Do not include Terms of Service, Privacy, email links.\n" | |
| user_prompt += "Links (some might be relative links):\n" | |
| user_prompt += "\n".join(website.links) | |
| return user_prompt | |
| def get_relevant_links(url, llm): | |
| website=Website(url) | |
| messages = [ | |
| ("system", get_links_system_prompt(website)), | |
| ("human", get_links_user_prompt(website)) | |
| ] | |
| result = llm.invoke(messages) | |
| return json.loads(result.content) | |
| def get_all_details(url, llm): | |
| result = "Landing page:\n" | |
| result += Website(url).get_contents() | |
| links = get_relevant_links(url, llm) | |
| print("Found links:", links) | |
| for link in links["links"]: | |
| result += f"\n\n{link['type']}\n" | |
| result += Website(link["url"]).get_contents() | |
| return result | |
| def get_brochure_system_prompt(): | |
| system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \ | |
| and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\ | |
| Include details of company culture, customers and careers/jobs if you have the information." | |
| return system_prompt | |
| def get_brochure_user_prompt(company_name, url, llm): | |
| user_prompt = f"You are looking at a company called: {company_name}\n" | |
| user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n" | |
| user_prompt += get_all_details(url,llm) | |
| user_prompt = user_prompt[:20_000] # Truncate if more than 20,000 characters | |
| return user_prompt | |
| def create_brochure(company_name, url, llm_choice, groq_api_key): | |
| # Set the Groq API key dynamically | |
| os.environ["GROQ_API_KEY"] = groq_api_key | |
| llm = ChatGroq(model=llm_options[llm_choice]) | |
| messages = [ | |
| ("system", get_brochure_system_prompt()), | |
| ("human", get_brochure_user_prompt(company_name, url, llm)) | |
| ] | |
| # Process the stream and return the result | |
| return llm.invoke(messages).content | |
| llm_options = { | |
| "Gemma2":"gemma2-9b-it", | |
| "LLama": "llama-3.2-3b-preview", | |
| "Mixtral": "mixtral-8x7b-32768" | |
| } | |
| # Gradio interface | |
| demo = gr.Interface( | |
| fn=create_brochure, | |
| inputs=[ | |
| "text", # Company name input | |
| "text", # URL input | |
| gr.Dropdown(choices=list(llm_options.keys()), label="Select LLM"), # LLM selection | |
| gr.Textbox(type="password", label="Enter Groq API Key") # API Key input | |
| ], | |
| outputs="markdown", # Output format | |
| title="Brochure Generator", | |
| description="Generate brochures by selecting a company name, URL, and LLM. Provide your Groq API Key." | |
| ) | |
| demo.launch() | |