Polarisailabs commited on
Commit
dc77ca8
·
verified ·
1 Parent(s): 3416f15

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -213
app.py DELETED
@@ -1,213 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # Install required libraries if running outside Colab
3
- # !pip install gradio yt-dlp moviepy pillow speechrecognition llama-index lancedb google-generativeai
4
-
5
- import gradio as gr
6
- from moviepy import VideoFileClip
7
- from pathlib import Path
8
- import speech_recognition as sr
9
- from PIL import Image
10
- import os
11
- import shutil
12
- import json
13
- import matplotlib.pyplot as plt
14
-
15
- # Add your existing methods here (download_video, video_to_images, video_to_audio, audio_to_text, prepare_video...)
16
-
17
- def plot_images(image_paths):
18
- images_shown = 0
19
- plt.figure(figsize=(16, 9))
20
- img_files = []
21
- for img_path in image_paths:
22
- if os.path.isfile(img_path):
23
- img_files.append(img_path)
24
- images_shown += 1
25
- if images_shown >= 7:
26
- break
27
- return img_files
28
-
29
- def download_video(video_url, output_video_path="./video_data/"):
30
- ydl_opts = {
31
- "format": "bestvideo+bestaudio/best",
32
- "merge_output_format": "mp4",
33
- "outtmpl": f"{output_video_path}/input_vid.mp4",
34
- "noplaylist": True,
35
- "quiet": False,
36
- # Uncomment and set your cookie file path if required
37
- # "cookiefile": "cookies.txt",
38
- }
39
- Path(output_video_path).mkdir(parents=True, exist_ok=True)
40
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
41
- info = ydl.extract_info(video_url, download=True)
42
- info = ydl.sanitize_info(info)
43
- return {
44
- "title": info.get("title"),
45
- "uploader": info.get("uploader"),
46
- "views": info.get("view_count"),
47
- }
48
-
49
- def video_to_images(video_path, output_folder):
50
- Path(output_folder).mkdir(parents=True, exist_ok=True)
51
- clip = VideoFileClip(video_path)
52
- clip.write_images_sequence(
53
- os.path.join(output_folder, "frame%04d.png"), fps=0.2
54
- )
55
-
56
- def video_to_audio(video_path, output_audio_path):
57
- clip = VideoFileClip(video_path)
58
- audio = clip.audio
59
- audio.write_audiofile(output_audio_path)
60
-
61
- def audio_to_text(audio_path):
62
- recognizer = sr.Recognizer()
63
- try:
64
- with sr.AudioFile(audio_path) as source:
65
- audio_data = recognizer.record(source)
66
- text = recognizer.recognize_google(audio_data)
67
- return text
68
- except sr.UnknownValueError:
69
- print("Google Speech Recognition could not understand the audio.")
70
- except sr.RequestError as e:
71
- print(f"Could not request results: {e}")
72
- return None
73
-
74
- def prepare_video(video_url,
75
- output_video_path="./video_data/",
76
- output_folder="./mixed_data/",
77
- output_audio_path="./mixed_data/output_audio.wav"):
78
- filepath = os.path.join(output_video_path, "input_vid.mp4")
79
- #meta = download_video(video_url, output_video_path)
80
- video_to_images(filepath, output_folder)
81
- video_to_audio(filepath, output_audio_path)
82
- text_data = audio_to_text(output_audio_path)
83
- text_path = os.path.join(output_folder, "output_text.txt")
84
- with open(text_path, "w") as file:
85
- file.write(text_data if text_data else "")
86
- os.remove(output_audio_path)
87
- meta = {
88
- "title": "test",
89
- "uploader": "uploader",
90
- "views": "view_count",
91
- }
92
- return meta, text_data
93
-
94
- from llama_index.core.indices import MultiModalVectorStoreIndex
95
- from llama_index.core import SimpleDirectoryReader, StorageContext
96
- from llama_index.vector_stores.lancedb import LanceDBVectorStore
97
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
98
- from llama_index.core import Settings
99
-
100
- def create_vector_db(image_txt_folder_path: str):
101
- text_store = LanceDBVectorStore(uri="lancedb", table_name="text_collection")
102
- image_store = LanceDBVectorStore(uri="lancedb", table_name="image_collection")
103
- storage_context = StorageContext.from_defaults(
104
- vector_store=text_store, image_store=image_store
105
- )
106
- Settings.embed_model = HuggingFaceEmbedding(
107
- model_name="sentence-transformers/all-MiniLM-L6-v2"
108
- )
109
- documents = SimpleDirectoryReader(image_txt_folder_path).load_data()
110
- index = MultiModalVectorStoreIndex.from_documents(
111
- documents,
112
- storage_context=storage_context,
113
- )
114
- retriever_engine = index.as_retriever(
115
- similarity_top_k=2, image_similarity_top_k=3
116
- )
117
- return retriever_engine
118
-
119
- from llama_index.core.schema import ImageNode
120
-
121
- def retrieve(retriever_engine, query_str):
122
- retrieval_results = retriever_engine.retrieve(query_str)
123
- retrieved_image = []
124
- retrieved_text = []
125
- for res_node in retrieval_results:
126
- if isinstance(res_node.node, ImageNode):
127
- retrieved_image.append(res_node.node.metadata["file_path"])
128
- else:
129
- retrieved_text.append(res_node.text)
130
- return retrieved_image, retrieved_text
131
-
132
- qa_tmpl_str = (
133
- "Given the provided information, including relevant images and retrieved context from the video, \
134
- accurately and precisely answer the query without any additional prior knowledge.\n"
135
- "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
136
- "---------------------\n"
137
- "Context: {context_str}\n"
138
- "Metadata for video: {metadata_str} \n"
139
- "---------------------\n"
140
- "Query: {query_str}\n"
141
- "Answer: "
142
- )
143
-
144
- import google.generativeai as genai
145
-
146
- def get_response(retriever_engine, query_str, metadata_str, output_folder):
147
- img, txt = retrieve(retriever_engine=retriever_engine, query_str=query_str)
148
- context_str = "".join(txt)
149
- prompt = qa_tmpl_str.format(
150
- context_str=context_str, query_str=query_str, metadata_str=metadata_str
151
- )
152
-
153
- GOOGLE_API_KEY = "AIzaSyD0sn-z1CmYcyhzSyE_4t2_nSQFGmnKFWc"
154
- genai.configure(api_key=GOOGLE_API_KEY)
155
- gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
156
-
157
-
158
- content_parts = [prompt]
159
- image_paths = []
160
- for img_path in img:
161
- try:
162
- image = Image.open(img_path)
163
- content_parts.append(image)
164
- image_paths.append(img_path)
165
- except Exception as e:
166
- print(f"Error loading image {img_path}: {e}")
167
-
168
- response_1 = gemini_model.generate_content(content_parts)
169
- result_text = response_1.text if hasattr(response_1, 'text') else str(response_1)
170
- return result_text, image_paths
171
-
172
- # Gradio interface function
173
- def gradio_chat(query):
174
- output_video_path = "./video_data/"
175
- output_folder = "./mixed_data/"
176
- output_audio_path = "./mixed_data/output_audio.wav"
177
- video_url=""
178
-
179
- try:
180
- metadata_vid, text_data = prepare_video(
181
- video_url, output_video_path, output_folder, output_audio_path
182
- )
183
- metadata_str = json.dumps(metadata_vid)
184
- retriever_engine = create_vector_db(output_folder)
185
- result_text, image_paths = get_response(
186
- retriever_engine, query, metadata_str, output_folder
187
- )
188
- # Cleanup
189
- #if os.path.exists(output_video_path):
190
- # shutil.rmtree(output_video_path)
191
- #if os.path.exists(output_folder):
192
- # shutil.rmtree(output_folder)
193
- # Gradio can return text plus images (as list of file paths)
194
- return result_text, image_paths
195
- except Exception as e:
196
- return f"Error: {str(e)}", []
197
-
198
- # Gradio UI
199
- gradio_ui = gr.Interface(
200
- fn=gradio_chat,
201
- inputs=[
202
- gr.Textbox(label="Try:Beaches in Kerala"),
203
- ],
204
- outputs=[
205
- gr.Textbox(label="Vega Response"),
206
- gr.Gallery(label="Relevant Images", allow_preview=True),
207
- ],
208
- title="",
209
- description=""
210
- )
211
-
212
- if __name__ == "__main__":
213
- gradio_ui.launch(share=True)