Spaces:
Paused
Paused
Commit
·
5167b0f
1
Parent(s):
29685bd
updating with summarizer
Browse files- app.py +35 -6
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import yt_dlp as ydlp
|
|
|
|
|
|
|
| 3 |
from whispercpp import Whisper
|
|
|
|
| 4 |
|
| 5 |
def download_audio(youtube_url, output_folder='.'):
|
| 6 |
ydl_opts = {
|
|
@@ -32,21 +35,47 @@ def process_general_transcription(transcription):
|
|
| 32 |
transcript_str = "\n".join(formatted_transcription)
|
| 33 |
|
| 34 |
return transcript_str
|
| 35 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
download_audio(youtube_url)
|
| 37 |
result = w.transcribe("audio.wav")
|
| 38 |
text = w.extract_text(result)
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
| 40 |
with gr.Blocks() as demo:
|
| 41 |
gr.Markdown(
|
| 42 |
"""
|
| 43 |
# CPP Whisperer - Transcribe YouTube Videos
|
| 44 |
|
| 45 |
""")
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import yt_dlp as ydlp
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
|
| 5 |
from whispercpp import Whisper
|
| 6 |
+
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
|
| 7 |
|
| 8 |
def download_audio(youtube_url, output_folder='.'):
|
| 9 |
ydl_opts = {
|
|
|
|
| 35 |
transcript_str = "\n".join(formatted_transcription)
|
| 36 |
|
| 37 |
return transcript_str
|
| 38 |
+
def chunk_to_tokens(text, n):
|
| 39 |
+
tokens = text.split()
|
| 40 |
+
max_chunk_size = min(len(tokens), 512)
|
| 41 |
+
|
| 42 |
+
token_size = max(1, int(max_chunk_size * (1 - n / 100)))
|
| 43 |
+
|
| 44 |
+
chunks = [" ".join(tokens[i:i + token_size]) for i in range(0, len(tokens), token_size)]
|
| 45 |
+
|
| 46 |
+
return chunks
|
| 47 |
+
def summarizing(text,n):
|
| 48 |
+
valid_tok=chunk_to_tokens(text,n)
|
| 49 |
+
res=""
|
| 50 |
+
for i in valid_tok:
|
| 51 |
+
res+=summarizer(i)[0]['summary_text']+'\n'
|
| 52 |
+
return res
|
| 53 |
+
def transcribe_sum_youtube(youtube_url,n):
|
| 54 |
download_audio(youtube_url)
|
| 55 |
result = w.transcribe("audio.wav")
|
| 56 |
text = w.extract_text(result)
|
| 57 |
+
res=process_general_transcription(text)
|
| 58 |
+
return summarizing(res,n)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
with gr.Blocks() as demo:
|
| 62 |
gr.Markdown(
|
| 63 |
"""
|
| 64 |
# CPP Whisperer - Transcribe YouTube Videos
|
| 65 |
|
| 66 |
""")
|
| 67 |
+
with gr.Row():
|
| 68 |
+
with gr.Column():
|
| 69 |
+
|
| 70 |
+
inp = gr.Textbox(label="Youtube Url",placeholder="Insert YT Url here")
|
| 71 |
+
inp2 = gr.Slider(label="Summarization Percentage",min_value=0,max_value=100,step_size=1)
|
| 72 |
+
result_button_transcribe = gr.Button('Transcribe and Summarize')
|
| 73 |
+
|
| 74 |
+
with gr.Column():
|
| 75 |
+
out = gr.Textbox(label="Transcribed and Summarize Text")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
result_button_transcribe.click(transcribe_sum_youtube, inputs = [inp,inp2] , outputs = out)
|
| 79 |
|
| 80 |
|
| 81 |
demo.launch()
|
requirements.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
git+https://github.com/stlukey/whispercpp.py
|
| 2 |
gradio
|
| 3 |
-
yt_dlp
|
|
|
|
|
|
|
|
|
| 1 |
git+https://github.com/stlukey/whispercpp.py
|
| 2 |
gradio
|
| 3 |
+
yt_dlp
|
| 4 |
+
transformers
|
| 5 |
+
torch
|