Spaces:

Shivam29rathore
/

smaller-pegasus

Runtime error

App Files Files Community

smaller-pegasus / app.py

Shivam29rathore

Update app.py

71fdb73 over 3 years ago

raw

history blame contribute delete

4.17 kB

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import pickle
	import torch
	from transformers import PegasusTokenizer, PegasusForConditionalGeneration
	import tensorflow as tf
	from tensorflow.python.lib.io import file_io
	from nltk.tokenize import sent_tokenize


	import io









	tf.compat.v1.disable_eager_execution()
	# Let's load the model and the tokenizer
	model_name = "human-centered-summarization/financial-summarization-pegasus"
	tokenizer = PegasusTokenizer.from_pretrained(model_name)
	model2 = PegasusForConditionalGeneration.from_pretrained(model_name)


	#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


	import nltk
	from finbert_embedding.embedding import FinbertEmbedding
	import pandas as pd
	from nltk.cluster import KMeansClusterer
	import numpy as np
	import os
	from scipy.spatial import distance_matrix
	from tensorflow.python.lib.io import file_io
	import pickle

	nltk.download('punkt')


	def pegasus(text):
	'''A function to obtain summaries for each tokenized sentence.
	It returns a summarized document as output'''

	import nltk
	nltk.download('punkt')

	import os
	data_path = "/tmp/"
	if not os.path.exists(data_path):
	os.makedirs(data_path)
	input_ = "/tmp/input.txt"

	with open(input_, "w") as file:
	file.write(text)
	# read the written txt into a variable
	with open(input_ , 'r') as f:
	text_ = f.read()

	def tokenized_sentences(file):
	'''A function to generate chunks of sentences and texts.
	Returns tokenized texts'''
	# Create empty arrays
	tokenized_sentences = []
	sentences = []
	length = 0
	for sentence in sent_tokenize(file):
	length += len(sentence)
	# 512 is the maximum input length for the Pegasus model
	if length < 512:
	sentences.append(sentence)
	else:
	tokenized_sentences.append(sentences)
	sentences = [sentence]
	length = len(sentence)

	sentences = [sentence.strip() for sentence in sentences]
	size = len(sentences)
	# Append all tokenized sentences
	if sentences:
	tokenized_sentences.append(sentences)
	return tokenized_sentences

	tokenized = tokenized_sentences(text_)
	# Use GPU if available
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	global summary
	# Create an empty array for all summaries
	summary = []
	if size <= 4:
	max_length= size
	else:
	max_length = size//4

	# Loop to encode tokens, to generate abstractive summary and finally decode tokens
	for token in tokenized:
	# Encoding
	inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt')
	# Use CPU or GPU
	inputs = inputs.to(device)
	# Get summaries from transformer model
	all_summary = model2.to(device).generate(inputs,do_sample=True,
	max_length=max_length, top_k=50, top_p=0.95,
	num_beams = 5, early_stopping=True)
	# num_return_sequences=5)
	# length_penalty=0.2, no_repeat_ngram_size=2
	# min_length=10,
	# max_length=50)
	# Decoding
	output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary]
	# Append each output to array
	summary.append(output)
	# Get final summary
	summary = [sentence for each in summary for sentence in each]
	final = "".join(summary)

	return final


	import gradio as gr




	interface1 = gr.Interface(fn=pegasus,
	inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
	outputs=gr.outputs.Textbox(label='Output- Pegasus')).launch()