legalsummarizer

Running

App Files Files Community

arithescientist commited on May 31, 2022

Commit

be03ede

1 Parent(s): a1065a5

Upload app.py

Browse files

Files changed (1) hide show

app.py +291 -0

app.py ADDED Viewed

	@@ -0,0 +1,291 @@

+#**************** IMPORT PACKAGES ********************
+import flask
+from flask import render_template, jsonify, Flask, redirect, url_for, request, flash
+from flask_cors import CORS, cross_origin
+from werkzeug.utils import secure_filename
+import numpy as np
+import pytesseract as pt
+import pdf2image
+from fpdf import FPDF
+import re
+import nltk
+from nltk.tokenize import sent_tokenize
+from nltk.tokenize import word_tokenize
+import os
+import pdfkit
+import yake
+from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
+from summarizer import Summarizer,TransformerSummarizer
+from transformers import pipelines
+#nltk.download('punkt')
+print("lets go")
+app = flask.Flask(__name__)
+app.config["DEBUG"] = True
+UPLOAD_FOLDER = './pdfs'
+ALLOWED_EXTENSIONS = {'txt', 'pdf'}
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+#***************** FLASK *****************************
+CORS(app)
+def allowed_file(filename):
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+#model_name = 'laxya007/gpt2_legal'
+#model_name = 'facebook/bart-large-cnn'
+model_name = 'nlpaueb/legal-bert-base-uncased'
+#The setup of huggingface.co
+print("lets go")
+custom_config = AutoConfig.from_pretrained(model_name)
+print("lets go")
+custom_config.output_hidden_states=True
+print("lets go")
+custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
+print("lets go")
+#custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
+print("lets go")
+bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
+print('Using model {}\n'.format(model_name))
+# main index page route
+@app.route('/')
+@cross_origin()
+def index():
+    return render_template('index.html')
+@cross_origin()
+@app.route('/results')
+def results():
+    return render_template('results.html')
+@app.route('/predict', methods=['GET', 'POST'])
+def uploads():
+    if request.method == 'GET':
+        # Get the file from post request
+        numsent = int(request.args['number'])
+        text = str(request.args['text'])
+        content = text
+        summary_text = ""
+        for i, paragraph in enumerate(content.split("\n\n")):
+            paragraph = paragraph.replace('\n',' ')
+            paragraph = paragraph.replace('\t','')
+            paragraph = ' '.join(paragraph.split())
+            # count words in the paragraph and exclude if less than 4 words
+            tokens = word_tokenize(paragraph)
+            # only do real words
+            tokens = [word for word in tokens if word.isalpha()]
+            # print("\nTokens: {}\n".format(len(tokens)))
+            # only do sentences with more than 1 words excl. alpha crap
+            if len(tokens) <= 1:
+                continue
+            # Perhaps also ignore paragraphs with no sentence?
+            sentences = sent_tokenize(paragraph)
+            paragraph = ' '.join(tokens)
+            print("\nParagraph:")
+            print(paragraph+"\n")
+            # T5 needs to have 'summarize' in order to work:
+            # text = "summarize:" + paragraph
+            text = paragraph
+            summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
+            # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
+            summary_text += str(summary) + "\n\n"
+            print("Summary:")
+            print(summary)
+        content2 = content.replace('\n',' ')
+        content2 = content2.replace('\t','')
+        summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
+        # write all to file for inspection and storage
+        all_text = "The Summary-- " + str(summary) + "\n\n\n" \
+            + "The Larger Summary-- " + str(summary_text)
+        all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
+        all_text2 = all_text2.replace('?','.')
+        all_text2 = all_text2.replace('\n',' ')
+        all_text2 = all_text2.replace('..','.')
+        all_text2 = all_text2.replace(',.',',')
+        all_text2 = all_text2.replace('-- ','\n\n\n')
+        pdf = FPDF()
+        # Add a page
+        pdf.add_page()
+        pdf.set_font("Times", size = 12)
+        # open the text file in read mode
+        f = all_text2
+        # insert the texts in pdf
+        pdf.multi_cell(190, 10, txt = f, align = 'C')
+        # save the pdf with name .pdf
+        pdf.output("./static/legal.pdf")
+        all_text
+        return render_template('results.html')
+    return None
+@app.route('/predictpdf', methods=['GET', 'POST'])
+def uploads2():
+    if request.method == 'POST':
+        # Get the file from post request
+        numsent = int(request.args['number'])
+        if 'file' not in request.files:
+            flash('No file part')
+            return redirect(request.url)
+        file = request.files['file']
+        # if user does not select file, browser also
+        # submit an empty part without filename
+        if file.filename == '':
+            flash('No selected file')
+            return redirect(request.url)
+        if file and allowed_file(file.filename):
+            filename = "legal.pdf"
+            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
+        f = request.files['file']
+        f.save(secure_filename(f.filename))
+        path = os.getcwd()
+        folder_name = 'pdfs'
+        path = os.path.join(path, folder_name)
+        list_of_files = []
+        for root, dirs, files in os.walk(path):
+            for file in files:
+                if(file.endswith(".pdf")):
+                    # print(os.path.join(root,file))
+                    list_of_files.append(os.path.join(root,file))
+        print("\nProcessing {} files...\n".format(len(list_of_files)))
+        total_pages = 0
+        for filename in list_of_files:
+            print(filename)
+            file = os.path.splitext(os.path.basename(filename))[0]
+            pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
+            total_pages += len(pages)
+            print("\nProcessing the next {} pages...\n".format(len(pages)))
+            # Then save all pages as images and convert them to text except the last page
+            # TODO: create this as a function
+            content = ""
+            dir_name = 'images/' + file + '/'
+            os.makedirs(dir_name, exist_ok=True)
+            # If folder doesn't exist, then create it.
+            for i in range(len(pages)-1):
+                pages[i].save(dir_name + str(i) + '.jpg')
+                # OCR the image using Google's tesseract
+                content += pt.image_to_string(pages[i])
+            summary_text = ""
+            for i, paragraph in enumerate(content.split("\n\n")):
+                paragraph = paragraph.replace('\n',' ')
+                paragraph = paragraph.replace('\t','')
+                paragraph = ' '.join(paragraph.split())
+                # count words in the paragraph and exclude if less than 4 words
+                tokens = word_tokenize(paragraph)
+                # only do real words
+                tokens = [word for word in tokens if word.isalpha()]
+                # print("\nTokens: {}\n".format(len(tokens)))
+                # only do sentences with more than 1 words excl. alpha crap
+                if len(tokens) <= 1:
+                    continue
+                # Perhaps also ignore paragraphs with no sentence?
+                sentences = sent_tokenize(paragraph)
+                paragraph = ' '.join(tokens)
+                print("\nParagraph:")
+                print(paragraph+"\n")
+                # T5 needs to have 'summarize' in order to work:
+                # text = "summarize:" + paragraph
+                text = paragraph
+                summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
+                # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
+                summary_text += str(summary) + "\n\n"
+                print("Summary:")
+                print(summary)
+            content2 = content.replace('\n',' ')
+            content2 = content2.replace('\t','')
+            summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
+            # write all to file for inspection and storage
+            all_text = "The Summary-- " + str(summary) + "\n\n\n" \
+                + "The Larger Summary-- " + str(summary_text)
+            all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
+            all_text2 = all_text2.replace('?','.')
+            all_text2 = all_text2.replace('\n',' ')
+            all_text2 = all_text2.replace('..','.')
+            all_text2 = all_text2.replace(',.',',')
+            all_text2 = all_text2.replace('-- ','\n\n\n')
+            pdf = FPDF()
+            # Add a page
+            pdf.add_page()
+            pdf.set_font("Times", size = 12)
+            # open the text file in read mode
+            f = all_text2
+            # insert the texts in pdf
+            pdf.multi_cell(190, 10, txt = f, align = 'C')
+            # save the pdf with name .pdf
+            pdf.output("./static/legal.pdf")
+            all_text
+        return render_template('results.html')
+    return None
+if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=8000, debug=True)