Spaces:
Runtime error
Runtime error
Commit
·
ec64fd3
1
Parent(s):
26691c8
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ import collections
|
|
| 11 |
from datasets import load_dataset
|
| 12 |
|
| 13 |
dataset_small = load_dataset("pierreguillou/DocLayNet-small")
|
| 14 |
-
|
| 15 |
|
| 16 |
id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
|
| 17 |
label2id = {label:idx for idx,label in id2label.items()}
|
|
@@ -89,7 +89,7 @@ font = ImageFont.load_default()
|
|
| 89 |
dataset_names = ["small", "base"]
|
| 90 |
splits = ["all", "train", "validation", "test"]
|
| 91 |
domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
|
| 92 |
-
domains_names = [domain_name.lower().replace(" ", "_") for domain_name in domains]
|
| 93 |
categories = labels + ["all"]
|
| 94 |
|
| 95 |
# function to get a rendom image and all data from DocLayNet
|
|
@@ -113,7 +113,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 113 |
if domain_name != "all":
|
| 114 |
example = example.filter(lambda example: example["doc_category"] == domain_name)
|
| 115 |
if len(example) == 0:
|
| 116 |
-
msg_error = f'There is no image with at least one
|
| 117 |
example = dict()
|
| 118 |
|
| 119 |
# get category
|
|
@@ -125,16 +125,23 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 125 |
if len(idx_list) > 0:
|
| 126 |
example = example.select(idx_list)
|
| 127 |
else:
|
| 128 |
-
msg_error = f'There is no image with at least one
|
| 129 |
example = dict()
|
| 130 |
|
| 131 |
if len(msg_error) > 0:
|
| 132 |
-
# save
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
Image.open(images_wo_content).save("img_paragraphs.png")
|
| 134 |
Image.open(images_wo_content).save("img_lines.png")
|
|
|
|
| 135 |
df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
|
| 136 |
df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
|
| 137 |
-
|
|
|
|
| 138 |
else:
|
| 139 |
# get random image & PDF data
|
| 140 |
index = random.randint(0, len(example))
|
|
@@ -148,6 +155,9 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 148 |
# resize image to original
|
| 149 |
image = image.resize((original_width, original_height))
|
| 150 |
|
|
|
|
|
|
|
|
|
|
| 151 |
# get corresponding annotations
|
| 152 |
texts = example[index]["texts"]
|
| 153 |
bboxes_block = example[index]["bboxes_block"]
|
|
@@ -155,10 +165,6 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 155 |
categories = example[index]["categories"]
|
| 156 |
domain = example[index]["doc_category"]
|
| 157 |
|
| 158 |
-
# get list of categories
|
| 159 |
-
categories_unique = sorted(list(set([categories_list for categories_list in categories])))
|
| 160 |
-
categories_unique = [id2label[idx] for idx in categories_unique]
|
| 161 |
-
|
| 162 |
# convert boxes to original
|
| 163 |
original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
|
| 164 |
original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
|
|
@@ -207,7 +213,7 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 207 |
sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
|
| 208 |
sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
|
| 209 |
|
| 210 |
-
# setup images &
|
| 211 |
columns = 2
|
| 212 |
images = [image.copy(), image.copy()]
|
| 213 |
num_imgs = len(images)
|
|
@@ -252,77 +258,79 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 252 |
# save
|
| 253 |
df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
|
| 254 |
|
| 255 |
-
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your
|
| 256 |
|
| 257 |
-
return msg, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
|
| 258 |
|
| 259 |
# gradio APP
|
| 260 |
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
| 261 |
gr.HTML("""
|
| 262 |
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
|
| 263 |
-
<div style="margin-top:
|
| 264 |
-
<div><p>It uses the
|
| 265 |
-
<div><p>Make your
|
| 266 |
-
<div><p>
|
| 267 |
-
<div
|
|
|
|
| 268 |
""")
|
| 269 |
with gr.Row():
|
| 270 |
with gr.Column():
|
| 271 |
-
dataset_name_gr = gr.Radio(
|
| 272 |
with gr.Column():
|
| 273 |
split_gr = gr.Dropdown(splits, value="all", label="Split")
|
| 274 |
with gr.Column():
|
| 275 |
domain_gr = gr.Dropdown(domains, value="all", label="Domain")
|
| 276 |
with gr.Column():
|
| 277 |
category_gr = gr.Dropdown(categories, value="all", label="Category")
|
| 278 |
-
btn = gr.Button("Display PDF image")
|
| 279 |
with gr.Row():
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
| 281 |
with gr.Row():
|
| 282 |
-
# with gr.Column():
|
| 283 |
-
# json = gr.JSON(label="JSON")
|
| 284 |
with gr.Column():
|
| 285 |
-
img_paragraphs_file = gr.File(
|
| 286 |
img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
|
| 287 |
with gr.Column():
|
| 288 |
-
img_lines_file = gr.File(
|
| 289 |
img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
|
| 290 |
with gr.Row():
|
| 291 |
with gr.Column():
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
with gr.Column():
|
| 305 |
-
|
|
|
|
|
|
|
| 306 |
df_lines = gr.Dataframe(
|
| 307 |
headers=["lines", "categories", "texts", "bounding boxes"],
|
| 308 |
datatype=["number", "str", "str", "str"],
|
| 309 |
-
# row_count='dynamic',
|
| 310 |
col_count=(4, "fixed"),
|
| 311 |
-
interactive=False,
|
| 312 |
visible=True,
|
| 313 |
label="Lines data",
|
| 314 |
type="pandas",
|
| 315 |
wrap=True
|
| 316 |
)
|
| 317 |
-
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
|
| 318 |
|
| 319 |
gr.Markdown("## Example")
|
| 320 |
gr.Examples(
|
| 321 |
[["small", "all", "all", "all"]],
|
| 322 |
[dataset_name_gr, split_gr, domain_gr, category_gr],
|
| 323 |
-
[output_msg, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
|
| 324 |
fn=generate_annotated_image,
|
| 325 |
cache_examples=True,
|
| 326 |
)
|
| 327 |
|
| 328 |
-
demo.launch()
|
|
|
|
| 11 |
from datasets import load_dataset
|
| 12 |
|
| 13 |
dataset_small = load_dataset("pierreguillou/DocLayNet-small")
|
| 14 |
+
dataset_base = load_dataset("pierreguillou/DocLayNet-base")
|
| 15 |
|
| 16 |
id2label = {idx:label for idx,label in enumerate(dataset_small["train"].features["categories"].feature.names)}
|
| 17 |
label2id = {label:idx for idx,label in id2label.items()}
|
|
|
|
| 89 |
dataset_names = ["small", "base"]
|
| 90 |
splits = ["all", "train", "validation", "test"]
|
| 91 |
domains = ["all", "Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"]
|
| 92 |
+
domains_names = [domain_name.lower().replace(" ", "_").replace("&", "and") for domain_name in domains]
|
| 93 |
categories = labels + ["all"]
|
| 94 |
|
| 95 |
# function to get a rendom image and all data from DocLayNet
|
|
|
|
| 113 |
if domain_name != "all":
|
| 114 |
example = example.filter(lambda example: example["doc_category"] == domain_name)
|
| 115 |
if len(example) == 0:
|
| 116 |
+
msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / domain: "{domain}" / split: "{split}").'
|
| 117 |
example = dict()
|
| 118 |
|
| 119 |
# get category
|
|
|
|
| 125 |
if len(idx_list) > 0:
|
| 126 |
example = example.select(idx_list)
|
| 127 |
else:
|
| 128 |
+
msg_error = f'There is no image with at least one labeled bounding box that matches your settings (dataset: "DocLayNet {dataset_name}" / split: "{split}" / domain: "{domain}" / category: "{category}").'
|
| 129 |
example = dict()
|
| 130 |
|
| 131 |
if len(msg_error) > 0:
|
| 132 |
+
# save PDF
|
| 133 |
+
rgba = Image.open(images_wo_content)
|
| 134 |
+
rgb = Image.new('RGB', rgba.size, (255, 255, 255)) # white background
|
| 135 |
+
rgb.paste(rgba, mask=rgba.split()[3]) # paste using alpha channel as mask
|
| 136 |
+
rgb.save("wo_content.pdf", 'PDF', resolution=100.0)
|
| 137 |
+
# save image files
|
| 138 |
Image.open(images_wo_content).save("img_paragraphs.png")
|
| 139 |
Image.open(images_wo_content).save("img_lines.png")
|
| 140 |
+
# save csv files
|
| 141 |
df_paragraphs_wo_content.to_csv("paragraphs.csv", encoding="utf-8", index=False)
|
| 142 |
df_lines_wo_content.to_csv("lines.csv", encoding="utf-8", index=False)
|
| 143 |
+
|
| 144 |
+
return msg_error, "wo_content.pdf", images_wo_content, images_wo_content, "img_paragraphs.png", "img_lines.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs.csv", visible=False), gr.File.update(value="lines.csv", visible=False)
|
| 145 |
else:
|
| 146 |
# get random image & PDF data
|
| 147 |
index = random.randint(0, len(example))
|
|
|
|
| 155 |
# resize image to original
|
| 156 |
image = image.resize((original_width, original_height))
|
| 157 |
|
| 158 |
+
# get pdf of image
|
| 159 |
+
image.save(original_filename)
|
| 160 |
+
|
| 161 |
# get corresponding annotations
|
| 162 |
texts = example[index]["texts"]
|
| 163 |
bboxes_block = example[index]["bboxes_block"]
|
|
|
|
| 165 |
categories = example[index]["categories"]
|
| 166 |
domain = example[index]["doc_category"]
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
# convert boxes to original
|
| 169 |
original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block]
|
| 170 |
original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line]
|
|
|
|
| 213 |
sorted_category_line_list = np.array(category_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
|
| 214 |
sorted_text_line_list = np.array(text_line_list)[sorted_original_bboxes_line_list_indexes].tolist()
|
| 215 |
|
| 216 |
+
# setup images & PDF data
|
| 217 |
columns = 2
|
| 218 |
images = [image.copy(), image.copy()]
|
| 219 |
num_imgs = len(images)
|
|
|
|
| 258 |
# save
|
| 259 |
df_lines.to_csv("lines.csv", encoding="utf-8", index=False)
|
| 260 |
|
| 261 |
+
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.'
|
| 262 |
|
| 263 |
+
return msg, original_filename, imgs["paragraphs"], imgs["lines"], "img_paragraphs.png", "img_lines.png", df_paragraphs, df_lines, gr.File.update(value="paragraphs.csv", visible=True), gr.File.update(value="lines.csv", visible=True)
|
| 264 |
|
| 265 |
# gradio APP
|
| 266 |
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
| 267 |
gr.HTML("""
|
| 268 |
<div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>DocLayNet image viewer</h1></div>
|
| 269 |
+
<div style="margin-top: 40px"><p>(01/29/2023) This APP is an image viewer of the DocLayNet dataset and a data extraction tool.</p></div>
|
| 270 |
+
<div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div>
|
| 271 |
+
<div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div>
|
| 272 |
+
<div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div>
|
| 273 |
+
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the 2 following blog posts:</p></div>
|
| 274 |
+
<div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">(01/31/2023) Document AI | DocLayNet image viewer APP</a></li></ul></div>
|
| 275 |
""")
|
| 276 |
with gr.Row():
|
| 277 |
with gr.Column():
|
| 278 |
+
dataset_name_gr = gr.Radio(dataset_names, value="small", label="DocLayNet dataset")
|
| 279 |
with gr.Column():
|
| 280 |
split_gr = gr.Dropdown(splits, value="all", label="Split")
|
| 281 |
with gr.Column():
|
| 282 |
domain_gr = gr.Dropdown(domains, value="all", label="Domain")
|
| 283 |
with gr.Column():
|
| 284 |
category_gr = gr.Dropdown(categories, value="all", label="Category")
|
| 285 |
+
btn = gr.Button("Display labeled PDF image & data")
|
| 286 |
with gr.Row():
|
| 287 |
+
with gr.Column():
|
| 288 |
+
output_msg = gr.Textbox(label="Output message")
|
| 289 |
+
with gr.Column():
|
| 290 |
+
pdf_file = gr.File(visible=True, label="PDF file (original)")
|
| 291 |
with gr.Row():
|
|
|
|
|
|
|
| 292 |
with gr.Column():
|
| 293 |
+
img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)")
|
| 294 |
img_paragraphs = gr.Image(type="pil", label="Bounding boxes of labeled paragraphs", visible=True)
|
| 295 |
with gr.Column():
|
| 296 |
+
img_lines_file = gr.File(visible=True, label="Image file (labeled lines)")
|
| 297 |
img_lines = gr.Image(type="pil", label="Bounding boxes of labeled lines", visible=True)
|
| 298 |
with gr.Row():
|
| 299 |
with gr.Column():
|
| 300 |
+
with gr.Row():
|
| 301 |
+
csv_paragraphs = gr.File(visible=False, label="CSV file (paragraphs)")
|
| 302 |
+
with gr.Row():
|
| 303 |
+
df_paragraphs = gr.Dataframe(
|
| 304 |
+
headers=["paragraphs", "categories", "texts", "bounding boxes"],
|
| 305 |
+
datatype=["number", "str", "str", "str"],
|
| 306 |
+
col_count=(4, "fixed"),
|
| 307 |
+
visible=True,
|
| 308 |
+
label="Paragraphs data",
|
| 309 |
+
type="pandas",
|
| 310 |
+
wrap=True
|
| 311 |
+
)
|
| 312 |
with gr.Column():
|
| 313 |
+
with gr.Row():
|
| 314 |
+
csv_lines = gr.File(visible=False, label="CSV file (lines)")
|
| 315 |
+
with gr.Row():
|
| 316 |
df_lines = gr.Dataframe(
|
| 317 |
headers=["lines", "categories", "texts", "bounding boxes"],
|
| 318 |
datatype=["number", "str", "str", "str"],
|
|
|
|
| 319 |
col_count=(4, "fixed"),
|
|
|
|
| 320 |
visible=True,
|
| 321 |
label="Lines data",
|
| 322 |
type="pandas",
|
| 323 |
wrap=True
|
| 324 |
)
|
| 325 |
+
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
|
| 326 |
|
| 327 |
gr.Markdown("## Example")
|
| 328 |
gr.Examples(
|
| 329 |
[["small", "all", "all", "all"]],
|
| 330 |
[dataset_name_gr, split_gr, domain_gr, category_gr],
|
| 331 |
+
[output_msg, pdf_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
|
| 332 |
fn=generate_annotated_image,
|
| 333 |
cache_examples=True,
|
| 334 |
)
|
| 335 |
|
| 336 |
+
demo.launch()
|