Spaces:
Runtime error
Runtime error
Commit
·
7e13339
1
Parent(s):
ec64fd3
Update app.py
Browse files
app.py
CHANGED
|
@@ -129,19 +129,13 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 129 |
example = dict()
|
| 130 |
|
| 131 |
if len(msg_error) > 0:
|
| 132 |
-
# save PDF
|
| 133 |
-
rgba = Image.open(images_wo_content)
|
| 134 |
-
rgb = Image.new('RGB', rgba.size, (255, 255, 255)) # white background
|
| 135 |
-
rgb.paste(rgba, mask=rgba.split()[3]) # paste using alpha channel as mask
|
| 136 |
-
rgb.save("wo_content.pdf", 'PDF', resolution=100.0)
|
| 137 |
# save image files
|
| 138 |
-
Image.open(images_wo_content).save("
|
| 139 |
-
Image.open(images_wo_content).save("img_lines.png")
|
| 140 |
# save csv files
|
| 141 |
-
df_paragraphs_wo_content.to_csv("
|
| 142 |
-
df_lines_wo_content.to_csv("
|
| 143 |
|
| 144 |
-
return msg_error, "wo_content.
|
| 145 |
else:
|
| 146 |
# get random image & PDF data
|
| 147 |
index = random.randint(0, len(example))
|
|
@@ -155,8 +149,9 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 155 |
# resize image to original
|
| 156 |
image = image.resize((original_width, original_height))
|
| 157 |
|
| 158 |
-
# get
|
| 159 |
-
|
|
|
|
| 160 |
|
| 161 |
# get corresponding annotations
|
| 162 |
texts = example[index]["texts"]
|
|
@@ -233,8 +228,9 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 233 |
if i == 0:
|
| 234 |
imgs["paragraphs"] = img
|
| 235 |
|
| 236 |
-
# save
|
| 237 |
-
|
|
|
|
| 238 |
|
| 239 |
df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
|
| 240 |
df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
|
|
@@ -242,13 +238,15 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 242 |
df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
|
| 243 |
|
| 244 |
# save
|
| 245 |
-
|
|
|
|
| 246 |
|
| 247 |
else:
|
| 248 |
imgs["lines"] = img
|
| 249 |
|
| 250 |
# save
|
| 251 |
-
|
|
|
|
| 252 |
|
| 253 |
df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
|
| 254 |
df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
|
|
@@ -256,11 +254,12 @@ def generate_annotated_image(dataset_name, split, domain, category):
|
|
| 256 |
df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
|
| 257 |
|
| 258 |
# save
|
| 259 |
-
|
|
|
|
| 260 |
|
| 261 |
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.'
|
| 262 |
|
| 263 |
-
return msg,
|
| 264 |
|
| 265 |
# gradio APP
|
| 266 |
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
@@ -270,8 +269,7 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
| 270 |
<div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div>
|
| 271 |
<div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div>
|
| 272 |
<div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div>
|
| 273 |
-
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the
|
| 274 |
-
<div><ul><li>- <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li><li>- <a href="" target="_blank">(01/31/2023) Document AI | DocLayNet image viewer APP</a></li></ul></div>
|
| 275 |
""")
|
| 276 |
with gr.Row():
|
| 277 |
with gr.Column():
|
|
@@ -287,7 +285,7 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
| 287 |
with gr.Column():
|
| 288 |
output_msg = gr.Textbox(label="Output message")
|
| 289 |
with gr.Column():
|
| 290 |
-
|
| 291 |
with gr.Row():
|
| 292 |
with gr.Column():
|
| 293 |
img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)")
|
|
@@ -322,15 +320,15 @@ with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
| 322 |
type="pandas",
|
| 323 |
wrap=True
|
| 324 |
)
|
| 325 |
-
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg,
|
| 326 |
|
| 327 |
gr.Markdown("## Example")
|
| 328 |
gr.Examples(
|
| 329 |
[["small", "all", "all", "all"]],
|
| 330 |
[dataset_name_gr, split_gr, domain_gr, category_gr],
|
| 331 |
-
[output_msg,
|
| 332 |
fn=generate_annotated_image,
|
| 333 |
cache_examples=True,
|
| 334 |
)
|
| 335 |
|
| 336 |
-
demo.launch()
|
|
|
|
| 129 |
example = dict()
|
| 130 |
|
| 131 |
if len(msg_error) > 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
# save image files
|
| 133 |
+
Image.open(images_wo_content).save("wo_content.png")
|
|
|
|
| 134 |
# save csv files
|
| 135 |
+
df_paragraphs_wo_content.to_csv("paragraphs_wo_content.csv", encoding="utf-8", index=False)
|
| 136 |
+
df_lines_wo_content.to_csv("lines_wo_content.csv", encoding="utf-8", index=False)
|
| 137 |
|
| 138 |
+
return msg_error, "wo_content.png", images_wo_content, images_wo_content, "wo_content.png", "wo_content.png", df_paragraphs_wo_content, df_lines_wo_content, gr.File.update(value="paragraphs_wo_content.csv", visible=False), gr.File.update(value="lines_wo_content.csv", visible=False)
|
| 139 |
else:
|
| 140 |
# get random image & PDF data
|
| 141 |
index = random.randint(0, len(example))
|
|
|
|
| 149 |
# resize image to original
|
| 150 |
image = image.resize((original_width, original_height))
|
| 151 |
|
| 152 |
+
# get image of PDF without bounding boxes
|
| 153 |
+
img_file = original_filename.replace(".pdf", ".png")
|
| 154 |
+
image.save(img_file)
|
| 155 |
|
| 156 |
# get corresponding annotations
|
| 157 |
texts = example[index]["texts"]
|
|
|
|
| 228 |
if i == 0:
|
| 229 |
imgs["paragraphs"] = img
|
| 230 |
|
| 231 |
+
# save
|
| 232 |
+
img_paragraphs = "img_paragraphs_" + original_filename.replace(".pdf", ".png")
|
| 233 |
+
img.save(img_paragraphs)
|
| 234 |
|
| 235 |
df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list)))
|
| 236 |
df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list]
|
|
|
|
| 238 |
df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list]
|
| 239 |
|
| 240 |
# save
|
| 241 |
+
csv_paragraphs = "csv_paragraphs_" + original_filename.replace(".pdf", ".csv")
|
| 242 |
+
df_paragraphs.to_csv(csv_paragraphs, encoding="utf-8", index=False)
|
| 243 |
|
| 244 |
else:
|
| 245 |
imgs["lines"] = img
|
| 246 |
|
| 247 |
# save
|
| 248 |
+
img_lines = "img_lines_" + original_filename.replace(".pdf", ".png")
|
| 249 |
+
img.save(img_lines)
|
| 250 |
|
| 251 |
df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list)))
|
| 252 |
df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list]
|
|
|
|
| 254 |
df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list]
|
| 255 |
|
| 256 |
# save
|
| 257 |
+
csv_lines = "csv_lines_" + original_filename.replace(".pdf", ".csv")
|
| 258 |
+
df_lines.to_csv(csv_lines, encoding="utf-8", index=False)
|
| 259 |
|
| 260 |
msg = f'The page {page_no} of the PDF "{original_filename}" (domain: "{domain}") matches your settings.'
|
| 261 |
|
| 262 |
+
return msg, img_file, imgs["paragraphs"], imgs["lines"], img_paragraphs, img_lines, df_paragraphs, df_lines, gr.File.update(value=csv_paragraphs, visible=True), gr.File.update(value=csv_lines, visible=True)
|
| 263 |
|
| 264 |
# gradio APP
|
| 265 |
with gr.Blocks(title="DocLayNet image viewer", css=".gradio-container") as demo:
|
|
|
|
| 269 |
<div><p>It uses the datasets <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-small" target="_blank">DocLayNet small</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> (you can also run this APP in Google Colab by running this <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb" target="_blank">notebook</a>).</p></div>
|
| 270 |
<div><p>Make your settings and the output will show 2 images of a randomly selected PDF with labeled bounding boxes, one of paragraphs and the other of lines, and their corresponding tables of texts with their labels.</p></div>
|
| 271 |
<div><p>For example, if you select the domain "laws_and_regulations" and the category "Caption", you will get a random PDF that corresponds to these settings (ie, it will have at least one bounding box labeled with "Caption" in the PDF).</p></div>
|
| 272 |
+
<div style="margin-top: 20px"><p>More information about the DocLayNet datasets and this APP in the following blog post: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">(01/27/2023) Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></div>
|
|
|
|
| 273 |
""")
|
| 274 |
with gr.Row():
|
| 275 |
with gr.Column():
|
|
|
|
| 285 |
with gr.Column():
|
| 286 |
output_msg = gr.Textbox(label="Output message")
|
| 287 |
with gr.Column():
|
| 288 |
+
img_file = gr.File(visible=True, label="Image file of the PDF")
|
| 289 |
with gr.Row():
|
| 290 |
with gr.Column():
|
| 291 |
img_paragraphs_file = gr.File(visible=True, label="Image file (labeled paragraphs)")
|
|
|
|
| 320 |
type="pandas",
|
| 321 |
wrap=True
|
| 322 |
)
|
| 323 |
+
btn.click(generate_annotated_image, inputs=[dataset_name_gr, split_gr, domain_gr, category_gr], outputs=[output_msg, img_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines])
|
| 324 |
|
| 325 |
gr.Markdown("## Example")
|
| 326 |
gr.Examples(
|
| 327 |
[["small", "all", "all", "all"]],
|
| 328 |
[dataset_name_gr, split_gr, domain_gr, category_gr],
|
| 329 |
+
[output_msg, img_file, img_paragraphs, img_lines, img_paragraphs_file, img_lines_file, df_paragraphs, df_lines, csv_paragraphs, csv_lines],
|
| 330 |
fn=generate_annotated_image,
|
| 331 |
cache_examples=True,
|
| 332 |
)
|
| 333 |
|
| 334 |
+
demo.launch()
|