-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprint_examples.py
111 lines (81 loc) · 4.28 KB
/
print_examples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import random
import json
from pdf2image import convert_from_path
from PIL import ImageDraw
from src.paths import PUBLAY, MERGED
from src.const import categories_colors, Categories_names
from src.utils import create_folder
def visualize(stats=False):
#? COUNTING DATASET STATISTICS
if stats:
for split in ['test', 'train', 'val']:
annotations_path = MERGED / f'{split}.json'
pages = 0
tables = 0
with open(annotations_path, "r") as ann:
annotations = json.load(ann)
for page, objects in annotations['objects'].items():
pages += 1
tables += len([object for object in objects if object[2] == Categories_names.TABLE.value])
print(f"{split}: # pages {pages} - # tables {tables}")
#? CHOOSING A RANDOM SAMPLE TO VISUALIZE FROM THE TESTSET
SPLIT = 'test' # choose between train, dev (val), test
annotations_path = MERGED / f'{SPLIT}.json'
with open(annotations_path, "r") as ann:
annotations = json.load(ann)
data = PUBLAY / 'PubLayNet_PDF' / SPLIT
pdf_name = random.choice(list(annotations['objects']))
pdf_path = data / pdf_name
print(f'Visualizing {pdf_name}')
#? EXTRACTING OBJECTS, TOKENS AND LINKS
objects = annotations['objects'][pdf_name]
print(objects)
pdf_img = convert_from_path(pdf_path)[0]
draw = ImageDraw.Draw(pdf_img)
for object in objects:
draw.rectangle(object[1], fill=tuple(categories_colors[object[2]]), outline='black', width=4)
pdf_img.save('visualization/objects.png')
tokens = annotations['tokens'][pdf_name]
pdf_img = convert_from_path(pdf_path)[0]
draw = ImageDraw.Draw(pdf_img)
for token in tokens:
draw.rectangle(token[1], fill=tuple(categories_colors[token[3]]), outline='black', width=4)
pdf_img.save('visualization/tokens.png')
links = annotations['links'][pdf_name]
#? PRINTING EXAMPLES ON VISUALIZATION FOLDER
if links:
# GRID CELLS
pdf_img = convert_from_path(pdf_path)[0]
draw = ImageDraw.Draw(pdf_img)
for token in tokens:
draw.rectangle(token[1], fill='black')
for link in links:
if link[1] == Categories_names.TABLE_GCELL.value:
bbox = [min([t[1][0] for t in tokens if t[0] in link[2]]), min([t[1][1] for t in tokens if t[0] in link[2]]),
max([t[1][2] for t in tokens if t[0] in link[2]]), max([t[1][3] for t in tokens if t[0] in link[2]]),]
draw.rectangle(bbox, fill=tuple(categories_colors[link[1]]), outline='black', width=4)
pdf_img.save('visualization/grids.png')
# ROWS
pdf_img = convert_from_path(pdf_path)[0]
draw = ImageDraw.Draw(pdf_img)
for token in tokens:
draw.rectangle(token[1], fill='black')
for link in links:
if link[1] == Categories_names.TABLE_ROW.value:
bbox = [min([t[1][0] for t in tokens if t[0] in link[2]]), min([t[1][1] for t in tokens if t[0] in link[2]]),
max([t[1][2] for t in tokens if t[0] in link[2]]), max([t[1][3] for t in tokens if t[0] in link[2]]),]
draw.rectangle(bbox, fill=tuple(categories_colors[link[1]]), outline='black', width=4)
pdf_img.save('visualization/rows.png')
# COLUMNS
pdf_img = convert_from_path(pdf_path)[0]
draw = ImageDraw.Draw(pdf_img)
for token in tokens:
draw.rectangle(token[1], fill='black')
for link in links:
if link[1] == Categories_names.TABLE_COL.value:
bbox = [min([t[1][0] for t in tokens if t[0] in link[2]]), min([t[1][1] for t in tokens if t[0] in link[2]]),
max([t[1][2] for t in tokens if t[0] in link[2]]), max([t[1][3] for t in tokens if t[0] in link[2]]),]
draw.rectangle(bbox, fill=tuple(categories_colors[link[1]]), outline='black', width=4)
pdf_img.save('visualization/columns.png')
create_folder('visualization')
visualize()