fix table desc bugs, add positions to chunks (#91)

This commit is contained in:
KevinHuSh
2024-03-04 14:42:26 +08:00
committed by GitHub
parent 8a726fb04b
commit 685b4d8a95
13 changed files with 117 additions and 46 deletions

View File

@ -83,17 +83,39 @@ def tokenize(d, t, eng):
def tokenize_table(tbls, doc, eng, batch_size=10):
res = []
# add tables
for img, rows in tbls:
for (img, rows), poss in tbls:
if not rows:continue
if isinstance(rows, str):
d = copy.deepcopy(doc)
r = re.sub(r"<[^<>]{,12}>", "", rows)
tokenize(d, r, eng)
d["content_with_weight"] = rows
d["image"] = img
add_positions(d, poss)
res.append(d)
continue
de = "; " if eng else " "
for i in range(0, len(rows), batch_size):
d = copy.deepcopy(doc)
r = de.join(rows[i:i + batch_size])
tokenize(d, r, eng)
d["image"] = img
add_positions(d, poss)
res.append(d)
return res
def add_positions(d, poss):
if not poss:return
d["page_num_int"] = []
d["position_int"] = []
d["top_int"] = []
for pn, left, right, top, bottom in poss:
d["page_num_int"].append(pn+1)
d["top_int"].append(top)
d["position_int"].append((pn+1, left, right, top, bottom))
def remove_contents_table(sections, eng=False):
i = 0
while i < len(sections):