fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
This commit is contained in:
KevinHuSh
2024-03-04 17:08:35 +08:00
committed by GitHub
parent fae00827e6
commit 7bfaf0df29
11 changed files with 34 additions and 22 deletions

View File

@ -35,6 +35,7 @@ class HuParser:
self.updown_cnt_mdl.set_param({"device": "cuda"})
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
filename="updown_concat_xgb.model"))
self.page_from = 0
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
@ -683,7 +684,7 @@ class HuParser:
"layoutno", "")))
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
poss.append((pn, left, right, top, bott))
poss.append((pn+self.page_from, left, right, top, bott))
return self.page_images[pn] \
.crop((left * ZM, top * ZM,
right * ZM, bott * ZM))
@ -863,6 +864,7 @@ class HuParser:
self.garbages = {}
self.page_cum_height = [0]
self.page_layout = []
self.page_from = page_from
try:
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
@ -947,7 +949,9 @@ class HuParser:
left, right, top, bottom = float(left), float(
right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
if not poss: return
if not poss:
if need_position: return None, None
return
max_width = np.max([right-left for (_, left, right, _, _) in poss])
GAP = 6
@ -969,7 +973,8 @@ class HuParser:
bottom, self.page_images[pns[0]].size[1])
))
)
positions.append((pns[0], left, right, top, min(
if 0 < ii < len(poss)-1:
positions.append((pns[0]+self.page_from, left, right, top, min(
bottom, self.page_images[pns[0]].size[1])/ZM))
bottom -= self.page_images[pns[0]].size[1]
for pn in pns[1:]:
@ -980,8 +985,9 @@ class HuParser:
self.page_images[pn].size[1])
))
)
positions.append((pn, left, right, 0, min(
bottom, self.page_images[pn].size[1]) / ZM))
if 0 < ii < len(poss) - 1:
positions.append((pn+self.page_from, left, right, 0, min(
bottom, self.page_images[pn].size[1]) / ZM))
bottom -= self.page_images[pn].size[1]
if not imgs: