Feat: PDF vision figure parser supports reading context (#12416)

### What problem does this PR solve?

PDF vision figure parser supports reading context.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2026-01-05 09:55:43 +08:00
committed by GitHub
parent cc8a10376a
commit 4cd4526492
8 changed files with 263 additions and 41 deletions

View File

@ -667,17 +667,42 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
return chunks
def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0):
def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0, return_context=False):
from deepdoc.parser import PdfParser
if table_context_size <=0:
return tabls
return [] if return_context else tabls
page_bucket = defaultdict(list)
for i, (txt, poss) in enumerate(sections):
poss = PdfParser.extract_positions(poss)
for i, item in enumerate(sections):
if isinstance(item, (tuple, list)):
if len(item) > 2:
txt, _sec_id, poss = item[0], item[1], item[2]
else:
txt = item[0] if item else ""
poss = item[1] if len(item) > 1 else ""
else:
txt = item
poss = ""
# Normal: (text, "@@...##") from naive parser -> poss is a position tag string.
# Manual: (text, sec_id, poss_list) -> poss is a list of (page, left, right, top, bottom).
# Paper: (text_with_@@tag, layoutno) -> poss is layoutno; parse from txt when it contains @@ tags.
if isinstance(poss, list):
poss = poss
elif isinstance(poss, str):
if "@@" not in poss and isinstance(txt, str) and "@@" in txt:
poss = txt
poss = PdfParser.extract_positions(poss)
else:
if isinstance(txt, str) and "@@" in txt:
poss = PdfParser.extract_positions(txt)
else:
poss = []
if isinstance(txt, str) and "@@" in txt:
txt = re.sub(r"@@[0-9-]+\t[0-9.\t]+##", "", txt).strip()
for page, left, right, top, bottom in poss:
page = page[0]
page_bucket[page].append(((left, top, right, bottom), txt))
if isinstance(page, list):
page = page[0] if page else 0
page_bucket[page].append(((left, right, top, bottom), txt))
def upper_context(page, i):
txt = ""
@ -720,9 +745,10 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
return txt
res = []
contexts = []
for (img, tb), poss in tabls:
page, left, top, right, bott = poss[0]
_page, _left, _top, _right, _bott = poss[-1]
page, left, right, top, bott = poss[0]
_page, _left, _right, _top, _bott = poss[-1]
if isinstance(tb, list):
tb = "\n".join(tb)
@ -736,23 +762,34 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
i = 0
blks = page_bucket.get(page, [])
continue
tb = upper_context(page, i) + tb + lower_context(page+1, 0)
upper = upper_context(page, i)
lower = lower_context(page + 1, 0)
tb = upper + tb + lower
contexts.append((upper.strip(), lower.strip()))
break
(_, t, r, b), txt = blks[i]
(_, _, t, b), txt = blks[i]
if b > top:
break
(_, _t, _r, _b), _txt = blks[i+1]
(_, _, _t, _b), _txt = blks[i+1]
if _t < _bott:
i += 1
continue
tb = upper_context(page, i) + tb + lower_context(page, i)
upper = upper_context(page, i)
lower = lower_context(page, i)
tb = upper + tb + lower
contexts.append((upper.strip(), lower.strip()))
break
if _tb == tb:
tb = upper_context(page, -1) + tb + lower_context(page+1, 0)
upper = upper_context(page, -1)
lower = lower_context(page + 1, 0)
tb = upper + tb + lower
contexts.append((upper.strip(), lower.strip()))
if len(contexts) < len(res) + 1:
contexts.append(("", ""))
res.append(((img, tb), poss))
return res
return contexts if return_context else res
def add_positions(d, poss):