mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-31 23:55:06 +08:00
Feat: PDF vision figure parser supports reading context (#12416)
### What problem does this PR solve? PDF vision figure parser supports reading context. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -667,17 +667,42 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
|
||||
return chunks
|
||||
|
||||
|
||||
def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0):
|
||||
def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0, return_context=False):
|
||||
from deepdoc.parser import PdfParser
|
||||
if table_context_size <=0:
|
||||
return tabls
|
||||
return [] if return_context else tabls
|
||||
|
||||
page_bucket = defaultdict(list)
|
||||
for i, (txt, poss) in enumerate(sections):
|
||||
poss = PdfParser.extract_positions(poss)
|
||||
for i, item in enumerate(sections):
|
||||
if isinstance(item, (tuple, list)):
|
||||
if len(item) > 2:
|
||||
txt, _sec_id, poss = item[0], item[1], item[2]
|
||||
else:
|
||||
txt = item[0] if item else ""
|
||||
poss = item[1] if len(item) > 1 else ""
|
||||
else:
|
||||
txt = item
|
||||
poss = ""
|
||||
# Normal: (text, "@@...##") from naive parser -> poss is a position tag string.
|
||||
# Manual: (text, sec_id, poss_list) -> poss is a list of (page, left, right, top, bottom).
|
||||
# Paper: (text_with_@@tag, layoutno) -> poss is layoutno; parse from txt when it contains @@ tags.
|
||||
if isinstance(poss, list):
|
||||
poss = poss
|
||||
elif isinstance(poss, str):
|
||||
if "@@" not in poss and isinstance(txt, str) and "@@" in txt:
|
||||
poss = txt
|
||||
poss = PdfParser.extract_positions(poss)
|
||||
else:
|
||||
if isinstance(txt, str) and "@@" in txt:
|
||||
poss = PdfParser.extract_positions(txt)
|
||||
else:
|
||||
poss = []
|
||||
if isinstance(txt, str) and "@@" in txt:
|
||||
txt = re.sub(r"@@[0-9-]+\t[0-9.\t]+##", "", txt).strip()
|
||||
for page, left, right, top, bottom in poss:
|
||||
page = page[0]
|
||||
page_bucket[page].append(((left, top, right, bottom), txt))
|
||||
if isinstance(page, list):
|
||||
page = page[0] if page else 0
|
||||
page_bucket[page].append(((left, right, top, bottom), txt))
|
||||
|
||||
def upper_context(page, i):
|
||||
txt = ""
|
||||
@ -720,9 +745,10 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
|
||||
return txt
|
||||
|
||||
res = []
|
||||
contexts = []
|
||||
for (img, tb), poss in tabls:
|
||||
page, left, top, right, bott = poss[0]
|
||||
_page, _left, _top, _right, _bott = poss[-1]
|
||||
page, left, right, top, bott = poss[0]
|
||||
_page, _left, _right, _top, _bott = poss[-1]
|
||||
if isinstance(tb, list):
|
||||
tb = "\n".join(tb)
|
||||
|
||||
@ -736,23 +762,34 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
|
||||
i = 0
|
||||
blks = page_bucket.get(page, [])
|
||||
continue
|
||||
tb = upper_context(page, i) + tb + lower_context(page+1, 0)
|
||||
upper = upper_context(page, i)
|
||||
lower = lower_context(page + 1, 0)
|
||||
tb = upper + tb + lower
|
||||
contexts.append((upper.strip(), lower.strip()))
|
||||
break
|
||||
(_, t, r, b), txt = blks[i]
|
||||
(_, _, t, b), txt = blks[i]
|
||||
if b > top:
|
||||
break
|
||||
(_, _t, _r, _b), _txt = blks[i+1]
|
||||
(_, _, _t, _b), _txt = blks[i+1]
|
||||
if _t < _bott:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
tb = upper_context(page, i) + tb + lower_context(page, i)
|
||||
upper = upper_context(page, i)
|
||||
lower = lower_context(page, i)
|
||||
tb = upper + tb + lower
|
||||
contexts.append((upper.strip(), lower.strip()))
|
||||
break
|
||||
|
||||
if _tb == tb:
|
||||
tb = upper_context(page, -1) + tb + lower_context(page+1, 0)
|
||||
upper = upper_context(page, -1)
|
||||
lower = lower_context(page + 1, 0)
|
||||
tb = upper + tb + lower
|
||||
contexts.append((upper.strip(), lower.strip()))
|
||||
if len(contexts) < len(res) + 1:
|
||||
contexts.append(("", ""))
|
||||
res.append(((img, tb), poss))
|
||||
return res
|
||||
return contexts if return_context else res
|
||||
|
||||
|
||||
def add_positions(d, poss):
|
||||
|
||||
Reference in New Issue
Block a user