Feat: add context for figure and table (#11547)

### What problem does this PR solve?

Add context for figure table.



![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e)


`==================()` for demonstrating purpose. 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-11-27 10:21:44 +08:00
committed by GitHub
parent 7c3c185038
commit 9d8b96c1d0
11 changed files with 373 additions and 74 deletions

View File

@ -20,7 +20,7 @@ import re
from common.constants import ParserType
from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@ -155,7 +155,7 @@ class Docx(DocxParser):
sum_question = '\n'.join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
tbls = []
for tb in self.doc.tables:
html= "<table>"
@ -231,14 +231,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
pn = first[0]
if isinstance(pn, list):
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
return (txt, layoutno, poss)
sections = [_normalize_section(sec) for sec in sections]
@ -247,7 +247,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@ -325,10 +329,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["doc_type_kwd"] = "image"
tokenize(d, text, eng)
res.append(d)
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res
else:
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
if __name__ == "__main__":
import sys