Feat: add context for figure and table (#11547)

### What problem does this PR solve?

Add context for figure table.



![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e)


`==================()` for demonstrating purpose. 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-11-27 10:21:44 +08:00
committed by GitHub
parent 7c3c185038
commit 9d8b96c1d0
11 changed files with 373 additions and 74 deletions

View File

@ -23,7 +23,7 @@ from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks
tokenize_chunks, attach_media_context
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res