Feat: add context for figure and table (#11547)

### What problem does this PR solve? Add context for figure table. ![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e) `==================()` for demonstrating purpose. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-03 02:55:29 +08:00 · 2025-11-27 10:21:44 +08:00
parent 7c3c185038
commit 9d8b96c1d0
11 changed files with 373 additions and 74 deletions
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -20,7 +20,7 @@ import re

 from common.constants import ParserType
 from io import BytesIO
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
 from common.token_utils import num_tokens_from_string
 from deepdoc.parser import PdfParser, DocxParser
 from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@ -155,7 +155,7 @@ class Docx(DocxParser):
            sum_question = '\n'.join(question_stack)
            if sum_question:
                ti_list.append((f'{sum_question}\n{last_answer}', last_image))
-                
+
        tbls = []
        for tb in self.doc.tables:
            html= "<table>"
@ -231,14 +231,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            if isinstance(poss, str):
                poss = pdf_parser.extract_positions(poss)
                first = poss[0]          # tuple: ([pn], x1, x2, y1, y2)
-                pn = first[0]           
+                pn = first[0]

                if isinstance(pn, list):
                    pn = pn[0]           # [pn] -> pn
                    poss[0] = (pn, *first[1:])

            return (txt, layoutno, poss)
-        
+

        sections = [_normalize_section(sec) for sec in sections]

@ -247,7 +247,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

        if name in ["tcadp", "docling", "mineru"]:
            parser_config["chunk_token_num"] = 0
-        
+
        callback(0.8, "Finish parsing.")

        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
        res = tokenize_table(tbls, doc, eng)
        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+        table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
+        image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
+        if table_ctx or image_ctx:
+            attach_media_context(res, table_ctx, image_ctx)
        return res

    elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@ -325,10 +329,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                d["doc_type_kwd"] = "image"
            tokenize(d, text, eng)
            res.append(d)
+        table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
+        image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
+        if table_ctx or image_ctx:
+            attach_media_context(res, table_ctx, image_ctx)
        return res
    else:
        raise NotImplementedError("file type not supported yet(pdf and docx supported)")
-    
+

 if __name__ == "__main__":
    import sys