Feat: PDF vision figure parser supports reading context (#12416)

### What problem does this PR solve? PDF vision figure parser supports reading context. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 15:45:08 +08:00 · 2026-01-05 09:55:43 +08:00
parent cc8a10376a
commit 4cd4526492
8 changed files with 263 additions and 41 deletions
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -314,7 +314,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            tk_cnt = num_tokens_from_string(txt)
            if sec_id > -1:
                last_sid = sec_id
-        tbls = vision_figure_parser_pdf_wrapper(tbls=tbls, callback=callback, **kwargs)
+        tbls = vision_figure_parser_pdf_wrapper(
+            tbls=tbls,
+            sections=sections,
+            callback=callback,
+            **kwargs,
+        )
        res = tokenize_table(tbls, doc, eng)
        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
        table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -55,9 +55,12 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
        callback=callback
    )

-    tables = vision_figure_parser_pdf_wrapper(tbls=tables,
-                                              callback=callback,
-                                              **kwargs)
+    tables = vision_figure_parser_pdf_wrapper(
+        tbls=tables,
+        sections=sections,
+        callback=callback,
+        **kwargs,
+    )
    return sections, tables, pdf_parser


--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -166,6 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            pdf_parser = Pdf()
            paper = pdf_parser(filename if not binary else binary,
                               from_page=from_page, to_page=to_page, callback=callback)
+            sections = paper.get("sections", [])
        else:
            kwargs.pop("parse_method", None)
            kwargs.pop("mineru_llm_name", None)
@ -192,7 +193,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            }

        tbls = paper["tables"]
-        tbls = vision_figure_parser_pdf_wrapper(tbls=tbls, callback=callback, **kwargs)
+        tbls = vision_figure_parser_pdf_wrapper(
+            tbls=tbls,
+            sections=sections,
+            callback=callback,
+            **kwargs,
+        )
        paper["tables"] = tbls
    else:
        raise NotImplementedError("file type not supported yet(pdf supported)")