Feat: add image table context to pipeline splitter (#12167)

### What problem does this PR solve? Add image table context to pipeline splitter. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-12-25 16:26:51 +08:00 · 2025-12-24 16:58:14 +08:00
parent 44671ea413
commit 9b52ba8061
6 changed files with 31 additions and 44 deletions
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -37,7 +37,6 @@ from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
 from rag.llm.cv_model import Base as VLM
-from rag.nlp import attach_media_context
 from rag.utils.base64_image import image2id


@ -86,8 +85,6 @@ class ParserParam(ProcessParamBase):
                    "pdf",
                ],
                "output_format": "json",
-                "table_context_size": 0,
-                "image_context_size": 0,
            },
            "spreadsheet": {
                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
@ -97,8 +94,6 @@ class ParserParam(ProcessParamBase):
                    "xlsx",
                    "csv",
                ],
-                "table_context_size": 0,
-                "image_context_size": 0,
            },
            "word": {
                "suffix": [
@ -106,14 +101,10 @@ class ParserParam(ProcessParamBase):
                    "docx",
                ],
                "output_format": "json",
-                "table_context_size": 0,
-                "image_context_size": 0,
            },
            "text&markdown": {
                "suffix": ["md", "markdown", "mdx", "txt"],
                "output_format": "json",
-                "table_context_size": 0,
-                "image_context_size": 0,
            },
            "slides": {
                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
@ -122,8 +113,6 @@ class ParserParam(ProcessParamBase):
                    "ppt",
                ],
                "output_format": "json",
-                "table_context_size": 0,
-                "image_context_size": 0,
            },
            "image": {
                "parse_method": "ocr",
@ -357,11 +346,6 @@ class Parser(ProcessBase):
            elif layout == "table":
                b["doc_type_kwd"] = "table"

-        table_ctx = conf.get("table_context_size", 0) or 0
-        image_ctx = conf.get("image_context_size", 0) or 0
-        if table_ctx or image_ctx:
-            bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
-
        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
        if conf.get("output_format") == "markdown":
@ -436,11 +420,6 @@ class Parser(ProcessBase):
                    if table:
                        result.append({"text": table, "doc_type_kwd": "table"})

-                table_ctx = conf.get("table_context_size", 0) or 0
-                image_ctx = conf.get("image_context_size", 0) or 0
-                if table_ctx or image_ctx:
-                    result = attach_media_context(result, table_ctx, image_ctx)
-
                self.set_output("json", result)

            elif output_format == "markdown":
@ -476,11 +455,6 @@ class Parser(ProcessBase):
            sections = [{"text": section[0], "image": section[1]} for section in sections if section]
            sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])

-            table_ctx = conf.get("table_context_size", 0) or 0
-            image_ctx = conf.get("image_context_size", 0) or 0
-            if table_ctx or image_ctx:
-                sections = attach_media_context(sections, table_ctx, image_ctx)
-
            self.set_output("json", sections)
        elif conf.get("output_format") == "markdown":
            markdown_text = docx_parser.to_markdown(name, binary=blob)
@ -536,11 +510,6 @@ class Parser(ProcessBase):
                    if table:
                        result.append({"text": table, "doc_type_kwd": "table"})

-                table_ctx = conf.get("table_context_size", 0) or 0
-                image_ctx = conf.get("image_context_size", 0) or 0
-                if table_ctx or image_ctx:
-                    result = attach_media_context(result, table_ctx, image_ctx)
-
                self.set_output("json", result)
        else:
            # Default DeepDOC parser (supports .pptx format)
@ -554,10 +523,6 @@ class Parser(ProcessBase):
            # json
            assert conf.get("output_format") == "json", "have to be json for ppt"
            if conf.get("output_format") == "json":
-                table_ctx = conf.get("table_context_size", 0) or 0
-                image_ctx = conf.get("image_context_size", 0) or 0
-                if table_ctx or image_ctx:
-                    sections = attach_media_context(sections, table_ctx, image_ctx)
                self.set_output("json", sections)

    def _markdown(self, name, blob):
@ -597,11 +562,6 @@ class Parser(ProcessBase):

                json_results.append(json_result)

-            table_ctx = conf.get("table_context_size", 0) or 0
-            image_ctx = conf.get("image_context_size", 0) or 0
-            if table_ctx or image_ctx:
-                json_results = attach_media_context(json_results, table_ctx, image_ctx)
-
            self.set_output("json", json_results)
        else:
            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))