Feat: add context for figure and table (#11547)

### What problem does this PR solve? Add context for figure table. ![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e) `==================()` for demonstrating purpose. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-12-08 12:32:30 +08:00 · 2025-11-27 10:21:44 +08:00
parent 7c3c185038
commit 9d8b96c1d0
11 changed files with 373 additions and 74 deletions
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -19,16 +19,16 @@ import random
 import re
 from functools import partial

-import trio
 import numpy as np
+import trio
 from PIL import Image

-from common.constants import LLMType
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.llm_service import LLMBundle
+from common import settings
+from common.constants import LLMType
 from common.misc_utils import get_uuid
-from rag.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
 from deepdoc.parser.mineru_parser import MinerUParser
 from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
@ -37,7 +37,8 @@ from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
 from rag.llm.cv_model import Base as VLM
-from common import settings
+from rag.nlp import attach_media_context
+from rag.utils.base64_image import image2id


 class ParserParam(ProcessParamBase):
@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase):
                "json",
            ],
            "image": [
-                "text"
+                "text",
+            ],
+            "email": [
+                "text",
+                "json",
            ],
-            "email": ["text", "json"],
            "text&markdown": [
                "text",
-                "json"
+                "json",
            ],
            "audio": [
-                "json"
+                "json",
            ],
            "video": [],
        }
@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase):
                    "pdf",
                ],
                "output_format": "json",
+                "table_context_size": 0,
+                "image_context_size": 0,
            },
            "spreadsheet": {
                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase):
                    "xlsx",
                    "csv",
                ],
+                "table_context_size": 0,
+                "image_context_size": 0,
            },
            "word": {
                "suffix": [
@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase):
                    "docx",
                ],
                "output_format": "json",
+                "table_context_size": 0,
+                "image_context_size": 0,
            },
            "text&markdown": {
                "suffix": ["md", "markdown", "mdx", "txt"],
                "output_format": "json",
+                "table_context_size": 0,
+                "image_context_size": 0,
            },
            "slides": {
                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
                "suffix": [
                    "pptx",
-                    "ppt"
+                    "ppt",
                ],
                "output_format": "json",
+                "table_context_size": 0,
+                "image_context_size": 0,
            },
            "image": {
                "parse_method": "ocr",
@ -121,13 +135,14 @@ class ParserParam(ProcessParamBase):
            },
            "email": {
                "suffix": [
-                  "eml", "msg"
+                    "eml",
+                    "msg",
                ],
                "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
                "output_format": "json",
            },
            "audio": {
-                "suffix":[
+                "suffix": [
                    "da",
                    "wave",
                    "wav",
@ -142,15 +157,15 @@ class ParserParam(ProcessParamBase):
                    "realaudio",
                    "vqf",
                    "oggvorbis",
-                    "ape"
+                    "ape",
                ],
                "output_format": "text",
            },
            "video": {
-                "suffix":[
+                "suffix": [
                    "mp4",
                    "avi",
-                    "mkv"
+                    "mkv",
                ],
                "output_format": "text",
            },
@ -253,7 +268,7 @@ class Parser(ProcessBase):
            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
            tcadp_parser = TCADPParser(
                table_result_type=table_result_type,
-                markdown_image_response_type=markdown_image_response_type
+                markdown_image_response_type=markdown_image_response_type,
            )
            sections, _ = tcadp_parser.parse_pdf(
                filepath=name,
@ -261,7 +276,7 @@ class Parser(ProcessBase):
                callback=self.callback,
                file_type="PDF",
                file_start_page=1,
-                file_end_page=1000
+                file_end_page=1000,
            )
            bboxes = []
            for section, position_tag in sections:
@ -269,17 +284,20 @@ class Parser(ProcessBase):
                    # Extract position information from TCADP's position tag
                    # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
                    import re
+
                    match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
                    if match:
                        pn, x0, x1, top, bott = match.groups()
-                        bboxes.append({
-                            "page_number": int(pn.split('-')[0]),  # Take the first page number
-                            "x0": float(x0),
-                            "x1": float(x1),
-                            "top": float(top),
-                            "bottom": float(bott),
-                            "text": section
-                        })
+                        bboxes.append(
+                            {
+                                "page_number": int(pn.split("-")[0]),  # Take the first page number
+                                "x0": float(x0),
+                                "x1": float(x1),
+                                "top": float(top),
+                                "bottom": float(bott),
+                                "text": section,
+                            }
+                        )
                    else:
                        # If no position info, add as text without position
                        bboxes.append({"text": section})
@ -291,7 +309,30 @@ class Parser(ProcessBase):
            bboxes = []
            for t, poss in lines:
                for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
-                    bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
+                    bboxes.append(
+                        {
+                            "page_number": int(pn[0]),
+                            "x0": float(x0),
+                            "x1": float(x1),
+                            "top": float(top),
+                            "bottom": float(bott),
+                            "text": t,
+                        }
+                    )
+
+        for b in bboxes:
+            text_val = b.get("text", "")
+            has_text = isinstance(text_val, str) and text_val.strip()
+            layout = b.get("layout_type")
+            if layout == "figure" or (b.get("image") and not has_text):
+                b["doc_type_kwd"] = "image"
+            elif layout == "table":
+                b["doc_type_kwd"] = "table"
+
+        table_ctx = conf.get("table_context_size", 0) or 0
+        image_ctx = conf.get("image_context_size", 0) or 0
+        if table_ctx or image_ctx:
+            bboxes = attach_media_context(bboxes, table_ctx, image_ctx)

        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
@ -319,7 +360,7 @@ class Parser(ProcessBase):
            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
            tcadp_parser = TCADPParser(
                table_result_type=table_result_type,
-                markdown_image_response_type=markdown_image_response_type
+                markdown_image_response_type=markdown_image_response_type,
            )
            if not tcadp_parser.check_installation():
                raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -337,7 +378,7 @@ class Parser(ProcessBase):
                callback=self.callback,
                file_type=file_type,
                file_start_page=1,
-                file_end_page=1000
+                file_end_page=1000,
            )

            # Process TCADP parser output based on configured output_format
@ -365,7 +406,12 @@ class Parser(ProcessBase):
                # Add tables as text
                for table in tables:
                    if table:
-                        result.append({"text": table})
+                        result.append({"text": table, "doc_type_kwd": "table"})
+
+                table_ctx = conf.get("table_context_size", 0) or 0
+                image_ctx = conf.get("image_context_size", 0) or 0
+                if table_ctx or image_ctx:
+                    result = attach_media_context(result, table_ctx, image_ctx)

                self.set_output("json", result)

@ -400,7 +446,13 @@ class Parser(ProcessBase):
        if conf.get("output_format") == "json":
            sections, tbls = docx_parser(name, binary=blob)
            sections = [{"text": section[0], "image": section[1]} for section in sections if section]
-            sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
+            sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
+
+            table_ctx = conf.get("table_context_size", 0) or 0
+            image_ctx = conf.get("image_context_size", 0) or 0
+            if table_ctx or image_ctx:
+                sections = attach_media_context(sections, table_ctx, image_ctx)
+
            self.set_output("json", sections)
        elif conf.get("output_format") == "markdown":
            markdown_text = docx_parser.to_markdown(name, binary=blob)
@ -420,7 +472,7 @@ class Parser(ProcessBase):
            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
            tcadp_parser = TCADPParser(
                table_result_type=table_result_type,
-                markdown_image_response_type=markdown_image_response_type
+                markdown_image_response_type=markdown_image_response_type,
            )
            if not tcadp_parser.check_installation():
                raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -439,7 +491,7 @@ class Parser(ProcessBase):
                callback=self.callback,
                file_type=file_type,
                file_start_page=1,
-                file_end_page=1000
+                file_end_page=1000,
            )

            # Process TCADP parser output - PPT only supports json format
@ -454,7 +506,12 @@ class Parser(ProcessBase):
                # Add tables as text
                for table in tables:
                    if table:
-                        result.append({"text": table})
+                        result.append({"text": table, "doc_type_kwd": "table"})
+
+                table_ctx = conf.get("table_context_size", 0) or 0
+                image_ctx = conf.get("image_context_size", 0) or 0
+                if table_ctx or image_ctx:
+                    result = attach_media_context(result, table_ctx, image_ctx)

                self.set_output("json", result)
        else:
@ -469,6 +526,10 @@ class Parser(ProcessBase):
            # json
            assert conf.get("output_format") == "json", "have to be json for ppt"
            if conf.get("output_format") == "json":
+                table_ctx = conf.get("table_context_size", 0) or 0
+                image_ctx = conf.get("image_context_size", 0) or 0
+                if table_ctx or image_ctx:
+                    sections = attach_media_context(sections, table_ctx, image_ctx)
                self.set_output("json", sections)

    def _markdown(self, name, blob):
@ -508,11 +569,15 @@ class Parser(ProcessBase):

                json_results.append(json_result)

+            table_ctx = conf.get("table_context_size", 0) or 0
+            image_ctx = conf.get("image_context_size", 0) or 0
+            if table_ctx or image_ctx:
+                json_results = attach_media_context(json_results, table_ctx, image_ctx)
+
            self.set_output("json", json_results)
        else:
            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))

-
    def _image(self, name, blob):
        from deepdoc.vision import OCR

@ -588,7 +653,7 @@ class Parser(ProcessBase):
            from email.parser import BytesParser

            msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
-            email_content['metadata'] = {}
+            email_content["metadata"] = {}
            # handle header info
            for header, value in msg.items():
                # get fields like from, to, cc, bcc, date, subject
@ -600,6 +665,7 @@ class Parser(ProcessBase):
            # get body
            if "body" in target_fields:
                body_text, body_html = [], []
+
                def _add_content(m, content_type):
                    def _decode_payload(payload, charset, target_list):
                        try:
@ -641,14 +707,17 @@ class Parser(ProcessBase):
                        if dispositions[0].lower() == "attachment":
                            filename = part.get_filename()
                            payload = part.get_payload(decode=True).decode(part.get_content_charset())
-                            attachments.append({
-                                "filename": filename,
-                                "payload": payload,
-                            })
+                            attachments.append(
+                                {
+                                    "filename": filename,
+                                    "payload": payload,
+                                }
+                            )
                email_content["attachments"] = attachments
        else:
            # handle msg file
            import extract_msg
+
            print("handle a msg file.")
            msg = extract_msg.Message(blob)
            # handle header info
@ -662,9 +731,9 @@ class Parser(ProcessBase):
            }
            email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
            # get metadata
-            email_content['metadata'] = {
-                'message_id': msg.messageId,
-                'in_reply_to': msg.inReplyTo,
+            email_content["metadata"] = {
+                "message_id": msg.messageId,
+                "in_reply_to": msg.inReplyTo,
            }
            # get body
            if "body" in target_fields:
@ -675,29 +744,31 @@ class Parser(ProcessBase):
            if "attachments" in target_fields:
                attachments = []
                for t in msg.attachments:
-                    attachments.append({
-                        "filename": t.name,
-                        "payload": t.data.decode("utf-8")
-                    })
+                    attachments.append(
+                        {
+                            "filename": t.name,
+                            "payload": t.data.decode("utf-8"),
+                        }
+                    )
                email_content["attachments"] = attachments

        if conf["output_format"] == "json":
            self.set_output("json", [email_content])
        else:
-            content_txt = ''
+            content_txt = ""
            for k, v in email_content.items():
                if isinstance(v, str):
                    # basic info
-                    content_txt += f'{k}:{v}' + "\n"
+                    content_txt += f"{k}:{v}" + "\n"
                elif isinstance(v, dict):
                    # metadata
-                    content_txt += f'{k}:{json.dumps(v)}' + "\n"
+                    content_txt += f"{k}:{json.dumps(v)}" + "\n"
                elif isinstance(v, list):
                    # attachments or others
                    for fb in v:
                        if isinstance(fb, dict):
                            # attachments
-                            content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
+                            content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
                        else:
                            # str, usually plain text
                            content_txt += fb