Feat: add context for figure and table (#11547)

### What problem does this PR solve? Add context for figure table. ![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e) `==================()` for demonstrating purpose. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 16:45:08 +08:00 · 2025-11-27 10:21:44 +08:00
parent 7c3c185038
commit 9d8b96c1d0
11 changed files with 373 additions and 74 deletions
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -20,11 +20,11 @@ import re
 import numpy as np
 from PIL import Image

-from common.constants import LLMType
 from api.db.services.llm_service import LLMBundle
-from deepdoc.vision import OCR
-from rag.nlp import rag_tokenizer, tokenize
+from common.constants import LLMType
 from common.string_utils import clean_markdown_block
+from deepdoc.vision import OCR
+from rag.nlp import attach_media_context, rag_tokenizer, tokenize

 ocr = OCR()

@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
    }
    eng = lang.lower() == "english"

+    parser_config = kwargs.get("parser_config", {}) or {}
+    image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
+
    if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
        try:
-            doc.update({"doc_type_kwd": "video"})
+            doc.update(
+                {
+                    "doc_type_kwd": "video",
+                }
+            )
            cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
            ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
        if (eng and len(txt.split()) > 32) or len(txt) > 32:
            tokenize(doc, txt, eng)
            callback(0.8, "OCR results is too long to use CV LLM.")
-            return [doc]
+            return attach_media_context([doc], 0, image_ctx)

        try:
            callback(0.4, "Use CV LLM to describe the picture.")
@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
            txt += "\n" + ans
            tokenize(doc, txt, eng)
-            return [doc]
+            return attach_media_context([doc], 0, image_ctx)
        except Exception as e:
            callback(prog=-1, msg=str(e))

@ -103,7 +110,7 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
                img_binary.seek(0)
                img_binary.truncate()
                img.save(img_binary, format="PNG")
-                
+
            img_binary.seek(0)
            ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
            txt += "\n" + ans