Fix ft_title_rag_fine (#11555 )

### What problem does this PR solve? Fix ft_title_rag_fine ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
Feat: add context for figure and table (#11547 )
2025-12-24 07:26:47 +08:00 · 2025-11-27 10:26:08 +08:00 · 2025-11-27 10:21:44 +08:00 · 2025-11-27 09:42:06 +08:00 · 2025-11-27 09:41:24 +08:00 · 2025-11-27 09:40:10 +08:00
21 changed files with 469 additions and 168 deletions
--- a/api/apps/sdk/files.py
+++ b/api/apps/sdk/files.py
@ -31,7 +31,7 @@ from api.db.services.file_service import FileService
 from api.utils.api_utils import get_json_result
 from api.utils.file_utils import filename_type
 from common import settings
-
+from common.constants import RetCode
@manager.route('/file/upload', methods=['POST'])  # noqa: F821
@token_required
@ -86,19 +86,19 @@ async def upload(tenant_id):
        pf_id = root_folder["id"]
    if 'file' not in files:
-        return get_json_result(data=False, message='No file part!', code=400)
+        return get_json_result(data=False, message='No file part!', code=RetCode.BAD_REQUEST)
    file_objs = files.getlist('file')
    for file_obj in file_objs:
        if file_obj.filename == '':
-            return get_json_result(data=False, message='No selected file!', code=400)
+            return get_json_result(data=False, message='No selected file!', code=RetCode.BAD_REQUEST)
    file_res = []
    try:
        e, pf_folder = FileService.get_by_id(pf_id)
        if not e:
-            return get_json_result(data=False, message="Can't find this folder!", code=404)
+            return get_json_result(data=False, message="Can't find this folder!", code=RetCode.NOT_FOUND)
        for file_obj in file_objs:
            # Handle file path
@ -114,13 +114,13 @@ async def upload(tenant_id):
            if file_len != len_id_list:
                e, file = FileService.get_by_id(file_id_list[len_id_list - 1])
                if not e:
-                    return get_json_result(data=False, message="Folder not found!", code=404)
+                    return get_json_result(data=False, message="Folder not found!", code=RetCode.NOT_FOUND)
                last_folder = FileService.create_folder(file, file_id_list[len_id_list - 1], file_obj_names,
                                                        len_id_list)
            else:
                e, file = FileService.get_by_id(file_id_list[len_id_list - 2])
                if not e:
-                    return get_json_result(data=False, message="Folder not found!", code=404)
+                    return get_json_result(data=False, message="Folder not found!", code=RetCode.NOT_FOUND)
                last_folder = FileService.create_folder(file, file_id_list[len_id_list - 2], file_obj_names,
                                                        len_id_list)
@ -202,7 +202,7 @@ async def create(tenant_id):
    try:
        if not FileService.is_parent_folder_exist(pf_id):
-            return get_json_result(data=False, message="Parent Folder Doesn't Exist!", code=400)
+            return get_json_result(data=False, message="Parent Folder Doesn't Exist!", code=RetCode.BAD_REQUEST)
        if FileService.query(name=req["name"], parent_id=pf_id):
            return get_json_result(data=False, message="Duplicated folder name in the same folder.", code=409)
@ -306,13 +306,13 @@ def list_files(tenant_id):
    try:
        e, file = FileService.get_by_id(pf_id)
        if not e:
-            return get_json_result(message="Folder not found!", code=404)
+            return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
        files, total = FileService.get_by_pf_id(tenant_id, pf_id, page_number, items_per_page, orderby, desc, keywords)
        parent_folder = FileService.get_parent_folder(pf_id)
        if not parent_folder:
-            return get_json_result(message="File not found!", code=404)
+            return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
        return get_json_result(data={"total": total, "files": files, "parent_folder": parent_folder.to_json()})
    except Exception as e:
@ -392,7 +392,7 @@ def get_parent_folder():
    try:
        e, file = FileService.get_by_id(file_id)
        if not e:
-            return get_json_result(message="Folder not found!", code=404)
+            return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
        parent_folder = FileService.get_parent_folder(file_id)
        return get_json_result(data={"parent_folder": parent_folder.to_json()})
@ -439,7 +439,7 @@ def get_all_parent_folders(tenant_id):
    try:
        e, file = FileService.get_by_id(file_id)
        if not e:
-            return get_json_result(message="Folder not found!", code=404)
+            return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
        parent_folders = FileService.get_all_parent_folders(file_id)
        parent_folders_res = [folder.to_json() for folder in parent_folders]
@ -487,34 +487,34 @@ async def rm(tenant_id):
        for file_id in file_ids:
            e, file = FileService.get_by_id(file_id)
            if not e:
-                return get_json_result(message="File or Folder not found!", code=404)
+                return get_json_result(message="File or Folder not found!", code=RetCode.NOT_FOUND)
            if not file.tenant_id:
-                return get_json_result(message="Tenant not found!", code=404)
+                return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
            if file.type == FileType.FOLDER.value:
                file_id_list = FileService.get_all_innermost_file_ids(file_id, [])
                for inner_file_id in file_id_list:
                    e, file = FileService.get_by_id(inner_file_id)
                    if not e:
-                        return get_json_result(message="File not found!", code=404)
+                        return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
                    settings.STORAGE_IMPL.rm(file.parent_id, file.location)
                FileService.delete_folder_by_pf_id(tenant_id, file_id)
            else:
                settings.STORAGE_IMPL.rm(file.parent_id, file.location)
                if not FileService.delete(file):
-                    return get_json_result(message="Database error (File removal)!", code=500)
+                    return get_json_result(message="Database error (File removal)!", code=RetCode.SERVER_ERROR)
            informs = File2DocumentService.get_by_file_id(file_id)
            for inform in informs:
                doc_id = inform.document_id
                e, doc = DocumentService.get_by_id(doc_id)
                if not e:
-                    return get_json_result(message="Document not found!", code=404)
+                    return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
                tenant_id = DocumentService.get_tenant_id(doc_id)
                if not tenant_id:
-                    return get_json_result(message="Tenant not found!", code=404)
+                    return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
                if not DocumentService.remove_document(doc, tenant_id):
-                    return get_json_result(message="Database error (Document removal)!", code=500)
+                    return get_json_result(message="Database error (Document removal)!", code=RetCode.SERVER_ERROR)
            File2DocumentService.delete_by_file_id(file_id)
        return get_json_result(data=True)
@ -560,23 +560,23 @@ async def rename(tenant_id):
    try:
        e, file = FileService.get_by_id(req["file_id"])
        if not e:
-            return get_json_result(message="File not found!", code=404)
+            return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
        if file.type != FileType.FOLDER.value and pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
                file.name.lower()).suffix:
-            return get_json_result(data=False, message="The extension of file can't be changed", code=400)
+            return get_json_result(data=False, message="The extension of file can't be changed", code=RetCode.BAD_REQUEST)
        for existing_file in FileService.query(name=req["name"], pf_id=file.parent_id):
            if existing_file.name == req["name"]:
                return get_json_result(data=False, message="Duplicated file name in the same folder.", code=409)
        if not FileService.update_by_id(req["file_id"], {"name": req["name"]}):
-            return get_json_result(message="Database error (File rename)!", code=500)
+            return get_json_result(message="Database error (File rename)!", code=RetCode.SERVER_ERROR)
        informs = File2DocumentService.get_by_file_id(req["file_id"])
        if informs:
            if not DocumentService.update_by_id(informs[0].document_id, {"name": req["name"]}):
-                return get_json_result(message="Database error (Document rename)!", code=500)
+                return get_json_result(message="Database error (Document rename)!", code=RetCode.SERVER_ERROR)
        return get_json_result(data=True)
    except Exception as e:
@ -606,13 +606,13 @@ async def get(tenant_id, file_id):
        description: File stream
        schema:
          type: file
-      404:
+      RetCode.NOT_FOUND:
        description: File not found
    """
    try:
        e, file = FileService.get_by_id(file_id)
        if not e:
-            return get_json_result(message="Document not found!", code=404)
+            return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
        blob = settings.STORAGE_IMPL.get(file.parent_id, file.location)
        if not blob:
@ -677,13 +677,13 @@ async def move(tenant_id):
        for file_id in file_ids:
            file = files_dict[file_id]
            if not file:
-                return get_json_result(message="File or Folder not found!", code=404)
+                return get_json_result(message="File or Folder not found!", code=RetCode.NOT_FOUND)
            if not file.tenant_id:
-                return get_json_result(message="Tenant not found!", code=404)
+                return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
        fe, _ = FileService.get_by_id(parent_id)
        if not fe:
-            return get_json_result(message="Parent Folder not found!", code=404)
+            return get_json_result(message="Parent Folder not found!", code=RetCode.NOT_FOUND)
        FileService.move_file(file_ids, parent_id)
        return get_json_result(data=True)
@ -705,7 +705,7 @@ async def convert(tenant_id):
        for file_id in file_ids:
            file = files_set[file_id]
            if not file:
-                return get_json_result(message="File not found!", code=404)
+                return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
            file_ids_list = [file_id]
            if file.type == FileType.FOLDER.value:
                file_ids_list = FileService.get_all_innermost_file_ids(file_id, [])
@ -716,13 +716,13 @@ async def convert(tenant_id):
                    doc_id = inform.document_id
                    e, doc = DocumentService.get_by_id(doc_id)
                    if not e:
-                        return get_json_result(message="Document not found!", code=404)
+                        return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
                    tenant_id = DocumentService.get_tenant_id(doc_id)
                    if not tenant_id:
-                        return get_json_result(message="Tenant not found!", code=404)
+                        return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
                    if not DocumentService.remove_document(doc, tenant_id):
                        return get_json_result(
-                            message="Database error (Document removal)!", code=404)
+                            message="Database error (Document removal)!", code=RetCode.NOT_FOUND)
                File2DocumentService.delete_by_file_id(id)
                # insert
@ -730,11 +730,11 @@ async def convert(tenant_id):
                    e, kb = KnowledgebaseService.get_by_id(kb_id)
                    if not e:
                        return get_json_result(
-                            message="Can't find this knowledgebase!", code=404)
+                            message="Can't find this knowledgebase!", code=RetCode.NOT_FOUND)
                    e, file = FileService.get_by_id(id)
                    if not e:
                        return get_json_result(
-                            message="Can't find this file!", code=404)
+                            message="Can't find this file!", code=RetCode.NOT_FOUND)
                    doc = DocumentService.insert({
                        "id": get_uuid(),
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@ -749,7 +749,7 @@ class Knowledgebase(DataBaseModel):
    parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
    pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
-    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
+    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
    pagerank = IntegerField(default=0, index=False)
    graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
@ -774,7 +774,7 @@ class Document(DataBaseModel):
    kb_id = CharField(max_length=256, null=False, index=True)
    parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
    pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True)
-    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
+    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
    source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
    type = CharField(max_length=32, null=False, help_text="file extension", index=True)
    created_by = CharField(max_length=32, null=False, help_text="who created it", index=True)
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -923,7 +923,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
        ParserType.AUDIO.value: audio,
        ParserType.EMAIL.value: email
    }
-    parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。；！？", "layout_recognize": "Plain Text"}
+    parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。；！？", "layout_recognize": "Plain Text", "table_context_size": 0, "image_context_size": 0}
    exe = ThreadPoolExecutor(max_workers=12)
    threads = []
    doc_nm = {}
--- a/api/utils/api_utils.py
+++ b/api/utils/api_utils.py
@ -313,6 +313,10 @@ def get_parser_config(chunk_method, parser_config):
        chunk_method = "naive"
    # Define default configurations for each chunking method
    base_defaults = {
        "table_context_size": 0,
        "image_context_size": 0,
    }
    key_mapping = {
        "naive": {
            "layout_recognize": "DeepDOC",
@ -365,16 +369,19 @@ def get_parser_config(chunk_method, parser_config):
    default_config = key_mapping[chunk_method]
-    # If no parser_config provided, return default
+    # If no parser_config provided, return default merged with base defaults
    if not parser_config:
-        return default_config
+        if default_config is None:
            return deep_merge(base_defaults, {})
        return deep_merge(base_defaults, default_config)
    # If parser_config is provided, merge with defaults to ensure required fields exist
    if default_config is None:
-        return parser_config
+        return deep_merge(base_defaults, parser_config)
    # Ensure raptor and graphrag fields have default values if not provided
-    merged_config = deep_merge(default_config, parser_config)
+    merged_config = deep_merge(base_defaults, default_config)
    merged_config = deep_merge(merged_config, parser_config)
    return merged_config
--- a/common/constants.py
+++ b/common/constants.py
@ -49,6 +49,7 @@ class RetCode(IntEnum, CustomEnum):
    RUNNING = 106
    PERMISSION_ERROR = 108
    AUTHENTICATION_ERROR = 109
    BAD_REQUEST = 400
    UNAUTHORIZED = 401
    SERVER_ERROR = 500
    FORBIDDEN = 403
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@ -138,7 +138,6 @@ class RAGFlowHtmlParser:
                                            "metadata": {"table_id": table_id, "index": table_list.index(t)}})
                return table_info_list
            else:
                block_id = None
                if str.lower(element.name) in BLOCK_TAGS:
                    block_id = str(uuid.uuid1())
                for child in element.children:
@ -172,7 +171,7 @@ class RAGFlowHtmlParser:
                if tag_name == "table":
                    table_info_list.append(item)
                else:
-                    current_content += (" " if current_content else "" + content)
+                    current_content += (" " if current_content else "") + content
        if current_content:
            block_content.append(current_content)
        return block_content, table_info_list
--- a/docs/quickstart.mdx
+++ b/docs/quickstart.mdx
@ -39,8 +39,10 @@ If you have not installed Docker on your local machine (Windows, Mac, or Linux),
 This section provides instructions on setting up the RAGFlow server on Linux. If you are on a different operating system, no worries. Most steps are alike.
 1. Ensure `vm.max_map_count` &ge; 262144.
 <details>
-  <summary>1. Ensure <code>vm.max_map_count</code> &ge; 262144:</summary>
+  <summary>Expand to show details:</summary>
   `vm.max_map_count`. This value sets the maximum number of memory map areas a process may have. Its default value is 65530. While most applications require fewer than a thousand maps, reducing this value can result in abnormal behaviors, and the system will throw out-of-memory errors when a process reaches the limitation.
--- a/rag/app/book.py
+++ b/rag/app/book.py
@ -23,7 +23,7 @@ from rag.app import naive
 from rag.app.naive import by_plaintext, PARSERS
 from rag.nlp import bullets_category, is_english,remove_contents_table, \
    hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
-    tokenize_chunks
+    tokenize_chunks, attach_media_context
 from rag.nlp import rag_tokenizer
 from deepdoc.parser import PdfParser, HtmlParser
 from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    res = tokenize_table(tbls, doc, eng)
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
    image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
    if table_ctx or image_ctx:
        attach_media_context(res, table_ctx, image_ctx)
    return res
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -20,7 +20,7 @@ import re
 from common.constants import ParserType
 from io import BytesIO
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
 from common.token_utils import num_tokens_from_string
 from deepdoc.parser import PdfParser, DocxParser
 from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
        res = tokenize_table(tbls, doc, eng)
        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
        table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
        image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
        if table_ctx or image_ctx:
            attach_media_context(res, table_ctx, image_ctx)
        return res
    elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@ -325,6 +329,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                d["doc_type_kwd"] = "image"
            tokenize(d, text, eng)
            res.append(d)
        table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
        image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
        if table_ctx or image_ctx:
            attach_media_context(res, table_ctx, image_ctx)
        return res
    else:
        raise NotImplementedError("file type not supported yet(pdf and docx supported)")
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -37,7 +37,7 @@ from deepdoc.parser.pdf_parser import PlainParser, VisionParser
 from deepdoc.parser.mineru_parser import MinerUParser
 from deepdoc.parser.docling_parser import DoclingParser
 from deepdoc.parser.tcadp_parser import TCADPParser
-from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
+from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
 def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
    callback = callback
@ -616,6 +616,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    parser_config = kwargs.get(
        "parser_config", {
            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
    table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
    image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
    final_sections = False
    doc = {
        "docnm_kwd": filename,
@ -686,6 +688,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        logging.info("naive_merge({}): {}".format(filename, timer() - st))
        res.extend(embed_res)
        res.extend(url_res)
        if table_context_size or image_context_size:
            attach_media_context(res, table_context_size, image_context_size)
        return res
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -947,6 +951,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        res.extend(embed_res)
    if url_res:
        res.extend(url_res)
    if table_context_size or image_context_size:
        attach_media_context(res, table_context_size, image_context_size)
    return res
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -20,7 +20,7 @@ import re
 from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
 from common.constants import ParserType
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, attach_media_context
 from deepdoc.parser import PdfParser
 import numpy as np
 from rag.app.naive import by_plaintext, PARSERS
@ -234,6 +234,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        chunks.append(txt)
        last_sid = sec_id
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
    image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
    if table_ctx or image_ctx:
        attach_media_context(res, table_ctx, image_ctx)
    return res
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -20,11 +20,11 @@ import re
 import numpy as np
 from PIL import Image
 from common.constants import LLMType
 from api.db.services.llm_service import LLMBundle
-from deepdoc.vision import OCR
+from common.constants import LLMType
 from rag.nlp import rag_tokenizer, tokenize
 from common.string_utils import clean_markdown_block
 from deepdoc.vision import OCR
 from rag.nlp import attach_media_context, rag_tokenizer, tokenize
 ocr = OCR()
@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
    }
    eng = lang.lower() == "english"
    parser_config = kwargs.get("parser_config", {}) or {}
    image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
    if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
        try:
-            doc.update({"doc_type_kwd": "video"})
+            doc.update(
                {
                    "doc_type_kwd": "video",
                }
            )
            cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
            ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
        if (eng and len(txt.split()) > 32) or len(txt) > 32:
            tokenize(doc, txt, eng)
            callback(0.8, "OCR results is too long to use CV LLM.")
-            return [doc]
+            return attach_media_context([doc], 0, image_ctx)
        try:
            callback(0.4, "Use CV LLM to describe the picture.")
@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
            callback(0.8, "CV LLM respond: %s ..." % ans[:32])
            txt += "\n" + ans
            tokenize(doc, txt, eng)
-            return [doc]
+            return attach_media_context([doc], 0, image_ctx)
        except Exception as e:
            callback(prog=-1, msg=str(e))
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -19,16 +19,16 @@ import random
 import re
 from functools import partial
 import trio
 import numpy as np
 import trio
 from PIL import Image
 from common.constants import LLMType
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.llm_service import LLMBundle
 from common import settings
 from common.constants import LLMType
 from common.misc_utils import get_uuid
 from rag.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
 from deepdoc.parser.mineru_parser import MinerUParser
 from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
@ -37,7 +37,8 @@ from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
 from rag.llm.cv_model import Base as VLM
-from common import settings
+from rag.nlp import attach_media_context
 from rag.utils.base64_image import image2id
 class ParserParam(ProcessParamBase):
@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase):
                "json",
            ],
            "image": [
-                "text"
+                "text",
            ],
            "email": [
                "text",
                "json",
            ],
            "email": ["text", "json"],
            "text&markdown": [
                "text",
-                "json"
+                "json",
            ],
            "audio": [
-                "json"
+                "json",
            ],
            "video": [],
        }
@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase):
                    "pdf",
                ],
                "output_format": "json",
                "table_context_size": 0,
                "image_context_size": 0,
            },
            "spreadsheet": {
                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase):
                    "xlsx",
                    "csv",
                ],
                "table_context_size": 0,
                "image_context_size": 0,
            },
            "word": {
                "suffix": [
@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase):
                    "docx",
                ],
                "output_format": "json",
                "table_context_size": 0,
                "image_context_size": 0,
            },
            "text&markdown": {
                "suffix": ["md", "markdown", "mdx", "txt"],
                "output_format": "json",
                "table_context_size": 0,
                "image_context_size": 0,
            },
            "slides": {
                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
                "suffix": [
                    "pptx",
-                    "ppt"
+                    "ppt",
                ],
                "output_format": "json",
                "table_context_size": 0,
                "image_context_size": 0,
            },
            "image": {
                "parse_method": "ocr",
@ -121,7 +135,8 @@ class ParserParam(ProcessParamBase):
            },
            "email": {
                "suffix": [
-                  "eml", "msg"
+                    "eml",
                    "msg",
                ],
                "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
                "output_format": "json",
@ -142,7 +157,7 @@ class ParserParam(ProcessParamBase):
                    "realaudio",
                    "vqf",
                    "oggvorbis",
-                    "ape"
+                    "ape",
                ],
                "output_format": "text",
            },
@ -150,7 +165,7 @@ class ParserParam(ProcessParamBase):
                "suffix": [
                    "mp4",
                    "avi",
-                    "mkv"
+                    "mkv",
                ],
                "output_format": "text",
            },
@ -253,7 +268,7 @@ class Parser(ProcessBase):
            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
            tcadp_parser = TCADPParser(
                table_result_type=table_result_type,
-                markdown_image_response_type=markdown_image_response_type
+                markdown_image_response_type=markdown_image_response_type,
            )
            sections, _ = tcadp_parser.parse_pdf(
                filepath=name,
@ -261,7 +276,7 @@ class Parser(ProcessBase):
                callback=self.callback,
                file_type="PDF",
                file_start_page=1,
-                file_end_page=1000
+                file_end_page=1000,
            )
            bboxes = []
            for section, position_tag in sections:
@ -269,17 +284,20 @@ class Parser(ProcessBase):
                    # Extract position information from TCADP's position tag
                    # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
                    import re
                    match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
                    if match:
                        pn, x0, x1, top, bott = match.groups()
-                        bboxes.append({
+                        bboxes.append(
-                            "page_number": int(pn.split('-')[0]),  # Take the first page number
+                            {
                                "page_number": int(pn.split("-")[0]),  # Take the first page number
                                "x0": float(x0),
                                "x1": float(x1),
                                "top": float(top),
                                "bottom": float(bott),
-                            "text": section
+                                "text": section,
-                        })
+                            }
                        )
                    else:
                        # If no position info, add as text without position
                        bboxes.append({"text": section})
@ -291,7 +309,30 @@ class Parser(ProcessBase):
            bboxes = []
            for t, poss in lines:
                for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
-                    bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
+                    bboxes.append(
                        {
                            "page_number": int(pn[0]),
                            "x0": float(x0),
                            "x1": float(x1),
                            "top": float(top),
                            "bottom": float(bott),
                            "text": t,
                        }
                    )
        for b in bboxes:
            text_val = b.get("text", "")
            has_text = isinstance(text_val, str) and text_val.strip()
            layout = b.get("layout_type")
            if layout == "figure" or (b.get("image") and not has_text):
                b["doc_type_kwd"] = "image"
            elif layout == "table":
                b["doc_type_kwd"] = "table"
        table_ctx = conf.get("table_context_size", 0) or 0
        image_ctx = conf.get("image_context_size", 0) or 0
        if table_ctx or image_ctx:
            bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
@ -319,7 +360,7 @@ class Parser(ProcessBase):
            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
            tcadp_parser = TCADPParser(
                table_result_type=table_result_type,
-                markdown_image_response_type=markdown_image_response_type
+                markdown_image_response_type=markdown_image_response_type,
            )
            if not tcadp_parser.check_installation():
                raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -337,7 +378,7 @@ class Parser(ProcessBase):
                callback=self.callback,
                file_type=file_type,
                file_start_page=1,
-                file_end_page=1000
+                file_end_page=1000,
            )
            # Process TCADP parser output based on configured output_format
@ -365,7 +406,12 @@ class Parser(ProcessBase):
                # Add tables as text
                for table in tables:
                    if table:
-                        result.append({"text": table})
+                        result.append({"text": table, "doc_type_kwd": "table"})
                table_ctx = conf.get("table_context_size", 0) or 0
                image_ctx = conf.get("image_context_size", 0) or 0
                if table_ctx or image_ctx:
                    result = attach_media_context(result, table_ctx, image_ctx)
                self.set_output("json", result)
@ -400,7 +446,13 @@ class Parser(ProcessBase):
        if conf.get("output_format") == "json":
            sections, tbls = docx_parser(name, binary=blob)
            sections = [{"text": section[0], "image": section[1]} for section in sections if section]
-            sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
+            sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
            table_ctx = conf.get("table_context_size", 0) or 0
            image_ctx = conf.get("image_context_size", 0) or 0
            if table_ctx or image_ctx:
                sections = attach_media_context(sections, table_ctx, image_ctx)
            self.set_output("json", sections)
        elif conf.get("output_format") == "markdown":
            markdown_text = docx_parser.to_markdown(name, binary=blob)
@ -420,7 +472,7 @@ class Parser(ProcessBase):
            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
            tcadp_parser = TCADPParser(
                table_result_type=table_result_type,
-                markdown_image_response_type=markdown_image_response_type
+                markdown_image_response_type=markdown_image_response_type,
            )
            if not tcadp_parser.check_installation():
                raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -439,7 +491,7 @@ class Parser(ProcessBase):
                callback=self.callback,
                file_type=file_type,
                file_start_page=1,
-                file_end_page=1000
+                file_end_page=1000,
            )
            # Process TCADP parser output - PPT only supports json format
@ -454,7 +506,12 @@ class Parser(ProcessBase):
                # Add tables as text
                for table in tables:
                    if table:
-                        result.append({"text": table})
+                        result.append({"text": table, "doc_type_kwd": "table"})
                table_ctx = conf.get("table_context_size", 0) or 0
                image_ctx = conf.get("image_context_size", 0) or 0
                if table_ctx or image_ctx:
                    result = attach_media_context(result, table_ctx, image_ctx)
                self.set_output("json", result)
        else:
@ -469,6 +526,10 @@ class Parser(ProcessBase):
            # json
            assert conf.get("output_format") == "json", "have to be json for ppt"
            if conf.get("output_format") == "json":
                table_ctx = conf.get("table_context_size", 0) or 0
                image_ctx = conf.get("image_context_size", 0) or 0
                if table_ctx or image_ctx:
                    sections = attach_media_context(sections, table_ctx, image_ctx)
                self.set_output("json", sections)
    def _markdown(self, name, blob):
@ -508,11 +569,15 @@ class Parser(ProcessBase):
                json_results.append(json_result)
            table_ctx = conf.get("table_context_size", 0) or 0
            image_ctx = conf.get("image_context_size", 0) or 0
            if table_ctx or image_ctx:
                json_results = attach_media_context(json_results, table_ctx, image_ctx)
            self.set_output("json", json_results)
        else:
            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
    def _image(self, name, blob):
        from deepdoc.vision import OCR
@ -588,7 +653,7 @@ class Parser(ProcessBase):
            from email.parser import BytesParser
            msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
-            email_content['metadata'] = {}
+            email_content["metadata"] = {}
            # handle header info
            for header, value in msg.items():
                # get fields like from, to, cc, bcc, date, subject
@ -600,6 +665,7 @@ class Parser(ProcessBase):
            # get body
            if "body" in target_fields:
                body_text, body_html = [], []
                def _add_content(m, content_type):
                    def _decode_payload(payload, charset, target_list):
                        try:
@ -641,14 +707,17 @@ class Parser(ProcessBase):
                        if dispositions[0].lower() == "attachment":
                            filename = part.get_filename()
                            payload = part.get_payload(decode=True).decode(part.get_content_charset())
-                            attachments.append({
+                            attachments.append(
                                {
                                    "filename": filename,
                                    "payload": payload,
-                            })
+                                }
                            )
                email_content["attachments"] = attachments
        else:
            # handle msg file
            import extract_msg
            print("handle a msg file.")
            msg = extract_msg.Message(blob)
            # handle header info
@ -662,9 +731,9 @@ class Parser(ProcessBase):
            }
            email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
            # get metadata
-            email_content['metadata'] = {
+            email_content["metadata"] = {
-                'message_id': msg.messageId,
+                "message_id": msg.messageId,
-                'in_reply_to': msg.inReplyTo,
+                "in_reply_to": msg.inReplyTo,
            }
            # get body
            if "body" in target_fields:
@ -675,29 +744,31 @@ class Parser(ProcessBase):
            if "attachments" in target_fields:
                attachments = []
                for t in msg.attachments:
-                    attachments.append({
+                    attachments.append(
                        {
                            "filename": t.name,
-                        "payload": t.data.decode("utf-8")
+                            "payload": t.data.decode("utf-8"),
-                    })
+                        }
                    )
                email_content["attachments"] = attachments
        if conf["output_format"] == "json":
            self.set_output("json", [email_content])
        else:
-            content_txt = ''
+            content_txt = ""
            for k, v in email_content.items():
                if isinstance(v, str):
                    # basic info
-                    content_txt += f'{k}:{v}' + "\n"
+                    content_txt += f"{k}:{v}" + "\n"
                elif isinstance(v, dict):
                    # metadata
-                    content_txt += f'{k}:{json.dumps(v)}' + "\n"
+                    content_txt += f"{k}:{json.dumps(v)}" + "\n"
                elif isinstance(v, list):
                    # attachments or others
                    for fb in v:
                        if isinstance(fb, dict):
                            # attachments
-                            content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
+                            content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
                        else:
                            # str, usually plain text
                            content_txt += fb
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -318,6 +318,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
            d = copy.deepcopy(doc)
            tokenize(d, rows, eng)
            d["content_with_weight"] = rows
            d["doc_type_kwd"] = "table"
            if img:
                d["image"] = img
                d["doc_type_kwd"] = "image"
@ -330,6 +331,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
            d = copy.deepcopy(doc)
            r = de.join(rows[i:i + batch_size])
            tokenize(d, r, eng)
            d["doc_type_kwd"] = "table"
            if img:
                d["image"] = img
                d["doc_type_kwd"] = "image"
@ -338,6 +340,194 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
    return res
 def attach_media_context(chunks, table_context_size=0, image_context_size=0):
    """
    Attach surrounding text chunk content to media chunks (table/image).
    Best-effort ordering: if positional info exists on any chunk, use it to
    order chunks before collecting context; otherwise keep original order.
    """
    if not chunks or (table_context_size <= 0 and image_context_size <= 0):
        return chunks
    def is_image_chunk(ck):
        if ck.get("doc_type_kwd") == "image":
            return True
        text_val = ck.get("content_with_weight") if isinstance(ck.get("content_with_weight"), str) else ck.get("text")
        has_text = isinstance(text_val, str) and text_val.strip()
        return bool(ck.get("image")) and not has_text
    def is_table_chunk(ck):
        return ck.get("doc_type_kwd") == "table"
    def is_text_chunk(ck):
        return not is_image_chunk(ck) and not is_table_chunk(ck)
    def get_text(ck):
        if isinstance(ck.get("content_with_weight"), str):
            return ck["content_with_weight"]
        if isinstance(ck.get("text"), str):
            return ck["text"]
        return ""
    def split_sentences(text):
        pattern = r"([.。！？!?；;：:\n])"
        parts = re.split(pattern, text)
        sentences = []
        buf = ""
        for p in parts:
            if not p:
                continue
            if re.fullmatch(pattern, p):
                buf += p
                sentences.append(buf)
                buf = ""
            else:
                buf += p
        if buf:
            sentences.append(buf)
        return sentences
    def trim_to_tokens(text, token_budget, from_tail=False):
        if token_budget <= 0 or not text:
            return ""
        sentences = split_sentences(text)
        if not sentences:
            return ""
        collected = []
        remaining = token_budget
        seq = reversed(sentences) if from_tail else sentences
        for s in seq:
            tks = num_tokens_from_string(s)
            if tks <= 0:
                continue
            if tks > remaining:
                collected.append(s)
                break
            collected.append(s)
            remaining -= tks
        if from_tail:
            collected = list(reversed(collected))
        return "".join(collected)
    def extract_position(ck):
        pn = None
        top = None
        left = None
        try:
            if ck.get("page_num_int"):
                pn = ck["page_num_int"][0]
            elif ck.get("page_number") is not None:
                pn = ck.get("page_number")
            if ck.get("top_int"):
                top = ck["top_int"][0]
            elif ck.get("top") is not None:
                top = ck.get("top")
            if ck.get("position_int"):
                left = ck["position_int"][0][1]
            elif ck.get("x0") is not None:
                left = ck.get("x0")
        except Exception:
            pn = top = left = None
        return pn, top, left
    indexed = list(enumerate(chunks))
    positioned_indices = []
    unpositioned_indices = []
    for idx, ck in indexed:
        pn, top, left = extract_position(ck)
        if pn is not None and top is not None:
            positioned_indices.append((idx, pn, top, left if left is not None else 0))
        else:
            unpositioned_indices.append(idx)
    if positioned_indices:
        positioned_indices.sort(key=lambda x: (int(x[1]), int(x[2]), int(x[3]), x[0]))
        ordered_indices = [i for i, _, _, _ in positioned_indices] + unpositioned_indices
    else:
        ordered_indices = [idx for idx, _ in indexed]
    total = len(ordered_indices)
    for sorted_pos, idx in enumerate(ordered_indices):
        ck = chunks[idx]
        token_budget = image_context_size if is_image_chunk(ck) else table_context_size if is_table_chunk(ck) else 0
        if token_budget <= 0:
            continue
        prev_ctx = []
        remaining_prev = token_budget
        for prev_idx in range(sorted_pos - 1, -1, -1):
            if remaining_prev <= 0:
                break
            neighbor_idx = ordered_indices[prev_idx]
            if not is_text_chunk(chunks[neighbor_idx]):
                break
            txt = get_text(chunks[neighbor_idx])
            if not txt:
                continue
            tks = num_tokens_from_string(txt)
            if tks <= 0:
                continue
            if tks > remaining_prev:
                txt = trim_to_tokens(txt, remaining_prev, from_tail=True)
                tks = num_tokens_from_string(txt)
            prev_ctx.append(txt)
            remaining_prev -= tks
        prev_ctx.reverse()
        next_ctx = []
        remaining_next = token_budget
        for next_idx in range(sorted_pos + 1, total):
            if remaining_next <= 0:
                break
            neighbor_idx = ordered_indices[next_idx]
            if not is_text_chunk(chunks[neighbor_idx]):
                break
            txt = get_text(chunks[neighbor_idx])
            if not txt:
                continue
            tks = num_tokens_from_string(txt)
            if tks <= 0:
                continue
            if tks > remaining_next:
                txt = trim_to_tokens(txt, remaining_next, from_tail=False)
                tks = num_tokens_from_string(txt)
            next_ctx.append(txt)
            remaining_next -= tks
        if not prev_ctx and not next_ctx:
            continue
        self_text = get_text(ck)
        pieces = [*prev_ctx]
        if self_text:
            pieces.append(self_text)
        pieces.extend(next_ctx)
        combined = "\n".join(pieces)
        original = ck.get("content_with_weight")
        if "content_with_weight" in ck:
            ck["content_with_weight"] = combined
        elif "text" in ck:
            original = ck.get("text")
            ck["text"] = combined
        if combined != original:
            if "content_ltks" in ck:
                ck["content_ltks"] = rag_tokenizer.tokenize(combined)
            if "content_sm_ltks" in ck:
                ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
    if positioned_indices:
        chunks[:] = [chunks[i] for i in ordered_indices]
    return chunks
 def add_positions(d, poss):
    if not poss:
        return
--- a/rag/utils/infinity_conn.py
+++ b/rag/utils/infinity_conn.py
@ -69,7 +69,7 @@ def convert_matching_field(field_weightstr: str) -> str:
    if field == "docnm_kwd" or field == "title_tks":
        field = "docnm@ft_docnm_rag_coarse"
    elif field == "title_sm_tks":
-        field = "docnm@ft_title_rag_fine"
+        field = "docnm@ft_docnm_rag_fine"
    elif field == "important_kwd":
        field = "important_keywords@ft_important_keywords_rag_coarse"
    elif field == "important_tks":
--- a/test/testcases/configs.py
+++ b/test/testcases/configs.py
@ -42,6 +42,8 @@ DEFAULT_PARSER_CONFIG = {
    "auto_keywords": 0,
    "auto_questions": 0,
    "html4excel": False,
    "image_context_size": 0,
    "table_context_size": 0,
    "topn_tags": 3,
    "raptor": {
        "use_raptor": True,
Author	SHA1	Message	Date
Zhichang Yu	856201c0f2	Fix ft_title_rag_fine (#11555 ) ### What problem does this PR solve? Fix ft_title_rag_fine ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)	2025-11-27 10:26:08 +08:00
Yongteng Lei	9d8b96c1d0	Feat: add context for figure and table (#11547 ) ### What problem does this PR solve? Add context for figure table. ![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e) `==================()` for demonstrating purpose. ### Type of change - [x] New Feature (non-breaking change which adds functionality)	2025-11-27 10:21:44 +08:00
writinwaters	7c3c185038	Minor style changes (#11554 ) ### What problem does this PR solve? ### Type of change - [ ] Documentation Update	2025-11-27 09:42:06 +08:00
天海蒼灆	a9259917c6	fix(files): replace hard coded status codes with constants (#11544 ) ### What problem does this PR solve? To solve the problem of error reporting caused by type errors when various types of exception returns are triggered ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)	2025-11-27 09:41:24 +08:00
myoldcat	8c28587821	Fix issue where HTML file parsing may lose content. (#11536 ) ### What problem does this PR solve? ##### Problem Description When parsing HTML files, some page content may be lost. For example, text inside nested `<font>` tags within multiple `<div>` elements (e.g., `<div><font>Text_1</font></div><div><font>Text_2</font></div>`) fails to be preserved correctly. ###### Root Cause #1: Block ID propagation is interrupted 1. Block ID generation: When the parser encounters a `<div>`, it generates a new `block_id` because `<div>` belongs to `BLOCK_TAGS`. 2. Recursive processing: This `block_id` is passed down recursively to process the `<div>`’s child nodes. 3. Interruption occurs: When processing a child `<font>` tag, the code enters the `else` branch of `read_text_recursively` (since `<font>` is a Tag). 4. Bug location: The first line in this `else` branch explicitly sets `block_id = None`. - This discards the valid `block_id` inherited from the parent `<div>`. - Since `<font>` is not in `BLOCK_TAGS`, it does not generate a new `block_id`, so it passes `None` to its child text nodes. 5. Consequence: The extracted text nodes have an empty `block_id` in their `metadata`. During the subsequent `merge_block_text` step, these texts cannot be correctly associated with their original `<div>` block due to the missing ID. As a result, all text from `<font>` tags gets merged together, which then triggers a second issue during concatenation. 6. Solution: Remove the forced reset of `block_id` to `None`. When the current tag (e.g., `<font>`) is not a block-level element, it should inherit the `block_id` passed down from its parent. This ensures consistent ownership across the hierarchy: `div` → `font` → `text`. ###### Root Cause #2: Data loss during text concatenation 1. The line `current_content += (" " if current_content else "" + content)` has a misplaced parenthesis. When `current_content` is non-empty (`True`): - The ternary expression evaluates to `" "` (a single space). - The code executes `current_content += " "`. - Result: Only a space is appended—the new `content` string is completely discarded. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)	2025-11-27 09:40:10 +08:00