Compare commits

...

5 Commits

Author SHA1 Message Date
856201c0f2 Fix ft_title_rag_fine (#11555)
### What problem does this PR solve?

Fix ft_title_rag_fine

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-27 10:26:08 +08:00
9d8b96c1d0 Feat: add context for figure and table (#11547)
### What problem does this PR solve?

Add context for figure table.



![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e)


`==================()` for demonstrating purpose. 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-11-27 10:21:44 +08:00
7c3c185038 Minor style changes (#11554)
### What problem does this PR solve?

### Type of change


- [ ] Documentation Update
2025-11-27 09:42:06 +08:00
a9259917c6 fix(files): replace hard coded status codes with constants (#11544)
### What problem does this PR solve?

To solve the problem of error reporting caused by type errors when
various types of exception returns are triggered

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-27 09:41:24 +08:00
8c28587821 Fix issue where HTML file parsing may lose content. (#11536)
### What problem does this PR solve?

##### Problem Description
When parsing HTML files, some page content may be lost.  
For example, text inside nested `<font>` tags within multiple `<div>`
elements (e.g.,
`<div><font>Text_1</font></div><div><font>Text_2</font></div>`) fails to
be preserved correctly.

###### Root Cause #1: Block ID propagation is interrupted
1. **Block ID generation**: When the parser encounters a `<div>`, it
generates a new `block_id` because `<div>` belongs to `BLOCK_TAGS`.
2. **Recursive processing**: This `block_id` is passed down recursively
to process the `<div>`’s child nodes.
3. **Interruption occurs**: When processing a child `<font>` tag, the
code enters the `else` branch of `read_text_recursively` (since `<font>`
is a Tag).
4. **Bug location**: The first line in this `else` branch explicitly
sets **`block_id = None`**.
- This discards the valid `block_id` inherited from the parent `<div>`.
- Since `<font>` is not in `BLOCK_TAGS`, it does not generate a new
`block_id`, so it passes `None` to its child text nodes.
5. **Consequence**: The extracted text nodes have an empty `block_id` in
their `metadata`. During the subsequent `merge_block_text` step, these
texts cannot be correctly associated with their original `<div>` block
due to the missing ID. As a result, all text from `<font>` tags gets
merged together, which then triggers a second issue during
concatenation.
6. **Solution:** Remove the forced reset of `block_id` to `None`. When
the current tag (e.g., `<font>`) is not a block-level element, it should
inherit the `block_id` passed down from its parent. This ensures
consistent ownership across the hierarchy: `div` → `font` → `text`.

###### Root Cause #2: Data loss during text concatenation
1. The line `current_content += (" " if current_content else "" +
content)` has a misplaced parenthesis. When `current_content` is
non-empty (`True`):
    - The ternary expression evaluates to `" "` (a single space).
    - The code executes `current_content += " "`.
- **Result**: Only a space is appended—**the new `content` string is
completely discarded**.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-27 09:40:10 +08:00
21 changed files with 469 additions and 168 deletions

View File

@ -31,7 +31,7 @@ from api.db.services.file_service import FileService
from api.utils.api_utils import get_json_result
from api.utils.file_utils import filename_type
from common import settings
from common.constants import RetCode
@manager.route('/file/upload', methods=['POST']) # noqa: F821
@token_required
@ -86,19 +86,19 @@ async def upload(tenant_id):
pf_id = root_folder["id"]
if 'file' not in files:
return get_json_result(data=False, message='No file part!', code=400)
return get_json_result(data=False, message='No file part!', code=RetCode.BAD_REQUEST)
file_objs = files.getlist('file')
for file_obj in file_objs:
if file_obj.filename == '':
return get_json_result(data=False, message='No selected file!', code=400)
return get_json_result(data=False, message='No selected file!', code=RetCode.BAD_REQUEST)
file_res = []
try:
e, pf_folder = FileService.get_by_id(pf_id)
if not e:
return get_json_result(data=False, message="Can't find this folder!", code=404)
return get_json_result(data=False, message="Can't find this folder!", code=RetCode.NOT_FOUND)
for file_obj in file_objs:
# Handle file path
@ -114,13 +114,13 @@ async def upload(tenant_id):
if file_len != len_id_list:
e, file = FileService.get_by_id(file_id_list[len_id_list - 1])
if not e:
return get_json_result(data=False, message="Folder not found!", code=404)
return get_json_result(data=False, message="Folder not found!", code=RetCode.NOT_FOUND)
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 1], file_obj_names,
len_id_list)
else:
e, file = FileService.get_by_id(file_id_list[len_id_list - 2])
if not e:
return get_json_result(data=False, message="Folder not found!", code=404)
return get_json_result(data=False, message="Folder not found!", code=RetCode.NOT_FOUND)
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 2], file_obj_names,
len_id_list)
@ -202,7 +202,7 @@ async def create(tenant_id):
try:
if not FileService.is_parent_folder_exist(pf_id):
return get_json_result(data=False, message="Parent Folder Doesn't Exist!", code=400)
return get_json_result(data=False, message="Parent Folder Doesn't Exist!", code=RetCode.BAD_REQUEST)
if FileService.query(name=req["name"], parent_id=pf_id):
return get_json_result(data=False, message="Duplicated folder name in the same folder.", code=409)
@ -306,13 +306,13 @@ def list_files(tenant_id):
try:
e, file = FileService.get_by_id(pf_id)
if not e:
return get_json_result(message="Folder not found!", code=404)
return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
files, total = FileService.get_by_pf_id(tenant_id, pf_id, page_number, items_per_page, orderby, desc, keywords)
parent_folder = FileService.get_parent_folder(pf_id)
if not parent_folder:
return get_json_result(message="File not found!", code=404)
return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
return get_json_result(data={"total": total, "files": files, "parent_folder": parent_folder.to_json()})
except Exception as e:
@ -392,7 +392,7 @@ def get_parent_folder():
try:
e, file = FileService.get_by_id(file_id)
if not e:
return get_json_result(message="Folder not found!", code=404)
return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
parent_folder = FileService.get_parent_folder(file_id)
return get_json_result(data={"parent_folder": parent_folder.to_json()})
@ -439,7 +439,7 @@ def get_all_parent_folders(tenant_id):
try:
e, file = FileService.get_by_id(file_id)
if not e:
return get_json_result(message="Folder not found!", code=404)
return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
parent_folders = FileService.get_all_parent_folders(file_id)
parent_folders_res = [folder.to_json() for folder in parent_folders]
@ -487,34 +487,34 @@ async def rm(tenant_id):
for file_id in file_ids:
e, file = FileService.get_by_id(file_id)
if not e:
return get_json_result(message="File or Folder not found!", code=404)
return get_json_result(message="File or Folder not found!", code=RetCode.NOT_FOUND)
if not file.tenant_id:
return get_json_result(message="Tenant not found!", code=404)
return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
if file.type == FileType.FOLDER.value:
file_id_list = FileService.get_all_innermost_file_ids(file_id, [])
for inner_file_id in file_id_list:
e, file = FileService.get_by_id(inner_file_id)
if not e:
return get_json_result(message="File not found!", code=404)
return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
settings.STORAGE_IMPL.rm(file.parent_id, file.location)
FileService.delete_folder_by_pf_id(tenant_id, file_id)
else:
settings.STORAGE_IMPL.rm(file.parent_id, file.location)
if not FileService.delete(file):
return get_json_result(message="Database error (File removal)!", code=500)
return get_json_result(message="Database error (File removal)!", code=RetCode.SERVER_ERROR)
informs = File2DocumentService.get_by_file_id(file_id)
for inform in informs:
doc_id = inform.document_id
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_json_result(message="Document not found!", code=404)
return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
tenant_id = DocumentService.get_tenant_id(doc_id)
if not tenant_id:
return get_json_result(message="Tenant not found!", code=404)
return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
if not DocumentService.remove_document(doc, tenant_id):
return get_json_result(message="Database error (Document removal)!", code=500)
return get_json_result(message="Database error (Document removal)!", code=RetCode.SERVER_ERROR)
File2DocumentService.delete_by_file_id(file_id)
return get_json_result(data=True)
@ -560,23 +560,23 @@ async def rename(tenant_id):
try:
e, file = FileService.get_by_id(req["file_id"])
if not e:
return get_json_result(message="File not found!", code=404)
return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
if file.type != FileType.FOLDER.value and pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
file.name.lower()).suffix:
return get_json_result(data=False, message="The extension of file can't be changed", code=400)
return get_json_result(data=False, message="The extension of file can't be changed", code=RetCode.BAD_REQUEST)
for existing_file in FileService.query(name=req["name"], pf_id=file.parent_id):
if existing_file.name == req["name"]:
return get_json_result(data=False, message="Duplicated file name in the same folder.", code=409)
if not FileService.update_by_id(req["file_id"], {"name": req["name"]}):
return get_json_result(message="Database error (File rename)!", code=500)
return get_json_result(message="Database error (File rename)!", code=RetCode.SERVER_ERROR)
informs = File2DocumentService.get_by_file_id(req["file_id"])
if informs:
if not DocumentService.update_by_id(informs[0].document_id, {"name": req["name"]}):
return get_json_result(message="Database error (Document rename)!", code=500)
return get_json_result(message="Database error (Document rename)!", code=RetCode.SERVER_ERROR)
return get_json_result(data=True)
except Exception as e:
@ -606,13 +606,13 @@ async def get(tenant_id, file_id):
description: File stream
schema:
type: file
404:
RetCode.NOT_FOUND:
description: File not found
"""
try:
e, file = FileService.get_by_id(file_id)
if not e:
return get_json_result(message="Document not found!", code=404)
return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
blob = settings.STORAGE_IMPL.get(file.parent_id, file.location)
if not blob:
@ -677,13 +677,13 @@ async def move(tenant_id):
for file_id in file_ids:
file = files_dict[file_id]
if not file:
return get_json_result(message="File or Folder not found!", code=404)
return get_json_result(message="File or Folder not found!", code=RetCode.NOT_FOUND)
if not file.tenant_id:
return get_json_result(message="Tenant not found!", code=404)
return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
fe, _ = FileService.get_by_id(parent_id)
if not fe:
return get_json_result(message="Parent Folder not found!", code=404)
return get_json_result(message="Parent Folder not found!", code=RetCode.NOT_FOUND)
FileService.move_file(file_ids, parent_id)
return get_json_result(data=True)
@ -705,7 +705,7 @@ async def convert(tenant_id):
for file_id in file_ids:
file = files_set[file_id]
if not file:
return get_json_result(message="File not found!", code=404)
return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
file_ids_list = [file_id]
if file.type == FileType.FOLDER.value:
file_ids_list = FileService.get_all_innermost_file_ids(file_id, [])
@ -716,13 +716,13 @@ async def convert(tenant_id):
doc_id = inform.document_id
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_json_result(message="Document not found!", code=404)
return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
tenant_id = DocumentService.get_tenant_id(doc_id)
if not tenant_id:
return get_json_result(message="Tenant not found!", code=404)
return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
if not DocumentService.remove_document(doc, tenant_id):
return get_json_result(
message="Database error (Document removal)!", code=404)
message="Database error (Document removal)!", code=RetCode.NOT_FOUND)
File2DocumentService.delete_by_file_id(id)
# insert
@ -730,11 +730,11 @@ async def convert(tenant_id):
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
return get_json_result(
message="Can't find this knowledgebase!", code=404)
message="Can't find this knowledgebase!", code=RetCode.NOT_FOUND)
e, file = FileService.get_by_id(id)
if not e:
return get_json_result(
message="Can't find this file!", code=404)
message="Can't find this file!", code=RetCode.NOT_FOUND)
doc = DocumentService.insert({
"id": get_uuid(),

View File

@ -749,7 +749,7 @@ class Knowledgebase(DataBaseModel):
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
pagerank = IntegerField(default=0, index=False)
graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
@ -774,7 +774,7 @@ class Document(DataBaseModel):
kb_id = CharField(max_length=256, null=False, index=True)
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True)
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
type = CharField(max_length=32, null=False, help_text="file extension", index=True)
created_by = CharField(max_length=32, null=False, help_text="who created it", index=True)

View File

@ -923,7 +923,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
ParserType.AUDIO.value: audio,
ParserType.EMAIL.value: email
}
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text"}
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text", "table_context_size": 0, "image_context_size": 0}
exe = ThreadPoolExecutor(max_workers=12)
threads = []
doc_nm = {}

View File

@ -313,6 +313,10 @@ def get_parser_config(chunk_method, parser_config):
chunk_method = "naive"
# Define default configurations for each chunking method
base_defaults = {
"table_context_size": 0,
"image_context_size": 0,
}
key_mapping = {
"naive": {
"layout_recognize": "DeepDOC",
@ -365,16 +369,19 @@ def get_parser_config(chunk_method, parser_config):
default_config = key_mapping[chunk_method]
# If no parser_config provided, return default
# If no parser_config provided, return default merged with base defaults
if not parser_config:
return default_config
if default_config is None:
return deep_merge(base_defaults, {})
return deep_merge(base_defaults, default_config)
# If parser_config is provided, merge with defaults to ensure required fields exist
if default_config is None:
return parser_config
return deep_merge(base_defaults, parser_config)
# Ensure raptor and graphrag fields have default values if not provided
merged_config = deep_merge(default_config, parser_config)
merged_config = deep_merge(base_defaults, default_config)
merged_config = deep_merge(merged_config, parser_config)
return merged_config

View File

@ -49,6 +49,7 @@ class RetCode(IntEnum, CustomEnum):
RUNNING = 106
PERMISSION_ERROR = 108
AUTHENTICATION_ERROR = 109
BAD_REQUEST = 400
UNAUTHORIZED = 401
SERVER_ERROR = 500
FORBIDDEN = 403

View File

@ -138,7 +138,6 @@ class RAGFlowHtmlParser:
"metadata": {"table_id": table_id, "index": table_list.index(t)}})
return table_info_list
else:
block_id = None
if str.lower(element.name) in BLOCK_TAGS:
block_id = str(uuid.uuid1())
for child in element.children:
@ -172,7 +171,7 @@ class RAGFlowHtmlParser:
if tag_name == "table":
table_info_list.append(item)
else:
current_content += (" " if current_content else "" + content)
current_content += (" " if current_content else "") + content
if current_content:
block_content.append(current_content)
return block_content, table_info_list

View File

@ -39,8 +39,10 @@ If you have not installed Docker on your local machine (Windows, Mac, or Linux),
This section provides instructions on setting up the RAGFlow server on Linux. If you are on a different operating system, no worries. Most steps are alike.
1. Ensure `vm.max_map_count` &ge; 262144.
<details>
<summary>1. Ensure <code>vm.max_map_count</code> &ge; 262144:</summary>
<summary>Expand to show details:</summary>
`vm.max_map_count`. This value sets the maximum number of memory map areas a process may have. Its default value is 65530. While most applications require fewer than a thousand maps, reducing this value can result in abnormal behaviors, and the system will throw out-of-memory errors when a process reaches the limitation.

View File

@ -23,7 +23,7 @@ from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks
tokenize_chunks, attach_media_context
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res

View File

@ -20,7 +20,7 @@ import re
from common.constants import ParserType
from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@ -325,6 +329,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["doc_type_kwd"] = "image"
tokenize(d, text, eng)
res.append(d)
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res
else:
raise NotImplementedError("file type not supported yet(pdf and docx supported)")

View File

@ -37,7 +37,7 @@ from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback
@ -616,6 +616,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
final_sections = False
doc = {
"docnm_kwd": filename,
@ -686,6 +688,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res)
res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -947,6 +951,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(embed_res)
if url_res:
res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res

View File

@ -20,7 +20,7 @@ import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from common.constants import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, attach_media_context
from deepdoc.parser import PdfParser
import numpy as np
from rag.app.naive import by_plaintext, PARSERS
@ -234,6 +234,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
chunks.append(txt)
last_sid = sec_id
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res

View File

@ -20,11 +20,11 @@ import re
import numpy as np
from PIL import Image
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.vision import OCR
from rag.nlp import rag_tokenizer, tokenize
from common.constants import LLMType
from common.string_utils import clean_markdown_block
from deepdoc.vision import OCR
from rag.nlp import attach_media_context, rag_tokenizer, tokenize
ocr = OCR()
@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
}
eng = lang.lower() == "english"
parser_config = kwargs.get("parser_config", {}) or {}
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
try:
doc.update({"doc_type_kwd": "video"})
doc.update(
{
"doc_type_kwd": "video",
}
)
cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
if (eng and len(txt.split()) > 32) or len(txt) > 32:
tokenize(doc, txt, eng)
callback(0.8, "OCR results is too long to use CV LLM.")
return [doc]
return attach_media_context([doc], 0, image_ctx)
try:
callback(0.4, "Use CV LLM to describe the picture.")
@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans
tokenize(doc, txt, eng)
return [doc]
return attach_media_context([doc], 0, image_ctx)
except Exception as e:
callback(prog=-1, msg=str(e))

View File

@ -19,16 +19,16 @@ import random
import re
from functools import partial
import trio
import numpy as np
import trio
from PIL import Image
from common.constants import LLMType
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.llm_service import LLMBundle
from common import settings
from common.constants import LLMType
from common.misc_utils import get_uuid
from rag.utils.base64_image import image2id
from deepdoc.parser import ExcelParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
@ -37,7 +37,8 @@ from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream
from rag.llm.cv_model import Base as VLM
from common import settings
from rag.nlp import attach_media_context
from rag.utils.base64_image import image2id
class ParserParam(ProcessParamBase):
@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase):
"json",
],
"image": [
"text"
"text",
],
"email": [
"text",
"json",
],
"email": ["text", "json"],
"text&markdown": [
"text",
"json"
"json",
],
"audio": [
"json"
"json",
],
"video": [],
}
@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase):
"pdf",
],
"output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
},
"spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase):
"xlsx",
"csv",
],
"table_context_size": 0,
"image_context_size": 0,
},
"word": {
"suffix": [
@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase):
"docx",
],
"output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
},
"text&markdown": {
"suffix": ["md", "markdown", "mdx", "txt"],
"output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
},
"slides": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
"suffix": [
"pptx",
"ppt"
"ppt",
],
"output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
},
"image": {
"parse_method": "ocr",
@ -121,7 +135,8 @@ class ParserParam(ProcessParamBase):
},
"email": {
"suffix": [
"eml", "msg"
"eml",
"msg",
],
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
"output_format": "json",
@ -142,7 +157,7 @@ class ParserParam(ProcessParamBase):
"realaudio",
"vqf",
"oggvorbis",
"ape"
"ape",
],
"output_format": "text",
},
@ -150,7 +165,7 @@ class ParserParam(ProcessParamBase):
"suffix": [
"mp4",
"avi",
"mkv"
"mkv",
],
"output_format": "text",
},
@ -253,7 +268,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
markdown_image_response_type=markdown_image_response_type,
)
sections, _ = tcadp_parser.parse_pdf(
filepath=name,
@ -261,7 +276,7 @@ class Parser(ProcessBase):
callback=self.callback,
file_type="PDF",
file_start_page=1,
file_end_page=1000
file_end_page=1000,
)
bboxes = []
for section, position_tag in sections:
@ -269,17 +284,20 @@ class Parser(ProcessBase):
# Extract position information from TCADP's position tag
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
import re
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if match:
pn, x0, x1, top, bott = match.groups()
bboxes.append({
"page_number": int(pn.split('-')[0]), # Take the first page number
bboxes.append(
{
"page_number": int(pn.split("-")[0]), # Take the first page number
"x0": float(x0),
"x1": float(x1),
"top": float(top),
"bottom": float(bott),
"text": section
})
"text": section,
}
)
else:
# If no position info, add as text without position
bboxes.append({"text": section})
@ -291,7 +309,30 @@ class Parser(ProcessBase):
bboxes = []
for t, poss in lines:
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
bboxes.append(
{
"page_number": int(pn[0]),
"x0": float(x0),
"x1": float(x1),
"top": float(top),
"bottom": float(bott),
"text": t,
}
)
for b in bboxes:
text_val = b.get("text", "")
has_text = isinstance(text_val, str) and text_val.strip()
layout = b.get("layout_type")
if layout == "figure" or (b.get("image") and not has_text):
b["doc_type_kwd"] = "image"
elif layout == "table":
b["doc_type_kwd"] = "table"
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
if conf.get("output_format") == "json":
self.set_output("json", bboxes)
@ -319,7 +360,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
markdown_image_response_type=markdown_image_response_type,
)
if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -337,7 +378,7 @@ class Parser(ProcessBase):
callback=self.callback,
file_type=file_type,
file_start_page=1,
file_end_page=1000
file_end_page=1000,
)
# Process TCADP parser output based on configured output_format
@ -365,7 +406,12 @@ class Parser(ProcessBase):
# Add tables as text
for table in tables:
if table:
result.append({"text": table})
result.append({"text": table, "doc_type_kwd": "table"})
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result)
@ -400,7 +446,13 @@ class Parser(ProcessBase):
if conf.get("output_format") == "json":
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections)
elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob)
@ -420,7 +472,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
markdown_image_response_type=markdown_image_response_type,
)
if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -439,7 +491,7 @@ class Parser(ProcessBase):
callback=self.callback,
file_type=file_type,
file_start_page=1,
file_end_page=1000
file_end_page=1000,
)
# Process TCADP parser output - PPT only supports json format
@ -454,7 +506,12 @@ class Parser(ProcessBase):
# Add tables as text
for table in tables:
if table:
result.append({"text": table})
result.append({"text": table, "doc_type_kwd": "table"})
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result)
else:
@ -469,6 +526,10 @@ class Parser(ProcessBase):
# json
assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json":
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections)
def _markdown(self, name, blob):
@ -508,11 +569,15 @@ class Parser(ProcessBase):
json_results.append(json_result)
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
json_results = attach_media_context(json_results, table_ctx, image_ctx)
self.set_output("json", json_results)
else:
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
def _image(self, name, blob):
from deepdoc.vision import OCR
@ -588,7 +653,7 @@ class Parser(ProcessBase):
from email.parser import BytesParser
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
email_content['metadata'] = {}
email_content["metadata"] = {}
# handle header info
for header, value in msg.items():
# get fields like from, to, cc, bcc, date, subject
@ -600,6 +665,7 @@ class Parser(ProcessBase):
# get body
if "body" in target_fields:
body_text, body_html = [], []
def _add_content(m, content_type):
def _decode_payload(payload, charset, target_list):
try:
@ -641,14 +707,17 @@ class Parser(ProcessBase):
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True).decode(part.get_content_charset())
attachments.append({
attachments.append(
{
"filename": filename,
"payload": payload,
})
}
)
email_content["attachments"] = attachments
else:
# handle msg file
import extract_msg
print("handle a msg file.")
msg = extract_msg.Message(blob)
# handle header info
@ -662,9 +731,9 @@ class Parser(ProcessBase):
}
email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
# get metadata
email_content['metadata'] = {
'message_id': msg.messageId,
'in_reply_to': msg.inReplyTo,
email_content["metadata"] = {
"message_id": msg.messageId,
"in_reply_to": msg.inReplyTo,
}
# get body
if "body" in target_fields:
@ -675,29 +744,31 @@ class Parser(ProcessBase):
if "attachments" in target_fields:
attachments = []
for t in msg.attachments:
attachments.append({
attachments.append(
{
"filename": t.name,
"payload": t.data.decode("utf-8")
})
"payload": t.data.decode("utf-8"),
}
)
email_content["attachments"] = attachments
if conf["output_format"] == "json":
self.set_output("json", [email_content])
else:
content_txt = ''
content_txt = ""
for k, v in email_content.items():
if isinstance(v, str):
# basic info
content_txt += f'{k}:{v}' + "\n"
content_txt += f"{k}:{v}" + "\n"
elif isinstance(v, dict):
# metadata
content_txt += f'{k}:{json.dumps(v)}' + "\n"
content_txt += f"{k}:{json.dumps(v)}" + "\n"
elif isinstance(v, list):
# attachments or others
for fb in v:
if isinstance(fb, dict):
# attachments
content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
else:
# str, usually plain text
content_txt += fb

View File

@ -318,6 +318,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc)
tokenize(d, rows, eng)
d["content_with_weight"] = rows
d["doc_type_kwd"] = "table"
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
@ -330,6 +331,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc)
r = de.join(rows[i:i + batch_size])
tokenize(d, r, eng)
d["doc_type_kwd"] = "table"
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
@ -338,6 +340,194 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
return res
def attach_media_context(chunks, table_context_size=0, image_context_size=0):
"""
Attach surrounding text chunk content to media chunks (table/image).
Best-effort ordering: if positional info exists on any chunk, use it to
order chunks before collecting context; otherwise keep original order.
"""
if not chunks or (table_context_size <= 0 and image_context_size <= 0):
return chunks
def is_image_chunk(ck):
if ck.get("doc_type_kwd") == "image":
return True
text_val = ck.get("content_with_weight") if isinstance(ck.get("content_with_weight"), str) else ck.get("text")
has_text = isinstance(text_val, str) and text_val.strip()
return bool(ck.get("image")) and not has_text
def is_table_chunk(ck):
return ck.get("doc_type_kwd") == "table"
def is_text_chunk(ck):
return not is_image_chunk(ck) and not is_table_chunk(ck)
def get_text(ck):
if isinstance(ck.get("content_with_weight"), str):
return ck["content_with_weight"]
if isinstance(ck.get("text"), str):
return ck["text"]
return ""
def split_sentences(text):
pattern = r"([.。!?!?;:\n])"
parts = re.split(pattern, text)
sentences = []
buf = ""
for p in parts:
if not p:
continue
if re.fullmatch(pattern, p):
buf += p
sentences.append(buf)
buf = ""
else:
buf += p
if buf:
sentences.append(buf)
return sentences
def trim_to_tokens(text, token_budget, from_tail=False):
if token_budget <= 0 or not text:
return ""
sentences = split_sentences(text)
if not sentences:
return ""
collected = []
remaining = token_budget
seq = reversed(sentences) if from_tail else sentences
for s in seq:
tks = num_tokens_from_string(s)
if tks <= 0:
continue
if tks > remaining:
collected.append(s)
break
collected.append(s)
remaining -= tks
if from_tail:
collected = list(reversed(collected))
return "".join(collected)
def extract_position(ck):
pn = None
top = None
left = None
try:
if ck.get("page_num_int"):
pn = ck["page_num_int"][0]
elif ck.get("page_number") is not None:
pn = ck.get("page_number")
if ck.get("top_int"):
top = ck["top_int"][0]
elif ck.get("top") is not None:
top = ck.get("top")
if ck.get("position_int"):
left = ck["position_int"][0][1]
elif ck.get("x0") is not None:
left = ck.get("x0")
except Exception:
pn = top = left = None
return pn, top, left
indexed = list(enumerate(chunks))
positioned_indices = []
unpositioned_indices = []
for idx, ck in indexed:
pn, top, left = extract_position(ck)
if pn is not None and top is not None:
positioned_indices.append((idx, pn, top, left if left is not None else 0))
else:
unpositioned_indices.append(idx)
if positioned_indices:
positioned_indices.sort(key=lambda x: (int(x[1]), int(x[2]), int(x[3]), x[0]))
ordered_indices = [i for i, _, _, _ in positioned_indices] + unpositioned_indices
else:
ordered_indices = [idx for idx, _ in indexed]
total = len(ordered_indices)
for sorted_pos, idx in enumerate(ordered_indices):
ck = chunks[idx]
token_budget = image_context_size if is_image_chunk(ck) else table_context_size if is_table_chunk(ck) else 0
if token_budget <= 0:
continue
prev_ctx = []
remaining_prev = token_budget
for prev_idx in range(sorted_pos - 1, -1, -1):
if remaining_prev <= 0:
break
neighbor_idx = ordered_indices[prev_idx]
if not is_text_chunk(chunks[neighbor_idx]):
break
txt = get_text(chunks[neighbor_idx])
if not txt:
continue
tks = num_tokens_from_string(txt)
if tks <= 0:
continue
if tks > remaining_prev:
txt = trim_to_tokens(txt, remaining_prev, from_tail=True)
tks = num_tokens_from_string(txt)
prev_ctx.append(txt)
remaining_prev -= tks
prev_ctx.reverse()
next_ctx = []
remaining_next = token_budget
for next_idx in range(sorted_pos + 1, total):
if remaining_next <= 0:
break
neighbor_idx = ordered_indices[next_idx]
if not is_text_chunk(chunks[neighbor_idx]):
break
txt = get_text(chunks[neighbor_idx])
if not txt:
continue
tks = num_tokens_from_string(txt)
if tks <= 0:
continue
if tks > remaining_next:
txt = trim_to_tokens(txt, remaining_next, from_tail=False)
tks = num_tokens_from_string(txt)
next_ctx.append(txt)
remaining_next -= tks
if not prev_ctx and not next_ctx:
continue
self_text = get_text(ck)
pieces = [*prev_ctx]
if self_text:
pieces.append(self_text)
pieces.extend(next_ctx)
combined = "\n".join(pieces)
original = ck.get("content_with_weight")
if "content_with_weight" in ck:
ck["content_with_weight"] = combined
elif "text" in ck:
original = ck.get("text")
ck["text"] = combined
if combined != original:
if "content_ltks" in ck:
ck["content_ltks"] = rag_tokenizer.tokenize(combined)
if "content_sm_ltks" in ck:
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
if positioned_indices:
chunks[:] = [chunks[i] for i in ordered_indices]
return chunks
def add_positions(d, poss):
if not poss:
return

View File

@ -69,7 +69,7 @@ def convert_matching_field(field_weightstr: str) -> str:
if field == "docnm_kwd" or field == "title_tks":
field = "docnm@ft_docnm_rag_coarse"
elif field == "title_sm_tks":
field = "docnm@ft_title_rag_fine"
field = "docnm@ft_docnm_rag_fine"
elif field == "important_kwd":
field = "important_keywords@ft_important_keywords_rag_coarse"
elif field == "important_tks":

View File

@ -42,6 +42,8 @@ DEFAULT_PARSER_CONFIG = {
"auto_keywords": 0,
"auto_questions": 0,
"html4excel": False,
"image_context_size": 0,
"table_context_size": 0,
"topn_tags": 3,
"raptor": {
"use_raptor": True,