Compare commits

...

5 Commits

Author SHA1 Message Date
856201c0f2 Fix ft_title_rag_fine (#11555)
### What problem does this PR solve?

Fix ft_title_rag_fine

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-27 10:26:08 +08:00
9d8b96c1d0 Feat: add context for figure and table (#11547)
### What problem does this PR solve?

Add context for figure table.



![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e)


`==================()` for demonstrating purpose. 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-11-27 10:21:44 +08:00
7c3c185038 Minor style changes (#11554)
### What problem does this PR solve?

### Type of change


- [ ] Documentation Update
2025-11-27 09:42:06 +08:00
a9259917c6 fix(files): replace hard coded status codes with constants (#11544)
### What problem does this PR solve?

To solve the problem of error reporting caused by type errors when
various types of exception returns are triggered

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-27 09:41:24 +08:00
8c28587821 Fix issue where HTML file parsing may lose content. (#11536)
### What problem does this PR solve?

##### Problem Description
When parsing HTML files, some page content may be lost.  
For example, text inside nested `<font>` tags within multiple `<div>`
elements (e.g.,
`<div><font>Text_1</font></div><div><font>Text_2</font></div>`) fails to
be preserved correctly.

###### Root Cause #1: Block ID propagation is interrupted
1. **Block ID generation**: When the parser encounters a `<div>`, it
generates a new `block_id` because `<div>` belongs to `BLOCK_TAGS`.
2. **Recursive processing**: This `block_id` is passed down recursively
to process the `<div>`’s child nodes.
3. **Interruption occurs**: When processing a child `<font>` tag, the
code enters the `else` branch of `read_text_recursively` (since `<font>`
is a Tag).
4. **Bug location**: The first line in this `else` branch explicitly
sets **`block_id = None`**.
- This discards the valid `block_id` inherited from the parent `<div>`.
- Since `<font>` is not in `BLOCK_TAGS`, it does not generate a new
`block_id`, so it passes `None` to its child text nodes.
5. **Consequence**: The extracted text nodes have an empty `block_id` in
their `metadata`. During the subsequent `merge_block_text` step, these
texts cannot be correctly associated with their original `<div>` block
due to the missing ID. As a result, all text from `<font>` tags gets
merged together, which then triggers a second issue during
concatenation.
6. **Solution:** Remove the forced reset of `block_id` to `None`. When
the current tag (e.g., `<font>`) is not a block-level element, it should
inherit the `block_id` passed down from its parent. This ensures
consistent ownership across the hierarchy: `div` → `font` → `text`.

###### Root Cause #2: Data loss during text concatenation
1. The line `current_content += (" " if current_content else "" +
content)` has a misplaced parenthesis. When `current_content` is
non-empty (`True`):
    - The ternary expression evaluates to `" "` (a single space).
    - The code executes `current_content += " "`.
- **Result**: Only a space is appended—**the new `content` string is
completely discarded**.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2025-11-27 09:40:10 +08:00
21 changed files with 469 additions and 168 deletions

View File

@ -31,7 +31,7 @@ from api.db.services.file_service import FileService
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result
from api.utils.file_utils import filename_type from api.utils.file_utils import filename_type
from common import settings from common import settings
from common.constants import RetCode
@manager.route('/file/upload', methods=['POST']) # noqa: F821 @manager.route('/file/upload', methods=['POST']) # noqa: F821
@token_required @token_required
@ -86,19 +86,19 @@ async def upload(tenant_id):
pf_id = root_folder["id"] pf_id = root_folder["id"]
if 'file' not in files: if 'file' not in files:
return get_json_result(data=False, message='No file part!', code=400) return get_json_result(data=False, message='No file part!', code=RetCode.BAD_REQUEST)
file_objs = files.getlist('file') file_objs = files.getlist('file')
for file_obj in file_objs: for file_obj in file_objs:
if file_obj.filename == '': if file_obj.filename == '':
return get_json_result(data=False, message='No selected file!', code=400) return get_json_result(data=False, message='No selected file!', code=RetCode.BAD_REQUEST)
file_res = [] file_res = []
try: try:
e, pf_folder = FileService.get_by_id(pf_id) e, pf_folder = FileService.get_by_id(pf_id)
if not e: if not e:
return get_json_result(data=False, message="Can't find this folder!", code=404) return get_json_result(data=False, message="Can't find this folder!", code=RetCode.NOT_FOUND)
for file_obj in file_objs: for file_obj in file_objs:
# Handle file path # Handle file path
@ -114,13 +114,13 @@ async def upload(tenant_id):
if file_len != len_id_list: if file_len != len_id_list:
e, file = FileService.get_by_id(file_id_list[len_id_list - 1]) e, file = FileService.get_by_id(file_id_list[len_id_list - 1])
if not e: if not e:
return get_json_result(data=False, message="Folder not found!", code=404) return get_json_result(data=False, message="Folder not found!", code=RetCode.NOT_FOUND)
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 1], file_obj_names, last_folder = FileService.create_folder(file, file_id_list[len_id_list - 1], file_obj_names,
len_id_list) len_id_list)
else: else:
e, file = FileService.get_by_id(file_id_list[len_id_list - 2]) e, file = FileService.get_by_id(file_id_list[len_id_list - 2])
if not e: if not e:
return get_json_result(data=False, message="Folder not found!", code=404) return get_json_result(data=False, message="Folder not found!", code=RetCode.NOT_FOUND)
last_folder = FileService.create_folder(file, file_id_list[len_id_list - 2], file_obj_names, last_folder = FileService.create_folder(file, file_id_list[len_id_list - 2], file_obj_names,
len_id_list) len_id_list)
@ -202,7 +202,7 @@ async def create(tenant_id):
try: try:
if not FileService.is_parent_folder_exist(pf_id): if not FileService.is_parent_folder_exist(pf_id):
return get_json_result(data=False, message="Parent Folder Doesn't Exist!", code=400) return get_json_result(data=False, message="Parent Folder Doesn't Exist!", code=RetCode.BAD_REQUEST)
if FileService.query(name=req["name"], parent_id=pf_id): if FileService.query(name=req["name"], parent_id=pf_id):
return get_json_result(data=False, message="Duplicated folder name in the same folder.", code=409) return get_json_result(data=False, message="Duplicated folder name in the same folder.", code=409)
@ -306,13 +306,13 @@ def list_files(tenant_id):
try: try:
e, file = FileService.get_by_id(pf_id) e, file = FileService.get_by_id(pf_id)
if not e: if not e:
return get_json_result(message="Folder not found!", code=404) return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
files, total = FileService.get_by_pf_id(tenant_id, pf_id, page_number, items_per_page, orderby, desc, keywords) files, total = FileService.get_by_pf_id(tenant_id, pf_id, page_number, items_per_page, orderby, desc, keywords)
parent_folder = FileService.get_parent_folder(pf_id) parent_folder = FileService.get_parent_folder(pf_id)
if not parent_folder: if not parent_folder:
return get_json_result(message="File not found!", code=404) return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
return get_json_result(data={"total": total, "files": files, "parent_folder": parent_folder.to_json()}) return get_json_result(data={"total": total, "files": files, "parent_folder": parent_folder.to_json()})
except Exception as e: except Exception as e:
@ -392,7 +392,7 @@ def get_parent_folder():
try: try:
e, file = FileService.get_by_id(file_id) e, file = FileService.get_by_id(file_id)
if not e: if not e:
return get_json_result(message="Folder not found!", code=404) return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
parent_folder = FileService.get_parent_folder(file_id) parent_folder = FileService.get_parent_folder(file_id)
return get_json_result(data={"parent_folder": parent_folder.to_json()}) return get_json_result(data={"parent_folder": parent_folder.to_json()})
@ -439,7 +439,7 @@ def get_all_parent_folders(tenant_id):
try: try:
e, file = FileService.get_by_id(file_id) e, file = FileService.get_by_id(file_id)
if not e: if not e:
return get_json_result(message="Folder not found!", code=404) return get_json_result(message="Folder not found!", code=RetCode.NOT_FOUND)
parent_folders = FileService.get_all_parent_folders(file_id) parent_folders = FileService.get_all_parent_folders(file_id)
parent_folders_res = [folder.to_json() for folder in parent_folders] parent_folders_res = [folder.to_json() for folder in parent_folders]
@ -487,34 +487,34 @@ async def rm(tenant_id):
for file_id in file_ids: for file_id in file_ids:
e, file = FileService.get_by_id(file_id) e, file = FileService.get_by_id(file_id)
if not e: if not e:
return get_json_result(message="File or Folder not found!", code=404) return get_json_result(message="File or Folder not found!", code=RetCode.NOT_FOUND)
if not file.tenant_id: if not file.tenant_id:
return get_json_result(message="Tenant not found!", code=404) return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
if file.type == FileType.FOLDER.value: if file.type == FileType.FOLDER.value:
file_id_list = FileService.get_all_innermost_file_ids(file_id, []) file_id_list = FileService.get_all_innermost_file_ids(file_id, [])
for inner_file_id in file_id_list: for inner_file_id in file_id_list:
e, file = FileService.get_by_id(inner_file_id) e, file = FileService.get_by_id(inner_file_id)
if not e: if not e:
return get_json_result(message="File not found!", code=404) return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
settings.STORAGE_IMPL.rm(file.parent_id, file.location) settings.STORAGE_IMPL.rm(file.parent_id, file.location)
FileService.delete_folder_by_pf_id(tenant_id, file_id) FileService.delete_folder_by_pf_id(tenant_id, file_id)
else: else:
settings.STORAGE_IMPL.rm(file.parent_id, file.location) settings.STORAGE_IMPL.rm(file.parent_id, file.location)
if not FileService.delete(file): if not FileService.delete(file):
return get_json_result(message="Database error (File removal)!", code=500) return get_json_result(message="Database error (File removal)!", code=RetCode.SERVER_ERROR)
informs = File2DocumentService.get_by_file_id(file_id) informs = File2DocumentService.get_by_file_id(file_id)
for inform in informs: for inform in informs:
doc_id = inform.document_id doc_id = inform.document_id
e, doc = DocumentService.get_by_id(doc_id) e, doc = DocumentService.get_by_id(doc_id)
if not e: if not e:
return get_json_result(message="Document not found!", code=404) return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
tenant_id = DocumentService.get_tenant_id(doc_id) tenant_id = DocumentService.get_tenant_id(doc_id)
if not tenant_id: if not tenant_id:
return get_json_result(message="Tenant not found!", code=404) return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
if not DocumentService.remove_document(doc, tenant_id): if not DocumentService.remove_document(doc, tenant_id):
return get_json_result(message="Database error (Document removal)!", code=500) return get_json_result(message="Database error (Document removal)!", code=RetCode.SERVER_ERROR)
File2DocumentService.delete_by_file_id(file_id) File2DocumentService.delete_by_file_id(file_id)
return get_json_result(data=True) return get_json_result(data=True)
@ -560,23 +560,23 @@ async def rename(tenant_id):
try: try:
e, file = FileService.get_by_id(req["file_id"]) e, file = FileService.get_by_id(req["file_id"])
if not e: if not e:
return get_json_result(message="File not found!", code=404) return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
if file.type != FileType.FOLDER.value and pathlib.Path(req["name"].lower()).suffix != pathlib.Path( if file.type != FileType.FOLDER.value and pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
file.name.lower()).suffix: file.name.lower()).suffix:
return get_json_result(data=False, message="The extension of file can't be changed", code=400) return get_json_result(data=False, message="The extension of file can't be changed", code=RetCode.BAD_REQUEST)
for existing_file in FileService.query(name=req["name"], pf_id=file.parent_id): for existing_file in FileService.query(name=req["name"], pf_id=file.parent_id):
if existing_file.name == req["name"]: if existing_file.name == req["name"]:
return get_json_result(data=False, message="Duplicated file name in the same folder.", code=409) return get_json_result(data=False, message="Duplicated file name in the same folder.", code=409)
if not FileService.update_by_id(req["file_id"], {"name": req["name"]}): if not FileService.update_by_id(req["file_id"], {"name": req["name"]}):
return get_json_result(message="Database error (File rename)!", code=500) return get_json_result(message="Database error (File rename)!", code=RetCode.SERVER_ERROR)
informs = File2DocumentService.get_by_file_id(req["file_id"]) informs = File2DocumentService.get_by_file_id(req["file_id"])
if informs: if informs:
if not DocumentService.update_by_id(informs[0].document_id, {"name": req["name"]}): if not DocumentService.update_by_id(informs[0].document_id, {"name": req["name"]}):
return get_json_result(message="Database error (Document rename)!", code=500) return get_json_result(message="Database error (Document rename)!", code=RetCode.SERVER_ERROR)
return get_json_result(data=True) return get_json_result(data=True)
except Exception as e: except Exception as e:
@ -606,13 +606,13 @@ async def get(tenant_id, file_id):
description: File stream description: File stream
schema: schema:
type: file type: file
404: RetCode.NOT_FOUND:
description: File not found description: File not found
""" """
try: try:
e, file = FileService.get_by_id(file_id) e, file = FileService.get_by_id(file_id)
if not e: if not e:
return get_json_result(message="Document not found!", code=404) return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
blob = settings.STORAGE_IMPL.get(file.parent_id, file.location) blob = settings.STORAGE_IMPL.get(file.parent_id, file.location)
if not blob: if not blob:
@ -677,13 +677,13 @@ async def move(tenant_id):
for file_id in file_ids: for file_id in file_ids:
file = files_dict[file_id] file = files_dict[file_id]
if not file: if not file:
return get_json_result(message="File or Folder not found!", code=404) return get_json_result(message="File or Folder not found!", code=RetCode.NOT_FOUND)
if not file.tenant_id: if not file.tenant_id:
return get_json_result(message="Tenant not found!", code=404) return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
fe, _ = FileService.get_by_id(parent_id) fe, _ = FileService.get_by_id(parent_id)
if not fe: if not fe:
return get_json_result(message="Parent Folder not found!", code=404) return get_json_result(message="Parent Folder not found!", code=RetCode.NOT_FOUND)
FileService.move_file(file_ids, parent_id) FileService.move_file(file_ids, parent_id)
return get_json_result(data=True) return get_json_result(data=True)
@ -705,7 +705,7 @@ async def convert(tenant_id):
for file_id in file_ids: for file_id in file_ids:
file = files_set[file_id] file = files_set[file_id]
if not file: if not file:
return get_json_result(message="File not found!", code=404) return get_json_result(message="File not found!", code=RetCode.NOT_FOUND)
file_ids_list = [file_id] file_ids_list = [file_id]
if file.type == FileType.FOLDER.value: if file.type == FileType.FOLDER.value:
file_ids_list = FileService.get_all_innermost_file_ids(file_id, []) file_ids_list = FileService.get_all_innermost_file_ids(file_id, [])
@ -716,13 +716,13 @@ async def convert(tenant_id):
doc_id = inform.document_id doc_id = inform.document_id
e, doc = DocumentService.get_by_id(doc_id) e, doc = DocumentService.get_by_id(doc_id)
if not e: if not e:
return get_json_result(message="Document not found!", code=404) return get_json_result(message="Document not found!", code=RetCode.NOT_FOUND)
tenant_id = DocumentService.get_tenant_id(doc_id) tenant_id = DocumentService.get_tenant_id(doc_id)
if not tenant_id: if not tenant_id:
return get_json_result(message="Tenant not found!", code=404) return get_json_result(message="Tenant not found!", code=RetCode.NOT_FOUND)
if not DocumentService.remove_document(doc, tenant_id): if not DocumentService.remove_document(doc, tenant_id):
return get_json_result( return get_json_result(
message="Database error (Document removal)!", code=404) message="Database error (Document removal)!", code=RetCode.NOT_FOUND)
File2DocumentService.delete_by_file_id(id) File2DocumentService.delete_by_file_id(id)
# insert # insert
@ -730,11 +730,11 @@ async def convert(tenant_id):
e, kb = KnowledgebaseService.get_by_id(kb_id) e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e: if not e:
return get_json_result( return get_json_result(
message="Can't find this knowledgebase!", code=404) message="Can't find this knowledgebase!", code=RetCode.NOT_FOUND)
e, file = FileService.get_by_id(id) e, file = FileService.get_by_id(id)
if not e: if not e:
return get_json_result( return get_json_result(
message="Can't find this file!", code=404) message="Can't find this file!", code=RetCode.NOT_FOUND)
doc = DocumentService.insert({ doc = DocumentService.insert({
"id": get_uuid(), "id": get_uuid(),

View File

@ -749,7 +749,7 @@ class Knowledgebase(DataBaseModel):
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True) parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True) pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]}) parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
pagerank = IntegerField(default=0, index=False) pagerank = IntegerField(default=0, index=False)
graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True) graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
@ -774,7 +774,7 @@ class Document(DataBaseModel):
kb_id = CharField(max_length=256, null=False, index=True) kb_id = CharField(max_length=256, null=False, index=True)
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True) parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True) pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True)
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]}) parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True) source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
type = CharField(max_length=32, null=False, help_text="file extension", index=True) type = CharField(max_length=32, null=False, help_text="file extension", index=True)
created_by = CharField(max_length=32, null=False, help_text="who created it", index=True) created_by = CharField(max_length=32, null=False, help_text="who created it", index=True)

View File

@ -923,7 +923,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
ParserType.AUDIO.value: audio, ParserType.AUDIO.value: audio,
ParserType.EMAIL.value: email ParserType.EMAIL.value: email
} }
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text"} parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text", "table_context_size": 0, "image_context_size": 0}
exe = ThreadPoolExecutor(max_workers=12) exe = ThreadPoolExecutor(max_workers=12)
threads = [] threads = []
doc_nm = {} doc_nm = {}

View File

@ -313,6 +313,10 @@ def get_parser_config(chunk_method, parser_config):
chunk_method = "naive" chunk_method = "naive"
# Define default configurations for each chunking method # Define default configurations for each chunking method
base_defaults = {
"table_context_size": 0,
"image_context_size": 0,
}
key_mapping = { key_mapping = {
"naive": { "naive": {
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
@ -365,16 +369,19 @@ def get_parser_config(chunk_method, parser_config):
default_config = key_mapping[chunk_method] default_config = key_mapping[chunk_method]
# If no parser_config provided, return default # If no parser_config provided, return default merged with base defaults
if not parser_config: if not parser_config:
return default_config if default_config is None:
return deep_merge(base_defaults, {})
return deep_merge(base_defaults, default_config)
# If parser_config is provided, merge with defaults to ensure required fields exist # If parser_config is provided, merge with defaults to ensure required fields exist
if default_config is None: if default_config is None:
return parser_config return deep_merge(base_defaults, parser_config)
# Ensure raptor and graphrag fields have default values if not provided # Ensure raptor and graphrag fields have default values if not provided
merged_config = deep_merge(default_config, parser_config) merged_config = deep_merge(base_defaults, default_config)
merged_config = deep_merge(merged_config, parser_config)
return merged_config return merged_config

View File

@ -49,6 +49,7 @@ class RetCode(IntEnum, CustomEnum):
RUNNING = 106 RUNNING = 106
PERMISSION_ERROR = 108 PERMISSION_ERROR = 108
AUTHENTICATION_ERROR = 109 AUTHENTICATION_ERROR = 109
BAD_REQUEST = 400
UNAUTHORIZED = 401 UNAUTHORIZED = 401
SERVER_ERROR = 500 SERVER_ERROR = 500
FORBIDDEN = 403 FORBIDDEN = 403

View File

@ -138,7 +138,6 @@ class RAGFlowHtmlParser:
"metadata": {"table_id": table_id, "index": table_list.index(t)}}) "metadata": {"table_id": table_id, "index": table_list.index(t)}})
return table_info_list return table_info_list
else: else:
block_id = None
if str.lower(element.name) in BLOCK_TAGS: if str.lower(element.name) in BLOCK_TAGS:
block_id = str(uuid.uuid1()) block_id = str(uuid.uuid1())
for child in element.children: for child in element.children:
@ -172,7 +171,7 @@ class RAGFlowHtmlParser:
if tag_name == "table": if tag_name == "table":
table_info_list.append(item) table_info_list.append(item)
else: else:
current_content += (" " if current_content else "" + content) current_content += (" " if current_content else "") + content
if current_content: if current_content:
block_content.append(current_content) block_content.append(current_content)
return block_content, table_info_list return block_content, table_info_list

View File

@ -39,8 +39,10 @@ If you have not installed Docker on your local machine (Windows, Mac, or Linux),
This section provides instructions on setting up the RAGFlow server on Linux. If you are on a different operating system, no worries. Most steps are alike. This section provides instructions on setting up the RAGFlow server on Linux. If you are on a different operating system, no worries. Most steps are alike.
1. Ensure `vm.max_map_count` &ge; 262144.
<details> <details>
<summary>1. Ensure <code>vm.max_map_count</code> &ge; 262144:</summary> <summary>Expand to show details:</summary>
`vm.max_map_count`. This value sets the maximum number of memory map areas a process may have. Its default value is 65530. While most applications require fewer than a thousand maps, reducing this value can result in abnormal behaviors, and the system will throw out-of-memory errors when a process reaches the limitation. `vm.max_map_count`. This value sets the maximum number of memory map areas a process may have. Its default value is 65530. While most applications require fewer than a thousand maps, reducing this value can result in abnormal behaviors, and the system will throw out-of-memory errors when a process reaches the limitation.

View File

@ -23,7 +23,7 @@ from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS from rag.app.naive import by_plaintext, PARSERS
from rag.nlp import bullets_category, is_english,remove_contents_table, \ from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks tokenize_chunks, attach_media_context
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res return res

View File

@ -20,7 +20,7 @@ import re
from common.constants import ParserType from common.constants import ParserType
from io import BytesIO from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
from common.token_utils import num_tokens_from_string from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res return res
elif re.search(r"\.docx?$", filename, re.IGNORECASE): elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@ -325,6 +329,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["doc_type_kwd"] = "image" d["doc_type_kwd"] = "image"
tokenize(d, text, eng) tokenize(d, text, eng)
res.append(d) res.append(d)
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res return res
else: else:
raise NotImplementedError("file type not supported yet(pdf and docx supported)") raise NotImplementedError("file type not supported yet(pdf and docx supported)")

View File

@ -37,7 +37,7 @@ from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback callback = callback
@ -616,6 +616,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config = kwargs.get( parser_config = kwargs.get(
"parser_config", { "parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
final_sections = False final_sections = False
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
@ -686,6 +688,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res) res.extend(embed_res)
res.extend(url_res) res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -947,6 +951,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(embed_res) res.extend(embed_res)
if url_res: if url_res:
res.extend(url_res) res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res return res

View File

@ -20,7 +20,7 @@ import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from common.constants import ParserType from common.constants import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, attach_media_context
from deepdoc.parser import PdfParser from deepdoc.parser import PdfParser
import numpy as np import numpy as np
from rag.app.naive import by_plaintext, PARSERS from rag.app.naive import by_plaintext, PARSERS
@ -234,6 +234,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
chunks.append(txt) chunks.append(txt)
last_sid = sec_id last_sid = sec_id
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res return res

View File

@ -20,11 +20,11 @@ import re
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from deepdoc.vision import OCR from common.constants import LLMType
from rag.nlp import rag_tokenizer, tokenize
from common.string_utils import clean_markdown_block from common.string_utils import clean_markdown_block
from deepdoc.vision import OCR
from rag.nlp import attach_media_context, rag_tokenizer, tokenize
ocr = OCR() ocr = OCR()
@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
} }
eng = lang.lower() == "english" eng = lang.lower() == "english"
parser_config = kwargs.get("parser_config", {}) or {}
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS): if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
try: try:
doc.update({"doc_type_kwd": "video"}) doc.update(
{
"doc_type_kwd": "video",
}
)
cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang) cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename) ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
callback(0.8, "CV LLM respond: %s ..." % ans[:32]) callback(0.8, "CV LLM respond: %s ..." % ans[:32])
@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
if (eng and len(txt.split()) > 32) or len(txt) > 32: if (eng and len(txt.split()) > 32) or len(txt) > 32:
tokenize(doc, txt, eng) tokenize(doc, txt, eng)
callback(0.8, "OCR results is too long to use CV LLM.") callback(0.8, "OCR results is too long to use CV LLM.")
return [doc] return attach_media_context([doc], 0, image_ctx)
try: try:
callback(0.4, "Use CV LLM to describe the picture.") callback(0.4, "Use CV LLM to describe the picture.")
@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(0.8, "CV LLM respond: %s ..." % ans[:32]) callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans txt += "\n" + ans
tokenize(doc, txt, eng) tokenize(doc, txt, eng)
return [doc] return attach_media_context([doc], 0, image_ctx)
except Exception as e: except Exception as e:
callback(prog=-1, msg=str(e)) callback(prog=-1, msg=str(e))

View File

@ -19,16 +19,16 @@ import random
import re import re
from functools import partial from functools import partial
import trio
import numpy as np import numpy as np
import trio
from PIL import Image from PIL import Image
from common.constants import LLMType
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService from api.db.services.file_service import FileService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from common import settings
from common.constants import LLMType
from common.misc_utils import get_uuid from common.misc_utils import get_uuid
from rag.utils.base64_image import image2id
from deepdoc.parser import ExcelParser from deepdoc.parser import ExcelParser
from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
@ -37,7 +37,8 @@ from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream from rag.flow.parser.schema import ParserFromUpstream
from rag.llm.cv_model import Base as VLM from rag.llm.cv_model import Base as VLM
from common import settings from rag.nlp import attach_media_context
from rag.utils.base64_image import image2id
class ParserParam(ProcessParamBase): class ParserParam(ProcessParamBase):
@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase):
"json", "json",
], ],
"image": [ "image": [
"text" "text",
],
"email": [
"text",
"json",
], ],
"email": ["text", "json"],
"text&markdown": [ "text&markdown": [
"text", "text",
"json" "json",
], ],
"audio": [ "audio": [
"json" "json",
], ],
"video": [], "video": [],
} }
@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase):
"pdf", "pdf",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"spreadsheet": { "spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser "parse_method": "deepdoc", # deepdoc/tcadp_parser
@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase):
"xlsx", "xlsx",
"csv", "csv",
], ],
"table_context_size": 0,
"image_context_size": 0,
}, },
"word": { "word": {
"suffix": [ "suffix": [
@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase):
"docx", "docx",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"text&markdown": { "text&markdown": {
"suffix": ["md", "markdown", "mdx", "txt"], "suffix": ["md", "markdown", "mdx", "txt"],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"slides": { "slides": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser "parse_method": "deepdoc", # deepdoc/tcadp_parser
"suffix": [ "suffix": [
"pptx", "pptx",
"ppt" "ppt",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"image": { "image": {
"parse_method": "ocr", "parse_method": "ocr",
@ -121,7 +135,8 @@ class ParserParam(ProcessParamBase):
}, },
"email": { "email": {
"suffix": [ "suffix": [
"eml", "msg" "eml",
"msg",
], ],
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"], "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
"output_format": "json", "output_format": "json",
@ -142,7 +157,7 @@ class ParserParam(ProcessParamBase):
"realaudio", "realaudio",
"vqf", "vqf",
"oggvorbis", "oggvorbis",
"ape" "ape",
], ],
"output_format": "text", "output_format": "text",
}, },
@ -150,7 +165,7 @@ class ParserParam(ProcessParamBase):
"suffix": [ "suffix": [
"mp4", "mp4",
"avi", "avi",
"mkv" "mkv",
], ],
"output_format": "text", "output_format": "text",
}, },
@ -253,7 +268,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1") markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser( tcadp_parser = TCADPParser(
table_result_type=table_result_type, table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type markdown_image_response_type=markdown_image_response_type,
) )
sections, _ = tcadp_parser.parse_pdf( sections, _ = tcadp_parser.parse_pdf(
filepath=name, filepath=name,
@ -261,7 +276,7 @@ class Parser(ProcessBase):
callback=self.callback, callback=self.callback,
file_type="PDF", file_type="PDF",
file_start_page=1, file_start_page=1,
file_end_page=1000 file_end_page=1000,
) )
bboxes = [] bboxes = []
for section, position_tag in sections: for section, position_tag in sections:
@ -269,17 +284,20 @@ class Parser(ProcessBase):
# Extract position information from TCADP's position tag # Extract position information from TCADP's position tag
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}## # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
import re import re
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag) match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if match: if match:
pn, x0, x1, top, bott = match.groups() pn, x0, x1, top, bott = match.groups()
bboxes.append({ bboxes.append(
"page_number": int(pn.split('-')[0]), # Take the first page number {
"page_number": int(pn.split("-")[0]), # Take the first page number
"x0": float(x0), "x0": float(x0),
"x1": float(x1), "x1": float(x1),
"top": float(top), "top": float(top),
"bottom": float(bott), "bottom": float(bott),
"text": section "text": section,
}) }
)
else: else:
# If no position info, add as text without position # If no position info, add as text without position
bboxes.append({"text": section}) bboxes.append({"text": section})
@ -291,7 +309,30 @@ class Parser(ProcessBase):
bboxes = [] bboxes = []
for t, poss in lines: for t, poss in lines:
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss): for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t}) bboxes.append(
{
"page_number": int(pn[0]),
"x0": float(x0),
"x1": float(x1),
"top": float(top),
"bottom": float(bott),
"text": t,
}
)
for b in bboxes:
text_val = b.get("text", "")
has_text = isinstance(text_val, str) and text_val.strip()
layout = b.get("layout_type")
if layout == "figure" or (b.get("image") and not has_text):
b["doc_type_kwd"] = "image"
elif layout == "table":
b["doc_type_kwd"] = "table"
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
self.set_output("json", bboxes) self.set_output("json", bboxes)
@ -319,7 +360,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1") markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser( tcadp_parser = TCADPParser(
table_result_type=table_result_type, table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type markdown_image_response_type=markdown_image_response_type,
) )
if not tcadp_parser.check_installation(): if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.") raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -337,7 +378,7 @@ class Parser(ProcessBase):
callback=self.callback, callback=self.callback,
file_type=file_type, file_type=file_type,
file_start_page=1, file_start_page=1,
file_end_page=1000 file_end_page=1000,
) )
# Process TCADP parser output based on configured output_format # Process TCADP parser output based on configured output_format
@ -365,7 +406,12 @@ class Parser(ProcessBase):
# Add tables as text # Add tables as text
for table in tables: for table in tables:
if table: if table:
result.append({"text": table}) result.append({"text": table, "doc_type_kwd": "table"})
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result) self.set_output("json", result)
@ -400,7 +446,13 @@ class Parser(ProcessBase):
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
sections, tbls = docx_parser(name, binary=blob) sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section] sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls]) sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections) self.set_output("json", sections)
elif conf.get("output_format") == "markdown": elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob) markdown_text = docx_parser.to_markdown(name, binary=blob)
@ -420,7 +472,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1") markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser( tcadp_parser = TCADPParser(
table_result_type=table_result_type, table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type markdown_image_response_type=markdown_image_response_type,
) )
if not tcadp_parser.check_installation(): if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.") raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -439,7 +491,7 @@ class Parser(ProcessBase):
callback=self.callback, callback=self.callback,
file_type=file_type, file_type=file_type,
file_start_page=1, file_start_page=1,
file_end_page=1000 file_end_page=1000,
) )
# Process TCADP parser output - PPT only supports json format # Process TCADP parser output - PPT only supports json format
@ -454,7 +506,12 @@ class Parser(ProcessBase):
# Add tables as text # Add tables as text
for table in tables: for table in tables:
if table: if table:
result.append({"text": table}) result.append({"text": table, "doc_type_kwd": "table"})
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result) self.set_output("json", result)
else: else:
@ -469,6 +526,10 @@ class Parser(ProcessBase):
# json # json
assert conf.get("output_format") == "json", "have to be json for ppt" assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections) self.set_output("json", sections)
def _markdown(self, name, blob): def _markdown(self, name, blob):
@ -508,11 +569,15 @@ class Parser(ProcessBase):
json_results.append(json_result) json_results.append(json_result)
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
json_results = attach_media_context(json_results, table_ctx, image_ctx)
self.set_output("json", json_results) self.set_output("json", json_results)
else: else:
self.set_output("text", "\n".join([section_text for section_text, _ in sections])) self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
def _image(self, name, blob): def _image(self, name, blob):
from deepdoc.vision import OCR from deepdoc.vision import OCR
@ -588,7 +653,7 @@ class Parser(ProcessBase):
from email.parser import BytesParser from email.parser import BytesParser
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob)) msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
email_content['metadata'] = {} email_content["metadata"] = {}
# handle header info # handle header info
for header, value in msg.items(): for header, value in msg.items():
# get fields like from, to, cc, bcc, date, subject # get fields like from, to, cc, bcc, date, subject
@ -600,6 +665,7 @@ class Parser(ProcessBase):
# get body # get body
if "body" in target_fields: if "body" in target_fields:
body_text, body_html = [], [] body_text, body_html = [], []
def _add_content(m, content_type): def _add_content(m, content_type):
def _decode_payload(payload, charset, target_list): def _decode_payload(payload, charset, target_list):
try: try:
@ -641,14 +707,17 @@ class Parser(ProcessBase):
if dispositions[0].lower() == "attachment": if dispositions[0].lower() == "attachment":
filename = part.get_filename() filename = part.get_filename()
payload = part.get_payload(decode=True).decode(part.get_content_charset()) payload = part.get_payload(decode=True).decode(part.get_content_charset())
attachments.append({ attachments.append(
{
"filename": filename, "filename": filename,
"payload": payload, "payload": payload,
}) }
)
email_content["attachments"] = attachments email_content["attachments"] = attachments
else: else:
# handle msg file # handle msg file
import extract_msg import extract_msg
print("handle a msg file.") print("handle a msg file.")
msg = extract_msg.Message(blob) msg = extract_msg.Message(blob)
# handle header info # handle header info
@ -662,9 +731,9 @@ class Parser(ProcessBase):
} }
email_content.update({k: v for k, v in basic_content.items() if k in target_fields}) email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
# get metadata # get metadata
email_content['metadata'] = { email_content["metadata"] = {
'message_id': msg.messageId, "message_id": msg.messageId,
'in_reply_to': msg.inReplyTo, "in_reply_to": msg.inReplyTo,
} }
# get body # get body
if "body" in target_fields: if "body" in target_fields:
@ -675,29 +744,31 @@ class Parser(ProcessBase):
if "attachments" in target_fields: if "attachments" in target_fields:
attachments = [] attachments = []
for t in msg.attachments: for t in msg.attachments:
attachments.append({ attachments.append(
{
"filename": t.name, "filename": t.name,
"payload": t.data.decode("utf-8") "payload": t.data.decode("utf-8"),
}) }
)
email_content["attachments"] = attachments email_content["attachments"] = attachments
if conf["output_format"] == "json": if conf["output_format"] == "json":
self.set_output("json", [email_content]) self.set_output("json", [email_content])
else: else:
content_txt = '' content_txt = ""
for k, v in email_content.items(): for k, v in email_content.items():
if isinstance(v, str): if isinstance(v, str):
# basic info # basic info
content_txt += f'{k}:{v}' + "\n" content_txt += f"{k}:{v}" + "\n"
elif isinstance(v, dict): elif isinstance(v, dict):
# metadata # metadata
content_txt += f'{k}:{json.dumps(v)}' + "\n" content_txt += f"{k}:{json.dumps(v)}" + "\n"
elif isinstance(v, list): elif isinstance(v, list):
# attachments or others # attachments or others
for fb in v: for fb in v:
if isinstance(fb, dict): if isinstance(fb, dict):
# attachments # attachments
content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n" content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
else: else:
# str, usually plain text # str, usually plain text
content_txt += fb content_txt += fb

View File

@ -318,6 +318,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
tokenize(d, rows, eng) tokenize(d, rows, eng)
d["content_with_weight"] = rows d["content_with_weight"] = rows
d["doc_type_kwd"] = "table"
if img: if img:
d["image"] = img d["image"] = img
d["doc_type_kwd"] = "image" d["doc_type_kwd"] = "image"
@ -330,6 +331,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
r = de.join(rows[i:i + batch_size]) r = de.join(rows[i:i + batch_size])
tokenize(d, r, eng) tokenize(d, r, eng)
d["doc_type_kwd"] = "table"
if img: if img:
d["image"] = img d["image"] = img
d["doc_type_kwd"] = "image" d["doc_type_kwd"] = "image"
@ -338,6 +340,194 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
return res return res
def attach_media_context(chunks, table_context_size=0, image_context_size=0):
"""
Attach surrounding text chunk content to media chunks (table/image).
Best-effort ordering: if positional info exists on any chunk, use it to
order chunks before collecting context; otherwise keep original order.
"""
if not chunks or (table_context_size <= 0 and image_context_size <= 0):
return chunks
def is_image_chunk(ck):
if ck.get("doc_type_kwd") == "image":
return True
text_val = ck.get("content_with_weight") if isinstance(ck.get("content_with_weight"), str) else ck.get("text")
has_text = isinstance(text_val, str) and text_val.strip()
return bool(ck.get("image")) and not has_text
def is_table_chunk(ck):
return ck.get("doc_type_kwd") == "table"
def is_text_chunk(ck):
return not is_image_chunk(ck) and not is_table_chunk(ck)
def get_text(ck):
if isinstance(ck.get("content_with_weight"), str):
return ck["content_with_weight"]
if isinstance(ck.get("text"), str):
return ck["text"]
return ""
def split_sentences(text):
pattern = r"([.。!?!?;:\n])"
parts = re.split(pattern, text)
sentences = []
buf = ""
for p in parts:
if not p:
continue
if re.fullmatch(pattern, p):
buf += p
sentences.append(buf)
buf = ""
else:
buf += p
if buf:
sentences.append(buf)
return sentences
def trim_to_tokens(text, token_budget, from_tail=False):
if token_budget <= 0 or not text:
return ""
sentences = split_sentences(text)
if not sentences:
return ""
collected = []
remaining = token_budget
seq = reversed(sentences) if from_tail else sentences
for s in seq:
tks = num_tokens_from_string(s)
if tks <= 0:
continue
if tks > remaining:
collected.append(s)
break
collected.append(s)
remaining -= tks
if from_tail:
collected = list(reversed(collected))
return "".join(collected)
def extract_position(ck):
pn = None
top = None
left = None
try:
if ck.get("page_num_int"):
pn = ck["page_num_int"][0]
elif ck.get("page_number") is not None:
pn = ck.get("page_number")
if ck.get("top_int"):
top = ck["top_int"][0]
elif ck.get("top") is not None:
top = ck.get("top")
if ck.get("position_int"):
left = ck["position_int"][0][1]
elif ck.get("x0") is not None:
left = ck.get("x0")
except Exception:
pn = top = left = None
return pn, top, left
indexed = list(enumerate(chunks))
positioned_indices = []
unpositioned_indices = []
for idx, ck in indexed:
pn, top, left = extract_position(ck)
if pn is not None and top is not None:
positioned_indices.append((idx, pn, top, left if left is not None else 0))
else:
unpositioned_indices.append(idx)
if positioned_indices:
positioned_indices.sort(key=lambda x: (int(x[1]), int(x[2]), int(x[3]), x[0]))
ordered_indices = [i for i, _, _, _ in positioned_indices] + unpositioned_indices
else:
ordered_indices = [idx for idx, _ in indexed]
total = len(ordered_indices)
for sorted_pos, idx in enumerate(ordered_indices):
ck = chunks[idx]
token_budget = image_context_size if is_image_chunk(ck) else table_context_size if is_table_chunk(ck) else 0
if token_budget <= 0:
continue
prev_ctx = []
remaining_prev = token_budget
for prev_idx in range(sorted_pos - 1, -1, -1):
if remaining_prev <= 0:
break
neighbor_idx = ordered_indices[prev_idx]
if not is_text_chunk(chunks[neighbor_idx]):
break
txt = get_text(chunks[neighbor_idx])
if not txt:
continue
tks = num_tokens_from_string(txt)
if tks <= 0:
continue
if tks > remaining_prev:
txt = trim_to_tokens(txt, remaining_prev, from_tail=True)
tks = num_tokens_from_string(txt)
prev_ctx.append(txt)
remaining_prev -= tks
prev_ctx.reverse()
next_ctx = []
remaining_next = token_budget
for next_idx in range(sorted_pos + 1, total):
if remaining_next <= 0:
break
neighbor_idx = ordered_indices[next_idx]
if not is_text_chunk(chunks[neighbor_idx]):
break
txt = get_text(chunks[neighbor_idx])
if not txt:
continue
tks = num_tokens_from_string(txt)
if tks <= 0:
continue
if tks > remaining_next:
txt = trim_to_tokens(txt, remaining_next, from_tail=False)
tks = num_tokens_from_string(txt)
next_ctx.append(txt)
remaining_next -= tks
if not prev_ctx and not next_ctx:
continue
self_text = get_text(ck)
pieces = [*prev_ctx]
if self_text:
pieces.append(self_text)
pieces.extend(next_ctx)
combined = "\n".join(pieces)
original = ck.get("content_with_weight")
if "content_with_weight" in ck:
ck["content_with_weight"] = combined
elif "text" in ck:
original = ck.get("text")
ck["text"] = combined
if combined != original:
if "content_ltks" in ck:
ck["content_ltks"] = rag_tokenizer.tokenize(combined)
if "content_sm_ltks" in ck:
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
if positioned_indices:
chunks[:] = [chunks[i] for i in ordered_indices]
return chunks
def add_positions(d, poss): def add_positions(d, poss):
if not poss: if not poss:
return return

View File

@ -69,7 +69,7 @@ def convert_matching_field(field_weightstr: str) -> str:
if field == "docnm_kwd" or field == "title_tks": if field == "docnm_kwd" or field == "title_tks":
field = "docnm@ft_docnm_rag_coarse" field = "docnm@ft_docnm_rag_coarse"
elif field == "title_sm_tks": elif field == "title_sm_tks":
field = "docnm@ft_title_rag_fine" field = "docnm@ft_docnm_rag_fine"
elif field == "important_kwd": elif field == "important_kwd":
field = "important_keywords@ft_important_keywords_rag_coarse" field = "important_keywords@ft_important_keywords_rag_coarse"
elif field == "important_tks": elif field == "important_tks":

View File

@ -42,6 +42,8 @@ DEFAULT_PARSER_CONFIG = {
"auto_keywords": 0, "auto_keywords": 0,
"auto_questions": 0, "auto_questions": 0,
"html4excel": False, "html4excel": False,
"image_context_size": 0,
"table_context_size": 0,
"topn_tags": 3, "topn_tags": 3,
"raptor": { "raptor": {
"use_raptor": True, "use_raptor": True,