diff --git a/api/db/db_models.py b/api/db/db_models.py
index bd3feea64..e60afbef5 100644
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@@ -749,7 +749,7 @@ class Knowledgebase(DataBaseModel):
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
- parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
+ parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
pagerank = IntegerField(default=0, index=False)
graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
@@ -774,7 +774,7 @@ class Document(DataBaseModel):
kb_id = CharField(max_length=256, null=False, index=True)
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True)
- parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
+ parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
type = CharField(max_length=32, null=False, help_text="file extension", index=True)
created_by = CharField(max_length=32, null=False, help_text="who created it", index=True)
diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py
index 514b3fd87..7b7ef53ec 100644
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@@ -923,7 +923,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
ParserType.AUDIO.value: audio,
ParserType.EMAIL.value: email
}
- parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text"}
+ parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text", "table_context_size": 0, "image_context_size": 0}
exe = ThreadPoolExecutor(max_workers=12)
threads = []
doc_nm = {}
diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py
index cbd2423f2..314211694 100644
--- a/api/utils/api_utils.py
+++ b/api/utils/api_utils.py
@@ -313,6 +313,10 @@ def get_parser_config(chunk_method, parser_config):
chunk_method = "naive"
# Define default configurations for each chunking method
+ base_defaults = {
+ "table_context_size": 0,
+ "image_context_size": 0,
+ }
key_mapping = {
"naive": {
"layout_recognize": "DeepDOC",
@@ -365,16 +369,19 @@ def get_parser_config(chunk_method, parser_config):
default_config = key_mapping[chunk_method]
- # If no parser_config provided, return default
+ # If no parser_config provided, return default merged with base defaults
if not parser_config:
- return default_config
+ if default_config is None:
+ return deep_merge(base_defaults, {})
+ return deep_merge(base_defaults, default_config)
# If parser_config is provided, merge with defaults to ensure required fields exist
if default_config is None:
- return parser_config
+ return deep_merge(base_defaults, parser_config)
# Ensure raptor and graphrag fields have default values if not provided
- merged_config = deep_merge(default_config, parser_config)
+ merged_config = deep_merge(base_defaults, default_config)
+ merged_config = deep_merge(merged_config, parser_config)
return merged_config
diff --git a/rag/app/book.py b/rag/app/book.py
index 5bdaec72d..ca91be149 100644
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -23,7 +23,7 @@ from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
- tokenize_chunks
+ tokenize_chunks, attach_media_context
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+ table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
+ image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
+ if table_ctx or image_ctx:
+ attach_media_context(res, table_ctx, image_ctx)
return res
diff --git a/rag/app/manual.py b/rag/app/manual.py
index b3a4ae38d..1eb86a043 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -20,7 +20,7 @@ import re
from common.constants import ParserType
from io import BytesIO
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@@ -155,7 +155,7 @@ class Docx(DocxParser):
sum_question = '\n'.join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
-
+
tbls = []
for tb in self.doc.tables:
html= "
"
@@ -231,14 +231,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
- pn = first[0]
+ pn = first[0]
if isinstance(pn, list):
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
return (txt, layoutno, poss)
-
+
sections = [_normalize_section(sec) for sec in sections]
@@ -247,7 +247,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
-
+
callback(0.8, "Finish parsing.")
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
@@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+ table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
+ image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
+ if table_ctx or image_ctx:
+ attach_media_context(res, table_ctx, image_ctx)
return res
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@@ -325,10 +329,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["doc_type_kwd"] = "image"
tokenize(d, text, eng)
res.append(d)
+ table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
+ image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
+ if table_ctx or image_ctx:
+ attach_media_context(res, table_ctx, image_ctx)
return res
else:
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
-
+
if __name__ == "__main__":
import sys
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 0496c7507..7872ebc22 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -37,7 +37,7 @@ from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
-from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
+from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback
@@ -616,6 +616,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
+ table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
+ image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
final_sections = False
doc = {
"docnm_kwd": filename,
@@ -686,6 +688,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res)
res.extend(url_res)
+ if table_context_size or image_context_size:
+ attach_media_context(res, table_context_size, image_context_size)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@@ -947,6 +951,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(embed_res)
if url_res:
res.extend(url_res)
+ if table_context_size or image_context_size:
+ attach_media_context(res, table_context_size, image_context_size)
return res
diff --git a/rag/app/paper.py b/rag/app/paper.py
index 222be0762..d84d5645d 100644
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -20,7 +20,7 @@ import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from common.constants import ParserType
-from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
+from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, attach_media_context
from deepdoc.parser import PdfParser
import numpy as np
from rag.app.naive import by_plaintext, PARSERS
@@ -150,7 +150,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
if re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
-
+
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@@ -234,6 +234,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
chunks.append(txt)
last_sid = sec_id
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
+ table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
+ image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
+ if table_ctx or image_ctx:
+ attach_media_context(res, table_ctx, image_ctx)
return res
diff --git a/rag/app/picture.py b/rag/app/picture.py
index f260104f7..8e7aa4bce 100644
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@@ -20,11 +20,11 @@ import re
import numpy as np
from PIL import Image
-from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
-from deepdoc.vision import OCR
-from rag.nlp import rag_tokenizer, tokenize
+from common.constants import LLMType
from common.string_utils import clean_markdown_block
+from deepdoc.vision import OCR
+from rag.nlp import attach_media_context, rag_tokenizer, tokenize
ocr = OCR()
@@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
}
eng = lang.lower() == "english"
+ parser_config = kwargs.get("parser_config", {}) or {}
+ image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
+
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
try:
- doc.update({"doc_type_kwd": "video"})
+ doc.update(
+ {
+ "doc_type_kwd": "video",
+ }
+ )
cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
@@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
if (eng and len(txt.split()) > 32) or len(txt) > 32:
tokenize(doc, txt, eng)
callback(0.8, "OCR results is too long to use CV LLM.")
- return [doc]
+ return attach_media_context([doc], 0, image_ctx)
try:
callback(0.4, "Use CV LLM to describe the picture.")
@@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans
tokenize(doc, txt, eng)
- return [doc]
+ return attach_media_context([doc], 0, image_ctx)
except Exception as e:
callback(prog=-1, msg=str(e))
@@ -103,7 +110,7 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
img_binary.seek(0)
img_binary.truncate()
img.save(img_binary, format="PNG")
-
+
img_binary.seek(0)
ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
txt += "\n" + ans
diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
index 1a111cc3a..7747448ad 100644
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -19,16 +19,16 @@ import random
import re
from functools import partial
-import trio
import numpy as np
+import trio
from PIL import Image
-from common.constants import LLMType
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.llm_service import LLMBundle
+from common import settings
+from common.constants import LLMType
from common.misc_utils import get_uuid
-from rag.utils.base64_image import image2id
from deepdoc.parser import ExcelParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
@@ -37,7 +37,8 @@ from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream
from rag.llm.cv_model import Base as VLM
-from common import settings
+from rag.nlp import attach_media_context
+from rag.utils.base64_image import image2id
class ParserParam(ProcessParamBase):
@@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase):
"json",
],
"image": [
- "text"
+ "text",
+ ],
+ "email": [
+ "text",
+ "json",
],
- "email": ["text", "json"],
"text&markdown": [
"text",
- "json"
+ "json",
],
"audio": [
- "json"
+ "json",
],
"video": [],
}
@@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase):
"pdf",
],
"output_format": "json",
+ "table_context_size": 0,
+ "image_context_size": 0,
},
"spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
@@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase):
"xlsx",
"csv",
],
+ "table_context_size": 0,
+ "image_context_size": 0,
},
"word": {
"suffix": [
@@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase):
"docx",
],
"output_format": "json",
+ "table_context_size": 0,
+ "image_context_size": 0,
},
"text&markdown": {
"suffix": ["md", "markdown", "mdx", "txt"],
"output_format": "json",
+ "table_context_size": 0,
+ "image_context_size": 0,
},
"slides": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
"suffix": [
"pptx",
- "ppt"
+ "ppt",
],
"output_format": "json",
+ "table_context_size": 0,
+ "image_context_size": 0,
},
"image": {
"parse_method": "ocr",
@@ -121,13 +135,14 @@ class ParserParam(ProcessParamBase):
},
"email": {
"suffix": [
- "eml", "msg"
+ "eml",
+ "msg",
],
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
"output_format": "json",
},
"audio": {
- "suffix":[
+ "suffix": [
"da",
"wave",
"wav",
@@ -142,15 +157,15 @@ class ParserParam(ProcessParamBase):
"realaudio",
"vqf",
"oggvorbis",
- "ape"
+ "ape",
],
"output_format": "text",
},
"video": {
- "suffix":[
+ "suffix": [
"mp4",
"avi",
- "mkv"
+ "mkv",
],
"output_format": "text",
},
@@ -253,7 +268,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
- markdown_image_response_type=markdown_image_response_type
+ markdown_image_response_type=markdown_image_response_type,
)
sections, _ = tcadp_parser.parse_pdf(
filepath=name,
@@ -261,7 +276,7 @@ class Parser(ProcessBase):
callback=self.callback,
file_type="PDF",
file_start_page=1,
- file_end_page=1000
+ file_end_page=1000,
)
bboxes = []
for section, position_tag in sections:
@@ -269,17 +284,20 @@ class Parser(ProcessBase):
# Extract position information from TCADP's position tag
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
import re
+
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if match:
pn, x0, x1, top, bott = match.groups()
- bboxes.append({
- "page_number": int(pn.split('-')[0]), # Take the first page number
- "x0": float(x0),
- "x1": float(x1),
- "top": float(top),
- "bottom": float(bott),
- "text": section
- })
+ bboxes.append(
+ {
+ "page_number": int(pn.split("-")[0]), # Take the first page number
+ "x0": float(x0),
+ "x1": float(x1),
+ "top": float(top),
+ "bottom": float(bott),
+ "text": section,
+ }
+ )
else:
# If no position info, add as text without position
bboxes.append({"text": section})
@@ -291,7 +309,30 @@ class Parser(ProcessBase):
bboxes = []
for t, poss in lines:
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
- bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
+ bboxes.append(
+ {
+ "page_number": int(pn[0]),
+ "x0": float(x0),
+ "x1": float(x1),
+ "top": float(top),
+ "bottom": float(bott),
+ "text": t,
+ }
+ )
+
+ for b in bboxes:
+ text_val = b.get("text", "")
+ has_text = isinstance(text_val, str) and text_val.strip()
+ layout = b.get("layout_type")
+ if layout == "figure" or (b.get("image") and not has_text):
+ b["doc_type_kwd"] = "image"
+ elif layout == "table":
+ b["doc_type_kwd"] = "table"
+
+ table_ctx = conf.get("table_context_size", 0) or 0
+ image_ctx = conf.get("image_context_size", 0) or 0
+ if table_ctx or image_ctx:
+ bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
if conf.get("output_format") == "json":
self.set_output("json", bboxes)
@@ -319,7 +360,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
- markdown_image_response_type=markdown_image_response_type
+ markdown_image_response_type=markdown_image_response_type,
)
if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@@ -337,7 +378,7 @@ class Parser(ProcessBase):
callback=self.callback,
file_type=file_type,
file_start_page=1,
- file_end_page=1000
+ file_end_page=1000,
)
# Process TCADP parser output based on configured output_format
@@ -365,7 +406,12 @@ class Parser(ProcessBase):
# Add tables as text
for table in tables:
if table:
- result.append({"text": table})
+ result.append({"text": table, "doc_type_kwd": "table"})
+
+ table_ctx = conf.get("table_context_size", 0) or 0
+ image_ctx = conf.get("image_context_size", 0) or 0
+ if table_ctx or image_ctx:
+ result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result)
@@ -400,7 +446,13 @@ class Parser(ProcessBase):
if conf.get("output_format") == "json":
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
- sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
+ sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
+
+ table_ctx = conf.get("table_context_size", 0) or 0
+ image_ctx = conf.get("image_context_size", 0) or 0
+ if table_ctx or image_ctx:
+ sections = attach_media_context(sections, table_ctx, image_ctx)
+
self.set_output("json", sections)
elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob)
@@ -420,7 +472,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
- markdown_image_response_type=markdown_image_response_type
+ markdown_image_response_type=markdown_image_response_type,
)
if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@@ -439,7 +491,7 @@ class Parser(ProcessBase):
callback=self.callback,
file_type=file_type,
file_start_page=1,
- file_end_page=1000
+ file_end_page=1000,
)
# Process TCADP parser output - PPT only supports json format
@@ -454,7 +506,12 @@ class Parser(ProcessBase):
# Add tables as text
for table in tables:
if table:
- result.append({"text": table})
+ result.append({"text": table, "doc_type_kwd": "table"})
+
+ table_ctx = conf.get("table_context_size", 0) or 0
+ image_ctx = conf.get("image_context_size", 0) or 0
+ if table_ctx or image_ctx:
+ result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result)
else:
@@ -469,6 +526,10 @@ class Parser(ProcessBase):
# json
assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json":
+ table_ctx = conf.get("table_context_size", 0) or 0
+ image_ctx = conf.get("image_context_size", 0) or 0
+ if table_ctx or image_ctx:
+ sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections)
def _markdown(self, name, blob):
@@ -508,11 +569,15 @@ class Parser(ProcessBase):
json_results.append(json_result)
+ table_ctx = conf.get("table_context_size", 0) or 0
+ image_ctx = conf.get("image_context_size", 0) or 0
+ if table_ctx or image_ctx:
+ json_results = attach_media_context(json_results, table_ctx, image_ctx)
+
self.set_output("json", json_results)
else:
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
-
def _image(self, name, blob):
from deepdoc.vision import OCR
@@ -588,7 +653,7 @@ class Parser(ProcessBase):
from email.parser import BytesParser
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
- email_content['metadata'] = {}
+ email_content["metadata"] = {}
# handle header info
for header, value in msg.items():
# get fields like from, to, cc, bcc, date, subject
@@ -600,6 +665,7 @@ class Parser(ProcessBase):
# get body
if "body" in target_fields:
body_text, body_html = [], []
+
def _add_content(m, content_type):
def _decode_payload(payload, charset, target_list):
try:
@@ -641,14 +707,17 @@ class Parser(ProcessBase):
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True).decode(part.get_content_charset())
- attachments.append({
- "filename": filename,
- "payload": payload,
- })
+ attachments.append(
+ {
+ "filename": filename,
+ "payload": payload,
+ }
+ )
email_content["attachments"] = attachments
else:
# handle msg file
import extract_msg
+
print("handle a msg file.")
msg = extract_msg.Message(blob)
# handle header info
@@ -662,9 +731,9 @@ class Parser(ProcessBase):
}
email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
# get metadata
- email_content['metadata'] = {
- 'message_id': msg.messageId,
- 'in_reply_to': msg.inReplyTo,
+ email_content["metadata"] = {
+ "message_id": msg.messageId,
+ "in_reply_to": msg.inReplyTo,
}
# get body
if "body" in target_fields:
@@ -675,29 +744,31 @@ class Parser(ProcessBase):
if "attachments" in target_fields:
attachments = []
for t in msg.attachments:
- attachments.append({
- "filename": t.name,
- "payload": t.data.decode("utf-8")
- })
+ attachments.append(
+ {
+ "filename": t.name,
+ "payload": t.data.decode("utf-8"),
+ }
+ )
email_content["attachments"] = attachments
if conf["output_format"] == "json":
self.set_output("json", [email_content])
else:
- content_txt = ''
+ content_txt = ""
for k, v in email_content.items():
if isinstance(v, str):
# basic info
- content_txt += f'{k}:{v}' + "\n"
+ content_txt += f"{k}:{v}" + "\n"
elif isinstance(v, dict):
# metadata
- content_txt += f'{k}:{json.dumps(v)}' + "\n"
+ content_txt += f"{k}:{json.dumps(v)}" + "\n"
elif isinstance(v, list):
# attachments or others
for fb in v:
if isinstance(fb, dict):
# attachments
- content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
+ content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
else:
# str, usually plain text
content_txt += fb
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 0624309ee..6f36a927a 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -318,6 +318,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc)
tokenize(d, rows, eng)
d["content_with_weight"] = rows
+ d["doc_type_kwd"] = "table"
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
@@ -330,6 +331,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc)
r = de.join(rows[i:i + batch_size])
tokenize(d, r, eng)
+ d["doc_type_kwd"] = "table"
if img:
d["image"] = img
d["doc_type_kwd"] = "image"
@@ -338,6 +340,194 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
return res
+def attach_media_context(chunks, table_context_size=0, image_context_size=0):
+ """
+ Attach surrounding text chunk content to media chunks (table/image).
+ Best-effort ordering: if positional info exists on any chunk, use it to
+ order chunks before collecting context; otherwise keep original order.
+ """
+ if not chunks or (table_context_size <= 0 and image_context_size <= 0):
+ return chunks
+
+ def is_image_chunk(ck):
+ if ck.get("doc_type_kwd") == "image":
+ return True
+
+ text_val = ck.get("content_with_weight") if isinstance(ck.get("content_with_weight"), str) else ck.get("text")
+ has_text = isinstance(text_val, str) and text_val.strip()
+ return bool(ck.get("image")) and not has_text
+
+ def is_table_chunk(ck):
+ return ck.get("doc_type_kwd") == "table"
+
+ def is_text_chunk(ck):
+ return not is_image_chunk(ck) and not is_table_chunk(ck)
+
+ def get_text(ck):
+ if isinstance(ck.get("content_with_weight"), str):
+ return ck["content_with_weight"]
+ if isinstance(ck.get("text"), str):
+ return ck["text"]
+ return ""
+
+ def split_sentences(text):
+ pattern = r"([.。!?!?;;::\n])"
+ parts = re.split(pattern, text)
+ sentences = []
+ buf = ""
+ for p in parts:
+ if not p:
+ continue
+ if re.fullmatch(pattern, p):
+ buf += p
+ sentences.append(buf)
+ buf = ""
+ else:
+ buf += p
+ if buf:
+ sentences.append(buf)
+ return sentences
+
+ def trim_to_tokens(text, token_budget, from_tail=False):
+ if token_budget <= 0 or not text:
+ return ""
+ sentences = split_sentences(text)
+ if not sentences:
+ return ""
+
+ collected = []
+ remaining = token_budget
+ seq = reversed(sentences) if from_tail else sentences
+ for s in seq:
+ tks = num_tokens_from_string(s)
+ if tks <= 0:
+ continue
+ if tks > remaining:
+ collected.append(s)
+ break
+ collected.append(s)
+ remaining -= tks
+
+ if from_tail:
+ collected = list(reversed(collected))
+ return "".join(collected)
+
+ def extract_position(ck):
+ pn = None
+ top = None
+ left = None
+ try:
+ if ck.get("page_num_int"):
+ pn = ck["page_num_int"][0]
+ elif ck.get("page_number") is not None:
+ pn = ck.get("page_number")
+
+ if ck.get("top_int"):
+ top = ck["top_int"][0]
+ elif ck.get("top") is not None:
+ top = ck.get("top")
+
+ if ck.get("position_int"):
+ left = ck["position_int"][0][1]
+ elif ck.get("x0") is not None:
+ left = ck.get("x0")
+ except Exception:
+ pn = top = left = None
+ return pn, top, left
+
+ indexed = list(enumerate(chunks))
+ positioned_indices = []
+ unpositioned_indices = []
+ for idx, ck in indexed:
+ pn, top, left = extract_position(ck)
+ if pn is not None and top is not None:
+ positioned_indices.append((idx, pn, top, left if left is not None else 0))
+ else:
+ unpositioned_indices.append(idx)
+
+ if positioned_indices:
+ positioned_indices.sort(key=lambda x: (int(x[1]), int(x[2]), int(x[3]), x[0]))
+ ordered_indices = [i for i, _, _, _ in positioned_indices] + unpositioned_indices
+ else:
+ ordered_indices = [idx for idx, _ in indexed]
+
+ total = len(ordered_indices)
+ for sorted_pos, idx in enumerate(ordered_indices):
+ ck = chunks[idx]
+ token_budget = image_context_size if is_image_chunk(ck) else table_context_size if is_table_chunk(ck) else 0
+ if token_budget <= 0:
+ continue
+
+ prev_ctx = []
+ remaining_prev = token_budget
+ for prev_idx in range(sorted_pos - 1, -1, -1):
+ if remaining_prev <= 0:
+ break
+ neighbor_idx = ordered_indices[prev_idx]
+ if not is_text_chunk(chunks[neighbor_idx]):
+ break
+ txt = get_text(chunks[neighbor_idx])
+ if not txt:
+ continue
+ tks = num_tokens_from_string(txt)
+ if tks <= 0:
+ continue
+ if tks > remaining_prev:
+ txt = trim_to_tokens(txt, remaining_prev, from_tail=True)
+ tks = num_tokens_from_string(txt)
+ prev_ctx.append(txt)
+ remaining_prev -= tks
+ prev_ctx.reverse()
+
+ next_ctx = []
+ remaining_next = token_budget
+ for next_idx in range(sorted_pos + 1, total):
+ if remaining_next <= 0:
+ break
+ neighbor_idx = ordered_indices[next_idx]
+ if not is_text_chunk(chunks[neighbor_idx]):
+ break
+ txt = get_text(chunks[neighbor_idx])
+ if not txt:
+ continue
+ tks = num_tokens_from_string(txt)
+ if tks <= 0:
+ continue
+ if tks > remaining_next:
+ txt = trim_to_tokens(txt, remaining_next, from_tail=False)
+ tks = num_tokens_from_string(txt)
+ next_ctx.append(txt)
+ remaining_next -= tks
+
+ if not prev_ctx and not next_ctx:
+ continue
+
+ self_text = get_text(ck)
+ pieces = [*prev_ctx]
+ if self_text:
+ pieces.append(self_text)
+ pieces.extend(next_ctx)
+ combined = "\n".join(pieces)
+
+ original = ck.get("content_with_weight")
+ if "content_with_weight" in ck:
+ ck["content_with_weight"] = combined
+ elif "text" in ck:
+ original = ck.get("text")
+ ck["text"] = combined
+
+ if combined != original:
+ if "content_ltks" in ck:
+ ck["content_ltks"] = rag_tokenizer.tokenize(combined)
+ if "content_sm_ltks" in ck:
+ ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
+
+ if positioned_indices:
+ chunks[:] = [chunks[i] for i in ordered_indices]
+
+ return chunks
+
+
def add_positions(d, poss):
if not poss:
return
diff --git a/test/testcases/configs.py b/test/testcases/configs.py
index 992e98d5b..a94a627b7 100644
--- a/test/testcases/configs.py
+++ b/test/testcases/configs.py
@@ -42,6 +42,8 @@ DEFAULT_PARSER_CONFIG = {
"auto_keywords": 0,
"auto_questions": 0,
"html4excel": False,
+ "image_context_size": 0,
+ "table_context_size": 0,
"topn_tags": 3,
"raptor": {
"use_raptor": True,
@@ -62,4 +64,4 @@ DEFAULT_PARSER_CONFIG = {
],
"method": "light",
},
-}
\ No newline at end of file
+}