diff --git a/api/db/db_models.py b/api/db/db_models.py index bd3feea64..e60afbef5 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -749,7 +749,7 @@ class Knowledgebase(DataBaseModel): parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True) pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True) - parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]}) + parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0}) pagerank = IntegerField(default=0, index=False) graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True) @@ -774,7 +774,7 @@ class Document(DataBaseModel): kb_id = CharField(max_length=256, null=False, index=True) parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True) pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True) - parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]}) + parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0}) source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True) type = CharField(max_length=32, null=False, help_text="file extension", index=True) created_by = CharField(max_length=32, null=False, help_text="who created it", index=True) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 514b3fd87..7b7ef53ec 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -923,7 +923,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id): ParserType.AUDIO.value: audio, ParserType.EMAIL.value: email } - parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text"} + parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text", "table_context_size": 0, "image_context_size": 0} exe = ThreadPoolExecutor(max_workers=12) threads = [] doc_nm = {} diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index cbd2423f2..314211694 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -313,6 +313,10 @@ def get_parser_config(chunk_method, parser_config): chunk_method = "naive" # Define default configurations for each chunking method + base_defaults = { + "table_context_size": 0, + "image_context_size": 0, + } key_mapping = { "naive": { "layout_recognize": "DeepDOC", @@ -365,16 +369,19 @@ def get_parser_config(chunk_method, parser_config): default_config = key_mapping[chunk_method] - # If no parser_config provided, return default + # If no parser_config provided, return default merged with base defaults if not parser_config: - return default_config + if default_config is None: + return deep_merge(base_defaults, {}) + return deep_merge(base_defaults, default_config) # If parser_config is provided, merge with defaults to ensure required fields exist if default_config is None: - return parser_config + return deep_merge(base_defaults, parser_config) # Ensure raptor and graphrag fields have default values if not provided - merged_config = deep_merge(default_config, parser_config) + merged_config = deep_merge(base_defaults, default_config) + merged_config = deep_merge(merged_config, parser_config) return merged_config diff --git a/rag/app/book.py b/rag/app/book.py index 5bdaec72d..ca91be149 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -23,7 +23,7 @@ from rag.app import naive from rag.app.naive import by_plaintext, PARSERS from rag.nlp import bullets_category, is_english,remove_contents_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ - tokenize_chunks + tokenize_chunks, attach_media_context from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, HtmlParser from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper @@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res = tokenize_table(tbls, doc, eng) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) + table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) + image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) + if table_ctx or image_ctx: + attach_media_context(res, table_ctx, image_ctx) return res diff --git a/rag/app/manual.py b/rag/app/manual.py index b3a4ae38d..1eb86a043 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -20,7 +20,7 @@ import re from common.constants import ParserType from io import BytesIO -from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context from common.token_utils import num_tokens_from_string from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper @@ -155,7 +155,7 @@ class Docx(DocxParser): sum_question = '\n'.join(question_stack) if sum_question: ti_list.append((f'{sum_question}\n{last_answer}', last_image)) - + tbls = [] for tb in self.doc.tables: html= "" @@ -231,14 +231,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if isinstance(poss, str): poss = pdf_parser.extract_positions(poss) first = poss[0] # tuple: ([pn], x1, x2, y1, y2) - pn = first[0] + pn = first[0] if isinstance(pn, list): pn = pn[0] # [pn] -> pn poss[0] = (pn, *first[1:]) return (txt, layoutno, poss) - + sections = [_normalize_section(sec) for sec in sections] @@ -247,7 +247,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if name in ["tcadp", "docling", "mineru"]: parser_config["chunk_token_num"] = 0 - + callback(0.8, "Finish parsing.") if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: @@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) res = tokenize_table(tbls, doc, eng) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) + table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) + image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) + if table_ctx or image_ctx: + attach_media_context(res, table_ctx, image_ctx) return res elif re.search(r"\.docx?$", filename, re.IGNORECASE): @@ -325,10 +329,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, d["doc_type_kwd"] = "image" tokenize(d, text, eng) res.append(d) + table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) + image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) + if table_ctx or image_ctx: + attach_media_context(res, table_ctx, image_ctx) return res else: raise NotImplementedError("file type not supported yet(pdf and docx supported)") - + if __name__ == "__main__": import sys diff --git a/rag/app/naive.py b/rag/app/naive.py index 0496c7507..7872ebc22 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -37,7 +37,7 @@ from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser -from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table +from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): callback = callback @@ -616,6 +616,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, parser_config = kwargs.get( "parser_config", { "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) + table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0)) + image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0)) final_sections = False doc = { "docnm_kwd": filename, @@ -686,6 +688,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, logging.info("naive_merge({}): {}".format(filename, timer() - st)) res.extend(embed_res) res.extend(url_res) + if table_context_size or image_context_size: + attach_media_context(res, table_context_size, image_context_size) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): @@ -947,6 +951,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res.extend(embed_res) if url_res: res.extend(url_res) + if table_context_size or image_context_size: + attach_media_context(res, table_context_size, image_context_size) return res diff --git a/rag/app/paper.py b/rag/app/paper.py index 222be0762..d84d5645d 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -20,7 +20,7 @@ import re from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper from common.constants import ParserType -from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, attach_media_context from deepdoc.parser import PdfParser import numpy as np from rag.app.naive import by_plaintext, PARSERS @@ -150,7 +150,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) if re.search(r"\.pdf$", filename, re.IGNORECASE): layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") - + if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -234,6 +234,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, chunks.append(txt) last_sid = sec_id res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) + table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) + image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) + if table_ctx or image_ctx: + attach_media_context(res, table_ctx, image_ctx) return res diff --git a/rag/app/picture.py b/rag/app/picture.py index f260104f7..8e7aa4bce 100644 --- a/rag/app/picture.py +++ b/rag/app/picture.py @@ -20,11 +20,11 @@ import re import numpy as np from PIL import Image -from common.constants import LLMType from api.db.services.llm_service import LLMBundle -from deepdoc.vision import OCR -from rag.nlp import rag_tokenizer, tokenize +from common.constants import LLMType from common.string_utils import clean_markdown_block +from deepdoc.vision import OCR +from rag.nlp import attach_media_context, rag_tokenizer, tokenize ocr = OCR() @@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): } eng = lang.lower() == "english" + parser_config = kwargs.get("parser_config", {}) or {} + image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) + if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS): try: - doc.update({"doc_type_kwd": "video"}) + doc.update( + { + "doc_type_kwd": "video", + } + ) cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang) ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename) callback(0.8, "CV LLM respond: %s ..." % ans[:32]) @@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): if (eng and len(txt.split()) > 32) or len(txt) > 32: tokenize(doc, txt, eng) callback(0.8, "OCR results is too long to use CV LLM.") - return [doc] + return attach_media_context([doc], 0, image_ctx) try: callback(0.4, "Use CV LLM to describe the picture.") @@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): callback(0.8, "CV LLM respond: %s ..." % ans[:32]) txt += "\n" + ans tokenize(doc, txt, eng) - return [doc] + return attach_media_context([doc], 0, image_ctx) except Exception as e: callback(prog=-1, msg=str(e)) @@ -103,7 +110,7 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None): img_binary.seek(0) img_binary.truncate() img.save(img_binary, format="PNG") - + img_binary.seek(0) ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt)) txt += "\n" + ans diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 1a111cc3a..7747448ad 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -19,16 +19,16 @@ import random import re from functools import partial -import trio import numpy as np +import trio from PIL import Image -from common.constants import LLMType from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.llm_service import LLMBundle +from common import settings +from common.constants import LLMType from common.misc_utils import get_uuid -from rag.utils.base64_image import image2id from deepdoc.parser import ExcelParser from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser @@ -37,7 +37,8 @@ from rag.app.naive import Docx from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.parser.schema import ParserFromUpstream from rag.llm.cv_model import Base as VLM -from common import settings +from rag.nlp import attach_media_context +from rag.utils.base64_image import image2id class ParserParam(ProcessParamBase): @@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase): "json", ], "image": [ - "text" + "text", + ], + "email": [ + "text", + "json", ], - "email": ["text", "json"], "text&markdown": [ "text", - "json" + "json", ], "audio": [ - "json" + "json", ], "video": [], } @@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase): "pdf", ], "output_format": "json", + "table_context_size": 0, + "image_context_size": 0, }, "spreadsheet": { "parse_method": "deepdoc", # deepdoc/tcadp_parser @@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase): "xlsx", "csv", ], + "table_context_size": 0, + "image_context_size": 0, }, "word": { "suffix": [ @@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase): "docx", ], "output_format": "json", + "table_context_size": 0, + "image_context_size": 0, }, "text&markdown": { "suffix": ["md", "markdown", "mdx", "txt"], "output_format": "json", + "table_context_size": 0, + "image_context_size": 0, }, "slides": { "parse_method": "deepdoc", # deepdoc/tcadp_parser "suffix": [ "pptx", - "ppt" + "ppt", ], "output_format": "json", + "table_context_size": 0, + "image_context_size": 0, }, "image": { "parse_method": "ocr", @@ -121,13 +135,14 @@ class ParserParam(ProcessParamBase): }, "email": { "suffix": [ - "eml", "msg" + "eml", + "msg", ], "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"], "output_format": "json", }, "audio": { - "suffix":[ + "suffix": [ "da", "wave", "wav", @@ -142,15 +157,15 @@ class ParserParam(ProcessParamBase): "realaudio", "vqf", "oggvorbis", - "ape" + "ape", ], "output_format": "text", }, "video": { - "suffix":[ + "suffix": [ "mp4", "avi", - "mkv" + "mkv", ], "output_format": "text", }, @@ -253,7 +268,7 @@ class Parser(ProcessBase): markdown_image_response_type = conf.get("markdown_image_response_type", "1") tcadp_parser = TCADPParser( table_result_type=table_result_type, - markdown_image_response_type=markdown_image_response_type + markdown_image_response_type=markdown_image_response_type, ) sections, _ = tcadp_parser.parse_pdf( filepath=name, @@ -261,7 +276,7 @@ class Parser(ProcessBase): callback=self.callback, file_type="PDF", file_start_page=1, - file_end_page=1000 + file_end_page=1000, ) bboxes = [] for section, position_tag in sections: @@ -269,17 +284,20 @@ class Parser(ProcessBase): # Extract position information from TCADP's position tag # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}## import re + match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag) if match: pn, x0, x1, top, bott = match.groups() - bboxes.append({ - "page_number": int(pn.split('-')[0]), # Take the first page number - "x0": float(x0), - "x1": float(x1), - "top": float(top), - "bottom": float(bott), - "text": section - }) + bboxes.append( + { + "page_number": int(pn.split("-")[0]), # Take the first page number + "x0": float(x0), + "x1": float(x1), + "top": float(top), + "bottom": float(bott), + "text": section, + } + ) else: # If no position info, add as text without position bboxes.append({"text": section}) @@ -291,7 +309,30 @@ class Parser(ProcessBase): bboxes = [] for t, poss in lines: for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss): - bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t}) + bboxes.append( + { + "page_number": int(pn[0]), + "x0": float(x0), + "x1": float(x1), + "top": float(top), + "bottom": float(bott), + "text": t, + } + ) + + for b in bboxes: + text_val = b.get("text", "") + has_text = isinstance(text_val, str) and text_val.strip() + layout = b.get("layout_type") + if layout == "figure" or (b.get("image") and not has_text): + b["doc_type_kwd"] = "image" + elif layout == "table": + b["doc_type_kwd"] = "table" + + table_ctx = conf.get("table_context_size", 0) or 0 + image_ctx = conf.get("image_context_size", 0) or 0 + if table_ctx or image_ctx: + bboxes = attach_media_context(bboxes, table_ctx, image_ctx) if conf.get("output_format") == "json": self.set_output("json", bboxes) @@ -319,7 +360,7 @@ class Parser(ProcessBase): markdown_image_response_type = conf.get("markdown_image_response_type", "1") tcadp_parser = TCADPParser( table_result_type=table_result_type, - markdown_image_response_type=markdown_image_response_type + markdown_image_response_type=markdown_image_response_type, ) if not tcadp_parser.check_installation(): raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.") @@ -337,7 +378,7 @@ class Parser(ProcessBase): callback=self.callback, file_type=file_type, file_start_page=1, - file_end_page=1000 + file_end_page=1000, ) # Process TCADP parser output based on configured output_format @@ -365,7 +406,12 @@ class Parser(ProcessBase): # Add tables as text for table in tables: if table: - result.append({"text": table}) + result.append({"text": table, "doc_type_kwd": "table"}) + + table_ctx = conf.get("table_context_size", 0) or 0 + image_ctx = conf.get("image_context_size", 0) or 0 + if table_ctx or image_ctx: + result = attach_media_context(result, table_ctx, image_ctx) self.set_output("json", result) @@ -400,7 +446,13 @@ class Parser(ProcessBase): if conf.get("output_format") == "json": sections, tbls = docx_parser(name, binary=blob) sections = [{"text": section[0], "image": section[1]} for section in sections if section] - sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls]) + sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls]) + + table_ctx = conf.get("table_context_size", 0) or 0 + image_ctx = conf.get("image_context_size", 0) or 0 + if table_ctx or image_ctx: + sections = attach_media_context(sections, table_ctx, image_ctx) + self.set_output("json", sections) elif conf.get("output_format") == "markdown": markdown_text = docx_parser.to_markdown(name, binary=blob) @@ -420,7 +472,7 @@ class Parser(ProcessBase): markdown_image_response_type = conf.get("markdown_image_response_type", "1") tcadp_parser = TCADPParser( table_result_type=table_result_type, - markdown_image_response_type=markdown_image_response_type + markdown_image_response_type=markdown_image_response_type, ) if not tcadp_parser.check_installation(): raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.") @@ -439,7 +491,7 @@ class Parser(ProcessBase): callback=self.callback, file_type=file_type, file_start_page=1, - file_end_page=1000 + file_end_page=1000, ) # Process TCADP parser output - PPT only supports json format @@ -454,7 +506,12 @@ class Parser(ProcessBase): # Add tables as text for table in tables: if table: - result.append({"text": table}) + result.append({"text": table, "doc_type_kwd": "table"}) + + table_ctx = conf.get("table_context_size", 0) or 0 + image_ctx = conf.get("image_context_size", 0) or 0 + if table_ctx or image_ctx: + result = attach_media_context(result, table_ctx, image_ctx) self.set_output("json", result) else: @@ -469,6 +526,10 @@ class Parser(ProcessBase): # json assert conf.get("output_format") == "json", "have to be json for ppt" if conf.get("output_format") == "json": + table_ctx = conf.get("table_context_size", 0) or 0 + image_ctx = conf.get("image_context_size", 0) or 0 + if table_ctx or image_ctx: + sections = attach_media_context(sections, table_ctx, image_ctx) self.set_output("json", sections) def _markdown(self, name, blob): @@ -508,11 +569,15 @@ class Parser(ProcessBase): json_results.append(json_result) + table_ctx = conf.get("table_context_size", 0) or 0 + image_ctx = conf.get("image_context_size", 0) or 0 + if table_ctx or image_ctx: + json_results = attach_media_context(json_results, table_ctx, image_ctx) + self.set_output("json", json_results) else: self.set_output("text", "\n".join([section_text for section_text, _ in sections])) - def _image(self, name, blob): from deepdoc.vision import OCR @@ -588,7 +653,7 @@ class Parser(ProcessBase): from email.parser import BytesParser msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob)) - email_content['metadata'] = {} + email_content["metadata"] = {} # handle header info for header, value in msg.items(): # get fields like from, to, cc, bcc, date, subject @@ -600,6 +665,7 @@ class Parser(ProcessBase): # get body if "body" in target_fields: body_text, body_html = [], [] + def _add_content(m, content_type): def _decode_payload(payload, charset, target_list): try: @@ -641,14 +707,17 @@ class Parser(ProcessBase): if dispositions[0].lower() == "attachment": filename = part.get_filename() payload = part.get_payload(decode=True).decode(part.get_content_charset()) - attachments.append({ - "filename": filename, - "payload": payload, - }) + attachments.append( + { + "filename": filename, + "payload": payload, + } + ) email_content["attachments"] = attachments else: # handle msg file import extract_msg + print("handle a msg file.") msg = extract_msg.Message(blob) # handle header info @@ -662,9 +731,9 @@ class Parser(ProcessBase): } email_content.update({k: v for k, v in basic_content.items() if k in target_fields}) # get metadata - email_content['metadata'] = { - 'message_id': msg.messageId, - 'in_reply_to': msg.inReplyTo, + email_content["metadata"] = { + "message_id": msg.messageId, + "in_reply_to": msg.inReplyTo, } # get body if "body" in target_fields: @@ -675,29 +744,31 @@ class Parser(ProcessBase): if "attachments" in target_fields: attachments = [] for t in msg.attachments: - attachments.append({ - "filename": t.name, - "payload": t.data.decode("utf-8") - }) + attachments.append( + { + "filename": t.name, + "payload": t.data.decode("utf-8"), + } + ) email_content["attachments"] = attachments if conf["output_format"] == "json": self.set_output("json", [email_content]) else: - content_txt = '' + content_txt = "" for k, v in email_content.items(): if isinstance(v, str): # basic info - content_txt += f'{k}:{v}' + "\n" + content_txt += f"{k}:{v}" + "\n" elif isinstance(v, dict): # metadata - content_txt += f'{k}:{json.dumps(v)}' + "\n" + content_txt += f"{k}:{json.dumps(v)}" + "\n" elif isinstance(v, list): # attachments or others for fb in v: if isinstance(fb, dict): # attachments - content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n" + content_txt += f"{fb['filename']}:{fb['payload']}" + "\n" else: # str, usually plain text content_txt += fb diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 0624309ee..6f36a927a 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -318,6 +318,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10): d = copy.deepcopy(doc) tokenize(d, rows, eng) d["content_with_weight"] = rows + d["doc_type_kwd"] = "table" if img: d["image"] = img d["doc_type_kwd"] = "image" @@ -330,6 +331,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10): d = copy.deepcopy(doc) r = de.join(rows[i:i + batch_size]) tokenize(d, r, eng) + d["doc_type_kwd"] = "table" if img: d["image"] = img d["doc_type_kwd"] = "image" @@ -338,6 +340,194 @@ def tokenize_table(tbls, doc, eng, batch_size=10): return res +def attach_media_context(chunks, table_context_size=0, image_context_size=0): + """ + Attach surrounding text chunk content to media chunks (table/image). + Best-effort ordering: if positional info exists on any chunk, use it to + order chunks before collecting context; otherwise keep original order. + """ + if not chunks or (table_context_size <= 0 and image_context_size <= 0): + return chunks + + def is_image_chunk(ck): + if ck.get("doc_type_kwd") == "image": + return True + + text_val = ck.get("content_with_weight") if isinstance(ck.get("content_with_weight"), str) else ck.get("text") + has_text = isinstance(text_val, str) and text_val.strip() + return bool(ck.get("image")) and not has_text + + def is_table_chunk(ck): + return ck.get("doc_type_kwd") == "table" + + def is_text_chunk(ck): + return not is_image_chunk(ck) and not is_table_chunk(ck) + + def get_text(ck): + if isinstance(ck.get("content_with_weight"), str): + return ck["content_with_weight"] + if isinstance(ck.get("text"), str): + return ck["text"] + return "" + + def split_sentences(text): + pattern = r"([.。!?!?;;::\n])" + parts = re.split(pattern, text) + sentences = [] + buf = "" + for p in parts: + if not p: + continue + if re.fullmatch(pattern, p): + buf += p + sentences.append(buf) + buf = "" + else: + buf += p + if buf: + sentences.append(buf) + return sentences + + def trim_to_tokens(text, token_budget, from_tail=False): + if token_budget <= 0 or not text: + return "" + sentences = split_sentences(text) + if not sentences: + return "" + + collected = [] + remaining = token_budget + seq = reversed(sentences) if from_tail else sentences + for s in seq: + tks = num_tokens_from_string(s) + if tks <= 0: + continue + if tks > remaining: + collected.append(s) + break + collected.append(s) + remaining -= tks + + if from_tail: + collected = list(reversed(collected)) + return "".join(collected) + + def extract_position(ck): + pn = None + top = None + left = None + try: + if ck.get("page_num_int"): + pn = ck["page_num_int"][0] + elif ck.get("page_number") is not None: + pn = ck.get("page_number") + + if ck.get("top_int"): + top = ck["top_int"][0] + elif ck.get("top") is not None: + top = ck.get("top") + + if ck.get("position_int"): + left = ck["position_int"][0][1] + elif ck.get("x0") is not None: + left = ck.get("x0") + except Exception: + pn = top = left = None + return pn, top, left + + indexed = list(enumerate(chunks)) + positioned_indices = [] + unpositioned_indices = [] + for idx, ck in indexed: + pn, top, left = extract_position(ck) + if pn is not None and top is not None: + positioned_indices.append((idx, pn, top, left if left is not None else 0)) + else: + unpositioned_indices.append(idx) + + if positioned_indices: + positioned_indices.sort(key=lambda x: (int(x[1]), int(x[2]), int(x[3]), x[0])) + ordered_indices = [i for i, _, _, _ in positioned_indices] + unpositioned_indices + else: + ordered_indices = [idx for idx, _ in indexed] + + total = len(ordered_indices) + for sorted_pos, idx in enumerate(ordered_indices): + ck = chunks[idx] + token_budget = image_context_size if is_image_chunk(ck) else table_context_size if is_table_chunk(ck) else 0 + if token_budget <= 0: + continue + + prev_ctx = [] + remaining_prev = token_budget + for prev_idx in range(sorted_pos - 1, -1, -1): + if remaining_prev <= 0: + break + neighbor_idx = ordered_indices[prev_idx] + if not is_text_chunk(chunks[neighbor_idx]): + break + txt = get_text(chunks[neighbor_idx]) + if not txt: + continue + tks = num_tokens_from_string(txt) + if tks <= 0: + continue + if tks > remaining_prev: + txt = trim_to_tokens(txt, remaining_prev, from_tail=True) + tks = num_tokens_from_string(txt) + prev_ctx.append(txt) + remaining_prev -= tks + prev_ctx.reverse() + + next_ctx = [] + remaining_next = token_budget + for next_idx in range(sorted_pos + 1, total): + if remaining_next <= 0: + break + neighbor_idx = ordered_indices[next_idx] + if not is_text_chunk(chunks[neighbor_idx]): + break + txt = get_text(chunks[neighbor_idx]) + if not txt: + continue + tks = num_tokens_from_string(txt) + if tks <= 0: + continue + if tks > remaining_next: + txt = trim_to_tokens(txt, remaining_next, from_tail=False) + tks = num_tokens_from_string(txt) + next_ctx.append(txt) + remaining_next -= tks + + if not prev_ctx and not next_ctx: + continue + + self_text = get_text(ck) + pieces = [*prev_ctx] + if self_text: + pieces.append(self_text) + pieces.extend(next_ctx) + combined = "\n".join(pieces) + + original = ck.get("content_with_weight") + if "content_with_weight" in ck: + ck["content_with_weight"] = combined + elif "text" in ck: + original = ck.get("text") + ck["text"] = combined + + if combined != original: + if "content_ltks" in ck: + ck["content_ltks"] = rag_tokenizer.tokenize(combined) + if "content_sm_ltks" in ck: + ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined))) + + if positioned_indices: + chunks[:] = [chunks[i] for i in ordered_indices] + + return chunks + + def add_positions(d, poss): if not poss: return diff --git a/test/testcases/configs.py b/test/testcases/configs.py index 992e98d5b..a94a627b7 100644 --- a/test/testcases/configs.py +++ b/test/testcases/configs.py @@ -42,6 +42,8 @@ DEFAULT_PARSER_CONFIG = { "auto_keywords": 0, "auto_questions": 0, "html4excel": False, + "image_context_size": 0, + "table_context_size": 0, "topn_tags": 3, "raptor": { "use_raptor": True, @@ -62,4 +64,4 @@ DEFAULT_PARSER_CONFIG = { ], "method": "light", }, -} \ No newline at end of file +}