From 52f91c23884cc42abcc77c0aed6930a0950450cb Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 30 Dec 2025 20:24:27 +0800 Subject: [PATCH] Refine: image/table context. (#12336) ### What problem does this PR solve? #12303 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/pdf_parser.py | 10 ++-- rag/app/naive.py | 11 +++-- rag/nlp/__init__.py | 90 +++++++++++++++++++++++++++++++++++- rag/prompts/generator.py | 4 +- rag/svr/task_executor.py | 12 +++++ 5 files changed, 116 insertions(+), 11 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index cb34ba68d..ce6b9298b 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -476,11 +476,13 @@ class RAGFlowPdfParser: self.boxes = bxs def _naive_vertical_merge(self, zoomin=3): - bxs = self._assign_column(self.boxes, zoomin) + #bxs = self._assign_column(self.boxes, zoomin) + bxs = self.boxes grouped = defaultdict(list) for b in bxs: - grouped[(b["page_number"], b.get("col_id", 0))].append(b) + # grouped[(b["page_number"], b.get("col_id", 0))].append(b) + grouped[(b["page_number"], "x")].append(b) merged_boxes = [] for (pg, col), bxs in grouped.items(): @@ -551,7 +553,7 @@ class RAGFlowPdfParser: merged_boxes.extend(bxs) - self.boxes = sorted(merged_boxes, key=lambda x: (x["page_number"], x.get("col_id", 0), x["top"])) + #self.boxes = sorted(merged_boxes, key=lambda x: (x["page_number"], x.get("col_id", 0), x["top"])) def _final_reading_order_merge(self, zoomin=3): if not self.boxes: @@ -1206,7 +1208,7 @@ class RAGFlowPdfParser: start = timer() self._text_merge() self._concat_downward() - #self._naive_vertical_merge(zoomin) + self._naive_vertical_merge(zoomin) if callback: callback(0.92, "Text merged ({:.2f}s)".format(timer() - start)) diff --git a/rag/app/naive.py b/rag/app/naive.py index 200059707..7aa8c8c76 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -40,7 +40,7 @@ from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \ - tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context + tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context, append_context2table_image4pdf def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, @@ -487,7 +487,7 @@ class Pdf(PdfParser): tbls = self._extract_table_figure(True, zoomin, True, True) self._naive_vertical_merge() self._concat_downward() - self._final_reading_order_merge() + # self._final_reading_order_merge() # self._filter_forpages() logging.info("layouts cost: {}s".format(timer() - first_start)) return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls @@ -776,6 +776,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if not sections and not tables: return [] + if table_context_size or image_context_size: + tables = append_context2table_image4pdf(sections, tables, image_context_size) + if name in ["tcadp", "docling", "mineru"]: parser_config["chunk_token_num"] = 0 @@ -1006,8 +1009,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca res.extend(embed_res) if url_res: res.extend(url_res) - if table_context_size or image_context_size: - attach_media_context(res, table_context_size, image_context_size) + #if table_context_size or image_context_size: + # attach_media_context(res, table_context_size, image_context_size) return res diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 76ba6f4e5..9c613e8ce 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -16,7 +16,7 @@ import logging import random -from collections import Counter +from collections import Counter, defaultdict from common.token_utils import num_tokens_from_string import re @@ -667,6 +667,94 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0): return chunks +def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0): + from deepdoc.parser import PdfParser + if table_context_size <=0: + return tabls + + page_bucket = defaultdict(list) + for i, (txt, poss) in enumerate(sections): + poss = PdfParser.extract_positions(poss) + for page, left, right, top, bottom in poss: + page = page[0] + page_bucket[page].append(((left, top, right, bottom), txt)) + + def upper_context(page, i): + txt = "" + if page not in page_bucket: + i = -1 + while num_tokens_from_string(txt) < table_context_size: + if i < 0: + page -= 1 + if page < 0 or page not in page_bucket: + break + i = len(page_bucket[page]) -1 + blks = page_bucket[page] + (_, _, _, _), cnt = blks[i] + txts = re.split(r"([。!??;!\n]|\. )", cnt, flags=re.DOTALL)[::-1] + for j in range(0, len(txts), 2): + txt = (txts[j+1] if j+1 table_context_size: + break + i -= 1 + return txt + + def lower_context(page, i): + txt = "" + if page not in page_bucket: + return txt + while num_tokens_from_string(txt) < table_context_size: + if i >= len(page_bucket[page]): + page += 1 + if page not in page_bucket: + break + i = 0 + blks = page_bucket[page] + (_, _, _, _), cnt = blks[i] + txts = re.split(r"([。!??;!\n]|\. )", cnt, flags=re.DOTALL) + for j in range(0, len(txts), 2): + txt += txts[j] + (txts[j+1] if j+1 table_context_size: + break + i += 1 + return txt + + res = [] + for (img, tb), poss in tabls: + page, left, top, right, bott = poss[0] + _page, _left, _top, _right, _bott = poss[-1] + if isinstance(tb, list): + tb = "\n".join(tb) + + i = 0 + blks = page_bucket.get(page, []) + _tb = tb + while i < len(blks): + if i + 1 >= len(blks): + if _page > page: + page += 1 + i = 0 + blks = page_bucket.get(page, []) + continue + tb = upper_context(page, i) + tb + lower_context(page+1, 0) + break + (_, t, r, b), txt = blks[i] + if b > top: + break + (_, _t, _r, _b), _txt = blks[i+1] + if _t < _bott: + i += 1 + continue + + tb = upper_context(page, i) + tb + lower_context(page, i) + break + + if _tb == tb: + tb = upper_context(page, -1) + tb + lower_context(page+1, 0) + res.append(((img, tb), poss)) + return res + + def add_positions(d, poss): if not poss: return diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py index acc5e582d..b429960eb 100644 --- a/rag/prompts/generator.py +++ b/rag/prompts/generator.py @@ -729,6 +729,8 @@ TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user") # Generate TOC from text chunks with text llms async def gen_toc_from_text(txt_info: dict, chat_mdl, callback=None): + if callback: + callback(msg="") try: ans = await gen_json( PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(), @@ -738,8 +740,6 @@ async def gen_toc_from_text(txt_info: dict, chat_mdl, callback=None): gen_conf={"temperature": 0.0, "top_p": 0.9} ) txt_info["toc"] = ans if ans and not isinstance(ans, str) else [] - if callback: - callback(msg="") except Exception as e: logging.exception(e) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index cf339b15a..1c6616a94 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -332,6 +332,9 @@ async def build_chunks(task, progress_callback): async def doc_keyword_extraction(chat_mdl, d, topn): cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "keywords", {"topn": topn}) if not cached: + if has_canceled(task["id"]): + progress_callback(-1, msg="Task has been canceled.") + return async with chat_limiter: cached = await keyword_extraction(chat_mdl, d["content_with_weight"], topn) set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "keywords", {"topn": topn}) @@ -362,6 +365,9 @@ async def build_chunks(task, progress_callback): async def doc_question_proposal(chat_mdl, d, topn): cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "question", {"topn": topn}) if not cached: + if has_canceled(task["id"]): + progress_callback(-1, msg="Task has been canceled.") + return async with chat_limiter: cached = await question_proposal(chat_mdl, d["content_with_weight"], topn) set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "question", {"topn": topn}) @@ -392,6 +398,9 @@ async def build_chunks(task, progress_callback): cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata", task["parser_config"]["metadata"]) if not cached: + if has_canceled(task["id"]): + progress_callback(-1, msg="Task has been canceled.") + return async with chat_limiter: cached = await gen_metadata(chat_mdl, metadata_schema(task["parser_config"]["metadata"]), @@ -457,6 +466,9 @@ async def build_chunks(task, progress_callback): async def doc_content_tagging(chat_mdl, d, topn_tags): cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], all_tags, {"topn": topn_tags}) if not cached: + if has_canceled(task["id"]): + progress_callback(-1, msg="Task has been canceled.") + return picked_examples = random.choices(examples, k=2) if len(examples) > 2 else examples if not picked_examples: picked_examples.append({"content": "This is an example", TAG_FLD: {'example': 1}})