Refine: image/table context. (#12336)

### What problem does this PR solve?

#12303

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu
2025-12-30 20:24:27 +08:00
committed by GitHub
parent 348265afc1
commit 52f91c2388
5 changed files with 116 additions and 11 deletions

View File

@ -40,7 +40,7 @@ from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context, append_context2table_image4pdf
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
@ -487,7 +487,7 @@ class Pdf(PdfParser):
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
self._concat_downward()
self._final_reading_order_merge()
# self._final_reading_order_merge()
# self._filter_forpages()
logging.info("layouts cost: {}s".format(timer() - first_start))
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
@ -776,6 +776,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if not sections and not tables:
return []
if table_context_size or image_context_size:
tables = append_context2table_image4pdf(sections, tables, image_context_size)
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
@ -1006,8 +1009,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
res.extend(embed_res)
if url_res:
res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
#if table_context_size or image_context_size:
# attach_media_context(res, table_context_size, image_context_size)
return res

View File

@ -16,7 +16,7 @@
import logging
import random
from collections import Counter
from collections import Counter, defaultdict
from common.token_utils import num_tokens_from_string
import re
@ -667,6 +667,94 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
return chunks
def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0):
from deepdoc.parser import PdfParser
if table_context_size <=0:
return tabls
page_bucket = defaultdict(list)
for i, (txt, poss) in enumerate(sections):
poss = PdfParser.extract_positions(poss)
for page, left, right, top, bottom in poss:
page = page[0]
page_bucket[page].append(((left, top, right, bottom), txt))
def upper_context(page, i):
txt = ""
if page not in page_bucket:
i = -1
while num_tokens_from_string(txt) < table_context_size:
if i < 0:
page -= 1
if page < 0 or page not in page_bucket:
break
i = len(page_bucket[page]) -1
blks = page_bucket[page]
(_, _, _, _), cnt = blks[i]
txts = re.split(r"([。!?\n]|\. )", cnt, flags=re.DOTALL)[::-1]
for j in range(0, len(txts), 2):
txt = (txts[j+1] if j+1<len(txts) else "") + txts[j] + txt
if num_tokens_from_string(txt) > table_context_size:
break
i -= 1
return txt
def lower_context(page, i):
txt = ""
if page not in page_bucket:
return txt
while num_tokens_from_string(txt) < table_context_size:
if i >= len(page_bucket[page]):
page += 1
if page not in page_bucket:
break
i = 0
blks = page_bucket[page]
(_, _, _, _), cnt = blks[i]
txts = re.split(r"([。!?\n]|\. )", cnt, flags=re.DOTALL)
for j in range(0, len(txts), 2):
txt += txts[j] + (txts[j+1] if j+1<len(txts) else "")
if num_tokens_from_string(txt) > table_context_size:
break
i += 1
return txt
res = []
for (img, tb), poss in tabls:
page, left, top, right, bott = poss[0]
_page, _left, _top, _right, _bott = poss[-1]
if isinstance(tb, list):
tb = "\n".join(tb)
i = 0
blks = page_bucket.get(page, [])
_tb = tb
while i < len(blks):
if i + 1 >= len(blks):
if _page > page:
page += 1
i = 0
blks = page_bucket.get(page, [])
continue
tb = upper_context(page, i) + tb + lower_context(page+1, 0)
break
(_, t, r, b), txt = blks[i]
if b > top:
break
(_, _t, _r, _b), _txt = blks[i+1]
if _t < _bott:
i += 1
continue
tb = upper_context(page, i) + tb + lower_context(page, i)
break
if _tb == tb:
tb = upper_context(page, -1) + tb + lower_context(page+1, 0)
res.append(((img, tb), poss))
return res
def add_positions(d, poss):
if not poss:
return

View File

@ -729,6 +729,8 @@ TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user")
# Generate TOC from text chunks with text llms
async def gen_toc_from_text(txt_info: dict, chat_mdl, callback=None):
if callback:
callback(msg="")
try:
ans = await gen_json(
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
@ -738,8 +740,6 @@ async def gen_toc_from_text(txt_info: dict, chat_mdl, callback=None):
gen_conf={"temperature": 0.0, "top_p": 0.9}
)
txt_info["toc"] = ans if ans and not isinstance(ans, str) else []
if callback:
callback(msg="")
except Exception as e:
logging.exception(e)

View File

@ -332,6 +332,9 @@ async def build_chunks(task, progress_callback):
async def doc_keyword_extraction(chat_mdl, d, topn):
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "keywords", {"topn": topn})
if not cached:
if has_canceled(task["id"]):
progress_callback(-1, msg="Task has been canceled.")
return
async with chat_limiter:
cached = await keyword_extraction(chat_mdl, d["content_with_weight"], topn)
set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "keywords", {"topn": topn})
@ -362,6 +365,9 @@ async def build_chunks(task, progress_callback):
async def doc_question_proposal(chat_mdl, d, topn):
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "question", {"topn": topn})
if not cached:
if has_canceled(task["id"]):
progress_callback(-1, msg="Task has been canceled.")
return
async with chat_limiter:
cached = await question_proposal(chat_mdl, d["content_with_weight"], topn)
set_llm_cache(chat_mdl.llm_name, d["content_with_weight"], cached, "question", {"topn": topn})
@ -392,6 +398,9 @@ async def build_chunks(task, progress_callback):
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], "metadata",
task["parser_config"]["metadata"])
if not cached:
if has_canceled(task["id"]):
progress_callback(-1, msg="Task has been canceled.")
return
async with chat_limiter:
cached = await gen_metadata(chat_mdl,
metadata_schema(task["parser_config"]["metadata"]),
@ -457,6 +466,9 @@ async def build_chunks(task, progress_callback):
async def doc_content_tagging(chat_mdl, d, topn_tags):
cached = get_llm_cache(chat_mdl.llm_name, d["content_with_weight"], all_tags, {"topn": topn_tags})
if not cached:
if has_canceled(task["id"]):
progress_callback(-1, msg="Task has been canceled.")
return
picked_examples = random.choices(examples, k=2) if len(examples) > 2 else examples
if not picked_examples:
picked_examples.append({"content": "This is an example", TAG_FLD: {'example': 1}})