Feat: add context for figure and table (#11547)

### What problem does this PR solve?

Add context for figure table.



![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e)


`==================()` for demonstrating purpose. 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-11-27 10:21:44 +08:00
committed by GitHub
parent 7c3c185038
commit 9d8b96c1d0
11 changed files with 373 additions and 74 deletions

View File

@ -23,7 +23,7 @@ from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks
tokenize_chunks, attach_media_context
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res

View File

@ -20,7 +20,7 @@ import re
from common.constants import ParserType
from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@ -155,7 +155,7 @@ class Docx(DocxParser):
sum_question = '\n'.join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
tbls = []
for tb in self.doc.tables:
html= "<table>"
@ -231,14 +231,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
pn = first[0]
if isinstance(pn, list):
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
return (txt, layoutno, poss)
sections = [_normalize_section(sec) for sec in sections]
@ -247,7 +247,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@ -325,10 +329,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["doc_type_kwd"] = "image"
tokenize(d, text, eng)
res.append(d)
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res
else:
raise NotImplementedError("file type not supported yet(pdf and docx supported)")
if __name__ == "__main__":
import sys

View File

@ -37,7 +37,7 @@ from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback
@ -616,6 +616,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
final_sections = False
doc = {
"docnm_kwd": filename,
@ -686,6 +688,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res)
res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -947,6 +951,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(embed_res)
if url_res:
res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res

View File

@ -20,7 +20,7 @@ import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from common.constants import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, attach_media_context
from deepdoc.parser import PdfParser
import numpy as np
from rag.app.naive import by_plaintext, PARSERS
@ -150,7 +150,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
if re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -234,6 +234,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
chunks.append(txt)
last_sid = sec_id
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res

View File

@ -20,11 +20,11 @@ import re
import numpy as np
from PIL import Image
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.vision import OCR
from rag.nlp import rag_tokenizer, tokenize
from common.constants import LLMType
from common.string_utils import clean_markdown_block
from deepdoc.vision import OCR
from rag.nlp import attach_media_context, rag_tokenizer, tokenize
ocr = OCR()
@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
}
eng = lang.lower() == "english"
parser_config = kwargs.get("parser_config", {}) or {}
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
try:
doc.update({"doc_type_kwd": "video"})
doc.update(
{
"doc_type_kwd": "video",
}
)
cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
if (eng and len(txt.split()) > 32) or len(txt) > 32:
tokenize(doc, txt, eng)
callback(0.8, "OCR results is too long to use CV LLM.")
return [doc]
return attach_media_context([doc], 0, image_ctx)
try:
callback(0.4, "Use CV LLM to describe the picture.")
@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans
tokenize(doc, txt, eng)
return [doc]
return attach_media_context([doc], 0, image_ctx)
except Exception as e:
callback(prog=-1, msg=str(e))
@ -103,7 +110,7 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
img_binary.seek(0)
img_binary.truncate()
img.save(img_binary, format="PNG")
img_binary.seek(0)
ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
txt += "\n" + ans