Feat: add context for figure and table (#11547)

### What problem does this PR solve?

Add context for figure table.



![demo_figure_table_context](https://github.com/user-attachments/assets/61b37fac-e22e-40a4-9665-9396c7b4103e)


`==================()` for demonstrating purpose. 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-11-27 10:21:44 +08:00
committed by GitHub
parent 7c3c185038
commit 9d8b96c1d0
11 changed files with 373 additions and 74 deletions

View File

@ -749,7 +749,7 @@ class Knowledgebase(DataBaseModel):
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True) parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True) pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]}) parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
pagerank = IntegerField(default=0, index=False) pagerank = IntegerField(default=0, index=False)
graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True) graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
@ -774,7 +774,7 @@ class Document(DataBaseModel):
kb_id = CharField(max_length=256, null=False, index=True) kb_id = CharField(max_length=256, null=False, index=True)
parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True) parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True) pipeline_id = CharField(max_length=32, null=True, help_text="pipeline ID", index=True)
parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]}) parser_config = JSONField(null=False, default={"pages": [[1, 1000000]], "table_context_size": 0, "image_context_size": 0})
source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True) source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
type = CharField(max_length=32, null=False, help_text="file extension", index=True) type = CharField(max_length=32, null=False, help_text="file extension", index=True)
created_by = CharField(max_length=32, null=False, help_text="who created it", index=True) created_by = CharField(max_length=32, null=False, help_text="who created it", index=True)

View File

@ -923,7 +923,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
ParserType.AUDIO.value: audio, ParserType.AUDIO.value: audio,
ParserType.EMAIL.value: email ParserType.EMAIL.value: email
} }
parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text"} parser_config = {"chunk_token_num": 4096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text", "table_context_size": 0, "image_context_size": 0}
exe = ThreadPoolExecutor(max_workers=12) exe = ThreadPoolExecutor(max_workers=12)
threads = [] threads = []
doc_nm = {} doc_nm = {}

View File

@ -313,6 +313,10 @@ def get_parser_config(chunk_method, parser_config):
chunk_method = "naive" chunk_method = "naive"
# Define default configurations for each chunking method # Define default configurations for each chunking method
base_defaults = {
"table_context_size": 0,
"image_context_size": 0,
}
key_mapping = { key_mapping = {
"naive": { "naive": {
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
@ -365,16 +369,19 @@ def get_parser_config(chunk_method, parser_config):
default_config = key_mapping[chunk_method] default_config = key_mapping[chunk_method]
# If no parser_config provided, return default # If no parser_config provided, return default merged with base defaults
if not parser_config: if not parser_config:
return default_config if default_config is None:
return deep_merge(base_defaults, {})
return deep_merge(base_defaults, default_config)
# If parser_config is provided, merge with defaults to ensure required fields exist # If parser_config is provided, merge with defaults to ensure required fields exist
if default_config is None: if default_config is None:
return parser_config return deep_merge(base_defaults, parser_config)
# Ensure raptor and graphrag fields have default values if not provided # Ensure raptor and graphrag fields have default values if not provided
merged_config = deep_merge(default_config, parser_config) merged_config = deep_merge(base_defaults, default_config)
merged_config = deep_merge(merged_config, parser_config)
return merged_config return merged_config

View File

@ -23,7 +23,7 @@ from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS from rag.app.naive import by_plaintext, PARSERS
from rag.nlp import bullets_category, is_english,remove_contents_table, \ from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks tokenize_chunks, attach_media_context
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@ -175,6 +175,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res return res

View File

@ -20,7 +20,7 @@ import re
from common.constants import ParserType from common.constants import ParserType
from io import BytesIO from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
from common.token_utils import num_tokens_from_string from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
@ -155,7 +155,7 @@ class Docx(DocxParser):
sum_question = '\n'.join(question_stack) sum_question = '\n'.join(question_stack)
if sum_question: if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image)) ti_list.append((f'{sum_question}\n{last_answer}', last_image))
tbls = [] tbls = []
for tb in self.doc.tables: for tb in self.doc.tables:
html= "<table>" html= "<table>"
@ -231,14 +231,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if isinstance(poss, str): if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss) poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2) first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0] pn = first[0]
if isinstance(pn, list): if isinstance(pn, list):
pn = pn[0] # [pn] -> pn pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:]) poss[0] = (pn, *first[1:])
return (txt, layoutno, poss) return (txt, layoutno, poss)
sections = [_normalize_section(sec) for sec in sections] sections = [_normalize_section(sec) for sec in sections]
@ -247,7 +247,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if name in ["tcadp", "docling", "mineru"]: if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0 parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
@ -310,6 +310,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res return res
elif re.search(r"\.docx?$", filename, re.IGNORECASE): elif re.search(r"\.docx?$", filename, re.IGNORECASE):
@ -325,10 +329,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["doc_type_kwd"] = "image" d["doc_type_kwd"] = "image"
tokenize(d, text, eng) tokenize(d, text, eng)
res.append(d) res.append(d)
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res return res
else: else:
raise NotImplementedError("file type not supported yet(pdf and docx supported)") raise NotImplementedError("file type not supported yet(pdf and docx supported)")
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys

View File

@ -37,7 +37,7 @@ from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback callback = callback
@ -616,6 +616,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config = kwargs.get( parser_config = kwargs.get(
"parser_config", { "parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
final_sections = False final_sections = False
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
@ -686,6 +688,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
res.extend(embed_res) res.extend(embed_res)
res.extend(url_res) res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -947,6 +951,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(embed_res) res.extend(embed_res)
if url_res: if url_res:
res.extend(url_res) res.extend(url_res)
if table_context_size or image_context_size:
attach_media_context(res, table_context_size, image_context_size)
return res return res

View File

@ -20,7 +20,7 @@ import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from common.constants import ParserType from common.constants import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, attach_media_context
from deepdoc.parser import PdfParser from deepdoc.parser import PdfParser
import numpy as np import numpy as np
from rag.app.naive import by_plaintext, PARSERS from rag.app.naive import by_plaintext, PARSERS
@ -150,7 +150,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
if re.search(r"\.pdf$", filename, re.IGNORECASE): if re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if isinstance(layout_recognizer, bool): if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -234,6 +234,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
chunks.append(txt) chunks.append(txt)
last_sid = sec_id last_sid = sec_id
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if table_ctx or image_ctx:
attach_media_context(res, table_ctx, image_ctx)
return res return res

View File

@ -20,11 +20,11 @@ import re
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from deepdoc.vision import OCR from common.constants import LLMType
from rag.nlp import rag_tokenizer, tokenize
from common.string_utils import clean_markdown_block from common.string_utils import clean_markdown_block
from deepdoc.vision import OCR
from rag.nlp import attach_media_context, rag_tokenizer, tokenize
ocr = OCR() ocr = OCR()
@ -39,9 +39,16 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
} }
eng = lang.lower() == "english" eng = lang.lower() == "english"
parser_config = kwargs.get("parser_config", {}) or {}
image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS): if any(filename.lower().endswith(ext) for ext in VIDEO_EXTS):
try: try:
doc.update({"doc_type_kwd": "video"}) doc.update(
{
"doc_type_kwd": "video",
}
)
cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang) cv_mdl = LLMBundle(tenant_id, llm_type=LLMType.IMAGE2TEXT, lang=lang)
ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename) ans = cv_mdl.chat(system="", history=[], gen_conf={}, video_bytes=binary, filename=filename)
callback(0.8, "CV LLM respond: %s ..." % ans[:32]) callback(0.8, "CV LLM respond: %s ..." % ans[:32])
@ -64,7 +71,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
if (eng and len(txt.split()) > 32) or len(txt) > 32: if (eng and len(txt.split()) > 32) or len(txt) > 32:
tokenize(doc, txt, eng) tokenize(doc, txt, eng)
callback(0.8, "OCR results is too long to use CV LLM.") callback(0.8, "OCR results is too long to use CV LLM.")
return [doc] return attach_media_context([doc], 0, image_ctx)
try: try:
callback(0.4, "Use CV LLM to describe the picture.") callback(0.4, "Use CV LLM to describe the picture.")
@ -76,7 +83,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(0.8, "CV LLM respond: %s ..." % ans[:32]) callback(0.8, "CV LLM respond: %s ..." % ans[:32])
txt += "\n" + ans txt += "\n" + ans
tokenize(doc, txt, eng) tokenize(doc, txt, eng)
return [doc] return attach_media_context([doc], 0, image_ctx)
except Exception as e: except Exception as e:
callback(prog=-1, msg=str(e)) callback(prog=-1, msg=str(e))
@ -103,7 +110,7 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
img_binary.seek(0) img_binary.seek(0)
img_binary.truncate() img_binary.truncate()
img.save(img_binary, format="PNG") img.save(img_binary, format="PNG")
img_binary.seek(0) img_binary.seek(0)
ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt)) ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
txt += "\n" + ans txt += "\n" + ans

View File

@ -19,16 +19,16 @@ import random
import re import re
from functools import partial from functools import partial
import trio
import numpy as np import numpy as np
import trio
from PIL import Image from PIL import Image
from common.constants import LLMType
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService from api.db.services.file_service import FileService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from common import settings
from common.constants import LLMType
from common.misc_utils import get_uuid from common.misc_utils import get_uuid
from rag.utils.base64_image import image2id
from deepdoc.parser import ExcelParser from deepdoc.parser import ExcelParser
from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
@ -37,7 +37,8 @@ from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream from rag.flow.parser.schema import ParserFromUpstream
from rag.llm.cv_model import Base as VLM from rag.llm.cv_model import Base as VLM
from common import settings from rag.nlp import attach_media_context
from rag.utils.base64_image import image2id
class ParserParam(ProcessParamBase): class ParserParam(ProcessParamBase):
@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase):
"json", "json",
], ],
"image": [ "image": [
"text" "text",
],
"email": [
"text",
"json",
], ],
"email": ["text", "json"],
"text&markdown": [ "text&markdown": [
"text", "text",
"json" "json",
], ],
"audio": [ "audio": [
"json" "json",
], ],
"video": [], "video": [],
} }
@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase):
"pdf", "pdf",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"spreadsheet": { "spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser "parse_method": "deepdoc", # deepdoc/tcadp_parser
@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase):
"xlsx", "xlsx",
"csv", "csv",
], ],
"table_context_size": 0,
"image_context_size": 0,
}, },
"word": { "word": {
"suffix": [ "suffix": [
@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase):
"docx", "docx",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"text&markdown": { "text&markdown": {
"suffix": ["md", "markdown", "mdx", "txt"], "suffix": ["md", "markdown", "mdx", "txt"],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"slides": { "slides": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser "parse_method": "deepdoc", # deepdoc/tcadp_parser
"suffix": [ "suffix": [
"pptx", "pptx",
"ppt" "ppt",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"image": { "image": {
"parse_method": "ocr", "parse_method": "ocr",
@ -121,13 +135,14 @@ class ParserParam(ProcessParamBase):
}, },
"email": { "email": {
"suffix": [ "suffix": [
"eml", "msg" "eml",
"msg",
], ],
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"], "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
"output_format": "json", "output_format": "json",
}, },
"audio": { "audio": {
"suffix":[ "suffix": [
"da", "da",
"wave", "wave",
"wav", "wav",
@ -142,15 +157,15 @@ class ParserParam(ProcessParamBase):
"realaudio", "realaudio",
"vqf", "vqf",
"oggvorbis", "oggvorbis",
"ape" "ape",
], ],
"output_format": "text", "output_format": "text",
}, },
"video": { "video": {
"suffix":[ "suffix": [
"mp4", "mp4",
"avi", "avi",
"mkv" "mkv",
], ],
"output_format": "text", "output_format": "text",
}, },
@ -253,7 +268,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1") markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser( tcadp_parser = TCADPParser(
table_result_type=table_result_type, table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type markdown_image_response_type=markdown_image_response_type,
) )
sections, _ = tcadp_parser.parse_pdf( sections, _ = tcadp_parser.parse_pdf(
filepath=name, filepath=name,
@ -261,7 +276,7 @@ class Parser(ProcessBase):
callback=self.callback, callback=self.callback,
file_type="PDF", file_type="PDF",
file_start_page=1, file_start_page=1,
file_end_page=1000 file_end_page=1000,
) )
bboxes = [] bboxes = []
for section, position_tag in sections: for section, position_tag in sections:
@ -269,17 +284,20 @@ class Parser(ProcessBase):
# Extract position information from TCADP's position tag # Extract position information from TCADP's position tag
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}## # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
import re import re
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag) match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if match: if match:
pn, x0, x1, top, bott = match.groups() pn, x0, x1, top, bott = match.groups()
bboxes.append({ bboxes.append(
"page_number": int(pn.split('-')[0]), # Take the first page number {
"x0": float(x0), "page_number": int(pn.split("-")[0]), # Take the first page number
"x1": float(x1), "x0": float(x0),
"top": float(top), "x1": float(x1),
"bottom": float(bott), "top": float(top),
"text": section "bottom": float(bott),
}) "text": section,
}
)
else: else:
# If no position info, add as text without position # If no position info, add as text without position
bboxes.append({"text": section}) bboxes.append({"text": section})
@ -291,7 +309,30 @@ class Parser(ProcessBase):
bboxes = [] bboxes = []
for t, poss in lines: for t, poss in lines:
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss): for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t}) bboxes.append(
{
"page_number": int(pn[0]),
"x0": float(x0),
"x1": float(x1),
"top": float(top),
"bottom": float(bott),
"text": t,
}
)
for b in bboxes:
text_val = b.get("text", "")
has_text = isinstance(text_val, str) and text_val.strip()
layout = b.get("layout_type")
if layout == "figure" or (b.get("image") and not has_text):
b["doc_type_kwd"] = "image"
elif layout == "table":
b["doc_type_kwd"] = "table"
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
self.set_output("json", bboxes) self.set_output("json", bboxes)
@ -319,7 +360,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1") markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser( tcadp_parser = TCADPParser(
table_result_type=table_result_type, table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type markdown_image_response_type=markdown_image_response_type,
) )
if not tcadp_parser.check_installation(): if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.") raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -337,7 +378,7 @@ class Parser(ProcessBase):
callback=self.callback, callback=self.callback,
file_type=file_type, file_type=file_type,
file_start_page=1, file_start_page=1,
file_end_page=1000 file_end_page=1000,
) )
# Process TCADP parser output based on configured output_format # Process TCADP parser output based on configured output_format
@ -365,7 +406,12 @@ class Parser(ProcessBase):
# Add tables as text # Add tables as text
for table in tables: for table in tables:
if table: if table:
result.append({"text": table}) result.append({"text": table, "doc_type_kwd": "table"})
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result) self.set_output("json", result)
@ -400,7 +446,13 @@ class Parser(ProcessBase):
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
sections, tbls = docx_parser(name, binary=blob) sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section] sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls]) sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections) self.set_output("json", sections)
elif conf.get("output_format") == "markdown": elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob) markdown_text = docx_parser.to_markdown(name, binary=blob)
@ -420,7 +472,7 @@ class Parser(ProcessBase):
markdown_image_response_type = conf.get("markdown_image_response_type", "1") markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser( tcadp_parser = TCADPParser(
table_result_type=table_result_type, table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type markdown_image_response_type=markdown_image_response_type,
) )
if not tcadp_parser.check_installation(): if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.") raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
@ -439,7 +491,7 @@ class Parser(ProcessBase):
callback=self.callback, callback=self.callback,
file_type=file_type, file_type=file_type,
file_start_page=1, file_start_page=1,
file_end_page=1000 file_end_page=1000,
) )
# Process TCADP parser output - PPT only supports json format # Process TCADP parser output - PPT only supports json format
@ -454,7 +506,12 @@ class Parser(ProcessBase):
# Add tables as text # Add tables as text
for table in tables: for table in tables:
if table: if table:
result.append({"text": table}) result.append({"text": table, "doc_type_kwd": "table"})
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result) self.set_output("json", result)
else: else:
@ -469,6 +526,10 @@ class Parser(ProcessBase):
# json # json
assert conf.get("output_format") == "json", "have to be json for ppt" assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections) self.set_output("json", sections)
def _markdown(self, name, blob): def _markdown(self, name, blob):
@ -508,11 +569,15 @@ class Parser(ProcessBase):
json_results.append(json_result) json_results.append(json_result)
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
json_results = attach_media_context(json_results, table_ctx, image_ctx)
self.set_output("json", json_results) self.set_output("json", json_results)
else: else:
self.set_output("text", "\n".join([section_text for section_text, _ in sections])) self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
def _image(self, name, blob): def _image(self, name, blob):
from deepdoc.vision import OCR from deepdoc.vision import OCR
@ -588,7 +653,7 @@ class Parser(ProcessBase):
from email.parser import BytesParser from email.parser import BytesParser
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob)) msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
email_content['metadata'] = {} email_content["metadata"] = {}
# handle header info # handle header info
for header, value in msg.items(): for header, value in msg.items():
# get fields like from, to, cc, bcc, date, subject # get fields like from, to, cc, bcc, date, subject
@ -600,6 +665,7 @@ class Parser(ProcessBase):
# get body # get body
if "body" in target_fields: if "body" in target_fields:
body_text, body_html = [], [] body_text, body_html = [], []
def _add_content(m, content_type): def _add_content(m, content_type):
def _decode_payload(payload, charset, target_list): def _decode_payload(payload, charset, target_list):
try: try:
@ -641,14 +707,17 @@ class Parser(ProcessBase):
if dispositions[0].lower() == "attachment": if dispositions[0].lower() == "attachment":
filename = part.get_filename() filename = part.get_filename()
payload = part.get_payload(decode=True).decode(part.get_content_charset()) payload = part.get_payload(decode=True).decode(part.get_content_charset())
attachments.append({ attachments.append(
"filename": filename, {
"payload": payload, "filename": filename,
}) "payload": payload,
}
)
email_content["attachments"] = attachments email_content["attachments"] = attachments
else: else:
# handle msg file # handle msg file
import extract_msg import extract_msg
print("handle a msg file.") print("handle a msg file.")
msg = extract_msg.Message(blob) msg = extract_msg.Message(blob)
# handle header info # handle header info
@ -662,9 +731,9 @@ class Parser(ProcessBase):
} }
email_content.update({k: v for k, v in basic_content.items() if k in target_fields}) email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
# get metadata # get metadata
email_content['metadata'] = { email_content["metadata"] = {
'message_id': msg.messageId, "message_id": msg.messageId,
'in_reply_to': msg.inReplyTo, "in_reply_to": msg.inReplyTo,
} }
# get body # get body
if "body" in target_fields: if "body" in target_fields:
@ -675,29 +744,31 @@ class Parser(ProcessBase):
if "attachments" in target_fields: if "attachments" in target_fields:
attachments = [] attachments = []
for t in msg.attachments: for t in msg.attachments:
attachments.append({ attachments.append(
"filename": t.name, {
"payload": t.data.decode("utf-8") "filename": t.name,
}) "payload": t.data.decode("utf-8"),
}
)
email_content["attachments"] = attachments email_content["attachments"] = attachments
if conf["output_format"] == "json": if conf["output_format"] == "json":
self.set_output("json", [email_content]) self.set_output("json", [email_content])
else: else:
content_txt = '' content_txt = ""
for k, v in email_content.items(): for k, v in email_content.items():
if isinstance(v, str): if isinstance(v, str):
# basic info # basic info
content_txt += f'{k}:{v}' + "\n" content_txt += f"{k}:{v}" + "\n"
elif isinstance(v, dict): elif isinstance(v, dict):
# metadata # metadata
content_txt += f'{k}:{json.dumps(v)}' + "\n" content_txt += f"{k}:{json.dumps(v)}" + "\n"
elif isinstance(v, list): elif isinstance(v, list):
# attachments or others # attachments or others
for fb in v: for fb in v:
if isinstance(fb, dict): if isinstance(fb, dict):
# attachments # attachments
content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n" content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
else: else:
# str, usually plain text # str, usually plain text
content_txt += fb content_txt += fb

View File

@ -318,6 +318,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
tokenize(d, rows, eng) tokenize(d, rows, eng)
d["content_with_weight"] = rows d["content_with_weight"] = rows
d["doc_type_kwd"] = "table"
if img: if img:
d["image"] = img d["image"] = img
d["doc_type_kwd"] = "image" d["doc_type_kwd"] = "image"
@ -330,6 +331,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
r = de.join(rows[i:i + batch_size]) r = de.join(rows[i:i + batch_size])
tokenize(d, r, eng) tokenize(d, r, eng)
d["doc_type_kwd"] = "table"
if img: if img:
d["image"] = img d["image"] = img
d["doc_type_kwd"] = "image" d["doc_type_kwd"] = "image"
@ -338,6 +340,194 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
return res return res
def attach_media_context(chunks, table_context_size=0, image_context_size=0):
"""
Attach surrounding text chunk content to media chunks (table/image).
Best-effort ordering: if positional info exists on any chunk, use it to
order chunks before collecting context; otherwise keep original order.
"""
if not chunks or (table_context_size <= 0 and image_context_size <= 0):
return chunks
def is_image_chunk(ck):
if ck.get("doc_type_kwd") == "image":
return True
text_val = ck.get("content_with_weight") if isinstance(ck.get("content_with_weight"), str) else ck.get("text")
has_text = isinstance(text_val, str) and text_val.strip()
return bool(ck.get("image")) and not has_text
def is_table_chunk(ck):
return ck.get("doc_type_kwd") == "table"
def is_text_chunk(ck):
return not is_image_chunk(ck) and not is_table_chunk(ck)
def get_text(ck):
if isinstance(ck.get("content_with_weight"), str):
return ck["content_with_weight"]
if isinstance(ck.get("text"), str):
return ck["text"]
return ""
def split_sentences(text):
pattern = r"([.。!?!?;:\n])"
parts = re.split(pattern, text)
sentences = []
buf = ""
for p in parts:
if not p:
continue
if re.fullmatch(pattern, p):
buf += p
sentences.append(buf)
buf = ""
else:
buf += p
if buf:
sentences.append(buf)
return sentences
def trim_to_tokens(text, token_budget, from_tail=False):
if token_budget <= 0 or not text:
return ""
sentences = split_sentences(text)
if not sentences:
return ""
collected = []
remaining = token_budget
seq = reversed(sentences) if from_tail else sentences
for s in seq:
tks = num_tokens_from_string(s)
if tks <= 0:
continue
if tks > remaining:
collected.append(s)
break
collected.append(s)
remaining -= tks
if from_tail:
collected = list(reversed(collected))
return "".join(collected)
def extract_position(ck):
pn = None
top = None
left = None
try:
if ck.get("page_num_int"):
pn = ck["page_num_int"][0]
elif ck.get("page_number") is not None:
pn = ck.get("page_number")
if ck.get("top_int"):
top = ck["top_int"][0]
elif ck.get("top") is not None:
top = ck.get("top")
if ck.get("position_int"):
left = ck["position_int"][0][1]
elif ck.get("x0") is not None:
left = ck.get("x0")
except Exception:
pn = top = left = None
return pn, top, left
indexed = list(enumerate(chunks))
positioned_indices = []
unpositioned_indices = []
for idx, ck in indexed:
pn, top, left = extract_position(ck)
if pn is not None and top is not None:
positioned_indices.append((idx, pn, top, left if left is not None else 0))
else:
unpositioned_indices.append(idx)
if positioned_indices:
positioned_indices.sort(key=lambda x: (int(x[1]), int(x[2]), int(x[3]), x[0]))
ordered_indices = [i for i, _, _, _ in positioned_indices] + unpositioned_indices
else:
ordered_indices = [idx for idx, _ in indexed]
total = len(ordered_indices)
for sorted_pos, idx in enumerate(ordered_indices):
ck = chunks[idx]
token_budget = image_context_size if is_image_chunk(ck) else table_context_size if is_table_chunk(ck) else 0
if token_budget <= 0:
continue
prev_ctx = []
remaining_prev = token_budget
for prev_idx in range(sorted_pos - 1, -1, -1):
if remaining_prev <= 0:
break
neighbor_idx = ordered_indices[prev_idx]
if not is_text_chunk(chunks[neighbor_idx]):
break
txt = get_text(chunks[neighbor_idx])
if not txt:
continue
tks = num_tokens_from_string(txt)
if tks <= 0:
continue
if tks > remaining_prev:
txt = trim_to_tokens(txt, remaining_prev, from_tail=True)
tks = num_tokens_from_string(txt)
prev_ctx.append(txt)
remaining_prev -= tks
prev_ctx.reverse()
next_ctx = []
remaining_next = token_budget
for next_idx in range(sorted_pos + 1, total):
if remaining_next <= 0:
break
neighbor_idx = ordered_indices[next_idx]
if not is_text_chunk(chunks[neighbor_idx]):
break
txt = get_text(chunks[neighbor_idx])
if not txt:
continue
tks = num_tokens_from_string(txt)
if tks <= 0:
continue
if tks > remaining_next:
txt = trim_to_tokens(txt, remaining_next, from_tail=False)
tks = num_tokens_from_string(txt)
next_ctx.append(txt)
remaining_next -= tks
if not prev_ctx and not next_ctx:
continue
self_text = get_text(ck)
pieces = [*prev_ctx]
if self_text:
pieces.append(self_text)
pieces.extend(next_ctx)
combined = "\n".join(pieces)
original = ck.get("content_with_weight")
if "content_with_weight" in ck:
ck["content_with_weight"] = combined
elif "text" in ck:
original = ck.get("text")
ck["text"] = combined
if combined != original:
if "content_ltks" in ck:
ck["content_ltks"] = rag_tokenizer.tokenize(combined)
if "content_sm_ltks" in ck:
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
if positioned_indices:
chunks[:] = [chunks[i] for i in ordered_indices]
return chunks
def add_positions(d, poss): def add_positions(d, poss):
if not poss: if not poss:
return return

View File

@ -42,6 +42,8 @@ DEFAULT_PARSER_CONFIG = {
"auto_keywords": 0, "auto_keywords": 0,
"auto_questions": 0, "auto_questions": 0,
"html4excel": False, "html4excel": False,
"image_context_size": 0,
"table_context_size": 0,
"topn_tags": 3, "topn_tags": 3,
"raptor": { "raptor": {
"use_raptor": True, "use_raptor": True,
@ -62,4 +64,4 @@ DEFAULT_PARSER_CONFIG = {
], ],
"method": "light", "method": "light",
}, },
} }