mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: add context for figure and table (#11547)
### What problem does this PR solve? Add context for figure table.  `==================()` for demonstrating purpose. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -19,16 +19,16 @@ import random
|
||||
import re
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
import numpy as np
|
||||
import trio
|
||||
from PIL import Image
|
||||
|
||||
from common.constants import LLMType
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from common import settings
|
||||
from common.constants import LLMType
|
||||
from common.misc_utils import get_uuid
|
||||
from rag.utils.base64_image import image2id
|
||||
from deepdoc.parser import ExcelParser
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||
@ -37,7 +37,8 @@ from rag.app.naive import Docx
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.parser.schema import ParserFromUpstream
|
||||
from rag.llm.cv_model import Base as VLM
|
||||
from common import settings
|
||||
from rag.nlp import attach_media_context
|
||||
from rag.utils.base64_image import image2id
|
||||
|
||||
|
||||
class ParserParam(ProcessParamBase):
|
||||
@ -61,15 +62,18 @@ class ParserParam(ProcessParamBase):
|
||||
"json",
|
||||
],
|
||||
"image": [
|
||||
"text"
|
||||
"text",
|
||||
],
|
||||
"email": [
|
||||
"text",
|
||||
"json",
|
||||
],
|
||||
"email": ["text", "json"],
|
||||
"text&markdown": [
|
||||
"text",
|
||||
"json"
|
||||
"json",
|
||||
],
|
||||
"audio": [
|
||||
"json"
|
||||
"json",
|
||||
],
|
||||
"video": [],
|
||||
}
|
||||
@ -82,6 +86,8 @@ class ParserParam(ProcessParamBase):
|
||||
"pdf",
|
||||
],
|
||||
"output_format": "json",
|
||||
"table_context_size": 0,
|
||||
"image_context_size": 0,
|
||||
},
|
||||
"spreadsheet": {
|
||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||
@ -91,6 +97,8 @@ class ParserParam(ProcessParamBase):
|
||||
"xlsx",
|
||||
"csv",
|
||||
],
|
||||
"table_context_size": 0,
|
||||
"image_context_size": 0,
|
||||
},
|
||||
"word": {
|
||||
"suffix": [
|
||||
@ -98,18 +106,24 @@ class ParserParam(ProcessParamBase):
|
||||
"docx",
|
||||
],
|
||||
"output_format": "json",
|
||||
"table_context_size": 0,
|
||||
"image_context_size": 0,
|
||||
},
|
||||
"text&markdown": {
|
||||
"suffix": ["md", "markdown", "mdx", "txt"],
|
||||
"output_format": "json",
|
||||
"table_context_size": 0,
|
||||
"image_context_size": 0,
|
||||
},
|
||||
"slides": {
|
||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||
"suffix": [
|
||||
"pptx",
|
||||
"ppt"
|
||||
"ppt",
|
||||
],
|
||||
"output_format": "json",
|
||||
"table_context_size": 0,
|
||||
"image_context_size": 0,
|
||||
},
|
||||
"image": {
|
||||
"parse_method": "ocr",
|
||||
@ -121,13 +135,14 @@ class ParserParam(ProcessParamBase):
|
||||
},
|
||||
"email": {
|
||||
"suffix": [
|
||||
"eml", "msg"
|
||||
"eml",
|
||||
"msg",
|
||||
],
|
||||
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
|
||||
"output_format": "json",
|
||||
},
|
||||
"audio": {
|
||||
"suffix":[
|
||||
"suffix": [
|
||||
"da",
|
||||
"wave",
|
||||
"wav",
|
||||
@ -142,15 +157,15 @@ class ParserParam(ProcessParamBase):
|
||||
"realaudio",
|
||||
"vqf",
|
||||
"oggvorbis",
|
||||
"ape"
|
||||
"ape",
|
||||
],
|
||||
"output_format": "text",
|
||||
},
|
||||
"video": {
|
||||
"suffix":[
|
||||
"suffix": [
|
||||
"mp4",
|
||||
"avi",
|
||||
"mkv"
|
||||
"mkv",
|
||||
],
|
||||
"output_format": "text",
|
||||
},
|
||||
@ -253,7 +268,7 @@ class Parser(ProcessBase):
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
markdown_image_response_type=markdown_image_response_type,
|
||||
)
|
||||
sections, _ = tcadp_parser.parse_pdf(
|
||||
filepath=name,
|
||||
@ -261,7 +276,7 @@ class Parser(ProcessBase):
|
||||
callback=self.callback,
|
||||
file_type="PDF",
|
||||
file_start_page=1,
|
||||
file_end_page=1000
|
||||
file_end_page=1000,
|
||||
)
|
||||
bboxes = []
|
||||
for section, position_tag in sections:
|
||||
@ -269,17 +284,20 @@ class Parser(ProcessBase):
|
||||
# Extract position information from TCADP's position tag
|
||||
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
|
||||
import re
|
||||
|
||||
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
|
||||
if match:
|
||||
pn, x0, x1, top, bott = match.groups()
|
||||
bboxes.append({
|
||||
"page_number": int(pn.split('-')[0]), # Take the first page number
|
||||
"x0": float(x0),
|
||||
"x1": float(x1),
|
||||
"top": float(top),
|
||||
"bottom": float(bott),
|
||||
"text": section
|
||||
})
|
||||
bboxes.append(
|
||||
{
|
||||
"page_number": int(pn.split("-")[0]), # Take the first page number
|
||||
"x0": float(x0),
|
||||
"x1": float(x1),
|
||||
"top": float(top),
|
||||
"bottom": float(bott),
|
||||
"text": section,
|
||||
}
|
||||
)
|
||||
else:
|
||||
# If no position info, add as text without position
|
||||
bboxes.append({"text": section})
|
||||
@ -291,7 +309,30 @@ class Parser(ProcessBase):
|
||||
bboxes = []
|
||||
for t, poss in lines:
|
||||
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
|
||||
bboxes.append({"page_number": int(pn[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
|
||||
bboxes.append(
|
||||
{
|
||||
"page_number": int(pn[0]),
|
||||
"x0": float(x0),
|
||||
"x1": float(x1),
|
||||
"top": float(top),
|
||||
"bottom": float(bott),
|
||||
"text": t,
|
||||
}
|
||||
)
|
||||
|
||||
for b in bboxes:
|
||||
text_val = b.get("text", "")
|
||||
has_text = isinstance(text_val, str) and text_val.strip()
|
||||
layout = b.get("layout_type")
|
||||
if layout == "figure" or (b.get("image") and not has_text):
|
||||
b["doc_type_kwd"] = "image"
|
||||
elif layout == "table":
|
||||
b["doc_type_kwd"] = "table"
|
||||
|
||||
table_ctx = conf.get("table_context_size", 0) or 0
|
||||
image_ctx = conf.get("image_context_size", 0) or 0
|
||||
if table_ctx or image_ctx:
|
||||
bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", bboxes)
|
||||
@ -319,7 +360,7 @@ class Parser(ProcessBase):
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
markdown_image_response_type=markdown_image_response_type,
|
||||
)
|
||||
if not tcadp_parser.check_installation():
|
||||
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
@ -337,7 +378,7 @@ class Parser(ProcessBase):
|
||||
callback=self.callback,
|
||||
file_type=file_type,
|
||||
file_start_page=1,
|
||||
file_end_page=1000
|
||||
file_end_page=1000,
|
||||
)
|
||||
|
||||
# Process TCADP parser output based on configured output_format
|
||||
@ -365,7 +406,12 @@ class Parser(ProcessBase):
|
||||
# Add tables as text
|
||||
for table in tables:
|
||||
if table:
|
||||
result.append({"text": table})
|
||||
result.append({"text": table, "doc_type_kwd": "table"})
|
||||
|
||||
table_ctx = conf.get("table_context_size", 0) or 0
|
||||
image_ctx = conf.get("image_context_size", 0) or 0
|
||||
if table_ctx or image_ctx:
|
||||
result = attach_media_context(result, table_ctx, image_ctx)
|
||||
|
||||
self.set_output("json", result)
|
||||
|
||||
@ -400,7 +446,13 @@ class Parser(ProcessBase):
|
||||
if conf.get("output_format") == "json":
|
||||
sections, tbls = docx_parser(name, binary=blob)
|
||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
|
||||
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
|
||||
|
||||
table_ctx = conf.get("table_context_size", 0) or 0
|
||||
image_ctx = conf.get("image_context_size", 0) or 0
|
||||
if table_ctx or image_ctx:
|
||||
sections = attach_media_context(sections, table_ctx, image_ctx)
|
||||
|
||||
self.set_output("json", sections)
|
||||
elif conf.get("output_format") == "markdown":
|
||||
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
||||
@ -420,7 +472,7 @@ class Parser(ProcessBase):
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
markdown_image_response_type=markdown_image_response_type,
|
||||
)
|
||||
if not tcadp_parser.check_installation():
|
||||
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
@ -439,7 +491,7 @@ class Parser(ProcessBase):
|
||||
callback=self.callback,
|
||||
file_type=file_type,
|
||||
file_start_page=1,
|
||||
file_end_page=1000
|
||||
file_end_page=1000,
|
||||
)
|
||||
|
||||
# Process TCADP parser output - PPT only supports json format
|
||||
@ -454,7 +506,12 @@ class Parser(ProcessBase):
|
||||
# Add tables as text
|
||||
for table in tables:
|
||||
if table:
|
||||
result.append({"text": table})
|
||||
result.append({"text": table, "doc_type_kwd": "table"})
|
||||
|
||||
table_ctx = conf.get("table_context_size", 0) or 0
|
||||
image_ctx = conf.get("image_context_size", 0) or 0
|
||||
if table_ctx or image_ctx:
|
||||
result = attach_media_context(result, table_ctx, image_ctx)
|
||||
|
||||
self.set_output("json", result)
|
||||
else:
|
||||
@ -469,6 +526,10 @@ class Parser(ProcessBase):
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||
if conf.get("output_format") == "json":
|
||||
table_ctx = conf.get("table_context_size", 0) or 0
|
||||
image_ctx = conf.get("image_context_size", 0) or 0
|
||||
if table_ctx or image_ctx:
|
||||
sections = attach_media_context(sections, table_ctx, image_ctx)
|
||||
self.set_output("json", sections)
|
||||
|
||||
def _markdown(self, name, blob):
|
||||
@ -508,11 +569,15 @@ class Parser(ProcessBase):
|
||||
|
||||
json_results.append(json_result)
|
||||
|
||||
table_ctx = conf.get("table_context_size", 0) or 0
|
||||
image_ctx = conf.get("image_context_size", 0) or 0
|
||||
if table_ctx or image_ctx:
|
||||
json_results = attach_media_context(json_results, table_ctx, image_ctx)
|
||||
|
||||
self.set_output("json", json_results)
|
||||
else:
|
||||
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
|
||||
|
||||
|
||||
def _image(self, name, blob):
|
||||
from deepdoc.vision import OCR
|
||||
|
||||
@ -588,7 +653,7 @@ class Parser(ProcessBase):
|
||||
from email.parser import BytesParser
|
||||
|
||||
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
|
||||
email_content['metadata'] = {}
|
||||
email_content["metadata"] = {}
|
||||
# handle header info
|
||||
for header, value in msg.items():
|
||||
# get fields like from, to, cc, bcc, date, subject
|
||||
@ -600,6 +665,7 @@ class Parser(ProcessBase):
|
||||
# get body
|
||||
if "body" in target_fields:
|
||||
body_text, body_html = [], []
|
||||
|
||||
def _add_content(m, content_type):
|
||||
def _decode_payload(payload, charset, target_list):
|
||||
try:
|
||||
@ -641,14 +707,17 @@ class Parser(ProcessBase):
|
||||
if dispositions[0].lower() == "attachment":
|
||||
filename = part.get_filename()
|
||||
payload = part.get_payload(decode=True).decode(part.get_content_charset())
|
||||
attachments.append({
|
||||
"filename": filename,
|
||||
"payload": payload,
|
||||
})
|
||||
attachments.append(
|
||||
{
|
||||
"filename": filename,
|
||||
"payload": payload,
|
||||
}
|
||||
)
|
||||
email_content["attachments"] = attachments
|
||||
else:
|
||||
# handle msg file
|
||||
import extract_msg
|
||||
|
||||
print("handle a msg file.")
|
||||
msg = extract_msg.Message(blob)
|
||||
# handle header info
|
||||
@ -662,9 +731,9 @@ class Parser(ProcessBase):
|
||||
}
|
||||
email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
|
||||
# get metadata
|
||||
email_content['metadata'] = {
|
||||
'message_id': msg.messageId,
|
||||
'in_reply_to': msg.inReplyTo,
|
||||
email_content["metadata"] = {
|
||||
"message_id": msg.messageId,
|
||||
"in_reply_to": msg.inReplyTo,
|
||||
}
|
||||
# get body
|
||||
if "body" in target_fields:
|
||||
@ -675,29 +744,31 @@ class Parser(ProcessBase):
|
||||
if "attachments" in target_fields:
|
||||
attachments = []
|
||||
for t in msg.attachments:
|
||||
attachments.append({
|
||||
"filename": t.name,
|
||||
"payload": t.data.decode("utf-8")
|
||||
})
|
||||
attachments.append(
|
||||
{
|
||||
"filename": t.name,
|
||||
"payload": t.data.decode("utf-8"),
|
||||
}
|
||||
)
|
||||
email_content["attachments"] = attachments
|
||||
|
||||
if conf["output_format"] == "json":
|
||||
self.set_output("json", [email_content])
|
||||
else:
|
||||
content_txt = ''
|
||||
content_txt = ""
|
||||
for k, v in email_content.items():
|
||||
if isinstance(v, str):
|
||||
# basic info
|
||||
content_txt += f'{k}:{v}' + "\n"
|
||||
content_txt += f"{k}:{v}" + "\n"
|
||||
elif isinstance(v, dict):
|
||||
# metadata
|
||||
content_txt += f'{k}:{json.dumps(v)}' + "\n"
|
||||
content_txt += f"{k}:{json.dumps(v)}" + "\n"
|
||||
elif isinstance(v, list):
|
||||
# attachments or others
|
||||
for fb in v:
|
||||
if isinstance(fb, dict):
|
||||
# attachments
|
||||
content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
|
||||
content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
|
||||
else:
|
||||
# str, usually plain text
|
||||
content_txt += fb
|
||||
|
||||
Reference in New Issue
Block a user