Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
This commit is contained in:
Kevin Hu
2025-10-09 12:36:19 +08:00
committed by GitHub
parent ef0aecea3b
commit cbf04ee470
490 changed files with 10630 additions and 30688 deletions

View File

@ -13,20 +13,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import logging
import json
import os
import random
from functools import partial
import trio
import numpy as np
from PIL import Image
from api.db import LLMType
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.llm_service import LLMBundle
from api.utils import get_uuid
from api.utils.base64_image import image2id
from deepdoc.parser import ExcelParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream
from rag.llm.cv_model import Base as VLM
from rag.utils.storage_factory import STORAGE_IMPL
class ParserParam(ProcessParamBase):
@ -45,12 +53,14 @@ class ParserParam(ProcessParamBase):
"word": [
"json",
],
"ppt": [],
"slides": [
"json",
],
"image": [
"text"
],
"email": [],
"text": [
"email": ["text", "json"],
"text&markdown": [
"text",
"json"
],
@ -63,7 +73,6 @@ class ParserParam(ProcessParamBase):
self.setups = {
"pdf": {
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
"llm_id": "",
"lang": "Chinese",
"suffix": [
"pdf",
@ -85,23 +94,29 @@ class ParserParam(ProcessParamBase):
],
"output_format": "json",
},
"markdown": {
"suffix": ["md", "markdown"],
"text&markdown": {
"suffix": ["md", "markdown", "mdx", "txt"],
"output_format": "json",
},
"slides": {
"suffix": [
"pptx",
],
"output_format": "json",
},
"ppt": {},
"image": {
"parse_method": ["ocr", "vlm"],
"parse_method": "ocr",
"llm_id": "",
"lang": "Chinese",
"system_prompt": "",
"suffix": ["jpg", "jpeg", "png", "gif"],
"output_format": "json",
"output_format": "text",
},
"email": {},
"text": {
"email": {
"suffix": [
"txt"
"eml", "msg"
],
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
"output_format": "json",
},
"audio": {
@ -131,13 +146,10 @@ class ParserParam(ProcessParamBase):
pdf_config = self.setups.get("pdf", {})
if pdf_config:
pdf_parse_method = pdf_config.get("parse_method", "")
self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
self.check_empty(pdf_parse_method, "Parse method abnormal.")
if pdf_parse_method not in ["deepdoc", "plain_text"]:
self.check_empty(pdf_config.get("llm_id"), "VLM")
pdf_language = pdf_config.get("lang", "")
self.check_empty(pdf_language, "Language")
if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
pdf_output_format = pdf_config.get("output_format", "")
self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
@ -147,32 +159,38 @@ class ParserParam(ProcessParamBase):
spreadsheet_output_format = spreadsheet_config.get("output_format", "")
self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
doc_config = self.setups.get("doc", "")
doc_config = self.setups.get("word", "")
if doc_config:
doc_output_format = doc_config.get("output_format", "")
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
slides_config = self.setups.get("slides", "")
if slides_config:
slides_output_format = slides_config.get("output_format", "")
self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])
image_config = self.setups.get("image", "")
if image_config:
image_parse_method = image_config.get("parse_method", "")
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
if image_parse_method not in ["ocr"]:
self.check_empty(image_config.get("llm_id"), "VLM")
self.check_empty(image_config.get("lang", ""), "Image VLM language")
image_language = image_config.get("lang", "")
self.check_empty(image_language, "Language")
text_config = self.setups.get("text", "")
text_config = self.setups.get("text&markdown", "")
if text_config:
text_output_format = text_config.get("output_format", "")
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
audio_config = self.setups.get("audio", "")
if audio_config:
self.check_empty(audio_config.get("llm_id"), "VLM")
self.check_empty(audio_config.get("llm_id"), "Audio VLM")
audio_language = audio_config.get("lang", "")
self.check_empty(audio_language, "Language")
email_config = self.setups.get("email", "")
if email_config:
email_output_format = email_config.get("output_format", "")
self.check_valid_value(email_output_format, "Email output format abnormal.", self.allowed_output_format["email"])
def get_input_form(self) -> dict[str, dict]:
return {}
@ -180,21 +198,18 @@ class ParserParam(ProcessParamBase):
class Parser(ProcessBase):
component_name = "Parser"
def _pdf(self, from_upstream: ParserFromUpstream):
def _pdf(self, name, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
blob = from_upstream.blob
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
if conf.get("parse_method") == "deepdoc":
if conf.get("parse_method").lower() == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
elif conf.get("parse_method") == "plain_text":
elif conf.get("parse_method").lower() == "plain_text":
lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines]
else:
assert conf.get("llm_id")
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
bboxes = []
for t, poss in lines:
@ -214,66 +229,63 @@ class Parser(ProcessBase):
mkdn += b.get("text", "") + "\n"
self.set_output("markdown", mkdn)
def _spreadsheet(self, from_upstream: ParserFromUpstream):
def _spreadsheet(self, name, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
blob = from_upstream.blob
conf = self._param.setups["spreadsheet"]
self.set_output("output_format", conf["output_format"])
print("spreadsheet {conf=}", flush=True)
spreadsheet_parser = ExcelParser()
if conf.get("output_format") == "html":
html = spreadsheet_parser.html(blob, 1000000000)
self.set_output("html", html)
htmls = spreadsheet_parser.html(blob, 1000000000)
self.set_output("html", htmls[0])
elif conf.get("output_format") == "json":
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
elif conf.get("output_format") == "markdown":
self.set_output("markdown", spreadsheet_parser.markdown(blob))
def _word(self, from_upstream: ParserFromUpstream):
from tika import parser as word_parser
def _word(self, name, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
blob = from_upstream.blob
name = from_upstream.name
conf = self._param.setups["word"]
self.set_output("output_format", conf["output_format"])
print("word {conf=}", flush=True)
doc_parsed = word_parser.from_buffer(blob)
sections = []
if doc_parsed.get("content"):
sections = doc_parsed["content"].split("\n")
sections = [{"text": section} for section in sections if section]
else:
logging.warning(f"tika.parser got empty content from {name}.")
docx_parser = Docx()
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
# json
assert conf.get("output_format") == "json", "have to be json for doc"
if conf.get("output_format") == "json":
self.set_output("json", sections)
def _markdown(self, from_upstream: ParserFromUpstream):
def _slides(self, name, blob):
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
conf = self._param.setups["slides"]
self.set_output("output_format", conf["output_format"])
ppt_parser = ppt_parser()
txts = ppt_parser(blob, 0, 100000, None)
sections = [{"text": section} for section in txts if section.strip()]
# json
assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json":
self.set_output("json", sections)
def _markdown(self, name, blob):
from functools import reduce
from rag.app.naive import Markdown as naive_markdown_parser
from rag.nlp import concat_img
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
blob = from_upstream.blob
name = from_upstream.name
conf = self._param.setups["markdown"]
conf = self._param.setups["text&markdown"]
self.set_output("output_format", conf["output_format"])
markdown_parser = naive_markdown_parser()
sections, tables = markdown_parser(name, blob, separate_tables=False)
# json
assert conf.get("output_format") == "json", "have to be json for doc"
if conf.get("output_format") == "json":
json_results = []
@ -291,69 +303,51 @@ class Parser(ProcessBase):
json_results.append(json_result)
self.set_output("json", json_results)
def _text(self, from_upstream: ParserFromUpstream):
from deepdoc.parser.utils import get_text
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
blob = from_upstream.blob
name = from_upstream.name
conf = self._param.setups["text"]
self.set_output("output_format", conf["output_format"])
# parse binary to text
text_content = get_text(name, binary=blob)
if conf.get("output_format") == "json":
result = [{"text": text_content}]
self.set_output("json", result)
else:
result = text_content
self.set_output("text", result)
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
def _image(self, from_upstream: ParserFromUpstream):
def _image(self, name, blob):
from deepdoc.vision import OCR
self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
blob = from_upstream.blob
conf = self._param.setups["image"]
self.set_output("output_format", conf["output_format"])
img = Image.open(io.BytesIO(blob)).convert("RGB")
lang = conf["lang"]
if conf["parse_method"] == "ocr":
# use ocr, recognize chars only
ocr = OCR()
bxs = ocr(np.array(img)) # return boxes and recognize result
txt = "\n".join([t[0] for _, t in bxs if t[0]])
else:
lang = conf["lang"]
# use VLM to describe the picture
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["parse_method"], lang=lang)
img_binary = io.BytesIO()
img.save(img_binary, format="JPEG")
img_binary.seek(0)
txt = cv_model.describe(img_binary.read())
system_prompt = conf.get("system_prompt")
if system_prompt:
txt = cv_model.describe_with_prompt(img_binary.read(), system_prompt)
else:
txt = cv_model.describe(img_binary.read())
self.set_output("text", txt)
def _audio(self, from_upstream: ParserFromUpstream):
def _audio(self, name, blob):
import os
import tempfile
self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
blob = from_upstream.blob
name = from_upstream.name
conf = self._param.setups["audio"]
self.set_output("output_format", conf["output_format"])
lang = conf["lang"]
_, ext = os.path.splitext(name)
tmp_path = ""
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
tmpf.write(blob)
tmpf.flush()
@ -364,15 +358,131 @@ class Parser(ProcessBase):
self.set_output("text", txt)
def _email(self, name, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
email_content = {}
conf = self._param.setups["email"]
target_fields = conf["fields"]
_, ext = os.path.splitext(name)
if ext == ".eml":
# handle eml file
from email import policy
from email.parser import BytesParser
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
email_content['metadata'] = {}
# handle header info
for header, value in msg.items():
# get fields like from, to, cc, bcc, date, subject
if header.lower() in target_fields:
email_content[header.lower()] = value
# get metadata
elif header.lower() not in ["from", "to", "cc", "bcc", "date", "subject"]:
email_content["metadata"][header.lower()] = value
# get body
if "body" in target_fields:
body_text, body_html = [], []
def _add_content(m, content_type):
if content_type == "text/plain":
body_text.append(
m.get_payload(decode=True).decode(m.get_content_charset())
)
elif content_type == "text/html":
body_html.append(
m.get_payload(decode=True).decode(m.get_content_charset())
)
elif "multipart" in content_type:
if m.is_multipart():
for part in m.iter_parts():
_add_content(part, part.get_content_type())
_add_content(msg, msg.get_content_type())
email_content["text"] = body_text
email_content["text_html"] = body_html
# get attachment
if "attachments" in target_fields:
attachments = []
for part in msg.iter_attachments():
content_disposition = part.get("Content-Disposition")
if content_disposition:
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True)
attachments.append({
"filename": filename,
"payload": payload,
})
email_content["attachments"] = attachments
else:
# handle msg file
import extract_msg
print("handle a msg file.")
msg = extract_msg.Message(blob)
# handle header info
basic_content = {
"from": msg.sender,
"to": msg.to,
"cc": msg.cc,
"bcc": msg.bcc,
"date": msg.date,
"subject": msg.subject,
}
email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
# get metadata
email_content['metadata'] = {
'message_id': msg.messageId,
'in_reply_to': msg.inReplyTo,
}
# get body
if "body" in target_fields:
email_content["text"] = msg.body # usually empty. try text_html instead
email_content["text_html"] = msg.htmlBody
# get attachments
if "attachments" in target_fields:
attachments = []
for t in msg.attachments:
attachments.append({
"filename": t.name,
"payload": t.data # binary
})
email_content["attachments"] = attachments
if conf["output_format"] == "json":
self.set_output("json", [email_content])
else:
content_txt = ''
for k, v in email_content.items():
if isinstance(v, str):
# basic info
content_txt += f'{k}:{v}' + "\n"
elif isinstance(v, dict):
# metadata
content_txt += f'{k}:{json.dumps(v)}' + "\n"
elif isinstance(v, list):
# attachments or others
for fb in v:
if isinstance(fb, dict):
# attachments
content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
else:
# str, usually plain text
content_txt += fb
self.set_output("text", content_txt)
async def _invoke(self, **kwargs):
function_map = {
"pdf": self._pdf,
"markdown": self._markdown,
"text&markdown": self._markdown,
"spreadsheet": self._spreadsheet,
"slides": self._slides,
"word": self._word,
"text": self._text,
"image": self._image,
"audio": self._audio,
"email": self._email,
}
try:
from_upstream = ParserFromUpstream.model_validate(kwargs)
@ -380,8 +490,25 @@ class Parser(ProcessBase):
self.set_output("_ERROR", f"Input error: {str(e)}")
return
name = from_upstream.name
if self._canvas._doc_id:
b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
blob = STORAGE_IMPL.get(b, n)
else:
blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
done = False
for p_type, conf in self._param.setups.items():
if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
continue
await trio.to_thread.run_sync(function_map[p_type], from_upstream)
await trio.to_thread.run_sync(function_map[p_type], name, blob)
done = True
break
if not done:
raise Exception("No suitable for file extension: `.%s`" % from_upstream.name.split(".")[-1].lower())
outs = self.output()
async with trio.open_nursery() as nursery:
for d in outs.get("json", []):
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())

View File

@ -20,6 +20,5 @@ class ParserFromUpstream(BaseModel):
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
name: str
blob: bytes
file: dict | None = Field(default=None)
model_config = ConfigDict(populate_by_name=True, extra="forbid")