mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)
### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
This commit is contained in:
@ -13,20 +13,28 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import io
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from functools import partial
|
||||
|
||||
import trio
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from api.db import LLMType
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.utils import get_uuid
|
||||
from api.utils.base64_image import image2id
|
||||
from deepdoc.parser import ExcelParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||
from rag.app.naive import Docx
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.parser.schema import ParserFromUpstream
|
||||
from rag.llm.cv_model import Base as VLM
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
class ParserParam(ProcessParamBase):
|
||||
@ -45,12 +53,14 @@ class ParserParam(ProcessParamBase):
|
||||
"word": [
|
||||
"json",
|
||||
],
|
||||
"ppt": [],
|
||||
"slides": [
|
||||
"json",
|
||||
],
|
||||
"image": [
|
||||
"text"
|
||||
],
|
||||
"email": [],
|
||||
"text": [
|
||||
"email": ["text", "json"],
|
||||
"text&markdown": [
|
||||
"text",
|
||||
"json"
|
||||
],
|
||||
@ -63,7 +73,6 @@ class ParserParam(ProcessParamBase):
|
||||
self.setups = {
|
||||
"pdf": {
|
||||
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
|
||||
"llm_id": "",
|
||||
"lang": "Chinese",
|
||||
"suffix": [
|
||||
"pdf",
|
||||
@ -85,23 +94,29 @@ class ParserParam(ProcessParamBase):
|
||||
],
|
||||
"output_format": "json",
|
||||
},
|
||||
"markdown": {
|
||||
"suffix": ["md", "markdown"],
|
||||
"text&markdown": {
|
||||
"suffix": ["md", "markdown", "mdx", "txt"],
|
||||
"output_format": "json",
|
||||
},
|
||||
"slides": {
|
||||
"suffix": [
|
||||
"pptx",
|
||||
],
|
||||
"output_format": "json",
|
||||
},
|
||||
"ppt": {},
|
||||
"image": {
|
||||
"parse_method": ["ocr", "vlm"],
|
||||
"parse_method": "ocr",
|
||||
"llm_id": "",
|
||||
"lang": "Chinese",
|
||||
"system_prompt": "",
|
||||
"suffix": ["jpg", "jpeg", "png", "gif"],
|
||||
"output_format": "json",
|
||||
"output_format": "text",
|
||||
},
|
||||
"email": {},
|
||||
"text": {
|
||||
"email": {
|
||||
"suffix": [
|
||||
"txt"
|
||||
"eml", "msg"
|
||||
],
|
||||
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
|
||||
"output_format": "json",
|
||||
},
|
||||
"audio": {
|
||||
@ -131,13 +146,10 @@ class ParserParam(ProcessParamBase):
|
||||
pdf_config = self.setups.get("pdf", {})
|
||||
if pdf_config:
|
||||
pdf_parse_method = pdf_config.get("parse_method", "")
|
||||
self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
|
||||
self.check_empty(pdf_parse_method, "Parse method abnormal.")
|
||||
|
||||
if pdf_parse_method not in ["deepdoc", "plain_text"]:
|
||||
self.check_empty(pdf_config.get("llm_id"), "VLM")
|
||||
|
||||
pdf_language = pdf_config.get("lang", "")
|
||||
self.check_empty(pdf_language, "Language")
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
|
||||
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
|
||||
|
||||
pdf_output_format = pdf_config.get("output_format", "")
|
||||
self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
|
||||
@ -147,32 +159,38 @@ class ParserParam(ProcessParamBase):
|
||||
spreadsheet_output_format = spreadsheet_config.get("output_format", "")
|
||||
self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
|
||||
|
||||
doc_config = self.setups.get("doc", "")
|
||||
doc_config = self.setups.get("word", "")
|
||||
if doc_config:
|
||||
doc_output_format = doc_config.get("output_format", "")
|
||||
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])
|
||||
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
|
||||
|
||||
slides_config = self.setups.get("slides", "")
|
||||
if slides_config:
|
||||
slides_output_format = slides_config.get("output_format", "")
|
||||
self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])
|
||||
|
||||
image_config = self.setups.get("image", "")
|
||||
if image_config:
|
||||
image_parse_method = image_config.get("parse_method", "")
|
||||
self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
|
||||
if image_parse_method not in ["ocr"]:
|
||||
self.check_empty(image_config.get("llm_id"), "VLM")
|
||||
self.check_empty(image_config.get("lang", ""), "Image VLM language")
|
||||
|
||||
image_language = image_config.get("lang", "")
|
||||
self.check_empty(image_language, "Language")
|
||||
|
||||
text_config = self.setups.get("text", "")
|
||||
text_config = self.setups.get("text&markdown", "")
|
||||
if text_config:
|
||||
text_output_format = text_config.get("output_format", "")
|
||||
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
|
||||
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
|
||||
|
||||
audio_config = self.setups.get("audio", "")
|
||||
if audio_config:
|
||||
self.check_empty(audio_config.get("llm_id"), "VLM")
|
||||
self.check_empty(audio_config.get("llm_id"), "Audio VLM")
|
||||
audio_language = audio_config.get("lang", "")
|
||||
self.check_empty(audio_language, "Language")
|
||||
|
||||
email_config = self.setups.get("email", "")
|
||||
if email_config:
|
||||
email_output_format = email_config.get("output_format", "")
|
||||
self.check_valid_value(email_output_format, "Email output format abnormal.", self.allowed_output_format["email"])
|
||||
|
||||
def get_input_form(self) -> dict[str, dict]:
|
||||
return {}
|
||||
|
||||
@ -180,21 +198,18 @@ class ParserParam(ProcessParamBase):
|
||||
class Parser(ProcessBase):
|
||||
component_name = "Parser"
|
||||
|
||||
def _pdf(self, from_upstream: ParserFromUpstream):
|
||||
def _pdf(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
conf = self._param.setups["pdf"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
if conf.get("parse_method") == "deepdoc":
|
||||
if conf.get("parse_method").lower() == "deepdoc":
|
||||
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
|
||||
elif conf.get("parse_method") == "plain_text":
|
||||
elif conf.get("parse_method").lower() == "plain_text":
|
||||
lines, _ = PlainParser()(blob)
|
||||
bboxes = [{"text": t} for t, _ in lines]
|
||||
else:
|
||||
assert conf.get("llm_id")
|
||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
|
||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
|
||||
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
||||
bboxes = []
|
||||
for t, poss in lines:
|
||||
@ -214,66 +229,63 @@ class Parser(ProcessBase):
|
||||
mkdn += b.get("text", "") + "\n"
|
||||
self.set_output("markdown", mkdn)
|
||||
|
||||
def _spreadsheet(self, from_upstream: ParserFromUpstream):
|
||||
def _spreadsheet(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
conf = self._param.setups["spreadsheet"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
print("spreadsheet {conf=}", flush=True)
|
||||
spreadsheet_parser = ExcelParser()
|
||||
if conf.get("output_format") == "html":
|
||||
html = spreadsheet_parser.html(blob, 1000000000)
|
||||
self.set_output("html", html)
|
||||
htmls = spreadsheet_parser.html(blob, 1000000000)
|
||||
self.set_output("html", htmls[0])
|
||||
elif conf.get("output_format") == "json":
|
||||
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
|
||||
elif conf.get("output_format") == "markdown":
|
||||
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
||||
|
||||
def _word(self, from_upstream: ParserFromUpstream):
|
||||
from tika import parser as word_parser
|
||||
|
||||
def _word(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
||||
|
||||
blob = from_upstream.blob
|
||||
name = from_upstream.name
|
||||
conf = self._param.setups["word"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
print("word {conf=}", flush=True)
|
||||
doc_parsed = word_parser.from_buffer(blob)
|
||||
|
||||
sections = []
|
||||
if doc_parsed.get("content"):
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [{"text": section} for section in sections if section]
|
||||
else:
|
||||
logging.warning(f"tika.parser got empty content from {name}.")
|
||||
|
||||
docx_parser = Docx()
|
||||
sections, tbls = docx_parser(name, binary=blob)
|
||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for doc"
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", sections)
|
||||
|
||||
def _markdown(self, from_upstream: ParserFromUpstream):
|
||||
def _slides(self, name, blob):
|
||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
|
||||
|
||||
conf = self._param.setups["slides"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
ppt_parser = ppt_parser()
|
||||
txts = ppt_parser(blob, 0, 100000, None)
|
||||
|
||||
sections = [{"text": section} for section in txts if section.strip()]
|
||||
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||
if conf.get("output_format") == "json":
|
||||
self.set_output("json", sections)
|
||||
|
||||
def _markdown(self, name, blob):
|
||||
from functools import reduce
|
||||
|
||||
from rag.app.naive import Markdown as naive_markdown_parser
|
||||
from rag.nlp import concat_img
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
name = from_upstream.name
|
||||
conf = self._param.setups["markdown"]
|
||||
conf = self._param.setups["text&markdown"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
markdown_parser = naive_markdown_parser()
|
||||
sections, tables = markdown_parser(name, blob, separate_tables=False)
|
||||
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for doc"
|
||||
if conf.get("output_format") == "json":
|
||||
json_results = []
|
||||
|
||||
@ -291,69 +303,51 @@ class Parser(ProcessBase):
|
||||
json_results.append(json_result)
|
||||
|
||||
self.set_output("json", json_results)
|
||||
|
||||
def _text(self, from_upstream: ParserFromUpstream):
|
||||
from deepdoc.parser.utils import get_text
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
name = from_upstream.name
|
||||
conf = self._param.setups["text"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
# parse binary to text
|
||||
text_content = get_text(name, binary=blob)
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
result = [{"text": text_content}]
|
||||
self.set_output("json", result)
|
||||
else:
|
||||
result = text_content
|
||||
self.set_output("text", result)
|
||||
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
|
||||
|
||||
def _image(self, from_upstream: ParserFromUpstream):
|
||||
|
||||
def _image(self, name, blob):
|
||||
from deepdoc.vision import OCR
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
conf = self._param.setups["image"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
img = Image.open(io.BytesIO(blob)).convert("RGB")
|
||||
lang = conf["lang"]
|
||||
|
||||
if conf["parse_method"] == "ocr":
|
||||
# use ocr, recognize chars only
|
||||
ocr = OCR()
|
||||
bxs = ocr(np.array(img)) # return boxes and recognize result
|
||||
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||
|
||||
else:
|
||||
lang = conf["lang"]
|
||||
# use VLM to describe the picture
|
||||
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
|
||||
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["parse_method"], lang=lang)
|
||||
img_binary = io.BytesIO()
|
||||
img.save(img_binary, format="JPEG")
|
||||
img_binary.seek(0)
|
||||
txt = cv_model.describe(img_binary.read())
|
||||
|
||||
system_prompt = conf.get("system_prompt")
|
||||
if system_prompt:
|
||||
txt = cv_model.describe_with_prompt(img_binary.read(), system_prompt)
|
||||
else:
|
||||
txt = cv_model.describe(img_binary.read())
|
||||
|
||||
self.set_output("text", txt)
|
||||
|
||||
def _audio(self, from_upstream: ParserFromUpstream):
|
||||
def _audio(self, name, blob):
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
|
||||
|
||||
blob = from_upstream.blob
|
||||
name = from_upstream.name
|
||||
conf = self._param.setups["audio"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
lang = conf["lang"]
|
||||
_, ext = os.path.splitext(name)
|
||||
tmp_path = ""
|
||||
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
|
||||
tmpf.write(blob)
|
||||
tmpf.flush()
|
||||
@ -364,15 +358,131 @@ class Parser(ProcessBase):
|
||||
|
||||
self.set_output("text", txt)
|
||||
|
||||
def _email(self, name, blob):
|
||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
|
||||
|
||||
email_content = {}
|
||||
conf = self._param.setups["email"]
|
||||
target_fields = conf["fields"]
|
||||
|
||||
_, ext = os.path.splitext(name)
|
||||
if ext == ".eml":
|
||||
# handle eml file
|
||||
from email import policy
|
||||
from email.parser import BytesParser
|
||||
|
||||
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
|
||||
email_content['metadata'] = {}
|
||||
# handle header info
|
||||
for header, value in msg.items():
|
||||
# get fields like from, to, cc, bcc, date, subject
|
||||
if header.lower() in target_fields:
|
||||
email_content[header.lower()] = value
|
||||
# get metadata
|
||||
elif header.lower() not in ["from", "to", "cc", "bcc", "date", "subject"]:
|
||||
email_content["metadata"][header.lower()] = value
|
||||
# get body
|
||||
if "body" in target_fields:
|
||||
body_text, body_html = [], []
|
||||
def _add_content(m, content_type):
|
||||
if content_type == "text/plain":
|
||||
body_text.append(
|
||||
m.get_payload(decode=True).decode(m.get_content_charset())
|
||||
)
|
||||
elif content_type == "text/html":
|
||||
body_html.append(
|
||||
m.get_payload(decode=True).decode(m.get_content_charset())
|
||||
)
|
||||
elif "multipart" in content_type:
|
||||
if m.is_multipart():
|
||||
for part in m.iter_parts():
|
||||
_add_content(part, part.get_content_type())
|
||||
|
||||
_add_content(msg, msg.get_content_type())
|
||||
|
||||
email_content["text"] = body_text
|
||||
email_content["text_html"] = body_html
|
||||
# get attachment
|
||||
if "attachments" in target_fields:
|
||||
attachments = []
|
||||
for part in msg.iter_attachments():
|
||||
content_disposition = part.get("Content-Disposition")
|
||||
if content_disposition:
|
||||
dispositions = content_disposition.strip().split(";")
|
||||
if dispositions[0].lower() == "attachment":
|
||||
filename = part.get_filename()
|
||||
payload = part.get_payload(decode=True)
|
||||
attachments.append({
|
||||
"filename": filename,
|
||||
"payload": payload,
|
||||
})
|
||||
email_content["attachments"] = attachments
|
||||
else:
|
||||
# handle msg file
|
||||
import extract_msg
|
||||
print("handle a msg file.")
|
||||
msg = extract_msg.Message(blob)
|
||||
# handle header info
|
||||
basic_content = {
|
||||
"from": msg.sender,
|
||||
"to": msg.to,
|
||||
"cc": msg.cc,
|
||||
"bcc": msg.bcc,
|
||||
"date": msg.date,
|
||||
"subject": msg.subject,
|
||||
}
|
||||
email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
|
||||
# get metadata
|
||||
email_content['metadata'] = {
|
||||
'message_id': msg.messageId,
|
||||
'in_reply_to': msg.inReplyTo,
|
||||
}
|
||||
# get body
|
||||
if "body" in target_fields:
|
||||
email_content["text"] = msg.body # usually empty. try text_html instead
|
||||
email_content["text_html"] = msg.htmlBody
|
||||
# get attachments
|
||||
if "attachments" in target_fields:
|
||||
attachments = []
|
||||
for t in msg.attachments:
|
||||
attachments.append({
|
||||
"filename": t.name,
|
||||
"payload": t.data # binary
|
||||
})
|
||||
email_content["attachments"] = attachments
|
||||
|
||||
if conf["output_format"] == "json":
|
||||
self.set_output("json", [email_content])
|
||||
else:
|
||||
content_txt = ''
|
||||
for k, v in email_content.items():
|
||||
if isinstance(v, str):
|
||||
# basic info
|
||||
content_txt += f'{k}:{v}' + "\n"
|
||||
elif isinstance(v, dict):
|
||||
# metadata
|
||||
content_txt += f'{k}:{json.dumps(v)}' + "\n"
|
||||
elif isinstance(v, list):
|
||||
# attachments or others
|
||||
for fb in v:
|
||||
if isinstance(fb, dict):
|
||||
# attachments
|
||||
content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
|
||||
else:
|
||||
# str, usually plain text
|
||||
content_txt += fb
|
||||
self.set_output("text", content_txt)
|
||||
|
||||
async def _invoke(self, **kwargs):
|
||||
function_map = {
|
||||
"pdf": self._pdf,
|
||||
"markdown": self._markdown,
|
||||
"text&markdown": self._markdown,
|
||||
"spreadsheet": self._spreadsheet,
|
||||
"slides": self._slides,
|
||||
"word": self._word,
|
||||
"text": self._text,
|
||||
"image": self._image,
|
||||
"audio": self._audio,
|
||||
"email": self._email,
|
||||
}
|
||||
try:
|
||||
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
||||
@ -380,8 +490,25 @@ class Parser(ProcessBase):
|
||||
self.set_output("_ERROR", f"Input error: {str(e)}")
|
||||
return
|
||||
|
||||
name = from_upstream.name
|
||||
if self._canvas._doc_id:
|
||||
b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
|
||||
blob = STORAGE_IMPL.get(b, n)
|
||||
else:
|
||||
blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
|
||||
|
||||
done = False
|
||||
for p_type, conf in self._param.setups.items():
|
||||
if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
|
||||
continue
|
||||
await trio.to_thread.run_sync(function_map[p_type], from_upstream)
|
||||
await trio.to_thread.run_sync(function_map[p_type], name, blob)
|
||||
done = True
|
||||
break
|
||||
|
||||
if not done:
|
||||
raise Exception("No suitable for file extension: `.%s`" % from_upstream.name.split(".")[-1].lower())
|
||||
|
||||
outs = self.output()
|
||||
async with trio.open_nursery() as nursery:
|
||||
for d in outs.get("json", []):
|
||||
nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
|
||||
|
||||
@ -20,6 +20,5 @@ class ParserFromUpstream(BaseModel):
|
||||
elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
|
||||
|
||||
name: str
|
||||
blob: bytes
|
||||
|
||||
file: dict | None = Field(default=None)
|
||||
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
||||
|
||||
Reference in New Issue
Block a user