mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Add TCADP parser for PPTX and spreadsheet document types. (#11041)
### What problem does this PR solve? - Added TCADP Parser configuration fields to PDF, PPT, and spreadsheet parsing forms - Implemented support for setting table result type (Markdown/HTML) and Markdown image response type (URL/Text) - Updated TCADP Parser to handle return format settings from configuration or parameters - Enhanced frontend to dynamically show TCADP options based on selected parsing method - Modified backend to pass format parameters when calling TCADP API - Optimized form default value logic for TCADP configuration items - Updated multilingual resource files for new configuration options ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -147,5 +147,3 @@ user_default_llm:
|
|||||||
# secret_id: 'tencent_secret_id'
|
# secret_id: 'tencent_secret_id'
|
||||||
# secret_key: 'tencent_secret_key'
|
# secret_key: 'tencent_secret_key'
|
||||||
# region: 'tencent_region'
|
# region: 'tencent_region'
|
||||||
# table_result_type: '1'
|
|
||||||
# markdown_image_response_type: '1'
|
|
||||||
|
|||||||
@ -192,12 +192,16 @@ class TencentCloudAPIClient:
|
|||||||
|
|
||||||
|
|
||||||
class TCADPParser(RAGFlowPdfParser):
|
class TCADPParser(RAGFlowPdfParser):
|
||||||
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"):
|
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
|
||||||
|
table_result_type: str = None, markdown_image_response_type: str = None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
# First initialize logger
|
# First initialize logger
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
|
# Log received parameters
|
||||||
|
self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
|
||||||
|
|
||||||
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
|
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
|
||||||
try:
|
try:
|
||||||
tcadp_parser = get_base_config("tcadp_config", {})
|
tcadp_parser = get_base_config("tcadp_config", {})
|
||||||
@ -205,14 +209,30 @@ class TCADPParser(RAGFlowPdfParser):
|
|||||||
self.secret_id = secret_id or tcadp_parser.get("secret_id")
|
self.secret_id = secret_id or tcadp_parser.get("secret_id")
|
||||||
self.secret_key = secret_key or tcadp_parser.get("secret_key")
|
self.secret_key = secret_key or tcadp_parser.get("secret_key")
|
||||||
self.region = region or tcadp_parser.get("region", "ap-guangzhou")
|
self.region = region or tcadp_parser.get("region", "ap-guangzhou")
|
||||||
self.table_result_type = tcadp_parser.get("table_result_type", "1")
|
# Set table_result_type and markdown_image_response_type from config or parameters
|
||||||
self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1")
|
self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
|
||||||
self.logger.info("[TCADP] Configuration read from service_conf.yaml")
|
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
|
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
|
||||||
|
# If config file is empty, use provided parameters or defaults
|
||||||
|
self.secret_id = secret_id
|
||||||
|
self.secret_key = secret_key
|
||||||
|
self.region = region or "ap-guangzhou"
|
||||||
|
self.table_result_type = table_result_type if table_result_type is not None else "1"
|
||||||
|
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
self.logger.info("[TCADP] Configuration module import failed")
|
self.logger.info("[TCADP] Configuration module import failed")
|
||||||
|
# If config file is not available, use provided parameters or defaults
|
||||||
|
self.secret_id = secret_id
|
||||||
|
self.secret_key = secret_key
|
||||||
|
self.region = region or "ap-guangzhou"
|
||||||
|
self.table_result_type = table_result_type if table_result_type is not None else "1"
|
||||||
|
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
|
||||||
|
|
||||||
|
# Log final values
|
||||||
|
self.logger.info(f"[TCADP] Final values - table_result_type: {self.table_result_type}, markdown_image_response_type: {self.markdown_image_response_type}")
|
||||||
|
|
||||||
if not self.secret_id or not self.secret_key:
|
if not self.secret_id or not self.secret_key:
|
||||||
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
|
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
|
||||||
@ -400,6 +420,8 @@ class TCADPParser(RAGFlowPdfParser):
|
|||||||
"TableResultType": self.table_result_type,
|
"TableResultType": self.table_result_type,
|
||||||
"MarkdownImageResponseType": self.markdown_image_response_type
|
"MarkdownImageResponseType": self.markdown_image_response_type
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")
|
||||||
|
|
||||||
result = client.reconstruct_document_sse(
|
result = client.reconstruct_document_sse(
|
||||||
file_type=file_type,
|
file_type=file_type,
|
||||||
|
|||||||
@ -150,5 +150,3 @@ user_default_llm:
|
|||||||
# secret_id: '${TENCENT_SECRET_ID}'
|
# secret_id: '${TENCENT_SECRET_ID}'
|
||||||
# secret_key: '${TENCENT_SECRET_KEY}'
|
# secret_key: '${TENCENT_SECRET_KEY}'
|
||||||
# region: '${TENCENT_REGION}'
|
# region: '${TENCENT_REGION}'
|
||||||
# table_result_type: '1'
|
|
||||||
# markdown_image_response_type: '1'
|
|
||||||
|
|||||||
@ -116,7 +116,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
|
|||||||
else:
|
else:
|
||||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
|
||||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||||
|
|
||||||
sections, tables = pdf_parser(
|
sections, tables = pdf_parser(
|
||||||
filename if not binary else binary,
|
filename if not binary else binary,
|
||||||
from_page=from_page,
|
from_page=from_page,
|
||||||
@ -504,7 +504,7 @@ class Markdown(MarkdownParser):
|
|||||||
|
|
||||||
return images if images else None
|
return images if images else None
|
||||||
|
|
||||||
def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
|
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding, errors="ignore")
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
@ -602,7 +602,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
_SerializedRelationships.load_from_xml = load_from_xml_v2
|
_SerializedRelationships.load_from_xml = load_from_xml_v2
|
||||||
sections, tables = Docx()(filename, binary)
|
sections, tables = Docx()(filename, binary)
|
||||||
|
|
||||||
tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
|
tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)
|
||||||
|
|
||||||
res = tokenize_table(tables, doc, is_english)
|
res = tokenize_table(tables, doc, is_english)
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
@ -653,18 +653,47 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
if name in ["tcadp", "docling", "mineru"]:
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
parser_config["chunk_token_num"] = 0
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
res = tokenize_table(tables, doc, is_english)
|
res = tokenize_table(tables, doc, is_english)
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
excel_parser = ExcelParser()
|
|
||||||
if parser_config.get("html4excel"):
|
# Check if tcadp_parser is selected for spreadsheet files
|
||||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||||
|
if layout_recognizer == "TCADP Parser":
|
||||||
|
table_result_type = parser_config.get("table_result_type", "1")
|
||||||
|
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
|
||||||
|
tcadp_parser = TCADPParser(
|
||||||
|
table_result_type=table_result_type,
|
||||||
|
markdown_image_response_type=markdown_image_response_type
|
||||||
|
)
|
||||||
|
if not tcadp_parser.check_installation():
|
||||||
|
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||||
|
return res
|
||||||
|
|
||||||
|
# Determine file type based on extension
|
||||||
|
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
|
||||||
|
|
||||||
|
sections, tables = tcadp_parser.parse_pdf(
|
||||||
|
filepath=filename,
|
||||||
|
binary=binary,
|
||||||
|
callback=callback,
|
||||||
|
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||||
|
file_type=file_type
|
||||||
|
)
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
|
res = tokenize_table(tables, doc, is_english)
|
||||||
|
callback(0.8, "Finish parsing.")
|
||||||
else:
|
else:
|
||||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
# Default DeepDOC parser
|
||||||
parser_config["chunk_token_num"] = 12800
|
excel_parser = ExcelParser()
|
||||||
|
if parser_config.get("html4excel"):
|
||||||
|
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||||
|
else:
|
||||||
|
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||||
|
parser_config["chunk_token_num"] = 12800
|
||||||
|
|
||||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
@ -676,7 +705,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||||
sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
||||||
|
|||||||
@ -16,6 +16,7 @@ import io
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import re
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import trio
|
import trio
|
||||||
@ -83,6 +84,7 @@ class ParserParam(ProcessParamBase):
|
|||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
},
|
},
|
||||||
"spreadsheet": {
|
"spreadsheet": {
|
||||||
|
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||||
"output_format": "html",
|
"output_format": "html",
|
||||||
"suffix": [
|
"suffix": [
|
||||||
"xls",
|
"xls",
|
||||||
@ -102,8 +104,10 @@ class ParserParam(ProcessParamBase):
|
|||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
},
|
},
|
||||||
"slides": {
|
"slides": {
|
||||||
|
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||||
"suffix": [
|
"suffix": [
|
||||||
"pptx",
|
"pptx",
|
||||||
|
"ppt"
|
||||||
],
|
],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
},
|
},
|
||||||
@ -245,7 +249,12 @@ class Parser(ProcessBase):
|
|||||||
bboxes.append(box)
|
bboxes.append(box)
|
||||||
elif conf.get("parse_method").lower() == "tcadp parser":
|
elif conf.get("parse_method").lower() == "tcadp parser":
|
||||||
# ADP is a document parsing tool using Tencent Cloud API
|
# ADP is a document parsing tool using Tencent Cloud API
|
||||||
tcadp_parser = TCADPParser()
|
table_result_type = conf.get("table_result_type", "1")
|
||||||
|
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||||
|
tcadp_parser = TCADPParser(
|
||||||
|
table_result_type=table_result_type,
|
||||||
|
markdown_image_response_type=markdown_image_response_type
|
||||||
|
)
|
||||||
sections, _ = tcadp_parser.parse_pdf(
|
sections, _ = tcadp_parser.parse_pdf(
|
||||||
filepath=name,
|
filepath=name,
|
||||||
binary=blob,
|
binary=blob,
|
||||||
@ -301,14 +310,86 @@ class Parser(ProcessBase):
|
|||||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
|
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
|
||||||
conf = self._param.setups["spreadsheet"]
|
conf = self._param.setups["spreadsheet"]
|
||||||
self.set_output("output_format", conf["output_format"])
|
self.set_output("output_format", conf["output_format"])
|
||||||
spreadsheet_parser = ExcelParser()
|
|
||||||
if conf.get("output_format") == "html":
|
parse_method = conf.get("parse_method", "deepdoc")
|
||||||
htmls = spreadsheet_parser.html(blob, 1000000000)
|
|
||||||
self.set_output("html", htmls[0])
|
# Handle TCADP parser
|
||||||
elif conf.get("output_format") == "json":
|
if parse_method.lower() == "tcadp parser":
|
||||||
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
|
table_result_type = conf.get("table_result_type", "1")
|
||||||
elif conf.get("output_format") == "markdown":
|
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||||
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
tcadp_parser = TCADPParser(
|
||||||
|
table_result_type=table_result_type,
|
||||||
|
markdown_image_response_type=markdown_image_response_type
|
||||||
|
)
|
||||||
|
if not tcadp_parser.check_installation():
|
||||||
|
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||||
|
|
||||||
|
# Determine file type based on extension
|
||||||
|
if re.search(r"\.xlsx?$", name, re.IGNORECASE):
|
||||||
|
file_type = "XLSX"
|
||||||
|
else:
|
||||||
|
file_type = "CSV"
|
||||||
|
|
||||||
|
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
|
||||||
|
sections, tables = tcadp_parser.parse_pdf(
|
||||||
|
filepath=name,
|
||||||
|
binary=blob,
|
||||||
|
callback=self.callback,
|
||||||
|
file_type=file_type,
|
||||||
|
file_start_page=1,
|
||||||
|
file_end_page=1000
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process TCADP parser output based on configured output_format
|
||||||
|
output_format = conf.get("output_format", "html")
|
||||||
|
|
||||||
|
if output_format == "html":
|
||||||
|
# For HTML output, combine sections and tables into HTML
|
||||||
|
html_content = ""
|
||||||
|
for section, position_tag in sections:
|
||||||
|
if section:
|
||||||
|
html_content += section + "\n"
|
||||||
|
for table in tables:
|
||||||
|
if table:
|
||||||
|
html_content += table + "\n"
|
||||||
|
|
||||||
|
self.set_output("html", html_content)
|
||||||
|
|
||||||
|
elif output_format == "json":
|
||||||
|
# For JSON output, create a list of text items
|
||||||
|
result = []
|
||||||
|
# Add sections as text
|
||||||
|
for section, position_tag in sections:
|
||||||
|
if section:
|
||||||
|
result.append({"text": section})
|
||||||
|
# Add tables as text
|
||||||
|
for table in tables:
|
||||||
|
if table:
|
||||||
|
result.append({"text": table})
|
||||||
|
|
||||||
|
self.set_output("json", result)
|
||||||
|
|
||||||
|
elif output_format == "markdown":
|
||||||
|
# For markdown output, combine into markdown
|
||||||
|
md_content = ""
|
||||||
|
for section, position_tag in sections:
|
||||||
|
if section:
|
||||||
|
md_content += section + "\n\n"
|
||||||
|
for table in tables:
|
||||||
|
if table:
|
||||||
|
md_content += table + "\n\n"
|
||||||
|
|
||||||
|
self.set_output("markdown", md_content)
|
||||||
|
else:
|
||||||
|
# Default DeepDOC parser
|
||||||
|
spreadsheet_parser = ExcelParser()
|
||||||
|
if conf.get("output_format") == "html":
|
||||||
|
htmls = spreadsheet_parser.html(blob, 1000000000)
|
||||||
|
self.set_output("html", htmls[0])
|
||||||
|
elif conf.get("output_format") == "json":
|
||||||
|
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
|
||||||
|
elif conf.get("output_format") == "markdown":
|
||||||
|
self.set_output("markdown", spreadsheet_parser.markdown(blob))
|
||||||
|
|
||||||
def _word(self, name, blob):
|
def _word(self, name, blob):
|
||||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
|
||||||
@ -326,22 +407,69 @@ class Parser(ProcessBase):
|
|||||||
self.set_output("markdown", markdown_text)
|
self.set_output("markdown", markdown_text)
|
||||||
|
|
||||||
def _slides(self, name, blob):
|
def _slides(self, name, blob):
|
||||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
|
||||||
|
|
||||||
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
|
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
|
||||||
|
|
||||||
conf = self._param.setups["slides"]
|
conf = self._param.setups["slides"]
|
||||||
self.set_output("output_format", conf["output_format"])
|
self.set_output("output_format", conf["output_format"])
|
||||||
|
|
||||||
ppt_parser = ppt_parser()
|
parse_method = conf.get("parse_method", "deepdoc")
|
||||||
txts = ppt_parser(blob, 0, 100000, None)
|
|
||||||
|
|
||||||
sections = [{"text": section} for section in txts if section.strip()]
|
# Handle TCADP parser
|
||||||
|
if parse_method.lower() == "tcadp parser":
|
||||||
|
table_result_type = conf.get("table_result_type", "1")
|
||||||
|
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||||
|
tcadp_parser = TCADPParser(
|
||||||
|
table_result_type=table_result_type,
|
||||||
|
markdown_image_response_type=markdown_image_response_type
|
||||||
|
)
|
||||||
|
if not tcadp_parser.check_installation():
|
||||||
|
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||||
|
|
||||||
# json
|
# Determine file type based on extension
|
||||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
if re.search(r"\.pptx?$", name, re.IGNORECASE):
|
||||||
if conf.get("output_format") == "json":
|
file_type = "PPTX"
|
||||||
self.set_output("json", sections)
|
else:
|
||||||
|
file_type = "PPT"
|
||||||
|
|
||||||
|
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
|
||||||
|
|
||||||
|
sections, tables = tcadp_parser.parse_pdf(
|
||||||
|
filepath=name,
|
||||||
|
binary=blob,
|
||||||
|
callback=self.callback,
|
||||||
|
file_type=file_type,
|
||||||
|
file_start_page=1,
|
||||||
|
file_end_page=1000
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process TCADP parser output - PPT only supports json format
|
||||||
|
output_format = conf.get("output_format", "json")
|
||||||
|
if output_format == "json":
|
||||||
|
# For JSON output, create a list of text items
|
||||||
|
result = []
|
||||||
|
# Add sections as text
|
||||||
|
for section, position_tag in sections:
|
||||||
|
if section:
|
||||||
|
result.append({"text": section})
|
||||||
|
# Add tables as text
|
||||||
|
for table in tables:
|
||||||
|
if table:
|
||||||
|
result.append({"text": table})
|
||||||
|
|
||||||
|
self.set_output("json", result)
|
||||||
|
else:
|
||||||
|
# Default DeepDOC parser (supports .pptx format)
|
||||||
|
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||||
|
|
||||||
|
ppt_parser = ppt_parser()
|
||||||
|
txts = ppt_parser(blob, 0, 100000, None)
|
||||||
|
|
||||||
|
sections = [{"text": section} for section in txts if section.strip()]
|
||||||
|
|
||||||
|
# json
|
||||||
|
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||||
|
if conf.get("output_format") == "json":
|
||||||
|
self.set_output("json", sections)
|
||||||
|
|
||||||
def _markdown(self, name, blob):
|
def _markdown(self, name, blob):
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
@ -579,6 +707,7 @@ class Parser(ProcessBase):
|
|||||||
"video": self._video,
|
"video": self._video,
|
||||||
"email": self._email,
|
"email": self._email,
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
from_upstream = ParserFromUpstream.model_validate(kwargs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -1752,6 +1752,8 @@ The variable aggregation node (originally the variable assignment node) is a cru
|
|||||||
The Indexer will store the content in the corresponding data structures for the selected methods.`,
|
The Indexer will store the content in the corresponding data structures for the selected methods.`,
|
||||||
// file: 'File',
|
// file: 'File',
|
||||||
parserMethod: 'PDF parser',
|
parserMethod: 'PDF parser',
|
||||||
|
tableResultType: 'Table Result Type',
|
||||||
|
markdownImageResponseType: 'Markdown Image Response Type',
|
||||||
// systemPrompt: 'System Prompt',
|
// systemPrompt: 'System Prompt',
|
||||||
systemPromptPlaceholder:
|
systemPromptPlaceholder:
|
||||||
'Enter system prompt for image analysis, if empty the system default value will be used',
|
'Enter system prompt for image analysis, if empty the system default value will be used',
|
||||||
|
|||||||
@ -1629,6 +1629,8 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
|||||||
Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
|
Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
|
||||||
filenameEmbdWeight: '文件名嵌入权重',
|
filenameEmbdWeight: '文件名嵌入权重',
|
||||||
parserMethod: '解析方法',
|
parserMethod: '解析方法',
|
||||||
|
tableResultType: '表格返回形式',
|
||||||
|
markdownImageResponseType: '图片返回形式',
|
||||||
systemPromptPlaceholder:
|
systemPromptPlaceholder:
|
||||||
'请输入用于图像分析的系统提示词,若为空则使用系统缺省值',
|
'请输入用于图像分析的系统提示词,若为空则使用系统缺省值',
|
||||||
exportJson: '导出 JSON',
|
exportJson: '导出 JSON',
|
||||||
|
|||||||
@ -169,6 +169,7 @@ export const initialParserValues = {
|
|||||||
{
|
{
|
||||||
fileFormat: FileType.Spreadsheet,
|
fileFormat: FileType.Spreadsheet,
|
||||||
output_format: SpreadsheetOutputFormat.Html,
|
output_format: SpreadsheetOutputFormat.Html,
|
||||||
|
parse_method: ParseDocumentType.DeepDOC,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
fileFormat: FileType.Image,
|
fileFormat: FileType.Image,
|
||||||
@ -192,6 +193,7 @@ export const initialParserValues = {
|
|||||||
{
|
{
|
||||||
fileFormat: FileType.PowerPoint,
|
fileFormat: FileType.PowerPoint,
|
||||||
output_format: PptOutputFormat.Json,
|
output_format: PptOutputFormat.Json,
|
||||||
|
parse_method: ParseDocumentType.DeepDOC,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
@ -243,7 +245,7 @@ export const FileTypeSuffixMap = {
|
|||||||
[FileType.Email]: ['eml', 'msg'],
|
[FileType.Email]: ['eml', 'msg'],
|
||||||
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
|
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
|
||||||
[FileType.Docx]: ['doc', 'docx'],
|
[FileType.Docx]: ['doc', 'docx'],
|
||||||
[FileType.PowerPoint]: ['pptx'],
|
[FileType.PowerPoint]: ['pptx', 'ppt'],
|
||||||
[FileType.Video]: ['mp4', 'avi', 'mkv'],
|
[FileType.Video]: ['mp4', 'avi', 'mkv'],
|
||||||
[FileType.Audio]: [
|
[FileType.Audio]: [
|
||||||
'da',
|
'da',
|
||||||
|
|||||||
@ -34,6 +34,8 @@ import { OutputFormatFormField } from './common-form-fields';
|
|||||||
import { EmailFormFields } from './email-form-fields';
|
import { EmailFormFields } from './email-form-fields';
|
||||||
import { ImageFormFields } from './image-form-fields';
|
import { ImageFormFields } from './image-form-fields';
|
||||||
import { PdfFormFields } from './pdf-form-fields';
|
import { PdfFormFields } from './pdf-form-fields';
|
||||||
|
import { PptFormFields } from './ppt-form-fields';
|
||||||
|
import { SpreadsheetFormFields } from './spreadsheet-form-fields';
|
||||||
import { buildFieldNameWithPrefix } from './utils';
|
import { buildFieldNameWithPrefix } from './utils';
|
||||||
import { AudioFormFields, VideoFormFields } from './video-form-fields';
|
import { AudioFormFields, VideoFormFields } from './video-form-fields';
|
||||||
|
|
||||||
@ -41,6 +43,8 @@ const outputList = buildOutputList(initialParserValues.outputs);
|
|||||||
|
|
||||||
const FileFormatWidgetMap = {
|
const FileFormatWidgetMap = {
|
||||||
[FileType.PDF]: PdfFormFields,
|
[FileType.PDF]: PdfFormFields,
|
||||||
|
[FileType.Spreadsheet]: SpreadsheetFormFields,
|
||||||
|
[FileType.PowerPoint]: PptFormFields,
|
||||||
[FileType.Video]: VideoFormFields,
|
[FileType.Video]: VideoFormFields,
|
||||||
[FileType.Audio]: AudioFormFields,
|
[FileType.Audio]: AudioFormFields,
|
||||||
[FileType.Email]: EmailFormFields,
|
[FileType.Email]: EmailFormFields,
|
||||||
@ -65,6 +69,8 @@ export const FormSchema = z.object({
|
|||||||
fields: z.array(z.string()).optional(),
|
fields: z.array(z.string()).optional(),
|
||||||
llm_id: z.string().optional(),
|
llm_id: z.string().optional(),
|
||||||
system_prompt: z.string().optional(),
|
system_prompt: z.string().optional(),
|
||||||
|
table_result_type: z.string().optional(),
|
||||||
|
markdown_image_response_type: z.string().optional(),
|
||||||
}),
|
}),
|
||||||
),
|
),
|
||||||
});
|
});
|
||||||
@ -184,6 +190,8 @@ const ParserForm = ({ node }: INextOperatorForm) => {
|
|||||||
lang: '',
|
lang: '',
|
||||||
fields: [],
|
fields: [],
|
||||||
llm_id: '',
|
llm_id: '',
|
||||||
|
table_result_type: '',
|
||||||
|
markdown_image_response_type: '',
|
||||||
});
|
});
|
||||||
}, [append]);
|
}, [append]);
|
||||||
|
|
||||||
|
|||||||
@ -1,13 +1,30 @@
|
|||||||
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||||
|
import {
|
||||||
|
SelectWithSearch,
|
||||||
|
SelectWithSearchFlagOptionType,
|
||||||
|
} from '@/components/originui/select-with-search';
|
||||||
|
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||||
import { isEmpty } from 'lodash';
|
import { isEmpty } from 'lodash';
|
||||||
import { useEffect, useMemo } from 'react';
|
import { useEffect, useMemo } from 'react';
|
||||||
import { useFormContext, useWatch } from 'react-hook-form';
|
import { useFormContext, useWatch } from 'react-hook-form';
|
||||||
|
import { useTranslation } from 'react-i18next';
|
||||||
import { LanguageFormField, ParserMethodFormField } from './common-form-fields';
|
import { LanguageFormField, ParserMethodFormField } from './common-form-fields';
|
||||||
import { CommonProps } from './interface';
|
import { CommonProps } from './interface';
|
||||||
import { useSetInitialLanguage } from './use-set-initial-language';
|
import { useSetInitialLanguage } from './use-set-initial-language';
|
||||||
import { buildFieldNameWithPrefix } from './utils';
|
import { buildFieldNameWithPrefix } from './utils';
|
||||||
|
|
||||||
|
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||||
|
{ label: 'Markdown', value: '0' },
|
||||||
|
{ label: 'HTML', value: '1' },
|
||||||
|
];
|
||||||
|
|
||||||
|
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||||
|
{ label: 'URL', value: '0' },
|
||||||
|
{ label: 'Text', value: '1' },
|
||||||
|
];
|
||||||
|
|
||||||
export function PdfFormFields({ prefix }: CommonProps) {
|
export function PdfFormFields({ prefix }: CommonProps) {
|
||||||
|
const { t } = useTranslation();
|
||||||
const form = useFormContext();
|
const form = useFormContext();
|
||||||
|
|
||||||
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||||
@ -25,6 +42,12 @@ export function PdfFormFields({ prefix }: CommonProps) {
|
|||||||
);
|
);
|
||||||
}, [parseMethod]);
|
}, [parseMethod]);
|
||||||
|
|
||||||
|
const tcadpOptionsShown = useMemo(() => {
|
||||||
|
return (
|
||||||
|
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
|
||||||
|
);
|
||||||
|
}, [parseMethod]);
|
||||||
|
|
||||||
useSetInitialLanguage({ prefix, languageShown });
|
useSetInitialLanguage({ prefix, languageShown });
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@ -36,10 +59,68 @@ export function PdfFormFields({ prefix }: CommonProps) {
|
|||||||
}
|
}
|
||||||
}, [form, parseMethodName]);
|
}, [form, parseMethodName]);
|
||||||
|
|
||||||
|
// Set default values for TCADP options when TCADP is selected
|
||||||
|
useEffect(() => {
|
||||||
|
if (tcadpOptionsShown) {
|
||||||
|
const tableResultTypeName = buildFieldNameWithPrefix(
|
||||||
|
'table_result_type',
|
||||||
|
prefix,
|
||||||
|
);
|
||||||
|
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
|
||||||
|
'markdown_image_response_type',
|
||||||
|
prefix,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (isEmpty(form.getValues(tableResultTypeName))) {
|
||||||
|
form.setValue(tableResultTypeName, '1', {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
|
||||||
|
form.setValue(markdownImageResponseTypeName, '1', {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, [tcadpOptionsShown, form, prefix]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
<ParserMethodFormField prefix={prefix}></ParserMethodFormField>
|
<ParserMethodFormField prefix={prefix}></ParserMethodFormField>
|
||||||
{languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>}
|
{languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>}
|
||||||
|
{tcadpOptionsShown && (
|
||||||
|
<>
|
||||||
|
<RAGFlowFormItem
|
||||||
|
name={buildFieldNameWithPrefix('table_result_type', prefix)}
|
||||||
|
label={t('flow.tableResultType') || '表格返回形式'}
|
||||||
|
>
|
||||||
|
{(field) => (
|
||||||
|
<SelectWithSearch
|
||||||
|
value={field.value}
|
||||||
|
onChange={field.onChange}
|
||||||
|
options={tableResultTypeOptions}
|
||||||
|
></SelectWithSearch>
|
||||||
|
)}
|
||||||
|
</RAGFlowFormItem>
|
||||||
|
<RAGFlowFormItem
|
||||||
|
name={buildFieldNameWithPrefix(
|
||||||
|
'markdown_image_response_type',
|
||||||
|
prefix,
|
||||||
|
)}
|
||||||
|
label={t('flow.markdownImageResponseType') || '图片返回形式'}
|
||||||
|
>
|
||||||
|
{(field) => (
|
||||||
|
<SelectWithSearch
|
||||||
|
value={field.value}
|
||||||
|
onChange={field.onChange}
|
||||||
|
options={markdownImageResponseTypeOptions}
|
||||||
|
></SelectWithSearch>
|
||||||
|
)}
|
||||||
|
</RAGFlowFormItem>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
</>
|
</>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
125
web/src/pages/agent/form/parser-form/ppt-form-fields.tsx
Normal file
125
web/src/pages/agent/form/parser-form/ppt-form-fields.tsx
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||||
|
import {
|
||||||
|
SelectWithSearch,
|
||||||
|
SelectWithSearchFlagOptionType,
|
||||||
|
} from '@/components/originui/select-with-search';
|
||||||
|
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||||
|
import { isEmpty } from 'lodash';
|
||||||
|
import { useEffect, useMemo } from 'react';
|
||||||
|
import { useFormContext, useWatch } from 'react-hook-form';
|
||||||
|
import { useTranslation } from 'react-i18next';
|
||||||
|
import { ParserMethodFormField } from './common-form-fields';
|
||||||
|
import { CommonProps } from './interface';
|
||||||
|
import { buildFieldNameWithPrefix } from './utils';
|
||||||
|
|
||||||
|
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||||
|
{ label: 'Markdown', value: '0' },
|
||||||
|
{ label: 'HTML', value: '1' },
|
||||||
|
];
|
||||||
|
|
||||||
|
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||||
|
{ label: 'URL', value: '0' },
|
||||||
|
{ label: 'Text', value: '1' },
|
||||||
|
];
|
||||||
|
|
||||||
|
export function PptFormFields({ prefix }: CommonProps) {
|
||||||
|
const { t } = useTranslation();
|
||||||
|
const form = useFormContext();
|
||||||
|
|
||||||
|
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||||
|
|
||||||
|
const parseMethod = useWatch({
|
||||||
|
name: parseMethodName,
|
||||||
|
});
|
||||||
|
|
||||||
|
// PPT only supports DeepDOC and TCADPParser
|
||||||
|
const optionsWithoutLLM = [
|
||||||
|
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
|
||||||
|
{
|
||||||
|
label: ParseDocumentType.TCADPParser,
|
||||||
|
value: ParseDocumentType.TCADPParser,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const tcadpOptionsShown = useMemo(() => {
|
||||||
|
return (
|
||||||
|
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
|
||||||
|
);
|
||||||
|
}, [parseMethod]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (isEmpty(form.getValues(parseMethodName))) {
|
||||||
|
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, [form, parseMethodName]);
|
||||||
|
|
||||||
|
// Set default values for TCADP options when TCADP is selected
|
||||||
|
useEffect(() => {
|
||||||
|
if (tcadpOptionsShown) {
|
||||||
|
const tableResultTypeName = buildFieldNameWithPrefix(
|
||||||
|
'table_result_type',
|
||||||
|
prefix,
|
||||||
|
);
|
||||||
|
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
|
||||||
|
'markdown_image_response_type',
|
||||||
|
prefix,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (isEmpty(form.getValues(tableResultTypeName))) {
|
||||||
|
form.setValue(tableResultTypeName, '1', {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
|
||||||
|
form.setValue(markdownImageResponseTypeName, '1', {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, [tcadpOptionsShown, form, prefix]);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<ParserMethodFormField
|
||||||
|
prefix={prefix}
|
||||||
|
optionsWithoutLLM={optionsWithoutLLM}
|
||||||
|
></ParserMethodFormField>
|
||||||
|
{tcadpOptionsShown && (
|
||||||
|
<>
|
||||||
|
<RAGFlowFormItem
|
||||||
|
name={buildFieldNameWithPrefix('table_result_type', prefix)}
|
||||||
|
label={t('flow.tableResultType') || '表格返回形式'}
|
||||||
|
>
|
||||||
|
{(field) => (
|
||||||
|
<SelectWithSearch
|
||||||
|
value={field.value}
|
||||||
|
onChange={field.onChange}
|
||||||
|
options={tableResultTypeOptions}
|
||||||
|
></SelectWithSearch>
|
||||||
|
)}
|
||||||
|
</RAGFlowFormItem>
|
||||||
|
<RAGFlowFormItem
|
||||||
|
name={buildFieldNameWithPrefix(
|
||||||
|
'markdown_image_response_type',
|
||||||
|
prefix,
|
||||||
|
)}
|
||||||
|
label={t('flow.markdownImageResponseType') || '图片返回形式'}
|
||||||
|
>
|
||||||
|
{(field) => (
|
||||||
|
<SelectWithSearch
|
||||||
|
value={field.value}
|
||||||
|
onChange={field.onChange}
|
||||||
|
options={markdownImageResponseTypeOptions}
|
||||||
|
></SelectWithSearch>
|
||||||
|
)}
|
||||||
|
</RAGFlowFormItem>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
||||||
125
web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
Normal file
125
web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||||
|
import {
|
||||||
|
SelectWithSearch,
|
||||||
|
SelectWithSearchFlagOptionType,
|
||||||
|
} from '@/components/originui/select-with-search';
|
||||||
|
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||||
|
import { isEmpty } from 'lodash';
|
||||||
|
import { useEffect, useMemo } from 'react';
|
||||||
|
import { useFormContext, useWatch } from 'react-hook-form';
|
||||||
|
import { useTranslation } from 'react-i18next';
|
||||||
|
import { ParserMethodFormField } from './common-form-fields';
|
||||||
|
import { CommonProps } from './interface';
|
||||||
|
import { buildFieldNameWithPrefix } from './utils';
|
||||||
|
|
||||||
|
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||||
|
{ label: 'Markdown', value: '0' },
|
||||||
|
{ label: 'HTML', value: '1' },
|
||||||
|
];
|
||||||
|
|
||||||
|
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
|
||||||
|
{ label: 'URL', value: '0' },
|
||||||
|
{ label: 'Text', value: '1' },
|
||||||
|
];
|
||||||
|
|
||||||
|
export function SpreadsheetFormFields({ prefix }: CommonProps) {
|
||||||
|
const { t } = useTranslation();
|
||||||
|
const form = useFormContext();
|
||||||
|
|
||||||
|
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||||
|
|
||||||
|
const parseMethod = useWatch({
|
||||||
|
name: parseMethodName,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Spreadsheet only supports DeepDOC and TCADPParser
|
||||||
|
const optionsWithoutLLM = [
|
||||||
|
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
|
||||||
|
{
|
||||||
|
label: ParseDocumentType.TCADPParser,
|
||||||
|
value: ParseDocumentType.TCADPParser,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const tcadpOptionsShown = useMemo(() => {
|
||||||
|
return (
|
||||||
|
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
|
||||||
|
);
|
||||||
|
}, [parseMethod]);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (isEmpty(form.getValues(parseMethodName))) {
|
||||||
|
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, [form, parseMethodName]);
|
||||||
|
|
||||||
|
// Set default values for TCADP options when TCADP is selected
|
||||||
|
useEffect(() => {
|
||||||
|
if (tcadpOptionsShown) {
|
||||||
|
const tableResultTypeName = buildFieldNameWithPrefix(
|
||||||
|
'table_result_type',
|
||||||
|
prefix,
|
||||||
|
);
|
||||||
|
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
|
||||||
|
'markdown_image_response_type',
|
||||||
|
prefix,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (isEmpty(form.getValues(tableResultTypeName))) {
|
||||||
|
form.setValue(tableResultTypeName, '1', {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
|
||||||
|
form.setValue(markdownImageResponseTypeName, '1', {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, [tcadpOptionsShown, form, prefix]);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<ParserMethodFormField
|
||||||
|
prefix={prefix}
|
||||||
|
optionsWithoutLLM={optionsWithoutLLM}
|
||||||
|
></ParserMethodFormField>
|
||||||
|
{tcadpOptionsShown && (
|
||||||
|
<>
|
||||||
|
<RAGFlowFormItem
|
||||||
|
name={buildFieldNameWithPrefix('table_result_type', prefix)}
|
||||||
|
label={t('flow.tableResultType') || '表格返回形式'}
|
||||||
|
>
|
||||||
|
{(field) => (
|
||||||
|
<SelectWithSearch
|
||||||
|
value={field.value}
|
||||||
|
onChange={field.onChange}
|
||||||
|
options={tableResultTypeOptions}
|
||||||
|
></SelectWithSearch>
|
||||||
|
)}
|
||||||
|
</RAGFlowFormItem>
|
||||||
|
<RAGFlowFormItem
|
||||||
|
name={buildFieldNameWithPrefix(
|
||||||
|
'markdown_image_response_type',
|
||||||
|
prefix,
|
||||||
|
)}
|
||||||
|
label={t('flow.markdownImageResponseType') || '图片返回形式'}
|
||||||
|
>
|
||||||
|
{(field) => (
|
||||||
|
<SelectWithSearch
|
||||||
|
value={field.value}
|
||||||
|
onChange={field.onChange}
|
||||||
|
options={markdownImageResponseTypeOptions}
|
||||||
|
></SelectWithSearch>
|
||||||
|
)}
|
||||||
|
</RAGFlowFormItem>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
||||||
@ -214,6 +214,36 @@ function transformParserParams(params: ParserFormSchemaType) {
|
|||||||
parse_method: cur.parse_method,
|
parse_method: cur.parse_method,
|
||||||
lang: cur.lang,
|
lang: cur.lang,
|
||||||
};
|
};
|
||||||
|
// Only include TCADP parameters if TCADP Parser is selected
|
||||||
|
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
|
||||||
|
filteredSetup.table_result_type = cur.table_result_type;
|
||||||
|
filteredSetup.markdown_image_response_type =
|
||||||
|
cur.markdown_image_response_type;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FileType.Spreadsheet:
|
||||||
|
filteredSetup = {
|
||||||
|
...filteredSetup,
|
||||||
|
parse_method: cur.parse_method,
|
||||||
|
};
|
||||||
|
// Only include TCADP parameters if TCADP Parser is selected
|
||||||
|
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
|
||||||
|
filteredSetup.table_result_type = cur.table_result_type;
|
||||||
|
filteredSetup.markdown_image_response_type =
|
||||||
|
cur.markdown_image_response_type;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FileType.PowerPoint:
|
||||||
|
filteredSetup = {
|
||||||
|
...filteredSetup,
|
||||||
|
parse_method: cur.parse_method,
|
||||||
|
};
|
||||||
|
// Only include TCADP parameters if TCADP Parser is selected
|
||||||
|
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
|
||||||
|
filteredSetup.table_result_type = cur.table_result_type;
|
||||||
|
filteredSetup.markdown_image_response_type =
|
||||||
|
cur.markdown_image_response_type;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case FileType.Image:
|
case FileType.Image:
|
||||||
filteredSetup = {
|
filteredSetup = {
|
||||||
|
|||||||
0
web/src/pages/data-flow/constant.tsx
Normal file
0
web/src/pages/data-flow/constant.tsx
Normal file
0
web/src/pages/data-flow/form/parser-form/index.tsx
Normal file
0
web/src/pages/data-flow/form/parser-form/index.tsx
Normal file
40
web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx
Normal file
40
web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||||
|
import { isEmpty } from 'lodash';
|
||||||
|
import { useEffect } from 'react';
|
||||||
|
import { useFormContext } from 'react-hook-form';
|
||||||
|
import { ParserMethodFormField } from './common-form-fields';
|
||||||
|
import { CommonProps } from './interface';
|
||||||
|
import { buildFieldNameWithPrefix } from './utils';
|
||||||
|
|
||||||
|
export function PptFormFields({ prefix }: CommonProps) {
|
||||||
|
const form = useFormContext();
|
||||||
|
|
||||||
|
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||||
|
|
||||||
|
// PPT only supports DeepDOC and TCADPParser
|
||||||
|
const optionsWithoutLLM = [
|
||||||
|
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
|
||||||
|
{
|
||||||
|
label: ParseDocumentType.TCADPParser,
|
||||||
|
value: ParseDocumentType.TCADPParser,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (isEmpty(form.getValues(parseMethodName))) {
|
||||||
|
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, [form, parseMethodName]);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<ParserMethodFormField
|
||||||
|
prefix={prefix}
|
||||||
|
optionsWithoutLLM={optionsWithoutLLM}
|
||||||
|
></ParserMethodFormField>
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
||||||
@ -0,0 +1,40 @@
|
|||||||
|
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
|
||||||
|
import { isEmpty } from 'lodash';
|
||||||
|
import { useEffect } from 'react';
|
||||||
|
import { useFormContext } from 'react-hook-form';
|
||||||
|
import { ParserMethodFormField } from './common-form-fields';
|
||||||
|
import { CommonProps } from './interface';
|
||||||
|
import { buildFieldNameWithPrefix } from './utils';
|
||||||
|
|
||||||
|
export function SpreadsheetFormFields({ prefix }: CommonProps) {
|
||||||
|
const form = useFormContext();
|
||||||
|
|
||||||
|
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
|
||||||
|
|
||||||
|
// Spreadsheet only supports DeepDOC and TCADPParser
|
||||||
|
const optionsWithoutLLM = [
|
||||||
|
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
|
||||||
|
{
|
||||||
|
label: ParseDocumentType.TCADPParser,
|
||||||
|
value: ParseDocumentType.TCADPParser,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (isEmpty(form.getValues(parseMethodName))) {
|
||||||
|
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
|
||||||
|
shouldValidate: true,
|
||||||
|
shouldDirty: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}, [form, parseMethodName]);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<ParserMethodFormField
|
||||||
|
prefix={prefix}
|
||||||
|
optionsWithoutLLM={optionsWithoutLLM}
|
||||||
|
></ParserMethodFormField>
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
}
|
||||||
0
web/src/pages/data-flow/utils.ts
Normal file
0
web/src/pages/data-flow/utils.ts
Normal file
Reference in New Issue
Block a user