diff --git a/conf/service_conf.yaml b/conf/service_conf.yaml
index ec756a149..6b3cef80e 100644
--- a/conf/service_conf.yaml
+++ b/conf/service_conf.yaml
@@ -147,5 +147,3 @@ user_default_llm:
# secret_id: 'tencent_secret_id'
# secret_key: 'tencent_secret_key'
# region: 'tencent_region'
-# table_result_type: '1'
-# markdown_image_response_type: '1'
diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py
index 920b6f1a1..8d704baed 100644
--- a/deepdoc/parser/tcadp_parser.py
+++ b/deepdoc/parser/tcadp_parser.py
@@ -192,12 +192,16 @@ class TencentCloudAPIClient:
class TCADPParser(RAGFlowPdfParser):
- def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"):
+ def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
+ table_result_type: str = None, markdown_image_response_type: str = None):
super().__init__()
# First initialize logger
self.logger = logging.getLogger(self.__class__.__name__)
+ # Log received parameters
+ self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
+
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
try:
tcadp_parser = get_base_config("tcadp_config", {})
@@ -205,14 +209,30 @@ class TCADPParser(RAGFlowPdfParser):
self.secret_id = secret_id or tcadp_parser.get("secret_id")
self.secret_key = secret_key or tcadp_parser.get("secret_key")
self.region = region or tcadp_parser.get("region", "ap-guangzhou")
- self.table_result_type = tcadp_parser.get("table_result_type", "1")
- self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1")
- self.logger.info("[TCADP] Configuration read from service_conf.yaml")
+ # Set table_result_type and markdown_image_response_type from config or parameters
+ self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
+ self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
+
else:
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
+ # If config file is empty, use provided parameters or defaults
+ self.secret_id = secret_id
+ self.secret_key = secret_key
+ self.region = region or "ap-guangzhou"
+ self.table_result_type = table_result_type if table_result_type is not None else "1"
+ self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
except ImportError:
self.logger.info("[TCADP] Configuration module import failed")
+ # If config file is not available, use provided parameters or defaults
+ self.secret_id = secret_id
+ self.secret_key = secret_key
+ self.region = region or "ap-guangzhou"
+ self.table_result_type = table_result_type if table_result_type is not None else "1"
+ self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
+
+ # Log final values
+ self.logger.info(f"[TCADP] Final values - table_result_type: {self.table_result_type}, markdown_image_response_type: {self.markdown_image_response_type}")
if not self.secret_id or not self.secret_key:
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
@@ -400,6 +420,8 @@ class TCADPParser(RAGFlowPdfParser):
"TableResultType": self.table_result_type,
"MarkdownImageResponseType": self.markdown_image_response_type
}
+
+ self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")
result = client.reconstruct_document_sse(
file_type=file_type,
diff --git a/docker/service_conf.yaml.template b/docker/service_conf.yaml.template
index 14b1b71f9..fa85453ab 100644
--- a/docker/service_conf.yaml.template
+++ b/docker/service_conf.yaml.template
@@ -150,5 +150,3 @@ user_default_llm:
# secret_id: '${TENCENT_SECRET_ID}'
# secret_key: '${TENCENT_SECRET_KEY}'
# region: '${TENCENT_REGION}'
-# table_result_type: '1'
-# markdown_image_response_type: '1'
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 293e4a8b9..49dca17af 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -116,7 +116,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
-
+
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
@@ -504,7 +504,7 @@ class Markdown(MarkdownParser):
return images if images else None
- def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
+ def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
@@ -602,7 +602,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
_SerializedRelationships.load_from_xml = load_from_xml_v2
sections, tables = Docx()(filename, binary)
- tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
+ tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
@@ -653,18 +653,47 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
-
+
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
- excel_parser = ExcelParser()
- if parser_config.get("html4excel"):
- sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+
+ # Check if tcadp_parser is selected for spreadsheet files
+ layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+ if layout_recognizer == "TCADP Parser":
+ table_result_type = parser_config.get("table_result_type", "1")
+ markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
+ tcadp_parser = TCADPParser(
+ table_result_type=table_result_type,
+ markdown_image_response_type=markdown_image_response_type
+ )
+ if not tcadp_parser.check_installation():
+ callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
+ return res
+
+ # Determine file type based on extension
+ file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
+
+ sections, tables = tcadp_parser.parse_pdf(
+ filepath=filename,
+ binary=binary,
+ callback=callback,
+ output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
+ file_type=file_type
+ )
+ parser_config["chunk_token_num"] = 0
+ res = tokenize_table(tables, doc, is_english)
+ callback(0.8, "Finish parsing.")
else:
- sections = [(_, "") for _ in excel_parser(binary) if _]
- parser_config["chunk_token_num"] = 12800
+ # Default DeepDOC parser
+ excel_parser = ExcelParser()
+ if parser_config.get("html4excel"):
+ sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+ else:
+ sections = [(_, "") for _ in excel_parser(binary) if _]
+ parser_config["chunk_token_num"] = 12800
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
@@ -676,7 +705,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
- sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
+ sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
index e3d95a470..2ba5cfa7b 100644
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -16,6 +16,7 @@ import io
import json
import os
import random
+import re
from functools import partial
import trio
@@ -83,6 +84,7 @@ class ParserParam(ProcessParamBase):
"output_format": "json",
},
"spreadsheet": {
+ "parse_method": "deepdoc", # deepdoc/tcadp_parser
"output_format": "html",
"suffix": [
"xls",
@@ -102,8 +104,10 @@ class ParserParam(ProcessParamBase):
"output_format": "json",
},
"slides": {
+ "parse_method": "deepdoc", # deepdoc/tcadp_parser
"suffix": [
"pptx",
+ "ppt"
],
"output_format": "json",
},
@@ -245,7 +249,12 @@ class Parser(ProcessBase):
bboxes.append(box)
elif conf.get("parse_method").lower() == "tcadp parser":
# ADP is a document parsing tool using Tencent Cloud API
- tcadp_parser = TCADPParser()
+ table_result_type = conf.get("table_result_type", "1")
+ markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+ tcadp_parser = TCADPParser(
+ table_result_type=table_result_type,
+ markdown_image_response_type=markdown_image_response_type
+ )
sections, _ = tcadp_parser.parse_pdf(
filepath=name,
binary=blob,
@@ -301,14 +310,86 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
conf = self._param.setups["spreadsheet"]
self.set_output("output_format", conf["output_format"])
- spreadsheet_parser = ExcelParser()
- if conf.get("output_format") == "html":
- htmls = spreadsheet_parser.html(blob, 1000000000)
- self.set_output("html", htmls[0])
- elif conf.get("output_format") == "json":
- self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
- elif conf.get("output_format") == "markdown":
- self.set_output("markdown", spreadsheet_parser.markdown(blob))
+
+ parse_method = conf.get("parse_method", "deepdoc")
+
+ # Handle TCADP parser
+ if parse_method.lower() == "tcadp parser":
+ table_result_type = conf.get("table_result_type", "1")
+ markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+ tcadp_parser = TCADPParser(
+ table_result_type=table_result_type,
+ markdown_image_response_type=markdown_image_response_type
+ )
+ if not tcadp_parser.check_installation():
+ raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
+
+ # Determine file type based on extension
+ if re.search(r"\.xlsx?$", name, re.IGNORECASE):
+ file_type = "XLSX"
+ else:
+ file_type = "CSV"
+
+ self.callback(0.2, f"Using TCADP parser for {file_type} file.")
+ sections, tables = tcadp_parser.parse_pdf(
+ filepath=name,
+ binary=blob,
+ callback=self.callback,
+ file_type=file_type,
+ file_start_page=1,
+ file_end_page=1000
+ )
+
+ # Process TCADP parser output based on configured output_format
+ output_format = conf.get("output_format", "html")
+
+ if output_format == "html":
+ # For HTML output, combine sections and tables into HTML
+ html_content = ""
+ for section, position_tag in sections:
+ if section:
+ html_content += section + "\n"
+ for table in tables:
+ if table:
+ html_content += table + "\n"
+
+ self.set_output("html", html_content)
+
+ elif output_format == "json":
+ # For JSON output, create a list of text items
+ result = []
+ # Add sections as text
+ for section, position_tag in sections:
+ if section:
+ result.append({"text": section})
+ # Add tables as text
+ for table in tables:
+ if table:
+ result.append({"text": table})
+
+ self.set_output("json", result)
+
+ elif output_format == "markdown":
+ # For markdown output, combine into markdown
+ md_content = ""
+ for section, position_tag in sections:
+ if section:
+ md_content += section + "\n\n"
+ for table in tables:
+ if table:
+ md_content += table + "\n\n"
+
+ self.set_output("markdown", md_content)
+ else:
+ # Default DeepDOC parser
+ spreadsheet_parser = ExcelParser()
+ if conf.get("output_format") == "html":
+ htmls = spreadsheet_parser.html(blob, 1000000000)
+ self.set_output("html", htmls[0])
+ elif conf.get("output_format") == "json":
+ self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
+ elif conf.get("output_format") == "markdown":
+ self.set_output("markdown", spreadsheet_parser.markdown(blob))
def _word(self, name, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
@@ -326,22 +407,69 @@ class Parser(ProcessBase):
self.set_output("markdown", markdown_text)
def _slides(self, name, blob):
- from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
-
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
conf = self._param.setups["slides"]
self.set_output("output_format", conf["output_format"])
- ppt_parser = ppt_parser()
- txts = ppt_parser(blob, 0, 100000, None)
+ parse_method = conf.get("parse_method", "deepdoc")
- sections = [{"text": section} for section in txts if section.strip()]
+ # Handle TCADP parser
+ if parse_method.lower() == "tcadp parser":
+ table_result_type = conf.get("table_result_type", "1")
+ markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+ tcadp_parser = TCADPParser(
+ table_result_type=table_result_type,
+ markdown_image_response_type=markdown_image_response_type
+ )
+ if not tcadp_parser.check_installation():
+ raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
- # json
- assert conf.get("output_format") == "json", "have to be json for ppt"
- if conf.get("output_format") == "json":
- self.set_output("json", sections)
+ # Determine file type based on extension
+ if re.search(r"\.pptx?$", name, re.IGNORECASE):
+ file_type = "PPTX"
+ else:
+ file_type = "PPT"
+
+ self.callback(0.2, f"Using TCADP parser for {file_type} file.")
+
+ sections, tables = tcadp_parser.parse_pdf(
+ filepath=name,
+ binary=blob,
+ callback=self.callback,
+ file_type=file_type,
+ file_start_page=1,
+ file_end_page=1000
+ )
+
+ # Process TCADP parser output - PPT only supports json format
+ output_format = conf.get("output_format", "json")
+ if output_format == "json":
+ # For JSON output, create a list of text items
+ result = []
+ # Add sections as text
+ for section, position_tag in sections:
+ if section:
+ result.append({"text": section})
+ # Add tables as text
+ for table in tables:
+ if table:
+ result.append({"text": table})
+
+ self.set_output("json", result)
+ else:
+ # Default DeepDOC parser (supports .pptx format)
+ from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
+
+ ppt_parser = ppt_parser()
+ txts = ppt_parser(blob, 0, 100000, None)
+
+ sections = [{"text": section} for section in txts if section.strip()]
+
+ # json
+ assert conf.get("output_format") == "json", "have to be json for ppt"
+ if conf.get("output_format") == "json":
+ self.set_output("json", sections)
def _markdown(self, name, blob):
from functools import reduce
@@ -579,6 +707,7 @@ class Parser(ProcessBase):
"video": self._video,
"email": self._email,
}
+
try:
from_upstream = ParserFromUpstream.model_validate(kwargs)
except Exception as e:
diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts
index 38bc6c212..40a6a072c 100644
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -1752,6 +1752,8 @@ The variable aggregation node (originally the variable assignment node) is a cru
The Indexer will store the content in the corresponding data structures for the selected methods.`,
// file: 'File',
parserMethod: 'PDF parser',
+ tableResultType: 'Table Result Type',
+ markdownImageResponseType: 'Markdown Image Response Type',
// systemPrompt: 'System Prompt',
systemPromptPlaceholder:
'Enter system prompt for image analysis, if empty the system default value will be used',
diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts
index 763bd6304..5b684a131 100644
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -1629,6 +1629,8 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
filenameEmbdWeight: '文件名嵌入权重',
parserMethod: '解析方法',
+ tableResultType: '表格返回形式',
+ markdownImageResponseType: '图片返回形式',
systemPromptPlaceholder:
'请输入用于图像分析的系统提示词,若为空则使用系统缺省值',
exportJson: '导出 JSON',
diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx
index e098df997..849340bfa 100644
--- a/web/src/pages/agent/constant/pipeline.tsx
+++ b/web/src/pages/agent/constant/pipeline.tsx
@@ -169,6 +169,7 @@ export const initialParserValues = {
{
fileFormat: FileType.Spreadsheet,
output_format: SpreadsheetOutputFormat.Html,
+ parse_method: ParseDocumentType.DeepDOC,
},
{
fileFormat: FileType.Image,
@@ -192,6 +193,7 @@ export const initialParserValues = {
{
fileFormat: FileType.PowerPoint,
output_format: PptOutputFormat.Json,
+ parse_method: ParseDocumentType.DeepDOC,
},
],
};
@@ -243,7 +245,7 @@ export const FileTypeSuffixMap = {
[FileType.Email]: ['eml', 'msg'],
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
[FileType.Docx]: ['doc', 'docx'],
- [FileType.PowerPoint]: ['pptx'],
+ [FileType.PowerPoint]: ['pptx', 'ppt'],
[FileType.Video]: ['mp4', 'avi', 'mkv'],
[FileType.Audio]: [
'da',
diff --git a/web/src/pages/agent/form/parser-form/index.tsx b/web/src/pages/agent/form/parser-form/index.tsx
index 2584c7960..1942b2d05 100644
--- a/web/src/pages/agent/form/parser-form/index.tsx
+++ b/web/src/pages/agent/form/parser-form/index.tsx
@@ -34,6 +34,8 @@ import { OutputFormatFormField } from './common-form-fields';
import { EmailFormFields } from './email-form-fields';
import { ImageFormFields } from './image-form-fields';
import { PdfFormFields } from './pdf-form-fields';
+import { PptFormFields } from './ppt-form-fields';
+import { SpreadsheetFormFields } from './spreadsheet-form-fields';
import { buildFieldNameWithPrefix } from './utils';
import { AudioFormFields, VideoFormFields } from './video-form-fields';
@@ -41,6 +43,8 @@ const outputList = buildOutputList(initialParserValues.outputs);
const FileFormatWidgetMap = {
[FileType.PDF]: PdfFormFields,
+ [FileType.Spreadsheet]: SpreadsheetFormFields,
+ [FileType.PowerPoint]: PptFormFields,
[FileType.Video]: VideoFormFields,
[FileType.Audio]: AudioFormFields,
[FileType.Email]: EmailFormFields,
@@ -65,6 +69,8 @@ export const FormSchema = z.object({
fields: z.array(z.string()).optional(),
llm_id: z.string().optional(),
system_prompt: z.string().optional(),
+ table_result_type: z.string().optional(),
+ markdown_image_response_type: z.string().optional(),
}),
),
});
@@ -184,6 +190,8 @@ const ParserForm = ({ node }: INextOperatorForm) => {
lang: '',
fields: [],
llm_id: '',
+ table_result_type: '',
+ markdown_image_response_type: '',
});
}, [append]);
diff --git a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
index 020032c5c..82c976f0f 100644
--- a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
+++ b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx
@@ -1,13 +1,30 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import {
+ SelectWithSearch,
+ SelectWithSearchFlagOptionType,
+} from '@/components/originui/select-with-search';
+import { RAGFlowFormItem } from '@/components/ragflow-form';
import { isEmpty } from 'lodash';
import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
+import { useTranslation } from 'react-i18next';
import { LanguageFormField, ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { useSetInitialLanguage } from './use-set-initial-language';
import { buildFieldNameWithPrefix } from './utils';
+const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
+ { label: 'Markdown', value: '0' },
+ { label: 'HTML', value: '1' },
+];
+
+const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
+ { label: 'URL', value: '0' },
+ { label: 'Text', value: '1' },
+];
+
export function PdfFormFields({ prefix }: CommonProps) {
+ const { t } = useTranslation();
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
@@ -25,6 +42,12 @@ export function PdfFormFields({ prefix }: CommonProps) {
);
}, [parseMethod]);
+ const tcadpOptionsShown = useMemo(() => {
+ return (
+ !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
+ );
+ }, [parseMethod]);
+
useSetInitialLanguage({ prefix, languageShown });
useEffect(() => {
@@ -36,10 +59,68 @@ export function PdfFormFields({ prefix }: CommonProps) {
}
}, [form, parseMethodName]);
+ // Set default values for TCADP options when TCADP is selected
+ useEffect(() => {
+ if (tcadpOptionsShown) {
+ const tableResultTypeName = buildFieldNameWithPrefix(
+ 'table_result_type',
+ prefix,
+ );
+ const markdownImageResponseTypeName = buildFieldNameWithPrefix(
+ 'markdown_image_response_type',
+ prefix,
+ );
+
+ if (isEmpty(form.getValues(tableResultTypeName))) {
+ form.setValue(tableResultTypeName, '1', {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
+ form.setValue(markdownImageResponseTypeName, '1', {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ }
+ }, [tcadpOptionsShown, form, prefix]);
+
return (
<>
{languageShown && }
+ {tcadpOptionsShown && (
+ <>
+
+ {(field) => (
+
+ )}
+
+
+ {(field) => (
+
+ )}
+
+ >
+ )}
>
);
}
diff --git a/web/src/pages/agent/form/parser-form/ppt-form-fields.tsx b/web/src/pages/agent/form/parser-form/ppt-form-fields.tsx
new file mode 100644
index 000000000..18f924959
--- /dev/null
+++ b/web/src/pages/agent/form/parser-form/ppt-form-fields.tsx
@@ -0,0 +1,125 @@
+import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import {
+ SelectWithSearch,
+ SelectWithSearchFlagOptionType,
+} from '@/components/originui/select-with-search';
+import { RAGFlowFormItem } from '@/components/ragflow-form';
+import { isEmpty } from 'lodash';
+import { useEffect, useMemo } from 'react';
+import { useFormContext, useWatch } from 'react-hook-form';
+import { useTranslation } from 'react-i18next';
+import { ParserMethodFormField } from './common-form-fields';
+import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';
+
+const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
+ { label: 'Markdown', value: '0' },
+ { label: 'HTML', value: '1' },
+];
+
+const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
+ { label: 'URL', value: '0' },
+ { label: 'Text', value: '1' },
+];
+
+export function PptFormFields({ prefix }: CommonProps) {
+ const { t } = useTranslation();
+ const form = useFormContext();
+
+ const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
+
+ const parseMethod = useWatch({
+ name: parseMethodName,
+ });
+
+ // PPT only supports DeepDOC and TCADPParser
+ const optionsWithoutLLM = [
+ { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
+ {
+ label: ParseDocumentType.TCADPParser,
+ value: ParseDocumentType.TCADPParser,
+ },
+ ];
+
+ const tcadpOptionsShown = useMemo(() => {
+ return (
+ !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
+ );
+ }, [parseMethod]);
+
+ useEffect(() => {
+ if (isEmpty(form.getValues(parseMethodName))) {
+ form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ }, [form, parseMethodName]);
+
+ // Set default values for TCADP options when TCADP is selected
+ useEffect(() => {
+ if (tcadpOptionsShown) {
+ const tableResultTypeName = buildFieldNameWithPrefix(
+ 'table_result_type',
+ prefix,
+ );
+ const markdownImageResponseTypeName = buildFieldNameWithPrefix(
+ 'markdown_image_response_type',
+ prefix,
+ );
+
+ if (isEmpty(form.getValues(tableResultTypeName))) {
+ form.setValue(tableResultTypeName, '1', {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
+ form.setValue(markdownImageResponseTypeName, '1', {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ }
+ }, [tcadpOptionsShown, form, prefix]);
+
+ return (
+ <>
+
+ {tcadpOptionsShown && (
+ <>
+
+ {(field) => (
+
+ )}
+
+
+ {(field) => (
+
+ )}
+
+ >
+ )}
+ >
+ );
+}
diff --git a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
new file mode 100644
index 000000000..407150991
--- /dev/null
+++ b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx
@@ -0,0 +1,125 @@
+import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import {
+ SelectWithSearch,
+ SelectWithSearchFlagOptionType,
+} from '@/components/originui/select-with-search';
+import { RAGFlowFormItem } from '@/components/ragflow-form';
+import { isEmpty } from 'lodash';
+import { useEffect, useMemo } from 'react';
+import { useFormContext, useWatch } from 'react-hook-form';
+import { useTranslation } from 'react-i18next';
+import { ParserMethodFormField } from './common-form-fields';
+import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';
+
+const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
+ { label: 'Markdown', value: '0' },
+ { label: 'HTML', value: '1' },
+];
+
+const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
+ { label: 'URL', value: '0' },
+ { label: 'Text', value: '1' },
+];
+
+export function SpreadsheetFormFields({ prefix }: CommonProps) {
+ const { t } = useTranslation();
+ const form = useFormContext();
+
+ const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
+
+ const parseMethod = useWatch({
+ name: parseMethodName,
+ });
+
+ // Spreadsheet only supports DeepDOC and TCADPParser
+ const optionsWithoutLLM = [
+ { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
+ {
+ label: ParseDocumentType.TCADPParser,
+ value: ParseDocumentType.TCADPParser,
+ },
+ ];
+
+ const tcadpOptionsShown = useMemo(() => {
+ return (
+ !isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
+ );
+ }, [parseMethod]);
+
+ useEffect(() => {
+ if (isEmpty(form.getValues(parseMethodName))) {
+ form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ }, [form, parseMethodName]);
+
+ // Set default values for TCADP options when TCADP is selected
+ useEffect(() => {
+ if (tcadpOptionsShown) {
+ const tableResultTypeName = buildFieldNameWithPrefix(
+ 'table_result_type',
+ prefix,
+ );
+ const markdownImageResponseTypeName = buildFieldNameWithPrefix(
+ 'markdown_image_response_type',
+ prefix,
+ );
+
+ if (isEmpty(form.getValues(tableResultTypeName))) {
+ form.setValue(tableResultTypeName, '1', {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
+ form.setValue(markdownImageResponseTypeName, '1', {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ }
+ }, [tcadpOptionsShown, form, prefix]);
+
+ return (
+ <>
+
+ {tcadpOptionsShown && (
+ <>
+
+ {(field) => (
+
+ )}
+
+
+ {(field) => (
+
+ )}
+
+ >
+ )}
+ >
+ );
+}
diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts
index f40015012..f4e4a4b1d 100644
--- a/web/src/pages/agent/utils.ts
+++ b/web/src/pages/agent/utils.ts
@@ -214,6 +214,36 @@ function transformParserParams(params: ParserFormSchemaType) {
parse_method: cur.parse_method,
lang: cur.lang,
};
+ // Only include TCADP parameters if TCADP Parser is selected
+ if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
+ filteredSetup.table_result_type = cur.table_result_type;
+ filteredSetup.markdown_image_response_type =
+ cur.markdown_image_response_type;
+ }
+ break;
+ case FileType.Spreadsheet:
+ filteredSetup = {
+ ...filteredSetup,
+ parse_method: cur.parse_method,
+ };
+ // Only include TCADP parameters if TCADP Parser is selected
+ if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
+ filteredSetup.table_result_type = cur.table_result_type;
+ filteredSetup.markdown_image_response_type =
+ cur.markdown_image_response_type;
+ }
+ break;
+ case FileType.PowerPoint:
+ filteredSetup = {
+ ...filteredSetup,
+ parse_method: cur.parse_method,
+ };
+ // Only include TCADP parameters if TCADP Parser is selected
+ if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
+ filteredSetup.table_result_type = cur.table_result_type;
+ filteredSetup.markdown_image_response_type =
+ cur.markdown_image_response_type;
+ }
break;
case FileType.Image:
filteredSetup = {
diff --git a/web/src/pages/data-flow/constant.tsx b/web/src/pages/data-flow/constant.tsx
new file mode 100644
index 000000000..e69de29bb
diff --git a/web/src/pages/data-flow/form/parser-form/index.tsx b/web/src/pages/data-flow/form/parser-form/index.tsx
new file mode 100644
index 000000000..e69de29bb
diff --git a/web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx b/web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx
new file mode 100644
index 000000000..59b179498
--- /dev/null
+++ b/web/src/pages/data-flow/form/parser-form/ppt-form-fields.tsx
@@ -0,0 +1,40 @@
+import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import { isEmpty } from 'lodash';
+import { useEffect } from 'react';
+import { useFormContext } from 'react-hook-form';
+import { ParserMethodFormField } from './common-form-fields';
+import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';
+
+export function PptFormFields({ prefix }: CommonProps) {
+ const form = useFormContext();
+
+ const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
+
+ // PPT only supports DeepDOC and TCADPParser
+ const optionsWithoutLLM = [
+ { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
+ {
+ label: ParseDocumentType.TCADPParser,
+ value: ParseDocumentType.TCADPParser,
+ },
+ ];
+
+ useEffect(() => {
+ if (isEmpty(form.getValues(parseMethodName))) {
+ form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ }, [form, parseMethodName]);
+
+ return (
+ <>
+
+ >
+ );
+}
diff --git a/web/src/pages/data-flow/form/parser-form/spreadsheet-form-fields.tsx b/web/src/pages/data-flow/form/parser-form/spreadsheet-form-fields.tsx
new file mode 100644
index 000000000..443ff6e18
--- /dev/null
+++ b/web/src/pages/data-flow/form/parser-form/spreadsheet-form-fields.tsx
@@ -0,0 +1,40 @@
+import { ParseDocumentType } from '@/components/layout-recognize-form-field';
+import { isEmpty } from 'lodash';
+import { useEffect } from 'react';
+import { useFormContext } from 'react-hook-form';
+import { ParserMethodFormField } from './common-form-fields';
+import { CommonProps } from './interface';
+import { buildFieldNameWithPrefix } from './utils';
+
+export function SpreadsheetFormFields({ prefix }: CommonProps) {
+ const form = useFormContext();
+
+ const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
+
+ // Spreadsheet only supports DeepDOC and TCADPParser
+ const optionsWithoutLLM = [
+ { label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
+ {
+ label: ParseDocumentType.TCADPParser,
+ value: ParseDocumentType.TCADPParser,
+ },
+ ];
+
+ useEffect(() => {
+ if (isEmpty(form.getValues(parseMethodName))) {
+ form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
+ shouldValidate: true,
+ shouldDirty: true,
+ });
+ }
+ }, [form, parseMethodName]);
+
+ return (
+ <>
+
+ >
+ );
+}
diff --git a/web/src/pages/data-flow/utils.ts b/web/src/pages/data-flow/utils.ts
new file mode 100644
index 000000000..e69de29bb