Feat: Add TCADP parser for PPTX and spreadsheet document types. (#11041)

### What problem does this PR solve?

- Added TCADP Parser configuration fields to PDF, PPT, and spreadsheet
parsing forms
- Implemented support for setting table result type (Markdown/HTML) and
Markdown image response type (URL/Text)
- Updated TCADP Parser to handle return format settings from
configuration or parameters
- Enhanced frontend to dynamically show TCADP options based on selected
parsing method
- Modified backend to pass format parameters when calling TCADP API
- Optimized form default value logic for TCADP configuration items
- Updated multilingual resource files for new configuration options

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
aidan
2025-11-20 10:08:42 +08:00
committed by GitHub
parent ecf0322165
commit 420c97199a
18 changed files with 668 additions and 37 deletions

View File

@ -147,5 +147,3 @@ user_default_llm:
# secret_id: 'tencent_secret_id' # secret_id: 'tencent_secret_id'
# secret_key: 'tencent_secret_key' # secret_key: 'tencent_secret_key'
# region: 'tencent_region' # region: 'tencent_region'
# table_result_type: '1'
# markdown_image_response_type: '1'

View File

@ -192,12 +192,16 @@ class TencentCloudAPIClient:
class TCADPParser(RAGFlowPdfParser): class TCADPParser(RAGFlowPdfParser):
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"): def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
table_result_type: str = None, markdown_image_response_type: str = None):
super().__init__() super().__init__()
# First initialize logger # First initialize logger
self.logger = logging.getLogger(self.__class__.__name__) self.logger = logging.getLogger(self.__class__.__name__)
# Log received parameters
self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml) # Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
try: try:
tcadp_parser = get_base_config("tcadp_config", {}) tcadp_parser = get_base_config("tcadp_config", {})
@ -205,14 +209,30 @@ class TCADPParser(RAGFlowPdfParser):
self.secret_id = secret_id or tcadp_parser.get("secret_id") self.secret_id = secret_id or tcadp_parser.get("secret_id")
self.secret_key = secret_key or tcadp_parser.get("secret_key") self.secret_key = secret_key or tcadp_parser.get("secret_key")
self.region = region or tcadp_parser.get("region", "ap-guangzhou") self.region = region or tcadp_parser.get("region", "ap-guangzhou")
self.table_result_type = tcadp_parser.get("table_result_type", "1") # Set table_result_type and markdown_image_response_type from config or parameters
self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1") self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
self.logger.info("[TCADP] Configuration read from service_conf.yaml") self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
else: else:
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first") self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
# If config file is empty, use provided parameters or defaults
self.secret_id = secret_id
self.secret_key = secret_key
self.region = region or "ap-guangzhou"
self.table_result_type = table_result_type if table_result_type is not None else "1"
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
except ImportError: except ImportError:
self.logger.info("[TCADP] Configuration module import failed") self.logger.info("[TCADP] Configuration module import failed")
# If config file is not available, use provided parameters or defaults
self.secret_id = secret_id
self.secret_key = secret_key
self.region = region or "ap-guangzhou"
self.table_result_type = table_result_type if table_result_type is not None else "1"
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
# Log final values
self.logger.info(f"[TCADP] Final values - table_result_type: {self.table_result_type}, markdown_image_response_type: {self.markdown_image_response_type}")
if not self.secret_id or not self.secret_key: if not self.secret_id or not self.secret_key:
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml") raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
@ -401,6 +421,8 @@ class TCADPParser(RAGFlowPdfParser):
"MarkdownImageResponseType": self.markdown_image_response_type "MarkdownImageResponseType": self.markdown_image_response_type
} }
self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")
result = client.reconstruct_document_sse( result = client.reconstruct_document_sse(
file_type=file_type, file_type=file_type,
file_base64=file_base64, file_base64=file_base64,

View File

@ -150,5 +150,3 @@ user_default_llm:
# secret_id: '${TENCENT_SECRET_ID}' # secret_id: '${TENCENT_SECRET_ID}'
# secret_key: '${TENCENT_SECRET_KEY}' # secret_key: '${TENCENT_SECRET_KEY}'
# region: '${TENCENT_REGION}' # region: '${TENCENT_REGION}'
# table_result_type: '1'
# markdown_image_response_type: '1'

View File

@ -504,7 +504,7 @@ class Markdown(MarkdownParser):
return images if images else None return images if images else None
def __call__(self, filename, binary=None, separate_tables=True,delimiter=None): def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore") txt = binary.decode(encoding, errors="ignore")
@ -602,7 +602,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
_SerializedRelationships.load_from_xml = load_from_xml_v2 _SerializedRelationships.load_from_xml = load_from_xml_v2
sections, tables = Docx()(filename, binary) sections, tables = Docx()(filename, binary)
tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs) tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
@ -659,12 +659,41 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
if parser_config.get("html4excel"): # Check if tcadp_parser is selected for spreadsheet files
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if layout_recognizer == "TCADP Parser":
table_result_type = parser_config.get("table_result_type", "1")
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
)
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return res
# Determine file type based on extension
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type=file_type
)
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
else: else:
sections = [(_, "") for _ in excel_parser(binary) if _] # Default DeepDOC parser
parser_config["chunk_token_num"] = 12800 excel_parser = ExcelParser()
if parser_config.get("html4excel"):
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
else:
sections = [(_, "") for _ in excel_parser(binary) if _]
parser_config["chunk_token_num"] = 12800
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
@ -676,7 +705,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?")) sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
try: try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)

View File

@ -16,6 +16,7 @@ import io
import json import json
import os import os
import random import random
import re
from functools import partial from functools import partial
import trio import trio
@ -83,6 +84,7 @@ class ParserParam(ProcessParamBase):
"output_format": "json", "output_format": "json",
}, },
"spreadsheet": { "spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
"output_format": "html", "output_format": "html",
"suffix": [ "suffix": [
"xls", "xls",
@ -102,8 +104,10 @@ class ParserParam(ProcessParamBase):
"output_format": "json", "output_format": "json",
}, },
"slides": { "slides": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
"suffix": [ "suffix": [
"pptx", "pptx",
"ppt"
], ],
"output_format": "json", "output_format": "json",
}, },
@ -245,7 +249,12 @@ class Parser(ProcessBase):
bboxes.append(box) bboxes.append(box)
elif conf.get("parse_method").lower() == "tcadp parser": elif conf.get("parse_method").lower() == "tcadp parser":
# ADP is a document parsing tool using Tencent Cloud API # ADP is a document parsing tool using Tencent Cloud API
tcadp_parser = TCADPParser() table_result_type = conf.get("table_result_type", "1")
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
)
sections, _ = tcadp_parser.parse_pdf( sections, _ = tcadp_parser.parse_pdf(
filepath=name, filepath=name,
binary=blob, binary=blob,
@ -301,14 +310,86 @@ class Parser(ProcessBase):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.") self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
conf = self._param.setups["spreadsheet"] conf = self._param.setups["spreadsheet"]
self.set_output("output_format", conf["output_format"]) self.set_output("output_format", conf["output_format"])
spreadsheet_parser = ExcelParser()
if conf.get("output_format") == "html": parse_method = conf.get("parse_method", "deepdoc")
htmls = spreadsheet_parser.html(blob, 1000000000)
self.set_output("html", htmls[0]) # Handle TCADP parser
elif conf.get("output_format") == "json": if parse_method.lower() == "tcadp parser":
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt]) table_result_type = conf.get("table_result_type", "1")
elif conf.get("output_format") == "markdown": markdown_image_response_type = conf.get("markdown_image_response_type", "1")
self.set_output("markdown", spreadsheet_parser.markdown(blob)) tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
)
if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
# Determine file type based on extension
if re.search(r"\.xlsx?$", name, re.IGNORECASE):
file_type = "XLSX"
else:
file_type = "CSV"
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
sections, tables = tcadp_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
file_type=file_type,
file_start_page=1,
file_end_page=1000
)
# Process TCADP parser output based on configured output_format
output_format = conf.get("output_format", "html")
if output_format == "html":
# For HTML output, combine sections and tables into HTML
html_content = ""
for section, position_tag in sections:
if section:
html_content += section + "\n"
for table in tables:
if table:
html_content += table + "\n"
self.set_output("html", html_content)
elif output_format == "json":
# For JSON output, create a list of text items
result = []
# Add sections as text
for section, position_tag in sections:
if section:
result.append({"text": section})
# Add tables as text
for table in tables:
if table:
result.append({"text": table})
self.set_output("json", result)
elif output_format == "markdown":
# For markdown output, combine into markdown
md_content = ""
for section, position_tag in sections:
if section:
md_content += section + "\n\n"
for table in tables:
if table:
md_content += table + "\n\n"
self.set_output("markdown", md_content)
else:
# Default DeepDOC parser
spreadsheet_parser = ExcelParser()
if conf.get("output_format") == "html":
htmls = spreadsheet_parser.html(blob, 1000000000)
self.set_output("html", htmls[0])
elif conf.get("output_format") == "json":
self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
elif conf.get("output_format") == "markdown":
self.set_output("markdown", spreadsheet_parser.markdown(blob))
def _word(self, name, blob): def _word(self, name, blob):
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document") self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
@ -326,22 +407,69 @@ class Parser(ProcessBase):
self.set_output("markdown", markdown_text) self.set_output("markdown", markdown_text)
def _slides(self, name, blob): def _slides(self, name, blob):
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document") self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
conf = self._param.setups["slides"] conf = self._param.setups["slides"]
self.set_output("output_format", conf["output_format"]) self.set_output("output_format", conf["output_format"])
ppt_parser = ppt_parser() parse_method = conf.get("parse_method", "deepdoc")
txts = ppt_parser(blob, 0, 100000, None)
sections = [{"text": section} for section in txts if section.strip()] # Handle TCADP parser
if parse_method.lower() == "tcadp parser":
table_result_type = conf.get("table_result_type", "1")
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
)
if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
# json # Determine file type based on extension
assert conf.get("output_format") == "json", "have to be json for ppt" if re.search(r"\.pptx?$", name, re.IGNORECASE):
if conf.get("output_format") == "json": file_type = "PPTX"
self.set_output("json", sections) else:
file_type = "PPT"
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
sections, tables = tcadp_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
file_type=file_type,
file_start_page=1,
file_end_page=1000
)
# Process TCADP parser output - PPT only supports json format
output_format = conf.get("output_format", "json")
if output_format == "json":
# For JSON output, create a list of text items
result = []
# Add sections as text
for section, position_tag in sections:
if section:
result.append({"text": section})
# Add tables as text
for table in tables:
if table:
result.append({"text": table})
self.set_output("json", result)
else:
# Default DeepDOC parser (supports .pptx format)
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
ppt_parser = ppt_parser()
txts = ppt_parser(blob, 0, 100000, None)
sections = [{"text": section} for section in txts if section.strip()]
# json
assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json":
self.set_output("json", sections)
def _markdown(self, name, blob): def _markdown(self, name, blob):
from functools import reduce from functools import reduce
@ -579,6 +707,7 @@ class Parser(ProcessBase):
"video": self._video, "video": self._video,
"email": self._email, "email": self._email,
} }
try: try:
from_upstream = ParserFromUpstream.model_validate(kwargs) from_upstream = ParserFromUpstream.model_validate(kwargs)
except Exception as e: except Exception as e:

View File

@ -1752,6 +1752,8 @@ The variable aggregation node (originally the variable assignment node) is a cru
The Indexer will store the content in the corresponding data structures for the selected methods.`, The Indexer will store the content in the corresponding data structures for the selected methods.`,
// file: 'File', // file: 'File',
parserMethod: 'PDF parser', parserMethod: 'PDF parser',
tableResultType: 'Table Result Type',
markdownImageResponseType: 'Markdown Image Response Type',
// systemPrompt: 'System Prompt', // systemPrompt: 'System Prompt',
systemPromptPlaceholder: systemPromptPlaceholder:
'Enter system prompt for image analysis, if empty the system default value will be used', 'Enter system prompt for image analysis, if empty the system default value will be used',

View File

@ -1629,6 +1629,8 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
Tokenizer 会根据所选方式将内容存储为对应的数据结构。`, Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
filenameEmbdWeight: '文件名嵌入权重', filenameEmbdWeight: '文件名嵌入权重',
parserMethod: '解析方法', parserMethod: '解析方法',
tableResultType: '表格返回形式',
markdownImageResponseType: '图片返回形式',
systemPromptPlaceholder: systemPromptPlaceholder:
'请输入用于图像分析的系统提示词,若为空则使用系统缺省值', '请输入用于图像分析的系统提示词,若为空则使用系统缺省值',
exportJson: '导出 JSON', exportJson: '导出 JSON',

View File

@ -169,6 +169,7 @@ export const initialParserValues = {
{ {
fileFormat: FileType.Spreadsheet, fileFormat: FileType.Spreadsheet,
output_format: SpreadsheetOutputFormat.Html, output_format: SpreadsheetOutputFormat.Html,
parse_method: ParseDocumentType.DeepDOC,
}, },
{ {
fileFormat: FileType.Image, fileFormat: FileType.Image,
@ -192,6 +193,7 @@ export const initialParserValues = {
{ {
fileFormat: FileType.PowerPoint, fileFormat: FileType.PowerPoint,
output_format: PptOutputFormat.Json, output_format: PptOutputFormat.Json,
parse_method: ParseDocumentType.DeepDOC,
}, },
], ],
}; };
@ -243,7 +245,7 @@ export const FileTypeSuffixMap = {
[FileType.Email]: ['eml', 'msg'], [FileType.Email]: ['eml', 'msg'],
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'], [FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
[FileType.Docx]: ['doc', 'docx'], [FileType.Docx]: ['doc', 'docx'],
[FileType.PowerPoint]: ['pptx'], [FileType.PowerPoint]: ['pptx', 'ppt'],
[FileType.Video]: ['mp4', 'avi', 'mkv'], [FileType.Video]: ['mp4', 'avi', 'mkv'],
[FileType.Audio]: [ [FileType.Audio]: [
'da', 'da',

View File

@ -34,6 +34,8 @@ import { OutputFormatFormField } from './common-form-fields';
import { EmailFormFields } from './email-form-fields'; import { EmailFormFields } from './email-form-fields';
import { ImageFormFields } from './image-form-fields'; import { ImageFormFields } from './image-form-fields';
import { PdfFormFields } from './pdf-form-fields'; import { PdfFormFields } from './pdf-form-fields';
import { PptFormFields } from './ppt-form-fields';
import { SpreadsheetFormFields } from './spreadsheet-form-fields';
import { buildFieldNameWithPrefix } from './utils'; import { buildFieldNameWithPrefix } from './utils';
import { AudioFormFields, VideoFormFields } from './video-form-fields'; import { AudioFormFields, VideoFormFields } from './video-form-fields';
@ -41,6 +43,8 @@ const outputList = buildOutputList(initialParserValues.outputs);
const FileFormatWidgetMap = { const FileFormatWidgetMap = {
[FileType.PDF]: PdfFormFields, [FileType.PDF]: PdfFormFields,
[FileType.Spreadsheet]: SpreadsheetFormFields,
[FileType.PowerPoint]: PptFormFields,
[FileType.Video]: VideoFormFields, [FileType.Video]: VideoFormFields,
[FileType.Audio]: AudioFormFields, [FileType.Audio]: AudioFormFields,
[FileType.Email]: EmailFormFields, [FileType.Email]: EmailFormFields,
@ -65,6 +69,8 @@ export const FormSchema = z.object({
fields: z.array(z.string()).optional(), fields: z.array(z.string()).optional(),
llm_id: z.string().optional(), llm_id: z.string().optional(),
system_prompt: z.string().optional(), system_prompt: z.string().optional(),
table_result_type: z.string().optional(),
markdown_image_response_type: z.string().optional(),
}), }),
), ),
}); });
@ -184,6 +190,8 @@ const ParserForm = ({ node }: INextOperatorForm) => {
lang: '', lang: '',
fields: [], fields: [],
llm_id: '', llm_id: '',
table_result_type: '',
markdown_image_response_type: '',
}); });
}, [append]); }, [append]);

View File

@ -1,13 +1,30 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field'; import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import {
SelectWithSearch,
SelectWithSearchFlagOptionType,
} from '@/components/originui/select-with-search';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { isEmpty } from 'lodash'; import { isEmpty } from 'lodash';
import { useEffect, useMemo } from 'react'; import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form'; import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { LanguageFormField, ParserMethodFormField } from './common-form-fields'; import { LanguageFormField, ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface'; import { CommonProps } from './interface';
import { useSetInitialLanguage } from './use-set-initial-language'; import { useSetInitialLanguage } from './use-set-initial-language';
import { buildFieldNameWithPrefix } from './utils'; import { buildFieldNameWithPrefix } from './utils';
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'Markdown', value: '0' },
{ label: 'HTML', value: '1' },
];
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'URL', value: '0' },
{ label: 'Text', value: '1' },
];
export function PdfFormFields({ prefix }: CommonProps) { export function PdfFormFields({ prefix }: CommonProps) {
const { t } = useTranslation();
const form = useFormContext(); const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix); const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
@ -25,6 +42,12 @@ export function PdfFormFields({ prefix }: CommonProps) {
); );
}, [parseMethod]); }, [parseMethod]);
const tcadpOptionsShown = useMemo(() => {
return (
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
);
}, [parseMethod]);
useSetInitialLanguage({ prefix, languageShown }); useSetInitialLanguage({ prefix, languageShown });
useEffect(() => { useEffect(() => {
@ -36,10 +59,68 @@ export function PdfFormFields({ prefix }: CommonProps) {
} }
}, [form, parseMethodName]); }, [form, parseMethodName]);
// Set default values for TCADP options when TCADP is selected
useEffect(() => {
if (tcadpOptionsShown) {
const tableResultTypeName = buildFieldNameWithPrefix(
'table_result_type',
prefix,
);
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
);
if (isEmpty(form.getValues(tableResultTypeName))) {
form.setValue(tableResultTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
form.setValue(markdownImageResponseTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
}
}, [tcadpOptionsShown, form, prefix]);
return ( return (
<> <>
<ParserMethodFormField prefix={prefix}></ParserMethodFormField> <ParserMethodFormField prefix={prefix}></ParserMethodFormField>
{languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>} {languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>}
{tcadpOptionsShown && (
<>
<RAGFlowFormItem
name={buildFieldNameWithPrefix('table_result_type', prefix)}
label={t('flow.tableResultType') || '表格返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={tableResultTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
)}
label={t('flow.markdownImageResponseType') || '图片返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={markdownImageResponseTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
</>
)}
</> </>
); );
} }

View File

@ -0,0 +1,125 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import {
SelectWithSearch,
SelectWithSearchFlagOptionType,
} from '@/components/originui/select-with-search';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { isEmpty } from 'lodash';
import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'Markdown', value: '0' },
{ label: 'HTML', value: '1' },
];
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'URL', value: '0' },
{ label: 'Text', value: '1' },
];
export function PptFormFields({ prefix }: CommonProps) {
const { t } = useTranslation();
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
const parseMethod = useWatch({
name: parseMethodName,
});
// PPT only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
{
label: ParseDocumentType.TCADPParser,
value: ParseDocumentType.TCADPParser,
},
];
const tcadpOptionsShown = useMemo(() => {
return (
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
);
}, [parseMethod]);
useEffect(() => {
if (isEmpty(form.getValues(parseMethodName))) {
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
shouldValidate: true,
shouldDirty: true,
});
}
}, [form, parseMethodName]);
// Set default values for TCADP options when TCADP is selected
useEffect(() => {
if (tcadpOptionsShown) {
const tableResultTypeName = buildFieldNameWithPrefix(
'table_result_type',
prefix,
);
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
);
if (isEmpty(form.getValues(tableResultTypeName))) {
form.setValue(tableResultTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
form.setValue(markdownImageResponseTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
}
}, [tcadpOptionsShown, form, prefix]);
return (
<>
<ParserMethodFormField
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
{tcadpOptionsShown && (
<>
<RAGFlowFormItem
name={buildFieldNameWithPrefix('table_result_type', prefix)}
label={t('flow.tableResultType') || '表格返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={tableResultTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
)}
label={t('flow.markdownImageResponseType') || '图片返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={markdownImageResponseTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
</>
)}
</>
);
}

View File

@ -0,0 +1,125 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import {
SelectWithSearch,
SelectWithSearchFlagOptionType,
} from '@/components/originui/select-with-search';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { isEmpty } from 'lodash';
import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'Markdown', value: '0' },
{ label: 'HTML', value: '1' },
];
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'URL', value: '0' },
{ label: 'Text', value: '1' },
];
export function SpreadsheetFormFields({ prefix }: CommonProps) {
const { t } = useTranslation();
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
const parseMethod = useWatch({
name: parseMethodName,
});
// Spreadsheet only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
{
label: ParseDocumentType.TCADPParser,
value: ParseDocumentType.TCADPParser,
},
];
const tcadpOptionsShown = useMemo(() => {
return (
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
);
}, [parseMethod]);
useEffect(() => {
if (isEmpty(form.getValues(parseMethodName))) {
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
shouldValidate: true,
shouldDirty: true,
});
}
}, [form, parseMethodName]);
// Set default values for TCADP options when TCADP is selected
useEffect(() => {
if (tcadpOptionsShown) {
const tableResultTypeName = buildFieldNameWithPrefix(
'table_result_type',
prefix,
);
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
);
if (isEmpty(form.getValues(tableResultTypeName))) {
form.setValue(tableResultTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
form.setValue(markdownImageResponseTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
}
}, [tcadpOptionsShown, form, prefix]);
return (
<>
<ParserMethodFormField
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
{tcadpOptionsShown && (
<>
<RAGFlowFormItem
name={buildFieldNameWithPrefix('table_result_type', prefix)}
label={t('flow.tableResultType') || '表格返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={tableResultTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
)}
label={t('flow.markdownImageResponseType') || '图片返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={markdownImageResponseTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
</>
)}
</>
);
}

View File

@ -214,6 +214,36 @@ function transformParserParams(params: ParserFormSchemaType) {
parse_method: cur.parse_method, parse_method: cur.parse_method,
lang: cur.lang, lang: cur.lang,
}; };
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
filteredSetup.table_result_type = cur.table_result_type;
filteredSetup.markdown_image_response_type =
cur.markdown_image_response_type;
}
break;
case FileType.Spreadsheet:
filteredSetup = {
...filteredSetup,
parse_method: cur.parse_method,
};
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
filteredSetup.table_result_type = cur.table_result_type;
filteredSetup.markdown_image_response_type =
cur.markdown_image_response_type;
}
break;
case FileType.PowerPoint:
filteredSetup = {
...filteredSetup,
parse_method: cur.parse_method,
};
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
filteredSetup.table_result_type = cur.table_result_type;
filteredSetup.markdown_image_response_type =
cur.markdown_image_response_type;
}
break; break;
case FileType.Image: case FileType.Image:
filteredSetup = { filteredSetup = {

View File

View File

@ -0,0 +1,40 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import { isEmpty } from 'lodash';
import { useEffect } from 'react';
import { useFormContext } from 'react-hook-form';
import { ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
export function PptFormFields({ prefix }: CommonProps) {
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
// PPT only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
{
label: ParseDocumentType.TCADPParser,
value: ParseDocumentType.TCADPParser,
},
];
useEffect(() => {
if (isEmpty(form.getValues(parseMethodName))) {
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
shouldValidate: true,
shouldDirty: true,
});
}
}, [form, parseMethodName]);
return (
<>
<ParserMethodFormField
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
</>
);
}

View File

@ -0,0 +1,40 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import { isEmpty } from 'lodash';
import { useEffect } from 'react';
import { useFormContext } from 'react-hook-form';
import { ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
export function SpreadsheetFormFields({ prefix }: CommonProps) {
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
// Spreadsheet only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
{
label: ParseDocumentType.TCADPParser,
value: ParseDocumentType.TCADPParser,
},
];
useEffect(() => {
if (isEmpty(form.getValues(parseMethodName))) {
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
shouldValidate: true,
shouldDirty: true,
});
}
}, [form, parseMethodName]);
return (
<>
<ParserMethodFormField
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
</>
);
}

View File