Feat: Add TCADP parser for PPTX and spreadsheet document types. (#11041)

### What problem does this PR solve?

- Added TCADP Parser configuration fields to PDF, PPT, and spreadsheet
parsing forms
- Implemented support for setting table result type (Markdown/HTML) and
Markdown image response type (URL/Text)
- Updated TCADP Parser to handle return format settings from
configuration or parameters
- Enhanced frontend to dynamically show TCADP options based on selected
parsing method
- Modified backend to pass format parameters when calling TCADP API
- Optimized form default value logic for TCADP configuration items
- Updated multilingual resource files for new configuration options

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
aidan
2025-11-20 10:08:42 +08:00
committed by GitHub
parent ecf0322165
commit 420c97199a
18 changed files with 668 additions and 37 deletions

View File

@ -1752,6 +1752,8 @@ The variable aggregation node (originally the variable assignment node) is a cru
The Indexer will store the content in the corresponding data structures for the selected methods.`,
// file: 'File',
parserMethod: 'PDF parser',
tableResultType: 'Table Result Type',
markdownImageResponseType: 'Markdown Image Response Type',
// systemPrompt: 'System Prompt',
systemPromptPlaceholder:
'Enter system prompt for image analysis, if empty the system default value will be used',

View File

@ -1629,6 +1629,8 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
Tokenizer 会根据所选方式将内容存储为对应的数据结构。`,
filenameEmbdWeight: '文件名嵌入权重',
parserMethod: '解析方法',
tableResultType: '表格返回形式',
markdownImageResponseType: '图片返回形式',
systemPromptPlaceholder:
'请输入用于图像分析的系统提示词,若为空则使用系统缺省值',
exportJson: '导出 JSON',

View File

@ -169,6 +169,7 @@ export const initialParserValues = {
{
fileFormat: FileType.Spreadsheet,
output_format: SpreadsheetOutputFormat.Html,
parse_method: ParseDocumentType.DeepDOC,
},
{
fileFormat: FileType.Image,
@ -192,6 +193,7 @@ export const initialParserValues = {
{
fileFormat: FileType.PowerPoint,
output_format: PptOutputFormat.Json,
parse_method: ParseDocumentType.DeepDOC,
},
],
};
@ -243,7 +245,7 @@ export const FileTypeSuffixMap = {
[FileType.Email]: ['eml', 'msg'],
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
[FileType.Docx]: ['doc', 'docx'],
[FileType.PowerPoint]: ['pptx'],
[FileType.PowerPoint]: ['pptx', 'ppt'],
[FileType.Video]: ['mp4', 'avi', 'mkv'],
[FileType.Audio]: [
'da',

View File

@ -34,6 +34,8 @@ import { OutputFormatFormField } from './common-form-fields';
import { EmailFormFields } from './email-form-fields';
import { ImageFormFields } from './image-form-fields';
import { PdfFormFields } from './pdf-form-fields';
import { PptFormFields } from './ppt-form-fields';
import { SpreadsheetFormFields } from './spreadsheet-form-fields';
import { buildFieldNameWithPrefix } from './utils';
import { AudioFormFields, VideoFormFields } from './video-form-fields';
@ -41,6 +43,8 @@ const outputList = buildOutputList(initialParserValues.outputs);
const FileFormatWidgetMap = {
[FileType.PDF]: PdfFormFields,
[FileType.Spreadsheet]: SpreadsheetFormFields,
[FileType.PowerPoint]: PptFormFields,
[FileType.Video]: VideoFormFields,
[FileType.Audio]: AudioFormFields,
[FileType.Email]: EmailFormFields,
@ -65,6 +69,8 @@ export const FormSchema = z.object({
fields: z.array(z.string()).optional(),
llm_id: z.string().optional(),
system_prompt: z.string().optional(),
table_result_type: z.string().optional(),
markdown_image_response_type: z.string().optional(),
}),
),
});
@ -184,6 +190,8 @@ const ParserForm = ({ node }: INextOperatorForm) => {
lang: '',
fields: [],
llm_id: '',
table_result_type: '',
markdown_image_response_type: '',
});
}, [append]);

View File

@ -1,13 +1,30 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import {
SelectWithSearch,
SelectWithSearchFlagOptionType,
} from '@/components/originui/select-with-search';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { isEmpty } from 'lodash';
import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { LanguageFormField, ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { useSetInitialLanguage } from './use-set-initial-language';
import { buildFieldNameWithPrefix } from './utils';
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'Markdown', value: '0' },
{ label: 'HTML', value: '1' },
];
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'URL', value: '0' },
{ label: 'Text', value: '1' },
];
export function PdfFormFields({ prefix }: CommonProps) {
const { t } = useTranslation();
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
@ -25,6 +42,12 @@ export function PdfFormFields({ prefix }: CommonProps) {
);
}, [parseMethod]);
const tcadpOptionsShown = useMemo(() => {
return (
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
);
}, [parseMethod]);
useSetInitialLanguage({ prefix, languageShown });
useEffect(() => {
@ -36,10 +59,68 @@ export function PdfFormFields({ prefix }: CommonProps) {
}
}, [form, parseMethodName]);
// Set default values for TCADP options when TCADP is selected
useEffect(() => {
if (tcadpOptionsShown) {
const tableResultTypeName = buildFieldNameWithPrefix(
'table_result_type',
prefix,
);
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
);
if (isEmpty(form.getValues(tableResultTypeName))) {
form.setValue(tableResultTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
form.setValue(markdownImageResponseTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
}
}, [tcadpOptionsShown, form, prefix]);
return (
<>
<ParserMethodFormField prefix={prefix}></ParserMethodFormField>
{languageShown && <LanguageFormField prefix={prefix}></LanguageFormField>}
{tcadpOptionsShown && (
<>
<RAGFlowFormItem
name={buildFieldNameWithPrefix('table_result_type', prefix)}
label={t('flow.tableResultType') || '表格返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={tableResultTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
)}
label={t('flow.markdownImageResponseType') || '图片返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={markdownImageResponseTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
</>
)}
</>
);
}

View File

@ -0,0 +1,125 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import {
SelectWithSearch,
SelectWithSearchFlagOptionType,
} from '@/components/originui/select-with-search';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { isEmpty } from 'lodash';
import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'Markdown', value: '0' },
{ label: 'HTML', value: '1' },
];
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'URL', value: '0' },
{ label: 'Text', value: '1' },
];
export function PptFormFields({ prefix }: CommonProps) {
const { t } = useTranslation();
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
const parseMethod = useWatch({
name: parseMethodName,
});
// PPT only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
{
label: ParseDocumentType.TCADPParser,
value: ParseDocumentType.TCADPParser,
},
];
const tcadpOptionsShown = useMemo(() => {
return (
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
);
}, [parseMethod]);
useEffect(() => {
if (isEmpty(form.getValues(parseMethodName))) {
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
shouldValidate: true,
shouldDirty: true,
});
}
}, [form, parseMethodName]);
// Set default values for TCADP options when TCADP is selected
useEffect(() => {
if (tcadpOptionsShown) {
const tableResultTypeName = buildFieldNameWithPrefix(
'table_result_type',
prefix,
);
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
);
if (isEmpty(form.getValues(tableResultTypeName))) {
form.setValue(tableResultTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
form.setValue(markdownImageResponseTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
}
}, [tcadpOptionsShown, form, prefix]);
return (
<>
<ParserMethodFormField
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
{tcadpOptionsShown && (
<>
<RAGFlowFormItem
name={buildFieldNameWithPrefix('table_result_type', prefix)}
label={t('flow.tableResultType') || '表格返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={tableResultTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
)}
label={t('flow.markdownImageResponseType') || '图片返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={markdownImageResponseTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
</>
)}
</>
);
}

View File

@ -0,0 +1,125 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import {
SelectWithSearch,
SelectWithSearchFlagOptionType,
} from '@/components/originui/select-with-search';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { isEmpty } from 'lodash';
import { useEffect, useMemo } from 'react';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
const tableResultTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'Markdown', value: '0' },
{ label: 'HTML', value: '1' },
];
const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [
{ label: 'URL', value: '0' },
{ label: 'Text', value: '1' },
];
export function SpreadsheetFormFields({ prefix }: CommonProps) {
const { t } = useTranslation();
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
const parseMethod = useWatch({
name: parseMethodName,
});
// Spreadsheet only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
{
label: ParseDocumentType.TCADPParser,
value: ParseDocumentType.TCADPParser,
},
];
const tcadpOptionsShown = useMemo(() => {
return (
!isEmpty(parseMethod) && parseMethod === ParseDocumentType.TCADPParser
);
}, [parseMethod]);
useEffect(() => {
if (isEmpty(form.getValues(parseMethodName))) {
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
shouldValidate: true,
shouldDirty: true,
});
}
}, [form, parseMethodName]);
// Set default values for TCADP options when TCADP is selected
useEffect(() => {
if (tcadpOptionsShown) {
const tableResultTypeName = buildFieldNameWithPrefix(
'table_result_type',
prefix,
);
const markdownImageResponseTypeName = buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
);
if (isEmpty(form.getValues(tableResultTypeName))) {
form.setValue(tableResultTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
if (isEmpty(form.getValues(markdownImageResponseTypeName))) {
form.setValue(markdownImageResponseTypeName, '1', {
shouldValidate: true,
shouldDirty: true,
});
}
}
}, [tcadpOptionsShown, form, prefix]);
return (
<>
<ParserMethodFormField
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
{tcadpOptionsShown && (
<>
<RAGFlowFormItem
name={buildFieldNameWithPrefix('table_result_type', prefix)}
label={t('flow.tableResultType') || '表格返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={tableResultTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildFieldNameWithPrefix(
'markdown_image_response_type',
prefix,
)}
label={t('flow.markdownImageResponseType') || '图片返回形式'}
>
{(field) => (
<SelectWithSearch
value={field.value}
onChange={field.onChange}
options={markdownImageResponseTypeOptions}
></SelectWithSearch>
)}
</RAGFlowFormItem>
</>
)}
</>
);
}

View File

@ -214,6 +214,36 @@ function transformParserParams(params: ParserFormSchemaType) {
parse_method: cur.parse_method,
lang: cur.lang,
};
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
filteredSetup.table_result_type = cur.table_result_type;
filteredSetup.markdown_image_response_type =
cur.markdown_image_response_type;
}
break;
case FileType.Spreadsheet:
filteredSetup = {
...filteredSetup,
parse_method: cur.parse_method,
};
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
filteredSetup.table_result_type = cur.table_result_type;
filteredSetup.markdown_image_response_type =
cur.markdown_image_response_type;
}
break;
case FileType.PowerPoint:
filteredSetup = {
...filteredSetup,
parse_method: cur.parse_method,
};
// Only include TCADP parameters if TCADP Parser is selected
if (cur.parse_method?.toLowerCase() === 'tcadp parser') {
filteredSetup.table_result_type = cur.table_result_type;
filteredSetup.markdown_image_response_type =
cur.markdown_image_response_type;
}
break;
case FileType.Image:
filteredSetup = {

View File

View File

@ -0,0 +1,40 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import { isEmpty } from 'lodash';
import { useEffect } from 'react';
import { useFormContext } from 'react-hook-form';
import { ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
export function PptFormFields({ prefix }: CommonProps) {
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
// PPT only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
{
label: ParseDocumentType.TCADPParser,
value: ParseDocumentType.TCADPParser,
},
];
useEffect(() => {
if (isEmpty(form.getValues(parseMethodName))) {
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
shouldValidate: true,
shouldDirty: true,
});
}
}, [form, parseMethodName]);
return (
<>
<ParserMethodFormField
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
</>
);
}

View File

@ -0,0 +1,40 @@
import { ParseDocumentType } from '@/components/layout-recognize-form-field';
import { isEmpty } from 'lodash';
import { useEffect } from 'react';
import { useFormContext } from 'react-hook-form';
import { ParserMethodFormField } from './common-form-fields';
import { CommonProps } from './interface';
import { buildFieldNameWithPrefix } from './utils';
export function SpreadsheetFormFields({ prefix }: CommonProps) {
const form = useFormContext();
const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix);
// Spreadsheet only supports DeepDOC and TCADPParser
const optionsWithoutLLM = [
{ label: ParseDocumentType.DeepDOC, value: ParseDocumentType.DeepDOC },
{
label: ParseDocumentType.TCADPParser,
value: ParseDocumentType.TCADPParser,
},
];
useEffect(() => {
if (isEmpty(form.getValues(parseMethodName))) {
form.setValue(parseMethodName, ParseDocumentType.DeepDOC, {
shouldValidate: true,
shouldDirty: true,
});
}
}, [form, parseMethodName]);
return (
<>
<ParserMethodFormField
prefix={prefix}
optionsWithoutLLM={optionsWithoutLLM}
></ParserMethodFormField>
</>
);
}

View File