Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
This commit is contained in:
Kevin Hu
2025-10-09 12:36:19 +08:00
committed by GitHub
parent ef0aecea3b
commit cbf04ee470
490 changed files with 10630 additions and 30688 deletions

View File

@ -18,8 +18,11 @@ import { useFetchKnowledgeBaseConfiguration } from '@/hooks/use-knowledge-reques
import { IModalProps } from '@/interfaces/common';
import { IParserConfig } from '@/interfaces/database/document';
import { IChangeParserConfigRequestBody } from '@/interfaces/request/document';
import {
ChunkMethodItem,
ParseTypeItem,
} from '@/pages/dataset/dataset-setting/configuration/common-item';
import { zodResolver } from '@hookform/resolvers/zod';
import get from 'lodash/get';
import omit from 'lodash/omit';
import {} from 'module';
import { useEffect, useMemo } from 'react';
@ -30,24 +33,17 @@ import {
AutoKeywordsFormField,
AutoQuestionsFormField,
} from '../auto-keywords-form-field';
import { DataFlowSelect } from '../data-pipeline-select';
import { DelimiterFormField } from '../delimiter-form-field';
import { EntityTypesFormField } from '../entity-types-form-field';
import { ExcelToHtmlFormField } from '../excel-to-html-form-field';
import { FormContainer } from '../form-container';
import { LayoutRecognizeFormField } from '../layout-recognize-form-field';
import { MaxTokenNumberFormField } from '../max-token-number-from-field';
import {
UseGraphRagFormField,
showGraphRagItems,
} from '../parse-configuration/graph-rag-form-fields';
import RaptorFormFields, {
showRaptorParseConfiguration,
} from '../parse-configuration/raptor-form-fields';
import { ButtonLoading } from '../ui/button';
import { Input } from '../ui/input';
import { RAGFlowSelect } from '../ui/select';
import { DynamicPageRange } from './dynamic-page-range';
import { useFetchParserListOnMount, useShowAutoKeywords } from './hooks';
import { useShowAutoKeywords } from './hooks';
import {
useDefaultParserValues,
useFillDefaultValueOnMount,
@ -62,6 +58,7 @@ interface IProps
}> {
loading: boolean;
parserId: string;
pipelineId?: string;
parserConfig: IParserConfig;
documentExtension: string;
documentId: string;
@ -80,6 +77,7 @@ export function ChunkMethodDialog({
hideModal,
onOk,
parserId,
pipelineId,
documentExtension,
visible,
parserConfig,
@ -87,8 +85,6 @@ export function ChunkMethodDialog({
}: IProps) {
const { t } = useTranslation();
const { parserList } = useFetchParserListOnMount(documentExtension);
const { data: knowledgeDetails } = useFetchKnowledgeBaseConfiguration();
const useGraphRag = useMemo(() => {
@ -99,46 +95,59 @@ export function ChunkMethodDialog({
const fillDefaultParserValue = useFillDefaultValueOnMount();
const FormSchema = z.object({
parser_id: z
.string()
.min(1, {
message: t('common.pleaseSelect'),
})
.trim(),
parser_config: z.object({
task_page_size: z.coerce.number().optional(),
layout_recognize: z.string().optional(),
chunk_token_num: z.coerce.number().optional(),
delimiter: z.string().optional(),
auto_keywords: z.coerce.number().optional(),
auto_questions: z.coerce.number().optional(),
html4excel: z.boolean().optional(),
raptor: z
.object({
use_raptor: z.boolean().optional(),
prompt: z.string().optional().optional(),
max_token: z.coerce.number().optional(),
threshold: z.coerce.number().optional(),
max_cluster: z.coerce.number().optional(),
random_seed: z.coerce.number().optional(),
const FormSchema = z
.object({
parseType: z.number(),
parser_id: z
.string()
.min(1, {
message: t('common.pleaseSelect'),
})
.optional(),
graphrag: z.object({
use_graphrag: z.boolean().optional(),
.trim(),
pipeline_id: z.string().optional(),
parser_config: z.object({
task_page_size: z.coerce.number().optional(),
layout_recognize: z.string().optional(),
chunk_token_num: z.coerce.number().optional(),
delimiter: z.string().optional(),
auto_keywords: z.coerce.number().optional(),
auto_questions: z.coerce.number().optional(),
html4excel: z.boolean().optional(),
// raptor: z
// .object({
// use_raptor: z.boolean().optional(),
// prompt: z.string().optional().optional(),
// max_token: z.coerce.number().optional(),
// threshold: z.coerce.number().optional(),
// max_cluster: z.coerce.number().optional(),
// random_seed: z.coerce.number().optional(),
// })
// .optional(),
// graphrag: z.object({
// use_graphrag: z.boolean().optional(),
// }),
entity_types: z.array(z.string()).optional(),
pages: z
.array(z.object({ from: z.coerce.number(), to: z.coerce.number() }))
.optional(),
}),
entity_types: z.array(z.string()).optional(),
pages: z
.array(z.object({ from: z.coerce.number(), to: z.coerce.number() }))
.optional(),
}),
});
})
.superRefine((data, ctx) => {
if (data.parseType === 2 && !data.pipeline_id) {
ctx.addIssue({
path: ['pipeline_id'],
message: t('common.pleaseSelect'),
code: 'custom',
});
}
});
const form = useForm<z.infer<typeof FormSchema>>({
resolver: zodResolver(FormSchema),
defaultValues: {
parser_id: parserId,
parser_id: parserId || '',
pipeline_id: pipelineId || '',
parseType: pipelineId ? 2 : 1,
parser_config: defaultParserValues,
},
});
@ -200,17 +209,19 @@ export function ChunkMethodDialog({
const pages =
parserConfig?.pages?.map((x) => ({ from: x[0], to: x[1] })) ?? [];
form.reset({
parser_id: parserId,
parser_id: parserId || '',
pipeline_id: pipelineId || '',
parseType: pipelineId ? 2 : 1,
parser_config: fillDefaultParserValue({
pages: pages.length > 0 ? pages : [{ from: 1, to: 1024 }],
...omit(parserConfig, 'pages'),
graphrag: {
use_graphrag: get(
parserConfig,
'graphrag.use_graphrag',
useGraphRag,
),
},
// graphrag: {
// use_graphrag: get(
// parserConfig,
// 'graphrag.use_graphrag',
// useGraphRag,
// ),
// },
}),
});
}
@ -220,10 +231,20 @@ export function ChunkMethodDialog({
knowledgeDetails.parser_config,
parserConfig,
parserId,
pipelineId,
useGraphRag,
visible,
]);
const parseType = useWatch({
control: form.control,
name: 'parseType',
defaultValue: pipelineId ? 2 : 1,
});
useEffect(() => {
if (parseType === 1) {
form.setValue('pipeline_id', '');
}
}, [parseType, form]);
return (
<Dialog open onOpenChange={hideModal}>
<DialogContent className="max-w-[50vw]">
@ -237,7 +258,17 @@ export function ChunkMethodDialog({
id={FormId}
>
<FormContainer>
<FormField
<ParseTypeItem />
{parseType === 1 && <ChunkMethodItem></ChunkMethodItem>}
{parseType === 2 && (
<DataFlowSelect
isMult={false}
// toDataPipeline={navigateToAgents}
formFieldName="pipeline_id"
/>
)}
{/* <FormField
control={form.control}
name="parser_id"
render={({ field }) => (
@ -252,9 +283,11 @@ export function ChunkMethodDialog({
<FormMessage />
</FormItem>
)}
/>
{showPages && <DynamicPageRange></DynamicPageRange>}
{showPages && layoutRecognize && (
/> */}
{showPages && parseType === 1 && (
<DynamicPageRange></DynamicPageRange>
)}
{showPages && parseType === 1 && layoutRecognize && (
<FormField
control={form.control}
name="parser_config.task_page_size"
@ -279,50 +312,60 @@ export function ChunkMethodDialog({
/>
)}
</FormContainer>
<FormContainer
show={showOne || showMaxTokenNumber}
className="space-y-3"
>
{showOne && <LayoutRecognizeFormField></LayoutRecognizeFormField>}
{showMaxTokenNumber && (
<>
<MaxTokenNumberFormField
max={
selectedTag === DocumentParserType.KnowledgeGraph
? 8192 * 2
: 2048
}
></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField>
</>
)}
</FormContainer>
<FormContainer
show={showAutoKeywords(selectedTag) || showExcelToHtml}
className="space-y-3"
>
{showAutoKeywords(selectedTag) && (
<>
<AutoKeywordsFormField></AutoKeywordsFormField>
<AutoQuestionsFormField></AutoQuestionsFormField>
</>
)}
{showExcelToHtml && <ExcelToHtmlFormField></ExcelToHtmlFormField>}
</FormContainer>
{showRaptorParseConfiguration(
selectedTag as DocumentParserType,
) && (
<FormContainer>
<RaptorFormFields></RaptorFormFields>
</FormContainer>
)}
{showGraphRagItems(selectedTag as DocumentParserType) &&
useGraphRag && (
<FormContainer>
<UseGraphRagFormField></UseGraphRagFormField>
{parseType === 1 && (
<>
<FormContainer
show={showOne || showMaxTokenNumber}
className="space-y-3"
>
{showOne && (
<LayoutRecognizeFormField></LayoutRecognizeFormField>
)}
{showMaxTokenNumber && (
<>
<MaxTokenNumberFormField
max={
selectedTag === DocumentParserType.KnowledgeGraph
? 8192 * 2
: 2048
}
></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField>
</>
)}
</FormContainer>
)}
{showEntityTypes && <EntityTypesFormField></EntityTypesFormField>}
<FormContainer
show={showAutoKeywords(selectedTag) || showExcelToHtml}
className="space-y-3"
>
{showAutoKeywords(selectedTag) && (
<>
<AutoKeywordsFormField></AutoKeywordsFormField>
<AutoQuestionsFormField></AutoQuestionsFormField>
</>
)}
{showExcelToHtml && (
<ExcelToHtmlFormField></ExcelToHtmlFormField>
)}
</FormContainer>
{/* {showRaptorParseConfiguration(
selectedTag as DocumentParserType,
) && (
<FormContainer>
<RaptorFormFields></RaptorFormFields>
</FormContainer>
)} */}
{/* {showGraphRagItems(selectedTag as DocumentParserType) &&
useGraphRag && (
<FormContainer>
<UseGraphRagFormField></UseGraphRagFormField>
</FormContainer>
)} */}
{showEntityTypes && (
<EntityTypesFormField></EntityTypesFormField>
)}
</>
)}
</form>
</Form>
<DialogFooter>