feat: add ingestion pipeline children delimiters configs (#11979)

### What problem does this PR solve?

Add children delimiters for Ingestion pipeline config

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Jimmy Ben Klieve
2025-12-17 11:18:54 +08:00
committed by GitHub
parent 30019dab9f
commit 2595644dfd
10 changed files with 218 additions and 3 deletions

View File

@ -0,0 +1,116 @@
import { cn } from '@/lib/utils';
import { forwardRef } from 'react';
import { useFormContext } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import {
FormControl,
FormField,
FormItem,
FormLabel,
FormMessage,
} from './ui/form';
import { Input, InputProps } from './ui/input';
import { Switch } from './ui/switch';
interface IProps {
value?: string | undefined;
onChange?: (val: string | undefined) => void;
}
export const DelimiterInput = forwardRef<HTMLInputElement, InputProps & IProps>(
({ value, onChange, maxLength, defaultValue, ...props }, ref) => {
const nextValue = value
?.replaceAll('\n', '\\n')
.replaceAll('\t', '\\t')
.replaceAll('\r', '\\r');
const handleInputChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const val = e.target.value;
const nextValue = val
.replaceAll('\\n', '\n')
.replaceAll('\\t', '\t')
.replaceAll('\\r', '\r');
onChange?.(nextValue);
};
return (
<Input
value={nextValue}
onChange={handleInputChange}
maxLength={maxLength}
defaultValue={defaultValue}
ref={ref}
className={cn('bg-bg-base', props.className)}
{...props}
/>
);
},
);
export function ChildrenDelimiterForm() {
const { t } = useTranslation();
const form = useFormContext();
const delimiterValue = form.watch('parser_config.children_delimiter');
return (
<fieldset className="space-y-2">
<FormField
control={form.control}
name="parser_config.enable_children"
render={({ field: { value, onChange, ...restProps } }) => (
<FormItem className="items-center space-y-0 ">
<div className="flex items-center justify-between gap-1">
<FormLabel>
{t('knowledgeDetails.enableChildrenDelimiter')}
</FormLabel>
<div className="flex-none">
<FormControl>
<Switch
checked={value}
onCheckedChange={(checked) => {
if (checked && !delimiterValue) {
form.setValue('parser_config.children_delimiter', '\n');
}
onChange(checked);
}}
{...restProps}
/>
</FormControl>
</div>
</div>
</FormItem>
)}
/>
{form.getValues('parser_config.enable_children') && (
<FormField
control={form.control}
name="parser_config.children_delimiter"
render={({ field }) => (
<FormItem className="items-center space-y-0 ">
<div className="flex items-center gap-1">
<FormLabel
required
tooltip={t('knowledgeDetails.childrenDelimiterTip')}
className="text-sm text-text-secondary whitespace-break-spaces w-1/4"
>
{t('knowledgeDetails.childrenDelimiter')}
</FormLabel>
<div className="w-3/4">
<FormControl>
<DelimiterInput {...field} />
</FormControl>
</div>
</div>
<div className="flex pt-1">
<div className="w-1/4"></div>
<FormMessage />
</div>
</FormItem>
)}
/>
)}
</fieldset>
);
}

View File

@ -34,6 +34,7 @@ import {
AutoKeywordsFormField, AutoKeywordsFormField,
AutoQuestionsFormField, AutoQuestionsFormField,
} from '../auto-keywords-form-field'; } from '../auto-keywords-form-field';
import { ChildrenDelimiterForm } from '../children-delimiter-form';
import { DataFlowSelect } from '../data-pipeline-select'; import { DataFlowSelect } from '../data-pipeline-select';
import { DelimiterFormField } from '../delimiter-form-field'; import { DelimiterFormField } from '../delimiter-form-field';
import { EntityTypesFormField } from '../entity-types-form-field'; import { EntityTypesFormField } from '../entity-types-form-field';
@ -111,6 +112,8 @@ export function ChunkMethodDialog({
layout_recognize: z.string().optional(), layout_recognize: z.string().optional(),
chunk_token_num: z.coerce.number().optional(), chunk_token_num: z.coerce.number().optional(),
delimiter: z.string().optional(), delimiter: z.string().optional(),
enable_children: z.boolean().optional(),
children_delimiter: z.string().optional(),
auto_keywords: z.coerce.number().optional(), auto_keywords: z.coerce.number().optional(),
auto_questions: z.coerce.number().optional(), auto_questions: z.coerce.number().optional(),
html4excel: z.boolean().optional(), html4excel: z.boolean().optional(),
@ -196,6 +199,10 @@ export function ChunkMethodDialog({
...data, ...data,
parser_config: { parser_config: {
...data.parser_config, ...data.parser_config,
// Unset children delimiter if this option is not enabled
children_delimiter: data.parser_config.enable_children
? data.parser_config.children_delimiter
: null,
pages: data.parser_config?.pages?.map((x: any) => [x.from, x.to]) ?? [], pages: data.parser_config?.pages?.map((x: any) => [x.from, x.to]) ?? [],
}, },
}; };
@ -333,6 +340,7 @@ export function ChunkMethodDialog({
} }
></MaxTokenNumberFormField> ></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField> <DelimiterFormField></DelimiterFormField>
<ChildrenDelimiterForm />
</> </>
)} )}
</FormContainer> </FormContainer>

View File

@ -12,6 +12,8 @@ export function useDefaultParserValues() {
layout_recognize: ParseDocumentType.DeepDOC, layout_recognize: ParseDocumentType.DeepDOC,
chunk_token_num: 512, chunk_token_num: 512,
delimiter: '\n', delimiter: '\n',
enable_children: false,
children_delimiter: '\n',
auto_keywords: 0, auto_keywords: 0,
auto_questions: 0, auto_questions: 0,
html4excel: false, html4excel: false,

View File

@ -295,6 +295,11 @@ export default {
delimiter: `Delimiter for text`, delimiter: `Delimiter for text`,
delimiterTip: delimiterTip:
'A delimiter or separator can consist of one or multiple special characters. If it is multiple characters, ensure they are enclosed in backticks( ``). For example, if you configure your delimiters like this: \\n`##`;, then your texts will be separated at line breaks, double hash symbols (##), and semicolons.', 'A delimiter or separator can consist of one or multiple special characters. If it is multiple characters, ensure they are enclosed in backticks( ``). For example, if you configure your delimiters like this: \\n`##`;, then your texts will be separated at line breaks, double hash symbols (##), and semicolons.',
enableChildrenDelimiter: 'Child chunk are used for retrieval',
childrenDelimiter: 'Delimiter for text',
childrenDelimiterTip:
'A delimiter or separator can consist of one or multiple special characters. If it is multiple characters, ensure they are enclosed in backticks( ``). For example, if you configure your delimiters like this: \\n`##`;, then your texts will be separated at line breaks, double hash symbols (##), and semicolons.',
html4excel: 'Excel to HTML', html4excel: 'Excel to HTML',
html4excelTip: `Use with the General chunking method. When disabled, spreadsheets (XLSX or XLS(Excel 97-2003)) in the knowledge base will be parsed into key-value pairs. When enabled, they will be parsed into HTML tables, splitting every 12 rows if the original table has more than 12 rows. See https://ragflow.io/docs/dev/enable_excel2html for details.`, html4excelTip: `Use with the General chunking method. When disabled, spreadsheets (XLSX or XLS(Excel 97-2003)) in the knowledge base will be parsed into key-value pairs. When enabled, they will be parsed into HTML tables, splitting every 12 rows if the original table has more than 12 rows. See https://ragflow.io/docs/dev/enable_excel2html for details.`,
autoKeywords: 'Auto-keyword', autoKeywords: 'Auto-keyword',
@ -779,7 +784,7 @@ This auto-tagging feature enhances retrieval by adding another layer of domain-s
'The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)', 'The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)',
confluenceSpaceKeyTip: confluenceSpaceKeyTip:
'Optional: Specify a space key to limit syncing to a specific space. Leave empty to sync all accessible spaces. For multiple spaces, separate with commas (e.g., DEV,DOCS,HR)', 'Optional: Specify a space key to limit syncing to a specific space. Leave empty to sync all accessible spaces. For multiple spaces, separate with commas (e.g., DEV,DOCS,HR)',
s3PrefixTip: `Specify the folder path within your S3 bucket to fetch files from. s3PrefixTip: `Specify the folder path within your S3 bucket to fetch files from.
Example: general/v2/`, Example: general/v2/`,
S3CompatibleEndpointUrlTip: `Required for S3 compatible Storage Box. Specify the S3-compatible endpoint URL. S3CompatibleEndpointUrlTip: `Required for S3 compatible Storage Box. Specify the S3-compatible endpoint URL.
Example: https://fsn1.your-objectstorage.com`, Example: https://fsn1.your-objectstorage.com`,
@ -1199,6 +1204,7 @@ Example: Virtual Hosted Style`,
tab: 'Tab', tab: 'Tab',
space: 'Space', space: 'Space',
delimiters: 'Delimiters', delimiters: 'Delimiters',
enableChildrenDelimiters: 'Child chunk are used for retrieval',
merge: 'Merge', merge: 'Merge',
split: 'Split', split: 'Split',
script: 'Script', script: 'Script',

View File

@ -2,7 +2,8 @@ import { DelimiterInput } from '@/components/delimiter-form-field';
import { RAGFlowFormItem } from '@/components/ragflow-form'; import { RAGFlowFormItem } from '@/components/ragflow-form';
import { SliderInputFormField } from '@/components/slider-input-form-field'; import { SliderInputFormField } from '@/components/slider-input-form-field';
import { BlockButton, Button } from '@/components/ui/button'; import { BlockButton, Button } from '@/components/ui/button';
import { Form } from '@/components/ui/form'; import { Form, FormControl, FormField, FormItem } from '@/components/ui/form';
import { Switch } from '@/components/ui/switch';
import { zodResolver } from '@hookform/resolvers/zod'; import { zodResolver } from '@hookform/resolvers/zod';
import { Trash2 } from 'lucide-react'; import { Trash2 } from 'lucide-react';
import { memo } from 'react'; import { memo } from 'react';
@ -26,6 +27,12 @@ export const FormSchema = z.object({
value: z.string().optional(), value: z.string().optional(),
}), }),
), ),
enable_children: z.boolean(),
children_delimiters: z.array(
z.object({
value: z.string().optional(),
}),
),
overlapped_percent: z.number(), // 0.0 - 0.3 , 0% - 30% overlapped_percent: z.number(), // 0.0 - 0.3 , 0% - 30%
}); });
@ -46,6 +53,11 @@ const SplitterForm = ({ node }: INextOperatorForm) => {
control: form.control, control: form.control,
}); });
const childrenDelimiters = useFieldArray({
name: 'children_delimiters',
control: form.control,
});
useWatchFormChange(node?.id, form); useWatchFormChange(node?.id, form);
return ( return (
@ -90,6 +102,59 @@ const SplitterForm = ({ node }: INextOperatorForm) => {
<BlockButton onClick={() => append({ value: '\n' })}> <BlockButton onClick={() => append({ value: '\n' })}>
{t('common.add')} {t('common.add')}
</BlockButton> </BlockButton>
<fieldset>
<div className="mb-2 flex justify-between items-center gap-1">
<span>{t('flow.enableChildrenDelimiters')}</span>
<FormField
control={form.control}
name="enable_children"
render={({ field: { value, onChange, ...restProps } }) => (
<FormItem>
<FormControl>
<Switch
checked={value}
onCheckedChange={onChange}
{...restProps}
/>
</FormControl>
</FormItem>
)}
/>
</div>
{form.getValues('enable_children') && (
<div className="space-y-4">
{childrenDelimiters.fields.map((field, index) => (
<div key={field.id} className="flex items-center gap-2">
<RAGFlowFormItem
name={`children_delimiters.${index}.value`}
label="children_delimiter"
labelClassName="!hidden"
className="flex-auto space-y-0"
>
<DelimiterInput className="!m-0"></DelimiterInput>
</RAGFlowFormItem>
<Button
type="button"
variant="ghost"
onClick={() => childrenDelimiters.remove(index)}
>
<Trash2 />
</Button>
</div>
))}
<BlockButton
onClick={() => childrenDelimiters.append({ value: '\n' })}
>
{t('common.add')}
</BlockButton>
</div>
)}
</fieldset>
</FormWrapper> </FormWrapper>
<div className="p-5"> <div className="p-5">
<Output list={outputList}></Output> <Output list={outputList}></Output>

View File

@ -288,6 +288,11 @@ function transformSplitterParams(params: SplitterFormSchemaType) {
...params, ...params,
overlapped_percent: Number(params.overlapped_percent) / 100, overlapped_percent: Number(params.overlapped_percent) / 100,
delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'), delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'),
// Unset children delimiters if this option is not enabled
children_delimiters: params.enable_children
? transformObjectArrayToPureArray(params.children_delimiters, 'value')
: [],
}; };
} }
@ -713,7 +718,7 @@ export function convertToObjectArray<T extends string | number | boolean>(
/** /**
* convert the following object into a list * convert the following object into a list
* *
* { * {
"product_related": { "product_related": {
"description": "The question is about product usage, appearance and how it works.", "description": "The question is about product usage, appearance and how it works.",

View File

@ -2,6 +2,7 @@ import {
AutoKeywordsFormField, AutoKeywordsFormField,
AutoQuestionsFormField, AutoQuestionsFormField,
} from '@/components/auto-keywords-form-field'; } from '@/components/auto-keywords-form-field';
import { ChildrenDelimiterForm } from '@/components/children-delimiter-form';
import { DelimiterFormField } from '@/components/delimiter-form-field'; import { DelimiterFormField } from '@/components/delimiter-form-field';
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field'; import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
@ -21,6 +22,7 @@ export function NaiveConfiguration() {
<MinerUOptionsFormField></MinerUOptionsFormField> <MinerUOptionsFormField></MinerUOptionsFormField>
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField> <MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField> <DelimiterFormField></DelimiterFormField>
<ChildrenDelimiterForm />
<EnableTocToggle /> <EnableTocToggle />
<OverlappedPercent /> <OverlappedPercent />
</ConfigurationFormContainer> </ConfigurationFormContainer>

View File

@ -24,6 +24,8 @@ export const formSchema = z
layout_recognize: z.string(), layout_recognize: z.string(),
chunk_token_num: z.number(), chunk_token_num: z.number(),
delimiter: z.string(), delimiter: z.string(),
enable_children: z.boolean(),
children_delimiter: z.string(),
auto_keywords: z.number().optional(), auto_keywords: z.number().optional(),
auto_questions: z.number().optional(), auto_questions: z.number().optional(),
html4excel: z.boolean(), html4excel: z.boolean(),

View File

@ -63,6 +63,8 @@ export default function DatasetSettings() {
layout_recognize: DocumentType.DeepDOC, layout_recognize: DocumentType.DeepDOC,
chunk_token_num: 512, chunk_token_num: 512,
delimiter: `\n`, delimiter: `\n`,
enable_children: false,
children_delimiter: `\n`,
auto_keywords: 0, auto_keywords: 0,
auto_questions: 0, auto_questions: 0,
html4excel: false, html4excel: false,

View File

@ -67,6 +67,13 @@ export function SavingButton() {
await saveKnowledgeConfiguration({ await saveKnowledgeConfiguration({
kb_id, kb_id,
...values, ...values,
parser_config: {
...values.parser_config,
// Unset children delimiter if this option is not enabled
children_delimiter: values.parser_config.enable_children
? values.parser_config.children_delimiter
: null,
},
}); });
})(); })();
} }