mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-20 04:39:00 +08:00
feat: add ingestion pipeline children delimiters configs (#11979)
### What problem does this PR solve? Add children delimiters for Ingestion pipeline config ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
116
web/src/components/children-delimiter-form.tsx
Normal file
116
web/src/components/children-delimiter-form.tsx
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
import { cn } from '@/lib/utils';
|
||||||
|
import { forwardRef } from 'react';
|
||||||
|
import { useFormContext } from 'react-hook-form';
|
||||||
|
import { useTranslation } from 'react-i18next';
|
||||||
|
import {
|
||||||
|
FormControl,
|
||||||
|
FormField,
|
||||||
|
FormItem,
|
||||||
|
FormLabel,
|
||||||
|
FormMessage,
|
||||||
|
} from './ui/form';
|
||||||
|
import { Input, InputProps } from './ui/input';
|
||||||
|
import { Switch } from './ui/switch';
|
||||||
|
|
||||||
|
interface IProps {
|
||||||
|
value?: string | undefined;
|
||||||
|
onChange?: (val: string | undefined) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const DelimiterInput = forwardRef<HTMLInputElement, InputProps & IProps>(
|
||||||
|
({ value, onChange, maxLength, defaultValue, ...props }, ref) => {
|
||||||
|
const nextValue = value
|
||||||
|
?.replaceAll('\n', '\\n')
|
||||||
|
.replaceAll('\t', '\\t')
|
||||||
|
.replaceAll('\r', '\\r');
|
||||||
|
const handleInputChange = (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||||
|
const val = e.target.value;
|
||||||
|
const nextValue = val
|
||||||
|
.replaceAll('\\n', '\n')
|
||||||
|
.replaceAll('\\t', '\t')
|
||||||
|
.replaceAll('\\r', '\r');
|
||||||
|
onChange?.(nextValue);
|
||||||
|
};
|
||||||
|
return (
|
||||||
|
<Input
|
||||||
|
value={nextValue}
|
||||||
|
onChange={handleInputChange}
|
||||||
|
maxLength={maxLength}
|
||||||
|
defaultValue={defaultValue}
|
||||||
|
ref={ref}
|
||||||
|
className={cn('bg-bg-base', props.className)}
|
||||||
|
{...props}
|
||||||
|
/>
|
||||||
|
);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
export function ChildrenDelimiterForm() {
|
||||||
|
const { t } = useTranslation();
|
||||||
|
const form = useFormContext();
|
||||||
|
|
||||||
|
const delimiterValue = form.watch('parser_config.children_delimiter');
|
||||||
|
|
||||||
|
return (
|
||||||
|
<fieldset className="space-y-2">
|
||||||
|
<FormField
|
||||||
|
control={form.control}
|
||||||
|
name="parser_config.enable_children"
|
||||||
|
render={({ field: { value, onChange, ...restProps } }) => (
|
||||||
|
<FormItem className="items-center space-y-0 ">
|
||||||
|
<div className="flex items-center justify-between gap-1">
|
||||||
|
<FormLabel>
|
||||||
|
{t('knowledgeDetails.enableChildrenDelimiter')}
|
||||||
|
</FormLabel>
|
||||||
|
|
||||||
|
<div className="flex-none">
|
||||||
|
<FormControl>
|
||||||
|
<Switch
|
||||||
|
checked={value}
|
||||||
|
onCheckedChange={(checked) => {
|
||||||
|
if (checked && !delimiterValue) {
|
||||||
|
form.setValue('parser_config.children_delimiter', '\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
onChange(checked);
|
||||||
|
}}
|
||||||
|
{...restProps}
|
||||||
|
/>
|
||||||
|
</FormControl>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</FormItem>
|
||||||
|
)}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{form.getValues('parser_config.enable_children') && (
|
||||||
|
<FormField
|
||||||
|
control={form.control}
|
||||||
|
name="parser_config.children_delimiter"
|
||||||
|
render={({ field }) => (
|
||||||
|
<FormItem className="items-center space-y-0 ">
|
||||||
|
<div className="flex items-center gap-1">
|
||||||
|
<FormLabel
|
||||||
|
required
|
||||||
|
tooltip={t('knowledgeDetails.childrenDelimiterTip')}
|
||||||
|
className="text-sm text-text-secondary whitespace-break-spaces w-1/4"
|
||||||
|
>
|
||||||
|
{t('knowledgeDetails.childrenDelimiter')}
|
||||||
|
</FormLabel>
|
||||||
|
<div className="w-3/4">
|
||||||
|
<FormControl>
|
||||||
|
<DelimiterInput {...field} />
|
||||||
|
</FormControl>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div className="flex pt-1">
|
||||||
|
<div className="w-1/4"></div>
|
||||||
|
<FormMessage />
|
||||||
|
</div>
|
||||||
|
</FormItem>
|
||||||
|
)}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
</fieldset>
|
||||||
|
);
|
||||||
|
}
|
||||||
@ -34,6 +34,7 @@ import {
|
|||||||
AutoKeywordsFormField,
|
AutoKeywordsFormField,
|
||||||
AutoQuestionsFormField,
|
AutoQuestionsFormField,
|
||||||
} from '../auto-keywords-form-field';
|
} from '../auto-keywords-form-field';
|
||||||
|
import { ChildrenDelimiterForm } from '../children-delimiter-form';
|
||||||
import { DataFlowSelect } from '../data-pipeline-select';
|
import { DataFlowSelect } from '../data-pipeline-select';
|
||||||
import { DelimiterFormField } from '../delimiter-form-field';
|
import { DelimiterFormField } from '../delimiter-form-field';
|
||||||
import { EntityTypesFormField } from '../entity-types-form-field';
|
import { EntityTypesFormField } from '../entity-types-form-field';
|
||||||
@ -111,6 +112,8 @@ export function ChunkMethodDialog({
|
|||||||
layout_recognize: z.string().optional(),
|
layout_recognize: z.string().optional(),
|
||||||
chunk_token_num: z.coerce.number().optional(),
|
chunk_token_num: z.coerce.number().optional(),
|
||||||
delimiter: z.string().optional(),
|
delimiter: z.string().optional(),
|
||||||
|
enable_children: z.boolean().optional(),
|
||||||
|
children_delimiter: z.string().optional(),
|
||||||
auto_keywords: z.coerce.number().optional(),
|
auto_keywords: z.coerce.number().optional(),
|
||||||
auto_questions: z.coerce.number().optional(),
|
auto_questions: z.coerce.number().optional(),
|
||||||
html4excel: z.boolean().optional(),
|
html4excel: z.boolean().optional(),
|
||||||
@ -196,6 +199,10 @@ export function ChunkMethodDialog({
|
|||||||
...data,
|
...data,
|
||||||
parser_config: {
|
parser_config: {
|
||||||
...data.parser_config,
|
...data.parser_config,
|
||||||
|
// Unset children delimiter if this option is not enabled
|
||||||
|
children_delimiter: data.parser_config.enable_children
|
||||||
|
? data.parser_config.children_delimiter
|
||||||
|
: null,
|
||||||
pages: data.parser_config?.pages?.map((x: any) => [x.from, x.to]) ?? [],
|
pages: data.parser_config?.pages?.map((x: any) => [x.from, x.to]) ?? [],
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@ -333,6 +340,7 @@ export function ChunkMethodDialog({
|
|||||||
}
|
}
|
||||||
></MaxTokenNumberFormField>
|
></MaxTokenNumberFormField>
|
||||||
<DelimiterFormField></DelimiterFormField>
|
<DelimiterFormField></DelimiterFormField>
|
||||||
|
<ChildrenDelimiterForm />
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
</FormContainer>
|
</FormContainer>
|
||||||
|
|||||||
@ -12,6 +12,8 @@ export function useDefaultParserValues() {
|
|||||||
layout_recognize: ParseDocumentType.DeepDOC,
|
layout_recognize: ParseDocumentType.DeepDOC,
|
||||||
chunk_token_num: 512,
|
chunk_token_num: 512,
|
||||||
delimiter: '\n',
|
delimiter: '\n',
|
||||||
|
enable_children: false,
|
||||||
|
children_delimiter: '\n',
|
||||||
auto_keywords: 0,
|
auto_keywords: 0,
|
||||||
auto_questions: 0,
|
auto_questions: 0,
|
||||||
html4excel: false,
|
html4excel: false,
|
||||||
|
|||||||
@ -295,6 +295,11 @@ export default {
|
|||||||
delimiter: `Delimiter for text`,
|
delimiter: `Delimiter for text`,
|
||||||
delimiterTip:
|
delimiterTip:
|
||||||
'A delimiter or separator can consist of one or multiple special characters. If it is multiple characters, ensure they are enclosed in backticks( ``). For example, if you configure your delimiters like this: \\n`##`;, then your texts will be separated at line breaks, double hash symbols (##), and semicolons.',
|
'A delimiter or separator can consist of one or multiple special characters. If it is multiple characters, ensure they are enclosed in backticks( ``). For example, if you configure your delimiters like this: \\n`##`;, then your texts will be separated at line breaks, double hash symbols (##), and semicolons.',
|
||||||
|
enableChildrenDelimiter: 'Child chunk are used for retrieval',
|
||||||
|
childrenDelimiter: 'Delimiter for text',
|
||||||
|
childrenDelimiterTip:
|
||||||
|
'A delimiter or separator can consist of one or multiple special characters. If it is multiple characters, ensure they are enclosed in backticks( ``). For example, if you configure your delimiters like this: \\n`##`;, then your texts will be separated at line breaks, double hash symbols (##), and semicolons.',
|
||||||
|
|
||||||
html4excel: 'Excel to HTML',
|
html4excel: 'Excel to HTML',
|
||||||
html4excelTip: `Use with the General chunking method. When disabled, spreadsheets (XLSX or XLS(Excel 97-2003)) in the knowledge base will be parsed into key-value pairs. When enabled, they will be parsed into HTML tables, splitting every 12 rows if the original table has more than 12 rows. See https://ragflow.io/docs/dev/enable_excel2html for details.`,
|
html4excelTip: `Use with the General chunking method. When disabled, spreadsheets (XLSX or XLS(Excel 97-2003)) in the knowledge base will be parsed into key-value pairs. When enabled, they will be parsed into HTML tables, splitting every 12 rows if the original table has more than 12 rows. See https://ragflow.io/docs/dev/enable_excel2html for details.`,
|
||||||
autoKeywords: 'Auto-keyword',
|
autoKeywords: 'Auto-keyword',
|
||||||
@ -779,7 +784,7 @@ This auto-tagging feature enhances retrieval by adding another layer of domain-s
|
|||||||
'The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)',
|
'The base URL of your Confluence instance (e.g., https://your-domain.atlassian.net/wiki)',
|
||||||
confluenceSpaceKeyTip:
|
confluenceSpaceKeyTip:
|
||||||
'Optional: Specify a space key to limit syncing to a specific space. Leave empty to sync all accessible spaces. For multiple spaces, separate with commas (e.g., DEV,DOCS,HR)',
|
'Optional: Specify a space key to limit syncing to a specific space. Leave empty to sync all accessible spaces. For multiple spaces, separate with commas (e.g., DEV,DOCS,HR)',
|
||||||
s3PrefixTip: `Specify the folder path within your S3 bucket to fetch files from.
|
s3PrefixTip: `Specify the folder path within your S3 bucket to fetch files from.
|
||||||
Example: general/v2/`,
|
Example: general/v2/`,
|
||||||
S3CompatibleEndpointUrlTip: `Required for S3 compatible Storage Box. Specify the S3-compatible endpoint URL.
|
S3CompatibleEndpointUrlTip: `Required for S3 compatible Storage Box. Specify the S3-compatible endpoint URL.
|
||||||
Example: https://fsn1.your-objectstorage.com`,
|
Example: https://fsn1.your-objectstorage.com`,
|
||||||
@ -1199,6 +1204,7 @@ Example: Virtual Hosted Style`,
|
|||||||
tab: 'Tab',
|
tab: 'Tab',
|
||||||
space: 'Space',
|
space: 'Space',
|
||||||
delimiters: 'Delimiters',
|
delimiters: 'Delimiters',
|
||||||
|
enableChildrenDelimiters: 'Child chunk are used for retrieval',
|
||||||
merge: 'Merge',
|
merge: 'Merge',
|
||||||
split: 'Split',
|
split: 'Split',
|
||||||
script: 'Script',
|
script: 'Script',
|
||||||
|
|||||||
@ -2,7 +2,8 @@ import { DelimiterInput } from '@/components/delimiter-form-field';
|
|||||||
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||||
import { SliderInputFormField } from '@/components/slider-input-form-field';
|
import { SliderInputFormField } from '@/components/slider-input-form-field';
|
||||||
import { BlockButton, Button } from '@/components/ui/button';
|
import { BlockButton, Button } from '@/components/ui/button';
|
||||||
import { Form } from '@/components/ui/form';
|
import { Form, FormControl, FormField, FormItem } from '@/components/ui/form';
|
||||||
|
import { Switch } from '@/components/ui/switch';
|
||||||
import { zodResolver } from '@hookform/resolvers/zod';
|
import { zodResolver } from '@hookform/resolvers/zod';
|
||||||
import { Trash2 } from 'lucide-react';
|
import { Trash2 } from 'lucide-react';
|
||||||
import { memo } from 'react';
|
import { memo } from 'react';
|
||||||
@ -26,6 +27,12 @@ export const FormSchema = z.object({
|
|||||||
value: z.string().optional(),
|
value: z.string().optional(),
|
||||||
}),
|
}),
|
||||||
),
|
),
|
||||||
|
enable_children: z.boolean(),
|
||||||
|
children_delimiters: z.array(
|
||||||
|
z.object({
|
||||||
|
value: z.string().optional(),
|
||||||
|
}),
|
||||||
|
),
|
||||||
overlapped_percent: z.number(), // 0.0 - 0.3 , 0% - 30%
|
overlapped_percent: z.number(), // 0.0 - 0.3 , 0% - 30%
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -46,6 +53,11 @@ const SplitterForm = ({ node }: INextOperatorForm) => {
|
|||||||
control: form.control,
|
control: form.control,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const childrenDelimiters = useFieldArray({
|
||||||
|
name: 'children_delimiters',
|
||||||
|
control: form.control,
|
||||||
|
});
|
||||||
|
|
||||||
useWatchFormChange(node?.id, form);
|
useWatchFormChange(node?.id, form);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
@ -90,6 +102,59 @@ const SplitterForm = ({ node }: INextOperatorForm) => {
|
|||||||
<BlockButton onClick={() => append({ value: '\n' })}>
|
<BlockButton onClick={() => append({ value: '\n' })}>
|
||||||
{t('common.add')}
|
{t('common.add')}
|
||||||
</BlockButton>
|
</BlockButton>
|
||||||
|
|
||||||
|
<fieldset>
|
||||||
|
<div className="mb-2 flex justify-between items-center gap-1">
|
||||||
|
<span>{t('flow.enableChildrenDelimiters')}</span>
|
||||||
|
|
||||||
|
<FormField
|
||||||
|
control={form.control}
|
||||||
|
name="enable_children"
|
||||||
|
render={({ field: { value, onChange, ...restProps } }) => (
|
||||||
|
<FormItem>
|
||||||
|
<FormControl>
|
||||||
|
<Switch
|
||||||
|
checked={value}
|
||||||
|
onCheckedChange={onChange}
|
||||||
|
{...restProps}
|
||||||
|
/>
|
||||||
|
</FormControl>
|
||||||
|
</FormItem>
|
||||||
|
)}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{form.getValues('enable_children') && (
|
||||||
|
<div className="space-y-4">
|
||||||
|
{childrenDelimiters.fields.map((field, index) => (
|
||||||
|
<div key={field.id} className="flex items-center gap-2">
|
||||||
|
<RAGFlowFormItem
|
||||||
|
name={`children_delimiters.${index}.value`}
|
||||||
|
label="children_delimiter"
|
||||||
|
labelClassName="!hidden"
|
||||||
|
className="flex-auto space-y-0"
|
||||||
|
>
|
||||||
|
<DelimiterInput className="!m-0"></DelimiterInput>
|
||||||
|
</RAGFlowFormItem>
|
||||||
|
|
||||||
|
<Button
|
||||||
|
type="button"
|
||||||
|
variant="ghost"
|
||||||
|
onClick={() => childrenDelimiters.remove(index)}
|
||||||
|
>
|
||||||
|
<Trash2 />
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
|
||||||
|
<BlockButton
|
||||||
|
onClick={() => childrenDelimiters.append({ value: '\n' })}
|
||||||
|
>
|
||||||
|
{t('common.add')}
|
||||||
|
</BlockButton>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</fieldset>
|
||||||
</FormWrapper>
|
</FormWrapper>
|
||||||
<div className="p-5">
|
<div className="p-5">
|
||||||
<Output list={outputList}></Output>
|
<Output list={outputList}></Output>
|
||||||
|
|||||||
@ -288,6 +288,11 @@ function transformSplitterParams(params: SplitterFormSchemaType) {
|
|||||||
...params,
|
...params,
|
||||||
overlapped_percent: Number(params.overlapped_percent) / 100,
|
overlapped_percent: Number(params.overlapped_percent) / 100,
|
||||||
delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'),
|
delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'),
|
||||||
|
|
||||||
|
// Unset children delimiters if this option is not enabled
|
||||||
|
children_delimiters: params.enable_children
|
||||||
|
? transformObjectArrayToPureArray(params.children_delimiters, 'value')
|
||||||
|
: [],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -713,7 +718,7 @@ export function convertToObjectArray<T extends string | number | boolean>(
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* convert the following object into a list
|
* convert the following object into a list
|
||||||
*
|
*
|
||||||
* {
|
* {
|
||||||
"product_related": {
|
"product_related": {
|
||||||
"description": "The question is about product usage, appearance and how it works.",
|
"description": "The question is about product usage, appearance and how it works.",
|
||||||
|
|||||||
@ -2,6 +2,7 @@ import {
|
|||||||
AutoKeywordsFormField,
|
AutoKeywordsFormField,
|
||||||
AutoQuestionsFormField,
|
AutoQuestionsFormField,
|
||||||
} from '@/components/auto-keywords-form-field';
|
} from '@/components/auto-keywords-form-field';
|
||||||
|
import { ChildrenDelimiterForm } from '@/components/children-delimiter-form';
|
||||||
import { DelimiterFormField } from '@/components/delimiter-form-field';
|
import { DelimiterFormField } from '@/components/delimiter-form-field';
|
||||||
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
|
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
|
||||||
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
||||||
@ -21,6 +22,7 @@ export function NaiveConfiguration() {
|
|||||||
<MinerUOptionsFormField></MinerUOptionsFormField>
|
<MinerUOptionsFormField></MinerUOptionsFormField>
|
||||||
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
|
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
|
||||||
<DelimiterFormField></DelimiterFormField>
|
<DelimiterFormField></DelimiterFormField>
|
||||||
|
<ChildrenDelimiterForm />
|
||||||
<EnableTocToggle />
|
<EnableTocToggle />
|
||||||
<OverlappedPercent />
|
<OverlappedPercent />
|
||||||
</ConfigurationFormContainer>
|
</ConfigurationFormContainer>
|
||||||
|
|||||||
@ -24,6 +24,8 @@ export const formSchema = z
|
|||||||
layout_recognize: z.string(),
|
layout_recognize: z.string(),
|
||||||
chunk_token_num: z.number(),
|
chunk_token_num: z.number(),
|
||||||
delimiter: z.string(),
|
delimiter: z.string(),
|
||||||
|
enable_children: z.boolean(),
|
||||||
|
children_delimiter: z.string(),
|
||||||
auto_keywords: z.number().optional(),
|
auto_keywords: z.number().optional(),
|
||||||
auto_questions: z.number().optional(),
|
auto_questions: z.number().optional(),
|
||||||
html4excel: z.boolean(),
|
html4excel: z.boolean(),
|
||||||
|
|||||||
@ -63,6 +63,8 @@ export default function DatasetSettings() {
|
|||||||
layout_recognize: DocumentType.DeepDOC,
|
layout_recognize: DocumentType.DeepDOC,
|
||||||
chunk_token_num: 512,
|
chunk_token_num: 512,
|
||||||
delimiter: `\n`,
|
delimiter: `\n`,
|
||||||
|
enable_children: false,
|
||||||
|
children_delimiter: `\n`,
|
||||||
auto_keywords: 0,
|
auto_keywords: 0,
|
||||||
auto_questions: 0,
|
auto_questions: 0,
|
||||||
html4excel: false,
|
html4excel: false,
|
||||||
|
|||||||
@ -67,6 +67,13 @@ export function SavingButton() {
|
|||||||
await saveKnowledgeConfiguration({
|
await saveKnowledgeConfiguration({
|
||||||
kb_id,
|
kb_id,
|
||||||
...values,
|
...values,
|
||||||
|
parser_config: {
|
||||||
|
...values.parser_config,
|
||||||
|
// Unset children delimiter if this option is not enabled
|
||||||
|
children_delimiter: values.parser_config.enable_children
|
||||||
|
? values.parser_config.children_delimiter
|
||||||
|
: null,
|
||||||
|
},
|
||||||
});
|
});
|
||||||
})();
|
})();
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user