Added semi-automatic mode to the metadata filter (#11886)

### What problem does this PR solve?

Retrieval metadata filtering adds semi-automatic mode, and users can
manually check the metadata key that participates in LLM to generate
filter conditions.
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
TeslaZY
2025-12-11 10:45:21 +08:00
committed by GitHub
parent a6afb7dfe2
commit c610bb605a
12 changed files with 246 additions and 3 deletions

View File

@ -136,6 +136,16 @@ class Retrieval(ToolBase, ABC):
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
elif self._param.meta_data_filter.get("method") == "semi_auto":
selected_keys = self._param.meta_data_filter.get("semi_auto", [])
if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas:
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT)
filters: dict = gen_meta_filter(chat_mdl, filtered_metas, query)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
doc_ids = None
elif self._param.meta_data_filter.get("method") == "manual": elif self._param.meta_data_filter.get("method") == "manual":
filters = self._param.meta_data_filter["manual"] filters = self._param.meta_data_filter["manual"]
for flt in filters: for flt in filters:

View File

@ -327,6 +327,40 @@ async def retrieval_test():
local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not local_doc_ids: if not local_doc_ids:
local_doc_ids = None local_doc_ids = None
elif meta_data_filter.get("method") == "semi_auto":
selected_keys = meta_data_filter.get("semi_auto", [])
if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas:
chat_mdl = LLMBundle(user_id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question)
local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not local_doc_ids:
local_doc_ids = None
elif meta_data_filter.get("method") == "manual":
local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if meta_data_filter["manual"] and not local_doc_ids:
local_doc_ids = ["-999"]
else:
meta_data_filter = req.get("meta_data_filter")
if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(user_id, LLMType.CHAT)
filters: dict = gen_meta_filter(chat_mdl, metas, question)
local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not local_doc_ids:
local_doc_ids = None
elif meta_data_filter.get("method") == "semi_auto":
selected_keys = meta_data_filter.get("semi_auto", [])
if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas:
chat_mdl = LLMBundle(user_id, LLMType.CHAT)
filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question)
local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not local_doc_ids:
local_doc_ids = None
elif meta_data_filter.get("method") == "manual": elif meta_data_filter.get("method") == "manual":
local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if meta_data_filter["manual"] and not local_doc_ids: if meta_data_filter["manual"] and not local_doc_ids:

View File

@ -984,10 +984,45 @@ async def retrieval_test_embedded():
local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not local_doc_ids: if not local_doc_ids:
local_doc_ids = None local_doc_ids = None
elif meta_data_filter.get("method") == "semi_auto":
selected_keys = meta_data_filter.get("semi_auto", [])
if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas:
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
filters: dict = gen_meta_filter(chat_mdl, filtered_metas, _question)
local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not local_doc_ids:
local_doc_ids = None
elif meta_data_filter.get("method") == "manual": elif meta_data_filter.get("method") == "manual":
local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if meta_data_filter["manual"] and not local_doc_ids: if meta_data_filter["manual"] and not local_doc_ids:
local_doc_ids = ["-999"] local_doc_ids = ["-999"]
else:
meta_data_filter = req.get("meta_data_filter")
if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
filters: dict = gen_meta_filter(chat_mdl, metas, question)
local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not local_doc_ids:
local_doc_ids = None
elif meta_data_filter.get("method") == "semi_auto":
selected_keys = meta_data_filter.get("semi_auto", [])
if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas:
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT)
filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question)
local_doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not local_doc_ids:
local_doc_ids = None
elif meta_data_filter.get("method") == "manual":
local_doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if meta_data_filter["manual"] and not local_doc_ids:
local_doc_ids = ["-999"]
tenants = UserTenantService.query(user_id=tenant_id) tenants = UserTenantService.query(user_id=tenant_id)
for kb_id in kb_ids: for kb_id in kb_ids:

View File

@ -425,6 +425,15 @@ async def async_chat(dialog, messages, stream=True, **kwargs):
attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not attachments: if not attachments:
attachments = None attachments = None
elif dialog.meta_data_filter.get("method") == "semi_auto":
selected_keys = dialog.meta_data_filter.get("semi_auto", [])
if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas:
filters: dict = gen_meta_filter(chat_mdl, filtered_metas, questions[-1])
attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not attachments:
attachments = None
elif dialog.meta_data_filter.get("method") == "manual": elif dialog.meta_data_filter.get("method") == "manual":
conds = dialog.meta_data_filter["manual"] conds = dialog.meta_data_filter["manual"]
attachments.extend(meta_filter(metas, conds, dialog.meta_data_filter.get("logic", "and"))) attachments.extend(meta_filter(metas, conds, dialog.meta_data_filter.get("logic", "and")))
@ -834,6 +843,15 @@ async def async_ask(question, kb_ids, tenant_id, chat_llm_name=None, search_conf
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
elif meta_data_filter.get("method") == "semi_auto":
selected_keys = meta_data_filter.get("semi_auto", [])
if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas:
filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
doc_ids = None
elif meta_data_filter.get("method") == "manual": elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if meta_data_filter["manual"] and not doc_ids: if meta_data_filter["manual"] and not doc_ids:
@ -909,6 +927,15 @@ async def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
elif meta_data_filter.get("method") == "semi_auto":
selected_keys = meta_data_filter.get("semi_auto", [])
if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas:
filters: dict = gen_meta_filter(chat_mdl, filtered_metas, question)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids:
doc_ids = None
elif meta_data_filter.get("method") == "manual": elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and"))) doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if meta_data_filter["manual"] and not doc_ids: if meta_data_filter["manual"] and not doc_ids:

View File

@ -5,6 +5,7 @@ import { z } from 'zod';
import { SelectWithSearch } from '../originui/select-with-search'; import { SelectWithSearch } from '../originui/select-with-search';
import { RAGFlowFormItem } from '../ragflow-form'; import { RAGFlowFormItem } from '../ragflow-form';
import { MetadataFilterConditions } from './metadata-filter-conditions'; import { MetadataFilterConditions } from './metadata-filter-conditions';
import { MetadataSemiAutoFields } from './metadata-semi-auto-fields';
type MetadataFilterProps = { type MetadataFilterProps = {
prefix?: string; prefix?: string;
@ -25,6 +26,9 @@ export const MetadataFilterSchema = {
}), }),
) )
.optional(), .optional(),
semi_auto: z
.array(z.string()) // 修改为字符串数组
.optional(),
}) })
.optional(), .optional(),
}; };
@ -76,6 +80,12 @@ export function MetadataFilter({
canReference={canReference} canReference={canReference}
></MetadataFilterConditions> ></MetadataFilterConditions>
)} )}
{hasKnowledge && metadata === DatasetMetadata.SemiAutomatic && (
<MetadataSemiAutoFields
kbIds={kbIds}
prefix={prefix}
></MetadataSemiAutoFields>
)}
</> </>
); );
} }

View File

@ -0,0 +1,100 @@
import { Button } from '@/components/ui/button';
import {
DropdownMenu,
DropdownMenuContent,
DropdownMenuItem,
DropdownMenuTrigger,
} from '@/components/ui/dropdown-menu';
import {
FormControl,
FormField,
FormItem,
FormLabel,
FormMessage,
} from '@/components/ui/form';
import { Input } from '@/components/ui/input';
import { useFetchKnowledgeMetadata } from '@/hooks/use-knowledge-request';
import { Plus, X } from 'lucide-react';
import { useCallback } from 'react';
import { useFieldArray, useFormContext } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
export function MetadataSemiAutoFields({
kbIds,
prefix = '',
}: {
kbIds: string[];
prefix?: string;
}) {
const { t } = useTranslation();
const form = useFormContext();
const name = prefix + 'meta_data_filter.semi_auto';
const metadata = useFetchKnowledgeMetadata(kbIds);
const { fields, remove, append } = useFieldArray({
name,
control: form.control,
});
const add = useCallback(
(key: string) => () => {
append(key); // 直接添加字符串而不是对象
},
[append],
);
return (
<section className="flex flex-col gap-2">
<div className="flex items-center justify-between">
<FormLabel>{t('chat.metadataKeys')}</FormLabel>
<DropdownMenu>
<DropdownMenuTrigger>
<Button variant={'ghost'} type="button">
<Plus />
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent className="max-h-[300px] !overflow-y-auto scrollbar-auto">
{Object.keys(metadata.data).map((key, idx) => {
return (
<DropdownMenuItem key={idx} onClick={add(key)}>
{key}
</DropdownMenuItem>
);
})}
</DropdownMenuContent>
</DropdownMenu>
</div>
<div className="space-y-5">
{fields.map((field, index) => {
// 修改字段名称以直接引用数组元素
const typeField = `${name}.${index}`;
return (
<section key={field.id} className="flex gap-2">
<div className="w-full space-y-2">
<FormField
control={form.control}
name={typeField}
render={({ field }) => (
<FormItem className="flex-1 overflow-hidden">
<FormControl>
<Input
{...field}
placeholder={t('common.pleaseInput')}
readOnly
></Input>
</FormControl>
<FormMessage />
</FormItem>
)}
/>
</div>
<Button variant={'ghost'} onClick={() => remove(index)}>
<X className="text-text-sub-title-invert " />
</Button>
</section>
);
})}
</div>
</section>
);
}

View File

@ -36,5 +36,6 @@ export const EmptyConversationId = 'empty';
export enum DatasetMetadata { export enum DatasetMetadata {
Disabled = 'disabled', Disabled = 'disabled',
Automatic = 'auto', Automatic = 'auto',
SemiAutomatic = 'semi_auto',
Manual = 'manual', Manual = 'manual',
} }

View File

@ -7,6 +7,16 @@ export interface ITestRetrievalRequestBody {
use_kg?: boolean; use_kg?: boolean;
highlight?: boolean; highlight?: boolean;
kb_id?: string[]; kb_id?: string[];
meta_data_filter?: {
logic?: string;
method?: string;
manual?: Array<{
key: string;
op: string;
value: string;
}>;
semi_auto?: string[];
};
} }
export interface IFetchKnowledgeListRequestBody { export interface IFetchKnowledgeListRequestBody {

View File

@ -737,11 +737,13 @@ This auto-tagging feature enhances retrieval by adding another layer of domain-s
metadataTip: metadataTip:
'Metadata filtering is the process of using metadata attributes (such as tags, categories, or access permissions) to refine and control the retrieval of relevant information within a system.', 'Metadata filtering is the process of using metadata attributes (such as tags, categories, or access permissions) to refine and control the retrieval of relevant information within a system.',
conditions: 'Conditions', conditions: 'Conditions',
metadataKeys: 'Filterable items',
addCondition: 'Add condition', addCondition: 'Add condition',
meta: { meta: {
disabled: 'Disabled', disabled: 'Disabled',
auto: 'Automatic', auto: 'Automatic',
manual: 'Manual', manual: 'Manual',
semi_auto: 'Semi-automatic',
}, },
cancel: 'Cancel', cancel: 'Cancel',
chatSetting: 'Chat setting', chatSetting: 'Chat setting',

View File

@ -673,11 +673,13 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
metadataTip: metadataTip:
'元数据过滤是使用元数据属性(例如标签、类别或访问权限)来优化和控制系统内相关信息检索的过程。', '元数据过滤是使用元数据属性(例如标签、类别或访问权限)来优化和控制系统内相关信息检索的过程。',
conditions: '条件', conditions: '条件',
metadataKeys: '可选过滤项',
addCondition: '增加条件', addCondition: '增加条件',
meta: { meta: {
disabled: '禁用', disabled: '禁用',
auto: '自动', auto: '自动',
manual: '手动', manual: '手动',
semi_auto: '半自动',
}, },
cancel: '取消', cancel: '取消',
chatSetting: '聊天设置', chatSetting: '聊天设置',

View File

@ -7,14 +7,18 @@ import { z } from 'zod';
import { CrossLanguageFormField } from '@/components/cross-language-form-field'; import { CrossLanguageFormField } from '@/components/cross-language-form-field';
import { FormContainer } from '@/components/form-container'; import { FormContainer } from '@/components/form-container';
import { import {
initialTopKValue, MetadataFilter,
MetadataFilterSchema,
} from '@/components/metadata-filter';
import {
RerankFormFields, RerankFormFields,
initialTopKValue,
topKSchema, topKSchema,
} from '@/components/rerank'; } from '@/components/rerank';
import { import {
SimilaritySliderFormField,
initialSimilarityThresholdValue, initialSimilarityThresholdValue,
initialVectorSimilarityWeightValue, initialVectorSimilarityWeightValue,
SimilaritySliderFormField,
similarityThresholdSchema, similarityThresholdSchema,
vectorSimilarityWeightSchema, vectorSimilarityWeightSchema,
} from '@/components/similarity-slider'; } from '@/components/similarity-slider';
@ -33,6 +37,7 @@ import { trim } from 'lodash';
import { Send } from 'lucide-react'; import { Send } from 'lucide-react';
import { useEffect } from 'react'; import { useEffect } from 'react';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { useParams } from 'umi';
type TestingFormProps = Pick< type TestingFormProps = Pick<
ReturnType<typeof useTestRetrieval>, ReturnType<typeof useTestRetrieval>,
@ -45,6 +50,8 @@ export default function TestingForm({
setValues, setValues,
}: TestingFormProps) { }: TestingFormProps) {
const { t } = useTranslation(); const { t } = useTranslation();
const { id } = useParams(); // 正确解构出id参数
const knowledgeBaseId = id; // 现在knowledgeBaseId是字符串类型
const formSchema = z.object({ const formSchema = z.object({
question: z.string().min(1, { question: z.string().min(1, {
@ -54,6 +61,8 @@ export default function TestingForm({
...vectorSimilarityWeightSchema, ...vectorSimilarityWeightSchema,
...topKSchema, ...topKSchema,
use_kg: z.boolean().optional(), use_kg: z.boolean().optional(),
kb_ids: z.array(z.string()).optional(),
...MetadataFilterSchema,
}); });
const form = useForm<z.infer<typeof formSchema>>({ const form = useForm<z.infer<typeof formSchema>>({
@ -63,6 +72,7 @@ export default function TestingForm({
...initialVectorSimilarityWeightValue, ...initialVectorSimilarityWeightValue,
...initialTopKValue, ...initialTopKValue,
use_kg: false, use_kg: false,
kb_ids: [knowledgeBaseId],
}, },
}); });
@ -90,6 +100,8 @@ export default function TestingForm({
<CrossLanguageFormField <CrossLanguageFormField
name={'cross_languages'} name={'cross_languages'}
></CrossLanguageFormField> ></CrossLanguageFormField>
{/* 添加元数据过滤组件 */}
<MetadataFilter prefix=""></MetadataFilter>
</FormContainer> </FormContainer>
<FormField <FormField
control={form.control} control={form.control}

View File

@ -202,7 +202,7 @@ export default function SearchingView({
<div className="w-full flex flex-col"> <div className="w-full flex flex-col">
<div className="w-full highlightContent"> <div className="w-full highlightContent">
<ImageWithPopover <ImageWithPopover
id={chunk.img_id} id={chunk.image_id || chunk.img_id}
></ImageWithPopover> ></ImageWithPopover>
<Popover> <Popover>
<PopoverTrigger asChild> <PopoverTrigger asChild>