Support operator constraints in semi-automatic metadata filtering (#12956)

### What problem does this PR solve?

#### Summary
This PR enhances the Semi-automatic metadata filtering mode by allowing
users to explicitly pre-define operators (e.g., contains, =, >, etc.)
for selected metadata keys. While the LLM still dynamically extracts the
filter value from the user's query, it is now strictly constrained to
use the operator specified in the UI configuration.

Using this feature is optional. By default the operator selection is set
to "automatic" resulting in the LLM choosing the operator (as
presently).

#### Rationale & Use Case
This enhancement was driven by a concrete challenge I encountered while
working with technical documentation.
In my specific use case, I was trying to filter for software versions
within a technical manual. In this dataset, a single document chunk
often applies to multiple software versions. These versions are stored
as a combined string within the metadata for each chunk.

When using the standard semi-automatic filter, the LLM would
inconsistently choose between the contains and equals operators. When it
chose equals, it would exclude every chunk that applied to more than one
version, even if the version I was searching for was clearly included in
that metadata string. This led to incomplete and frustrating retrieval
results.

By extending the semi-automatic filter to allow pre-defining the
operator for a specific key, I was able to force the use of contains for
the version field. This change immediately led to significantly improved
and more reliable results in my case.

I believe this functionality will be equally useful for others dealing
with "tagged" or multi-value metadata where the relationship between the
query and the field is known, but the specific value needs to remain
dynamic.

#### Key Changes
##### Backend & Core Logic
- `common/metadata_utils.py`: Updated apply_meta_data_filter to support
a mixed data structure for semi_auto (handling both legacy string arrays
and the new object-based format {"key": "...", "op": "..."}).
- `rag/prompts/generator.py`: Extended gen_meta_filter to accept and
pass operator constraints to the LLM.
- `rag/prompts/meta_filter.md`: Updated the system prompt to instruct
the LLM to strictly respect provided operator constraints.

##### Frontend
- `web/src/components/metadata-filter/metadata-semi-auto-fields.tsx`:
Enhanced the UI to include an operator dropdown for each selected
metadata key, utilizing existing operator constants.
- `web/src/components/metadata-filter/index.tsx`: Updated the validation
schema to accommodate the new state structure.

#### Test Plan
- Backward Compatibility: Verified that existing semi-auto filters
stored as simple strings still function correctly.
- Prompt Verification: Confirmed that constraints are correctly rendered
in the LLM system prompt when specified.
- Added unit tests as
`test/unit_test/common/test_apply_semi_auto_meta_data_filter.py`
 - Manual End-to-End:
- Configured a "Semi-automatic" filter for a "Version" key with the
"contains" operator.
   - Asked a version-specific query.
   - Result
   
<img width="1173" height="704" alt="Screenshot 2026-02-02 145359"
src="https://github.com/user-attachments/assets/510a6a61-a231-4dc2-a7fe-cdfc07219132"
/>




### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: Philipp Heyken Soares <philipp.heyken-soares@am.ai>
This commit is contained in:
Philipp Heyken Soares
2026-02-03 04:11:34 +01:00
committed by GitHub
parent 7cbe8b5b53
commit ad06c042c4
9 changed files with 173 additions and 69 deletions

View File

@ -145,11 +145,22 @@ async def apply_meta_data_filter(
if not doc_ids: if not doc_ids:
return None return None
elif method == "semi_auto": elif method == "semi_auto":
selected_keys = meta_data_filter.get("semi_auto", []) selected_keys = []
constraints = {}
for item in meta_data_filter.get("semi_auto", []):
if isinstance(item, str):
selected_keys.append(item)
elif isinstance(item, dict):
key = item.get("key")
op = item.get("op")
selected_keys.append(key)
if op:
constraints[key] = op
if selected_keys: if selected_keys:
filtered_metas = {key: metas[key] for key in selected_keys if key in metas} filtered_metas = {key: metas[key] for key in selected_keys if key in metas}
if filtered_metas: if filtered_metas:
filters: dict = await gen_meta_filter(chat_mdl, filtered_metas, question) filters: dict = await gen_meta_filter(chat_mdl, filtered_metas, question, constraints=constraints)
doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and"))) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
return None return None

View File

@ -15,18 +15,17 @@
# #
import infinity.rag_tokenizer import infinity.rag_tokenizer
from common import settings
class RagTokenizer(infinity.rag_tokenizer.RagTokenizer): class RagTokenizer(infinity.rag_tokenizer.RagTokenizer):
def tokenize(self, line: str) -> str: def tokenize(self, line: str) -> str:
from common import settings # moved from the top of the file to avoid circular import
if settings.DOC_ENGINE_INFINITY: if settings.DOC_ENGINE_INFINITY:
return line return line
else: else:
return super().tokenize(line) return super().tokenize(line)
def fine_grained_tokenize(self, tks: str) -> str: def fine_grained_tokenize(self, tks: str) -> str:
from common import settings # moved from the top of the file to avoid circular import
if settings.DOC_ENGINE_INFINITY: if settings.DOC_ENGINE_INFINITY:
return tks return tks
else: else:

View File

@ -20,7 +20,6 @@ import math
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from dataclasses import dataclass from dataclasses import dataclass
from rag.prompts.generator import relevant_chunks_with_toc
from rag.nlp import rag_tokenizer, query from rag.nlp import rag_tokenizer, query
import numpy as np import numpy as np
from common.doc_store.doc_store_base import MatchDenseExpr, FusionExpr, OrderByExpr, DocStoreConnection from common.doc_store.doc_store_base import MatchDenseExpr, FusionExpr, OrderByExpr, DocStoreConnection
@ -591,6 +590,7 @@ class Dealer:
return {a.replace(".", "_"): max(1, c) for a, c in tag_fea} return {a.replace(".", "_"): max(1, c) for a, c in tag_fea}
async def retrieval_by_toc(self, query: str, chunks: list[dict], tenant_ids: list[str], chat_mdl, topn: int = 6): async def retrieval_by_toc(self, query: str, chunks: list[dict], tenant_ids: list[str], chat_mdl, topn: int = 6):
from rag.prompts.generator import relevant_chunks_with_toc # moved from the top of the file to avoid circular import
if not chunks: if not chunks:
return [] return []
idx_nms = [index_name(tid) for tid in tenant_ids] idx_nms = [index_name(tid) for tid in tenant_ids]

View File

@ -467,7 +467,7 @@ async def rank_memories_async(chat_mdl, goal: str, sub_goal: str, tool_call_summ
return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL) return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
async def gen_meta_filter(chat_mdl, meta_data: dict, query: str) -> dict: async def gen_meta_filter(chat_mdl, meta_data: dict, query: str, constraints: dict = None) -> dict:
meta_data_structure = {} meta_data_structure = {}
for key, values in meta_data.items(): for key, values in meta_data.items():
meta_data_structure[key] = list(values.keys()) if isinstance(values, dict) else values meta_data_structure[key] = list(values.keys()) if isinstance(values, dict) else values
@ -475,7 +475,8 @@ async def gen_meta_filter(chat_mdl, meta_data: dict, query: str) -> dict:
sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render( sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
current_date=datetime.datetime.today().strftime('%Y-%m-%d'), current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
metadata_keys=json.dumps(meta_data_structure), metadata_keys=json.dumps(meta_data_structure),
user_question=query user_question=query,
constraints=json.dumps(constraints) if constraints else None
) )
user_prompt = "Generate filters:" user_prompt = "Generate filters:"
ans = await chat_mdl.async_chat(sys_prompt, [{"role": "user", "content": user_prompt}]) ans = await chat_mdl.async_chat(sys_prompt, [{"role": "user", "content": user_prompt}])

View File

@ -18,12 +18,17 @@ You are a metadata filtering condition generator. Analyze the user's question an
3. **Operator Guide**: 3. **Operator Guide**:
- Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"] - Use these operators only: ["contains", "not contains","in", "not in", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
- Date ranges: Break into two conditions (≥ start_date AND < next_month_start) - Date ranges: Break into two conditions (≥ start_date AND < next_month_start)
- Negations: Always use "≠" for exclusion terms ("not", "except", "exclude", "≠") - Negations: Always use "≠" for exclusion terms ("not", "except", "exclude", "≠")
- Implicit logic: Derive unstated filters (e.g., "July" [≥ YYYY-07-01, < YYYY-08-01]) - Implicit logic: Derive unstated filters (e.g., "July" [≥ YYYY-07-01, < YYYY-08-01])
4. **Processing Steps**: 4. **Operator Constraints**:
- If `constraints` are provided, you MUST use the specified operator for the corresponding key.
- Example Constraints: `{"price": ">", "author": "="}`
- If a key is not in `constraints`, choose the most appropriate operator.
5. **Processing Steps**:
a) Identify ALL filterable attributes in the query (both explicit and implicit) a) Identify ALL filterable attributes in the query (both explicit and implicit)
b) For dates: b) For dates:
- Infer missing year from current date if needed - Infer missing year from current date if needed
@ -34,7 +39,7 @@ You are a metadata filtering condition generator. Analyze the user's question an
- Attribute doesn't exist in metadata - Attribute doesn't exist in metadata
- Value has no match in metadata - Value has no match in metadata
5. **Example A**: 6. **Example A**:
- User query: "上市日期七月份的有哪些新品不要蓝色的只看鞋子和帽子" - User query: "上市日期七月份的有哪些新品不要蓝色的只看鞋子和帽子"
- Metadata: { "color": {...}, "listing_date": {...} } - Metadata: { "color": {...}, "listing_date": {...} }
- Output: - Output:
@ -48,7 +53,7 @@ You are a metadata filtering condition generator. Analyze the user's question an
] ]
} }
6. **Example B**: 7. **Example B**:
- User query: "It must be from China or India. Otherwise, it must not be blue or red." - User query: "It must be from China or India. Otherwise, it must not be blue or red."
- Metadata: { "color": {...}, "country": {...} } - Metadata: { "color": {...}, "country": {...} }
- -
@ -61,7 +66,7 @@ You are a metadata filtering condition generator. Analyze the user's question an
] ]
} }
7. **Final Output**: 8. **Final Output**:
- ONLY output valid JSON dictionary - ONLY output valid JSON dictionary
- NO additional text/explanations - NO additional text/explanations
- Json schema is as following: - Json schema is as following:
@ -131,4 +136,7 @@ You are a metadata filtering condition generator. Analyze the user's question an
- Today's date: {{ current_date }} - Today's date: {{ current_date }}
- Available metadata keys: {{ metadata_keys }} - Available metadata keys: {{ metadata_keys }}
- User query: "{{ user_question }}" - User query: "{{ user_question }}"
{% if constraints %}
- Operator constraints: {{ constraints }}
{% endif %}

View File

@ -0,0 +1,53 @@
import pytest
from common.metadata_utils import apply_meta_data_filter
from unittest.mock import MagicMock, AsyncMock, patch
@pytest.mark.asyncio
async def test_apply_meta_data_filter_semi_auto_key():
meta_data_filter = {
"method": "semi_auto",
"semi_auto": ["key1", "key2"]
}
metas = {
"key1": {"val1": ["doc1"]},
"key2": {"val2": ["doc2"]}
}
question = "find val1"
chat_mdl = MagicMock()
with patch("rag.prompts.generator.gen_meta_filter", new_callable=AsyncMock) as mock_gen:
mock_gen.return_value = {"conditions": [{"key": "key1", "op": "=", "value": "val1"}], "logic": "and"}
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl)
assert doc_ids == ["doc1"]
# Check that constraints is an empty dict by default for legacy
mock_gen.assert_called_once()
args, kwargs = mock_gen.call_args
assert kwargs["constraints"] == {}
@pytest.mark.asyncio
async def test_apply_meta_data_filter_semi_auto_key_and_operator():
meta_data_filter = {
"method": "semi_auto",
"semi_auto": [{"key": "key1", "op": ">"}, "key2"]
}
metas = {
"key1": {"10": ["doc1"]},
"key2": {"val2": ["doc2"]}
}
question = "find key1 > 5"
chat_mdl = MagicMock()
with patch("rag.prompts.generator.gen_meta_filter", new_callable=AsyncMock) as mock_gen:
mock_gen.return_value = {"conditions": [{"key": "key1", "op": ">", "value": "5"}], "logic": "and"}
doc_ids = await apply_meta_data_filter(meta_data_filter, metas, question, chat_mdl)
assert doc_ids == ["doc1"]
# Check that constraints are correctly passed
mock_gen.assert_called_once()
args, kwargs = mock_gen.call_args
assert kwargs["constraints"] == {"key1": ">"}

View File

@ -26,7 +26,17 @@ export const MetadataFilterSchema = {
}), }),
) )
.optional(), .optional(),
semi_auto: z.array(z.string()).optional(), semi_auto: z
.array(
z.union([
z.string(),
z.object({
key: z.string(),
op: z.string().optional(),
}),
]),
)
.optional(),
}) })
.optional(), .optional(),
}; };

View File

@ -1,10 +1,4 @@
import { Button } from '@/components/ui/button'; import { Button } from '@/components/ui/button';
import {
DropdownMenu,
DropdownMenuContent,
DropdownMenuItem,
DropdownMenuTrigger,
} from '@/components/ui/dropdown-menu';
import { import {
FormControl, FormControl,
FormField, FormField,
@ -12,12 +6,13 @@ import {
FormLabel, FormLabel,
FormMessage, FormMessage,
} from '@/components/ui/form'; } from '@/components/ui/form';
import { Input } from '@/components/ui/input'; import { useBuildSwitchOperatorOptions } from '@/hooks/logic-hooks/use-build-operator-options';
import { useFetchKnowledgeMetadata } from '@/hooks/use-knowledge-request'; import { useFetchKnowledgeMetadata } from '@/hooks/use-knowledge-request';
import { Plus, X } from 'lucide-react'; import { Plus, X } from 'lucide-react';
import { useCallback } from 'react'; import { useCallback, useMemo } from 'react';
import { useFieldArray, useFormContext } from 'react-hook-form'; import { useFieldArray, useFormContext } from 'react-hook-form';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { SelectWithSearch } from '../originui/select-with-search';
export function MetadataSemiAutoFields({ export function MetadataSemiAutoFields({
kbIds, kbIds,
@ -36,59 +31,86 @@ export function MetadataSemiAutoFields({
control: form.control, control: form.control,
}); });
const add = useCallback( const add = useCallback(() => {
(key: string) => () => { append({ key: '', op: '' });
append(key); }, [append]);
},
[append], const switchOperatorOptions = useBuildSwitchOperatorOptions();
);
const autoOption = { label: t('chat.meta.auto'), value: '' };
const metadataOptions = useMemo(() => {
return Object.keys(metadata.data || {}).map((key) => ({
label: key,
value: key,
}));
}, [metadata.data]);
return ( return (
<section className="flex flex-col gap-2"> <section className="flex flex-col gap-2">
<div className="flex items-center justify-between"> <div className="flex items-center justify-between">
<FormLabel>{t('chat.metadataKeys')}</FormLabel> <FormLabel>{t('chat.metadataKeys')}</FormLabel>
<DropdownMenu> <Button
<DropdownMenuTrigger> variant={'outline'}
<Button variant={'ghost'} type="button"> type="button"
<Plus /> size="sm"
onClick={add}
className="h-8"
>
<Plus className="mr-2 size-4" />
{t('common.add')}
</Button> </Button>
</DropdownMenuTrigger>
<DropdownMenuContent className="max-h-[300px] !overflow-y-auto scrollbar-auto">
{Object.keys(metadata.data).map((key, idx) => {
return (
<DropdownMenuItem key={idx} onClick={add(key)}>
{key}
</DropdownMenuItem>
);
})}
</DropdownMenuContent>
</DropdownMenu>
</div> </div>
<div className="space-y-5"> <div className="space-y-2">
{fields.map((field, index) => { {fields.map((field, index) => {
const typeField = `${name}.${index}`; const keyField = `${name}.${index}.key`;
const opField = `${name}.${index}.op`;
return ( return (
<section key={field.id} className="flex gap-2"> <section key={field.id} className="flex items-start gap-2">
<div className="w-full space-y-2">
<FormField <FormField
control={form.control} control={form.control}
name={typeField} name={keyField}
render={({ field }) => ( render={({ field }) => (
<FormItem className="flex-1 overflow-hidden"> <FormItem className="flex-[2] overflow-hidden">
<FormControl> <FormControl>
<Input <SelectWithSearch
{...field} {...field}
placeholder={t('common.pleaseInput')} options={metadataOptions}
readOnly placeholder={t('common.pleaseSelect')}
></Input> triggerClassName="bg-bg-input"
value={field.value}
onChange={field.onChange}
/>
</FormControl> </FormControl>
<FormMessage /> <FormMessage />
</FormItem> </FormItem>
)} )}
/> />
</div> <FormField
<Button variant={'ghost'} onClick={() => remove(index)}> control={form.control}
<X className="text-text-sub-title-invert " /> name={opField}
render={({ field }) => (
<FormItem className="flex-1">
<FormControl>
<SelectWithSearch
{...field}
options={[autoOption, ...switchOperatorOptions]}
triggerClassName="bg-bg-input"
value={field.value}
onChange={field.onChange}
/>
</FormControl>
<FormMessage />
</FormItem>
)}
/>
<Button
variant={'ghost'}
size="icon"
onClick={() => remove(index)}
className="mt-0 h-8 w-10"
>
<X className="size-4 text-text-sub-title-invert" />
</Button> </Button>
</section> </section>
); );

View File

@ -159,9 +159,9 @@ export const SelectWithSearch = forwardRef<
triggerClassName, triggerClassName,
)} )}
> >
{value ? ( {selectLabel || value ? (
<span className="flex min-w-0 options-center gap-2"> <span className="flex min-w-0 options-center gap-2">
<span className="leading-none truncate">{selectLabel}</span> <span className="leading-none truncate">{selectLabel || value}</span>
</span> </span>
) : ( ) : (
<span className="text-text-disabled">{placeholder}</span> <span className="text-text-disabled">{placeholder}</span>