mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add meta data filter. (#9405)
### What problem does this PR solve? #8531 #7417 #6761 #6573 #6477 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -383,8 +383,6 @@ class Dealer:
|
||||
vector_column = f"q_{dim}_vec"
|
||||
zero_vector = [0.0] * dim
|
||||
sim_np = np.array(sim)
|
||||
if doc_ids:
|
||||
similarity_threshold = 0
|
||||
filtered_count = (sim_np >= similarity_threshold).sum()
|
||||
ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
|
||||
for i in idx:
|
||||
|
||||
53
rag/prompts/meta_filter.md
Normal file
53
rag/prompts/meta_filter.md
Normal file
@ -0,0 +1,53 @@
|
||||
You are a metadata filtering condition generator. Analyze the user's question and available document metadata to output a JSON array of filter objects. Follow these rules:
|
||||
|
||||
1. **Metadata Structure**:
|
||||
- Metadata is provided as JSON where keys are attribute names (e.g., "color"), and values are objects mapping attribute values to document IDs.
|
||||
- Example:
|
||||
{
|
||||
"color": {"red": ["doc1"], "blue": ["doc2"]},
|
||||
"listing_date": {"2025-07-11": ["doc1"], "2025-08-01": ["doc2"]}
|
||||
}
|
||||
|
||||
2. **Output Requirements**:
|
||||
- Always output a JSON array of filter objects
|
||||
- Each object must have:
|
||||
"key": (metadata attribute name),
|
||||
"value": (string value to compare),
|
||||
"op": (operator from allowed list)
|
||||
|
||||
3. **Operator Guide**:
|
||||
- Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
|
||||
- Date ranges: Break into two conditions (≥ start_date AND < next_month_start)
|
||||
- Negations: Always use "≠" for exclusion terms ("not", "except", "exclude", "≠")
|
||||
- Implicit logic: Derive unstated filters (e.g., "July" → [≥ YYYY-07-01, < YYYY-08-01])
|
||||
|
||||
4. **Processing Steps**:
|
||||
a) Identify ALL filterable attributes in the query (both explicit and implicit)
|
||||
b) For dates:
|
||||
- Infer missing year from current date if needed
|
||||
- Always format dates as "YYYY-MM-DD"
|
||||
- Convert ranges: [≥ start, < end]
|
||||
c) For values: Match EXACTLY to metadata's value keys
|
||||
d) Skip conditions if:
|
||||
- Attribute doesn't exist in metadata
|
||||
- Value has no match in metadata
|
||||
|
||||
5. **Example**:
|
||||
- User query: "上市日期七月份的有哪些商品,不要蓝色的"
|
||||
- Metadata: { "color": {...}, "listing_date": {...} }
|
||||
- Output:
|
||||
[
|
||||
{"key": "listing_date", "value": "2025-07-01", "op": "≥"},
|
||||
{"key": "listing_date", "value": "2025-08-01", "op": "<"},
|
||||
{"key": "color", "value": "blue", "op": "≠"}
|
||||
]
|
||||
|
||||
6. **Final Output**:
|
||||
- ONLY output valid JSON array
|
||||
- NO additional text/explanations
|
||||
|
||||
**Current Task**:
|
||||
- Today's date: {{current_date}}
|
||||
- Available metadata keys: {{metadata_keys}}
|
||||
- User query: "{{user_question}}"
|
||||
|
||||
@ -149,6 +149,7 @@ NEXT_STEP = load_prompt("next_step")
|
||||
REFLECT = load_prompt("reflect")
|
||||
SUMMARY4MEMORY = load_prompt("summary4memory")
|
||||
RANK_MEMORY = load_prompt("rank_memory")
|
||||
META_FILTER = load_prompt("meta_filter")
|
||||
|
||||
PROMPT_JINJA_ENV = jinja2.Environment(autoescape=False, trim_blocks=True, lstrip_blocks=True)
|
||||
|
||||
@ -413,3 +414,20 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st
|
||||
ans = chat_mdl.chat(msg[0]["content"], msg[1:], stop="<|stop|>")
|
||||
return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
|
||||
|
||||
|
||||
def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
|
||||
sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
|
||||
current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
|
||||
metadata_keys=json.dumps(meta_data),
|
||||
user_question=query
|
||||
)
|
||||
user_prompt = "Generate filters:"
|
||||
ans = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_prompt}])
|
||||
ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
|
||||
try:
|
||||
ans = json_repair.loads(ans)
|
||||
assert isinstance(ans, list), ans
|
||||
return ans
|
||||
except Exception:
|
||||
logging.exception(f"Loading json failure: {ans}")
|
||||
return []
|
||||
@ -444,7 +444,7 @@ async def embedding(docs, mdl, parser_config=None, callback=None):
|
||||
tts = np.concatenate([vts for _ in range(len(tts))], axis=0)
|
||||
tk_count += c
|
||||
|
||||
@timeout(5)
|
||||
@timeout(60)
|
||||
def batch_encode(txts):
|
||||
nonlocal mdl
|
||||
return mdl.encode([truncate(c, mdl.max_length-10) for c in txts])
|
||||
|
||||
@ -190,3 +190,17 @@ class RAGFlowS3:
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return
|
||||
|
||||
@use_prefix_path
|
||||
@use_default_bucket
|
||||
def rm_bucket(self, bucket, *args, **kwargs):
|
||||
for conn in self.conn:
|
||||
try:
|
||||
if not conn.bucket_exists(bucket):
|
||||
continue
|
||||
for o in conn.list_objects_v2(Bucket=bucket):
|
||||
conn.delete_object(bucket, o.object_name)
|
||||
conn.delete_bucket(Bucket=bucket)
|
||||
return
|
||||
except Exception as e:
|
||||
logging.error(f"Fail rm {bucket}: " + str(e))
|
||||
|
||||
Reference in New Issue
Block a user