Feat: add meta data filter. (#9405)

### What problem does this PR solve? #8531 #7417 #6761 #6573 #6477 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 15:45:08 +08:00 · 2025-08-12 14:12:56 +08:00
parent 3ccaa06031
commit 153e430b00
11 changed files with 184 additions and 4 deletions
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -383,8 +383,6 @@ class Dealer:
        vector_column = f"q_{dim}_vec"
        zero_vector = [0.0] * dim
        sim_np = np.array(sim)
-        if doc_ids:
-            similarity_threshold = 0
        filtered_count = (sim_np >= similarity_threshold).sum()    
        ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
        for i in idx:
--- a/rag/prompts/meta_filter.md
+++ b/rag/prompts/meta_filter.md
@ -0,0 +1,53 @@
+You are a metadata filtering condition generator. Analyze the user's question and available document metadata to output a JSON array of filter objects. Follow these rules:
+
+1. **Metadata Structure**: 
+   - Metadata is provided as JSON where keys are attribute names (e.g., "color"), and values are objects mapping attribute values to document IDs.
+   - Example: 
+     {
+       "color": {"red": ["doc1"], "blue": ["doc2"]},
+       "listing_date": {"2025-07-11": ["doc1"], "2025-08-01": ["doc2"]}
+     }
+
+2. **Output Requirements**:
+   - Always output a JSON array of filter objects
+   - Each object must have:
+        "key": (metadata attribute name),
+        "value": (string value to compare),
+        "op": (operator from allowed list)
+
+3. **Operator Guide**:
+   - Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
+   - Date ranges: Break into two conditions (≥ start_date AND < next_month_start)
+   - Negations: Always use "≠" for exclusion terms ("not", "except", "exclude", "≠")
+   - Implicit logic: Derive unstated filters (e.g., "July" → [≥ YYYY-07-01, < YYYY-08-01])
+
+4. **Processing Steps**:
+   a) Identify ALL filterable attributes in the query (both explicit and implicit)
+   b) For dates:
+        - Infer missing year from current date if needed
+        - Always format dates as "YYYY-MM-DD"
+        - Convert ranges: [≥ start, < end]
+   c) For values: Match EXACTLY to metadata's value keys
+   d) Skip conditions if:
+        - Attribute doesn't exist in metadata
+        - Value has no match in metadata
+
+5. **Example**:
+   - User query: "上市日期七月份的有哪些商品，不要蓝色的"
+   - Metadata: { "color": {...}, "listing_date": {...} }
+   - Output: 
+        [
+          {"key": "listing_date", "value": "2025-07-01", "op": "≥"},
+          {"key": "listing_date", "value": "2025-08-01", "op": "<"},
+          {"key": "color", "value": "blue", "op": "≠"}
+        ]
+
+6. **Final Output**:
+   - ONLY output valid JSON array
+   - NO additional text/explanations
+
+**Current Task**:
+- Today's date: {{current_date}}
+- Available metadata keys: {{metadata_keys}}
+- User query: "{{user_question}}"
+
--- a/rag/prompts/prompts.py
+++ b/rag/prompts/prompts.py
@ -149,6 +149,7 @@ NEXT_STEP = load_prompt("next_step")
 REFLECT = load_prompt("reflect")
 SUMMARY4MEMORY = load_prompt("summary4memory")
 RANK_MEMORY = load_prompt("rank_memory")
+META_FILTER = load_prompt("meta_filter")

 PROMPT_JINJA_ENV = jinja2.Environment(autoescape=False, trim_blocks=True, lstrip_blocks=True)

@ -413,3 +414,20 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st
    ans = chat_mdl.chat(msg[0]["content"], msg[1:], stop="<|stop|>")
    return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)

+
+def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
+    sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
+        current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
+        metadata_keys=json.dumps(meta_data),
+        user_question=query
+    )
+    user_prompt = "Generate filters:"
+    ans = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_prompt}])
+    ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
+    try:
+        ans = json_repair.loads(ans)
+        assert isinstance(ans, list), ans
+        return ans
+    except Exception:
+        logging.exception(f"Loading json failure: {ans}")
+    return []
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -444,7 +444,7 @@ async def embedding(docs, mdl, parser_config=None, callback=None):
        tts = np.concatenate([vts for _ in range(len(tts))], axis=0)
        tk_count += c

-    @timeout(5)
+    @timeout(60)
    def batch_encode(txts):
        nonlocal mdl
        return mdl.encode([truncate(c, mdl.max_length-10) for c in txts])
--- a/rag/utils/s3_conn.py
+++ b/rag/utils/s3_conn.py
@ -190,3 +190,17 @@ class RAGFlowS3:
                self.__open__()
                time.sleep(1)
        return
+
+    @use_prefix_path
+    @use_default_bucket
+    def rm_bucket(self, bucket, *args, **kwargs):
+        for conn in self.conn:
+            try:
+                if not conn.bucket_exists(bucket):
+                    continue
+                for o in conn.list_objects_v2(Bucket=bucket):
+                    conn.delete_object(bucket, o.object_name)
+                conn.delete_bucket(Bucket=bucket)
+                return
+            except Exception as e:
+                logging.error(f"Fail rm {bucket}: " + str(e))