Put document metadata in ES/Infinity (#12826)

### What problem does this PR solve?

Put document metadata in ES/Infinity.

Index name of meta data: ragflow_doc_meta_{tenant_id}

### Type of change

- [x] Refactoring
This commit is contained in:
qinling0210
2026-01-28 13:29:34 +08:00
committed by GitHub
parent fd11aca8e5
commit 9a5208976c
24 changed files with 1529 additions and 304 deletions

View File

@ -0,0 +1,184 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
End-to-end test for metadata filtering during retrieval.
Tests that chunks are only retrieved from documents matching the metadata condition.
"""
import pytest
import logging
from common import (
create_dataset,
delete_datasets,
list_documents,
update_document,
)
from utils import wait_for
@wait_for(30, 1, "Document parsing timeout")
def _condition_parsing_complete(_auth, dataset_id):
res = list_documents(_auth, dataset_id)
if res["code"] != 0:
return False
for doc in res["data"]["docs"]:
status = doc.get("run", "UNKNOWN")
if status == "FAILED":
pytest.fail(f"Document parsing failed: {doc}")
return False
if status != "DONE":
return False
return True
@pytest.fixture(scope="function")
def add_dataset_with_metadata(HttpApiAuth):
# First create the dataset
res = create_dataset(HttpApiAuth, {
"name": f"test_metadata_{int(__import__('time').time())}",
"chunk_method": "naive"
})
assert res["code"] == 0, f"Failed to create dataset: {res}"
dataset_id = res["data"]["id"]
# Then configure metadata via the update_metadata_setting endpoint
import requests
from configs import HOST_ADDRESS, VERSION
metadata_config = {
"type": "object",
"properties": {
"character": {
"description": "Historical figure name",
"type": "string"
},
"era": {
"description": "Historical era",
"type": "string"
},
"achievements": {
"description": "Major achievements",
"type": "array",
"items": {
"type": "string"
}
}
}
}
res = requests.post(
url=f"{HOST_ADDRESS}/{VERSION}/kb/update_metadata_setting",
headers={"Content-Type": "application/json"},
auth=HttpApiAuth,
json={
"kb_id": dataset_id,
"metadata": metadata_config,
"enable_metadata": False
}
).json()
assert res["code"] == 0, f"Failed to configure metadata: {res}"
yield dataset_id
# Cleanup
delete_datasets(HttpApiAuth, {"ids": [dataset_id]})
@pytest.mark.p2
class TestMetadataWithRetrieval:
"""Test retrieval with metadata filtering."""
def test_retrieval_with_metadata_filter(self, HttpApiAuth, add_dataset_with_metadata, tmp_path):
"""
Test that retrieval respects metadata filters.
Verifies that chunks are only retrieved from documents matching the metadata condition.
"""
from common import upload_documents, parse_documents, retrieval_chunks
dataset_id = add_dataset_with_metadata
# Create two documents with different metadata
content_doc1 = "Document about Zhuge Liang who lived in Three Kingdoms period."
content_doc2 = "Document about Cao Cao who lived in Late Eastern Han Dynasty."
fp1 = tmp_path / "doc1_zhuge_liang.txt"
fp2 = tmp_path / "doc2_cao_cao.txt"
with open(fp1, "w", encoding="utf-8") as f:
f.write(content_doc1)
with open(fp2, "w", encoding="utf-8") as f:
f.write(content_doc2)
# Upload both documents
res = upload_documents(HttpApiAuth, dataset_id, [fp1, fp2])
assert res["code"] == 0, f"Failed to upload documents: {res}"
doc1_id = res["data"][0]["id"]
doc2_id = res["data"][1]["id"]
# Add different metadata to each document
res = update_document(HttpApiAuth, dataset_id, doc1_id, {
"meta_fields": {"character": "Zhuge Liang", "era": "Three Kingdoms"}
})
assert res["code"] == 0, f"Failed to update doc1 metadata: {res}"
res = update_document(HttpApiAuth, dataset_id, doc2_id, {
"meta_fields": {"character": "Cao Cao", "era": "Late Eastern Han"}
})
assert res["code"] == 0, f"Failed to update doc2 metadata: {res}"
# Parse both documents
res = parse_documents(HttpApiAuth, dataset_id, {"document_ids": [doc1_id, doc2_id]})
assert res["code"] == 0, f"Failed to trigger parsing: {res}"
# Wait for parsing to complete
assert _condition_parsing_complete(HttpApiAuth, dataset_id), "Parsing timeout"
# Test retrieval WITH metadata filter for "Zhuge Liang"
res = retrieval_chunks(HttpApiAuth, {
"question": "Zhuge Liang",
"dataset_ids": [dataset_id],
"metadata_condition": {
"logic": "and",
"conditions": [
{
"name": "character",
"comparison_operator": "is",
"value": "Zhuge Liang"
}
]
}
})
assert res["code"] == 0, f"Retrieval with metadata filter failed: {res}"
chunks_with_filter = res["data"]["chunks"]
doc_ids_with_filter = set(chunk.get("document_id", "") for chunk in chunks_with_filter)
logging.info(f"✓ Retrieved {len(chunks_with_filter)} chunks from documents: {doc_ids_with_filter}")
# Verify that filtered results only contain doc1 (Zhuge Liang)
if len(chunks_with_filter) > 0:
assert doc1_id in doc_ids_with_filter, f"Filtered results should contain doc1 (Zhuge Liang), but got: {doc_ids_with_filter}"
assert doc2_id not in doc_ids_with_filter, f"Filtered results should NOT contain doc2 (Cao Cao), but got: {doc_ids_with_filter}"
logging.info("Metadata filter correctly excluded chunks from non-matching documents")
else:
logging.warning("No chunks retrieved with filter - this might be due to embedding model not configured")

View File

@ -22,7 +22,9 @@
def _summary_to_counts(summary):
counts = {}
for key, pairs in summary.items():
for key, field_data in summary.items():
# New format: {key: {"type": "...", "values": [[value, count], ...]}}
pairs = field_data["values"]
counts[key] = {str(k): v for k, v in pairs}
return counts