mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-23 11:36:38 +08:00
## Summary Fixes #12520 - Deleted chunks should not appear in retrieval/reference results. ## Changes ### Core Fix - **api/apps/chunk_app.py**: Include \doc_id\ in delete condition to properly scope the delete operation ### Improved Error Handling - **api/db/services/document_service.py**: Better separation of concerns with individual try-catch blocks and proper logging for each cleanup operation ### Doc Store Updates - **rag/utils/es_conn.py**: Updated delete query construction to support compound conditions - **rag/utils/opensearch_conn.py**: Same updates for OpenSearch compatibility ### Tests - **test/testcases/.../test_retrieval_chunks.py**: Added \TestDeletedChunksNotRetrievable\ class with regression tests - **test/unit/test_delete_query_construction.py**: Unit tests for delete query construction ## Testing - Added regression tests that verify deleted chunks are not returned by retrieval API - Tests cover single chunk deletion and batch deletion scenarios
This commit is contained in:
@ -303,32 +303,43 @@ class ESConnection(ESConnectionBase):
|
||||
def delete(self, condition: dict, index_name: str, knowledgebase_id: str) -> int:
|
||||
assert "_id" not in condition
|
||||
condition["kb_id"] = knowledgebase_id
|
||||
|
||||
# Build a bool query that combines id filter with other conditions
|
||||
bool_query = Q("bool")
|
||||
|
||||
# Handle chunk IDs if present
|
||||
if "id" in condition:
|
||||
chunk_ids = condition["id"]
|
||||
if not isinstance(chunk_ids, list):
|
||||
chunk_ids = [chunk_ids]
|
||||
if not chunk_ids: # when chunk_ids is empty, delete all
|
||||
qry = Q("match_all")
|
||||
else:
|
||||
qry = Q("ids", values=chunk_ids)
|
||||
if chunk_ids:
|
||||
# Filter by specific chunk IDs
|
||||
bool_query.filter.append(Q("ids", values=chunk_ids))
|
||||
# If chunk_ids is empty, we don't add an ids filter - rely on other conditions
|
||||
|
||||
# Add all other conditions as filters
|
||||
for k, v in condition.items():
|
||||
if k == "id":
|
||||
continue # Already handled above
|
||||
if k == "exists":
|
||||
bool_query.filter.append(Q("exists", field=v))
|
||||
elif k == "must_not":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
if kk == "exists":
|
||||
bool_query.must_not.append(Q("exists", field=vv))
|
||||
elif isinstance(v, list):
|
||||
bool_query.must.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
bool_query.must.append(Q("term", **{k: v}))
|
||||
elif v is not None:
|
||||
raise Exception("Condition value must be int, str or list.")
|
||||
|
||||
# If no filters were added, use match_all (for tenant-wide operations)
|
||||
if not bool_query.filter and not bool_query.must and not bool_query.must_not:
|
||||
qry = Q("match_all")
|
||||
else:
|
||||
qry = Q("bool")
|
||||
for k, v in condition.items():
|
||||
if k == "exists":
|
||||
qry.filter.append(Q("exists", field=v))
|
||||
|
||||
elif k == "must_not":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
if kk == "exists":
|
||||
qry.must_not.append(Q("exists", field=vv))
|
||||
|
||||
elif isinstance(v, list):
|
||||
qry.must.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
qry.must.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception("Condition value must be int, str or list.")
|
||||
qry = bool_query
|
||||
self.logger.debug("ESConnection.delete query: " + json.dumps(qry.to_dict()))
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
|
||||
@ -405,34 +405,45 @@ class OSConnection(DocStoreConnection):
|
||||
return False
|
||||
|
||||
def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
|
||||
qry = None
|
||||
assert "_id" not in condition
|
||||
condition["kb_id"] = knowledgebaseId
|
||||
|
||||
# Build a bool query that combines id filter with other conditions
|
||||
bool_query = Q("bool")
|
||||
|
||||
# Handle chunk IDs if present
|
||||
if "id" in condition:
|
||||
chunk_ids = condition["id"]
|
||||
if not isinstance(chunk_ids, list):
|
||||
chunk_ids = [chunk_ids]
|
||||
if not chunk_ids: # when chunk_ids is empty, delete all
|
||||
qry = Q("match_all")
|
||||
else:
|
||||
qry = Q("ids", values=chunk_ids)
|
||||
if chunk_ids:
|
||||
# Filter by specific chunk IDs
|
||||
bool_query.filter.append(Q("ids", values=chunk_ids))
|
||||
# If chunk_ids is empty, we don't add an ids filter - rely on other conditions
|
||||
|
||||
# Add all other conditions as filters
|
||||
for k, v in condition.items():
|
||||
if k == "id":
|
||||
continue # Already handled above
|
||||
if k == "exists":
|
||||
bool_query.filter.append(Q("exists", field=v))
|
||||
elif k == "must_not":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
if kk == "exists":
|
||||
bool_query.must_not.append(Q("exists", field=vv))
|
||||
elif isinstance(v, list):
|
||||
bool_query.must.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
bool_query.must.append(Q("term", **{k: v}))
|
||||
elif v is not None:
|
||||
raise Exception("Condition value must be int, str or list.")
|
||||
|
||||
# If no filters were added, use match_all (for tenant-wide operations)
|
||||
if not bool_query.filter and not bool_query.must and not bool_query.must_not:
|
||||
qry = Q("match_all")
|
||||
else:
|
||||
qry = Q("bool")
|
||||
for k, v in condition.items():
|
||||
if k == "exists":
|
||||
qry.filter.append(Q("exists", field=v))
|
||||
|
||||
elif k == "must_not":
|
||||
if isinstance(v, dict):
|
||||
for kk, vv in v.items():
|
||||
if kk == "exists":
|
||||
qry.must_not.append(Q("exists", field=vv))
|
||||
|
||||
elif isinstance(v, list):
|
||||
qry.must.append(Q("terms", **{k: v}))
|
||||
elif isinstance(v, str) or isinstance(v, int):
|
||||
qry.must.append(Q("term", **{k: v}))
|
||||
else:
|
||||
raise Exception("Condition value must be int, str or list.")
|
||||
qry = bool_query
|
||||
logger.debug("OSConnection.delete query: " + json.dumps(qry.to_dict()))
|
||||
for _ in range(ATTEMPT_TIME):
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user