fix: ensure deleted chunks are not returned in retrieval (#12520) (#12546)

## Summary
Fixes #12520 - Deleted chunks should not appear in retrieval/reference
results.

## Changes

### Core Fix
- **api/apps/chunk_app.py**: Include \doc_id\ in delete condition to
properly scope the delete operation

### Improved Error Handling
- **api/db/services/document_service.py**: Better separation of concerns
with individual try-catch blocks and proper logging for each cleanup
operation

### Doc Store Updates
- **rag/utils/es_conn.py**: Updated delete query construction to support
compound conditions
- **rag/utils/opensearch_conn.py**: Same updates for OpenSearch
compatibility

### Tests
- **test/testcases/.../test_retrieval_chunks.py**: Added
\TestDeletedChunksNotRetrievable\ class with regression tests
- **test/unit/test_delete_query_construction.py**: Unit tests for delete
query construction

## Testing
- Added regression tests that verify deleted chunks are not returned by
retrieval API
- Tests cover single chunk deletion and batch deletion scenarios
This commit is contained in:
Vedant Madane
2026-01-15 12:15:55 +05:30
committed by GitHub
parent d8192f8f17
commit ac936005e6
6 changed files with 472 additions and 48 deletions

View File

@ -303,32 +303,43 @@ class ESConnection(ESConnectionBase):
def delete(self, condition: dict, index_name: str, knowledgebase_id: str) -> int:
assert "_id" not in condition
condition["kb_id"] = knowledgebase_id
# Build a bool query that combines id filter with other conditions
bool_query = Q("bool")
# Handle chunk IDs if present
if "id" in condition:
chunk_ids = condition["id"]
if not isinstance(chunk_ids, list):
chunk_ids = [chunk_ids]
if not chunk_ids: # when chunk_ids is empty, delete all
qry = Q("match_all")
else:
qry = Q("ids", values=chunk_ids)
if chunk_ids:
# Filter by specific chunk IDs
bool_query.filter.append(Q("ids", values=chunk_ids))
# If chunk_ids is empty, we don't add an ids filter - rely on other conditions
# Add all other conditions as filters
for k, v in condition.items():
if k == "id":
continue # Already handled above
if k == "exists":
bool_query.filter.append(Q("exists", field=v))
elif k == "must_not":
if isinstance(v, dict):
for kk, vv in v.items():
if kk == "exists":
bool_query.must_not.append(Q("exists", field=vv))
elif isinstance(v, list):
bool_query.must.append(Q("terms", **{k: v}))
elif isinstance(v, str) or isinstance(v, int):
bool_query.must.append(Q("term", **{k: v}))
elif v is not None:
raise Exception("Condition value must be int, str or list.")
# If no filters were added, use match_all (for tenant-wide operations)
if not bool_query.filter and not bool_query.must and not bool_query.must_not:
qry = Q("match_all")
else:
qry = Q("bool")
for k, v in condition.items():
if k == "exists":
qry.filter.append(Q("exists", field=v))
elif k == "must_not":
if isinstance(v, dict):
for kk, vv in v.items():
if kk == "exists":
qry.must_not.append(Q("exists", field=vv))
elif isinstance(v, list):
qry.must.append(Q("terms", **{k: v}))
elif isinstance(v, str) or isinstance(v, int):
qry.must.append(Q("term", **{k: v}))
else:
raise Exception("Condition value must be int, str or list.")
qry = bool_query
self.logger.debug("ESConnection.delete query: " + json.dumps(qry.to_dict()))
for _ in range(ATTEMPT_TIME):
try:

View File

@ -405,34 +405,45 @@ class OSConnection(DocStoreConnection):
return False
def delete(self, condition: dict, indexName: str, knowledgebaseId: str) -> int:
qry = None
assert "_id" not in condition
condition["kb_id"] = knowledgebaseId
# Build a bool query that combines id filter with other conditions
bool_query = Q("bool")
# Handle chunk IDs if present
if "id" in condition:
chunk_ids = condition["id"]
if not isinstance(chunk_ids, list):
chunk_ids = [chunk_ids]
if not chunk_ids: # when chunk_ids is empty, delete all
qry = Q("match_all")
else:
qry = Q("ids", values=chunk_ids)
if chunk_ids:
# Filter by specific chunk IDs
bool_query.filter.append(Q("ids", values=chunk_ids))
# If chunk_ids is empty, we don't add an ids filter - rely on other conditions
# Add all other conditions as filters
for k, v in condition.items():
if k == "id":
continue # Already handled above
if k == "exists":
bool_query.filter.append(Q("exists", field=v))
elif k == "must_not":
if isinstance(v, dict):
for kk, vv in v.items():
if kk == "exists":
bool_query.must_not.append(Q("exists", field=vv))
elif isinstance(v, list):
bool_query.must.append(Q("terms", **{k: v}))
elif isinstance(v, str) or isinstance(v, int):
bool_query.must.append(Q("term", **{k: v}))
elif v is not None:
raise Exception("Condition value must be int, str or list.")
# If no filters were added, use match_all (for tenant-wide operations)
if not bool_query.filter and not bool_query.must and not bool_query.must_not:
qry = Q("match_all")
else:
qry = Q("bool")
for k, v in condition.items():
if k == "exists":
qry.filter.append(Q("exists", field=v))
elif k == "must_not":
if isinstance(v, dict):
for kk, vv in v.items():
if kk == "exists":
qry.must_not.append(Q("exists", field=vv))
elif isinstance(v, list):
qry.must.append(Q("terms", **{k: v}))
elif isinstance(v, str) or isinstance(v, int):
qry.must.append(Q("term", **{k: v}))
else:
raise Exception("Condition value must be int, str or list.")
qry = bool_query
logger.debug("OSConnection.delete query: " + json.dumps(qry.to_dict()))
for _ in range(ATTEMPT_TIME):
try: