fix: ensure deleted chunks are not returned in retrieval (#12520) (#12546)

## Summary
Fixes #12520 - Deleted chunks should not appear in retrieval/reference
results.

## Changes

### Core Fix
- **api/apps/chunk_app.py**: Include \doc_id\ in delete condition to
properly scope the delete operation

### Improved Error Handling
- **api/db/services/document_service.py**: Better separation of concerns
with individual try-catch blocks and proper logging for each cleanup
operation

### Doc Store Updates
- **rag/utils/es_conn.py**: Updated delete query construction to support
compound conditions
- **rag/utils/opensearch_conn.py**: Same updates for OpenSearch
compatibility

### Tests
- **test/testcases/.../test_retrieval_chunks.py**: Added
\TestDeletedChunksNotRetrievable\ class with regression tests
- **test/unit/test_delete_query_construction.py**: Unit tests for delete
query construction

## Testing
- Added regression tests that verify deleted chunks are not returned by
retrieval API
- Tests cover single chunk deletion and batch deletion scenarios
This commit is contained in:
Vedant Madane
2026-01-15 12:15:55 +05:30
committed by GitHub
parent d8192f8f17
commit ac936005e6
6 changed files with 472 additions and 48 deletions

View File

@ -223,7 +223,9 @@ async def rm():
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if not settings.docStoreConn.delete({"id": req["chunk_ids"]},
# Include doc_id in condition to properly scope the delete
condition = {"id": req["chunk_ids"], "doc_id": req["doc_id"]}
if not settings.docStoreConn.delete(condition,
search.index_name(DocumentService.get_tenant_id(req["doc_id"])),
doc.kb_id):
return get_data_error_result(message="Chunk deleting failure")

View File

@ -340,14 +340,35 @@ class DocumentService(CommonService):
def remove_document(cls, doc, tenant_id):
from api.db.services.task_service import TaskService
cls.clear_chunk_num(doc.id)
# Delete tasks first
try:
TaskService.filter_delete([Task.doc_id == doc.id])
except Exception as e:
logging.warning(f"Failed to delete tasks for document {doc.id}: {e}")
# Delete chunk images (non-critical, log and continue)
try:
cls.delete_chunk_images(doc, tenant_id)
except Exception as e:
logging.warning(f"Failed to delete chunk images for document {doc.id}: {e}")
# Delete thumbnail (non-critical, log and continue)
try:
if doc.thumbnail and not doc.thumbnail.startswith(IMG_BASE64_PREFIX):
if settings.STORAGE_IMPL.obj_exist(doc.kb_id, doc.thumbnail):
settings.STORAGE_IMPL.rm(doc.kb_id, doc.thumbnail)
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
except Exception as e:
logging.warning(f"Failed to delete thumbnail for document {doc.id}: {e}")
# Delete chunks from doc store - this is critical, log errors
try:
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
except Exception as e:
logging.error(f"Failed to delete chunks from doc store for document {doc.id}: {e}")
# Cleanup knowledge graph references (non-critical, log and continue)
try:
graph_source = settings.docStoreConn.get_fields(
settings.docStoreConn.search(["source_id"], [], {"kb_id": doc.kb_id, "knowledge_graph_kwd": ["graph"]}, [], OrderByExpr(), 0, 1, search.index_name(tenant_id), [doc.kb_id]), ["source_id"]
)
@ -360,8 +381,9 @@ class DocumentService(CommonService):
search.index_name(tenant_id), doc.kb_id)
settings.docStoreConn.delete({"kb_id": doc.kb_id, "knowledge_graph_kwd": ["entity", "relation", "graph", "subgraph", "community_report"], "must_not": {"exists": "source_id"}},
search.index_name(tenant_id), doc.kb_id)
except Exception:
pass
except Exception as e:
logging.warning(f"Failed to cleanup knowledge graph for document {doc.id}: {e}")
return cls.delete_by_id(doc.id)
@classmethod