mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-23 11:36:38 +08:00
## Summary Fixes #12520 - Deleted chunks should not appear in retrieval/reference results. ## Changes ### Core Fix - **api/apps/chunk_app.py**: Include \doc_id\ in delete condition to properly scope the delete operation ### Improved Error Handling - **api/db/services/document_service.py**: Better separation of concerns with individual try-catch blocks and proper logging for each cleanup operation ### Doc Store Updates - **rag/utils/es_conn.py**: Updated delete query construction to support compound conditions - **rag/utils/opensearch_conn.py**: Same updates for OpenSearch compatibility ### Tests - **test/testcases/.../test_retrieval_chunks.py**: Added \TestDeletedChunksNotRetrievable\ class with regression tests - **test/unit/test_delete_query_construction.py**: Unit tests for delete query construction ## Testing - Added regression tests that verify deleted chunks are not returned by retrieval API - Tests cover single chunk deletion and batch deletion scenarios
This commit is contained in:
@ -15,9 +15,10 @@
|
||||
#
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from time import sleep
|
||||
|
||||
import pytest
|
||||
from common import retrieval_chunks
|
||||
from common import add_chunk, delete_chunks, retrieval_chunks
|
||||
from configs import INVALID_API_TOKEN
|
||||
from libs.auth import RAGFlowHttpApiAuth
|
||||
|
||||
@ -310,3 +311,89 @@ class TestChunksRetrieval:
|
||||
responses = list(as_completed(futures))
|
||||
assert len(responses) == count, responses
|
||||
assert all(future.result()["code"] == 0 for future in futures)
|
||||
|
||||
|
||||
class TestDeletedChunksNotRetrievable:
|
||||
"""Regression tests for issue #12520: deleted slices should not appear in retrieval/reference."""
|
||||
|
||||
@pytest.mark.p1
|
||||
def test_deleted_chunk_not_in_retrieval(self, HttpApiAuth, add_document):
|
||||
"""
|
||||
Test that a deleted chunk is not returned by the retrieval API.
|
||||
|
||||
Steps:
|
||||
1. Add a chunk with unique content
|
||||
2. Verify the chunk is retrievable
|
||||
3. Delete the chunk
|
||||
4. Verify the chunk is no longer retrievable
|
||||
"""
|
||||
dataset_id, document_id = add_document
|
||||
|
||||
# Add a chunk with unique content that we can search for
|
||||
unique_content = "UNIQUE_TEST_CONTENT_12520_REGRESSION"
|
||||
res = add_chunk(HttpApiAuth, dataset_id, document_id, {"content": unique_content})
|
||||
assert res["code"] == 0, f"Failed to add chunk: {res}"
|
||||
chunk_id = res["data"]["chunk"]["id"]
|
||||
|
||||
# Wait for indexing to complete
|
||||
sleep(2)
|
||||
|
||||
# Verify the chunk is retrievable
|
||||
payload = {"question": unique_content, "dataset_ids": [dataset_id]}
|
||||
res = retrieval_chunks(HttpApiAuth, payload)
|
||||
assert res["code"] == 0, f"Retrieval failed: {res}"
|
||||
chunk_ids_before = [c["id"] for c in res["data"]["chunks"]]
|
||||
assert chunk_id in chunk_ids_before, f"Chunk {chunk_id} should be retrievable before deletion"
|
||||
|
||||
# Delete the chunk
|
||||
res = delete_chunks(HttpApiAuth, dataset_id, document_id, {"chunk_ids": [chunk_id]})
|
||||
assert res["code"] == 0, f"Failed to delete chunk: {res}"
|
||||
|
||||
# Wait for deletion to propagate
|
||||
sleep(1)
|
||||
|
||||
# Verify the chunk is no longer retrievable
|
||||
res = retrieval_chunks(HttpApiAuth, payload)
|
||||
assert res["code"] == 0, f"Retrieval failed after deletion: {res}"
|
||||
chunk_ids_after = [c["id"] for c in res["data"]["chunks"]]
|
||||
assert chunk_id not in chunk_ids_after, f"Chunk {chunk_id} should NOT be retrievable after deletion"
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_deleted_chunks_batch_not_in_retrieval(self, HttpApiAuth, add_document):
|
||||
"""
|
||||
Test that multiple deleted chunks are not returned by retrieval.
|
||||
"""
|
||||
dataset_id, document_id = add_document
|
||||
|
||||
# Add multiple chunks with unique content
|
||||
chunk_ids = []
|
||||
for i in range(3):
|
||||
unique_content = f"BATCH_DELETE_TEST_CHUNK_{i}_12520"
|
||||
res = add_chunk(HttpApiAuth, dataset_id, document_id, {"content": unique_content})
|
||||
assert res["code"] == 0, f"Failed to add chunk {i}: {res}"
|
||||
chunk_ids.append(res["data"]["chunk"]["id"])
|
||||
|
||||
# Wait for indexing
|
||||
sleep(2)
|
||||
|
||||
# Verify chunks are retrievable
|
||||
payload = {"question": "BATCH_DELETE_TEST_CHUNK", "dataset_ids": [dataset_id]}
|
||||
res = retrieval_chunks(HttpApiAuth, payload)
|
||||
assert res["code"] == 0
|
||||
retrieved_ids_before = [c["id"] for c in res["data"]["chunks"]]
|
||||
for cid in chunk_ids:
|
||||
assert cid in retrieved_ids_before, f"Chunk {cid} should be retrievable before deletion"
|
||||
|
||||
# Delete all chunks
|
||||
res = delete_chunks(HttpApiAuth, dataset_id, document_id, {"chunk_ids": chunk_ids})
|
||||
assert res["code"] == 0, f"Failed to delete chunks: {res}"
|
||||
|
||||
# Wait for deletion to propagate
|
||||
sleep(1)
|
||||
|
||||
# Verify none of the chunks are retrievable
|
||||
res = retrieval_chunks(HttpApiAuth, payload)
|
||||
assert res["code"] == 0
|
||||
retrieved_ids_after = [c["id"] for c in res["data"]["chunks"]]
|
||||
for cid in chunk_ids:
|
||||
assert cid not in retrieved_ids_after, f"Chunk {cid} should NOT be retrievable after deletion"
|
||||
|
||||
Reference in New Issue
Block a user