diff --git a/sdk/python/test/test_http_api/common.py b/sdk/python/test/test_http_api/common.py index 1204a1a18..a76771364 100644 --- a/sdk/python/test/test_http_api/common.py +++ b/sdk/python/test/test_http_api/common.py @@ -35,42 +35,22 @@ DOCUMENT_NAME_LIMIT = 128 # DATASET MANAGEMENT def create_dataset(auth, payload=None): - res = requests.post( - url=f"{HOST_ADDRESS}{DATASETS_API_URL}", - headers=HEADERS, - auth=auth, - json=payload, - ) + res = requests.post(url=f"{HOST_ADDRESS}{DATASETS_API_URL}", headers=HEADERS, auth=auth, json=payload) return res.json() def list_dataset(auth, params=None): - res = requests.get( - url=f"{HOST_ADDRESS}{DATASETS_API_URL}", - headers=HEADERS, - auth=auth, - params=params, - ) + res = requests.get(url=f"{HOST_ADDRESS}{DATASETS_API_URL}", headers=HEADERS, auth=auth, params=params) return res.json() def update_dataset(auth, dataset_id, payload=None): - res = requests.put( - url=f"{HOST_ADDRESS}{DATASETS_API_URL}/{dataset_id}", - headers=HEADERS, - auth=auth, - json=payload, - ) + res = requests.put(url=f"{HOST_ADDRESS}{DATASETS_API_URL}/{dataset_id}", headers=HEADERS, auth=auth, json=payload) return res.json() def delete_dataset(auth, payload=None): - res = requests.delete( - url=f"{HOST_ADDRESS}{DATASETS_API_URL}", - headers=HEADERS, - auth=auth, - json=payload, - ) + res = requests.delete(url=f"{HOST_ADDRESS}{DATASETS_API_URL}", headers=HEADERS, auth=auth, json=payload) return res.json() @@ -127,12 +107,7 @@ def download_document(auth, dataset_id, document_id, save_path): def list_documnet(auth, dataset_id, params=None): url = f"{HOST_ADDRESS}{FILE_API_URL}".format(dataset_id=dataset_id) - res = requests.get( - url=url, - headers=HEADERS, - auth=auth, - params=params, - ) + res = requests.get(url=url, headers=HEADERS, auth=auth, params=params) return res.json() @@ -181,12 +156,7 @@ def add_chunk(auth, dataset_id, document_id, payload=None): def list_chunks(auth, dataset_id, document_id, params=None): url = f"{HOST_ADDRESS}{CHUNK_API_URL}".format(dataset_id=dataset_id, document_id=document_id) - res = requests.get( - url=url, - headers=HEADERS, - auth=auth, - params=params, - ) + res = requests.get(url=url, headers=HEADERS, auth=auth, params=params) return res.json() @@ -196,6 +166,12 @@ def update_chunk(auth, dataset_id, document_id, chunk_id, payload=None): return res.json() +def delete_chunks(auth, dataset_id, document_id, payload=None): + url = f"{HOST_ADDRESS}{CHUNK_API_URL}".format(dataset_id=dataset_id, document_id=document_id) + res = requests.delete(url=url, headers=HEADERS, auth=auth, json=payload) + return res.json() + + def batch_add_chunks(auth, dataset_id, document_id, num): chunk_ids = [] for i in range(num): diff --git a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/conftest.py b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/conftest.py index 8418c8333..69de33600 100644 --- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/conftest.py +++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/conftest.py @@ -16,7 +16,7 @@ import pytest -from common import add_chunk, batch_create_datasets, bulk_upload_documents, delete_dataset, list_documnet, parse_documnet +from common import add_chunk, batch_create_datasets, bulk_upload_documents, delete_chunks, delete_dataset, list_documnet, parse_documnet from libs.utils import wait_for @@ -62,4 +62,25 @@ def add_chunks(get_http_api_auth, get_dataset_id_and_document_id): from time import sleep sleep(1) - yield dataset_id, document_id, chunk_ids + return dataset_id, document_id, chunk_ids + + +@pytest.fixture(scope="function") +def add_chunks_func(get_http_api_auth, get_dataset_id_and_document_id, request): + dataset_id, document_id = get_dataset_id_and_document_id + + chunk_ids = [] + for i in range(4): + res = add_chunk(get_http_api_auth, dataset_id, document_id, {"content": f"chunk test {i}"}) + chunk_ids.append(res["data"]["chunk"]["id"]) + + # issues/6487 + from time import sleep + + sleep(1) + + def cleanup(): + delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids}) + + request.addfinalizer(cleanup) + return dataset_id, document_id, chunk_ids diff --git a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py index 79fd6ec5e..d8691f1f9 100644 --- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py +++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py @@ -133,7 +133,6 @@ class TestAddChunk: assert False, res chunks_count = res["data"]["doc"]["chunk_count"] res = add_chunk(get_http_api_auth, dataset_id, document_id, payload) - print(res) assert res["code"] == expected_code if expected_code == 0: validate_chunk_details(dataset_id, document_id, payload, res) diff --git a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py new file mode 100644 index 000000000..f466c9584 --- /dev/null +++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py @@ -0,0 +1,208 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +from concurrent.futures import ThreadPoolExecutor + +import pytest +from common import INVALID_API_TOKEN, batch_add_chunks, delete_chunks, list_chunks +from libs.auth import RAGFlowHttpApiAuth + + +class TestAuthorization: + @pytest.mark.parametrize( + "auth, expected_code, expected_message", + [ + (None, 0, "`Authorization` can't be empty"), + ( + RAGFlowHttpApiAuth(INVALID_API_TOKEN), + 109, + "Authentication error: API key is invalid!", + ), + ], + ) + def test_invalid_auth(self, auth, expected_code, expected_message): + res = delete_chunks(auth, "dataset_id", "document_id") + assert res["code"] == expected_code + assert res["message"] == expected_message + + +class TestChunkstDeletion: + @pytest.mark.parametrize( + "dataset_id, expected_code, expected_message", + [ + ("", 100, ""), + ( + "invalid_dataset_id", + 102, + "You don't own the dataset invalid_dataset_id.", + ), + ], + ) + def test_invalid_dataset_id(self, get_http_api_auth, add_chunks_func, dataset_id, expected_code, expected_message): + _, document_id, chunk_ids = add_chunks_func + res = delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids}) + assert res["code"] == expected_code + assert res["message"] == expected_message + + @pytest.mark.parametrize( + "document_id, expected_code, expected_message", + [ + ("", 100, ""), + pytest.param( + "invalid_document_id", + 100, + "LookupError('Document not found which is supposed to be there')", + marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="issues/6611"), + ), + pytest.param( + "invalid_document_id", + 102, + "rm_chunk deleted chunks 0, expect 4", + marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "elasticsearch"], reason="issues/6611"), + ), + ], + ) + def test_invalid_document_id(self, get_http_api_auth, add_chunks_func, document_id, expected_code, expected_message): + dataset_id, _, chunk_ids = add_chunks_func + res = delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids}) + assert res["code"] == expected_code + assert res["message"] == expected_message + + @pytest.mark.parametrize( + "payload", + [ + lambda r: {"chunk_ids": ["invalid_id"] + r}, + lambda r: {"chunk_ids": r[:1] + ["invalid_id"] + r[1:4]}, + lambda r: {"chunk_ids": r + ["invalid_id"]}, + ], + ) + def test_delete_partial_invalid_id(self, get_http_api_auth, add_chunks_func, payload): + dataset_id, document_id, chunk_ids = add_chunks_func + if callable(payload): + payload = payload(chunk_ids) + res = delete_chunks(get_http_api_auth, dataset_id, document_id, payload) + assert res["code"] == 102 + assert res["message"] == "rm_chunk deleted chunks 4, expect 5" + + res = list_chunks(get_http_api_auth, dataset_id, document_id) + if res["code"] != 0: + assert False, res + assert len(res["data"]["chunks"]) == 1 + assert res["data"]["total"] == 1 + + def test_repeated_deletion(self, get_http_api_auth, add_chunks_func): + dataset_id, document_id, chunk_ids = add_chunks_func + payload = {"chunk_ids": chunk_ids} + res = delete_chunks(get_http_api_auth, dataset_id, document_id, payload) + assert res["code"] == 0 + + res = delete_chunks(get_http_api_auth, dataset_id, document_id, payload) + assert res["code"] == 102 + assert res["message"] == "rm_chunk deleted chunks 0, expect 4" + + def test_duplicate_deletion(self, get_http_api_auth, add_chunks_func): + dataset_id, document_id, chunk_ids = add_chunks_func + res = delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids * 2}) + assert res["code"] == 0 + assert "Duplicate chunk ids" in res["data"]["errors"][0] + assert res["data"]["success_count"] == 4 + + res = list_chunks(get_http_api_auth, dataset_id, document_id) + if res["code"] != 0: + assert False, res + assert len(res["data"]["chunks"]) == 1 + assert res["data"]["total"] == 1 + + @pytest.mark.slow + def test_concurrent_deletion(self, get_http_api_auth, get_dataset_id_and_document_id): + chunks_num = 100 + dataset_id, document_id = get_dataset_id_and_document_id + chunk_ids = batch_add_chunks(get_http_api_auth, dataset_id, document_id, chunks_num) + + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit( + delete_chunks, + get_http_api_auth, + dataset_id, + document_id, + {"chunk_ids": chunk_ids[i : i + 1]}, + ) + for i in range(chunks_num) + ] + responses = [f.result() for f in futures] + assert all(r["code"] == 0 for r in responses) + + @pytest.mark.slow + def test_delete_1k(self, get_http_api_auth, get_dataset_id_and_document_id): + chunks_num = 1_000 + dataset_id, document_id = get_dataset_id_and_document_id + chunk_ids = batch_add_chunks(get_http_api_auth, dataset_id, document_id, chunks_num) + + # issues/6487 + from time import sleep + + sleep(1) + + res = delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids}) + assert res["code"] == 0 + + res = list_chunks(get_http_api_auth, dataset_id, document_id) + if res["code"] != 0: + assert False, res + assert len(res["data"]["chunks"]) == 1 + assert res["data"]["total"] == 1 + + @pytest.mark.parametrize( + "payload, expected_code, expected_message, remaining", + [ + pytest.param(None, 100, """TypeError("argument of type \'NoneType\' is not iterable")""", 5, marks=pytest.mark.skip), + ({"chunk_ids": ["invalid_id"]}, 102, "rm_chunk deleted chunks 0, expect 1", 5), + pytest.param( + "not json", + 100, + """UnboundLocalError("local variable \'duplicate_messages\' referenced before assignment")""", + 5, + marks=pytest.mark.skip(reason="pull/6376"), + ), + (lambda r: {"chunk_ids": r[:1]}, 0, "", 4), + (lambda r: {"chunk_ids": r}, 0, "", 1), + pytest.param({"chunk_ids": []}, 0, "", 5, marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="issues/6607")), + pytest.param({"chunk_ids": []}, 102, "rm_chunk deleted chunks 5, expect 0", 0, marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "elasticsearch"], reason="issues/6607")), + ], + ) + def test_basic_scenarios( + self, + get_http_api_auth, + add_chunks_func, + payload, + expected_code, + expected_message, + remaining, + ): + dataset_id, document_id, chunk_ids = add_chunks_func + if callable(payload): + payload = payload(chunk_ids) + res = delete_chunks(get_http_api_auth, dataset_id, document_id, payload) + assert res["code"] == expected_code + if res["code"] != 0: + assert res["message"] == expected_message + + res = list_chunks(get_http_api_auth, dataset_id, document_id) + if res["code"] != 0: + assert False, res + assert len(res["data"]["chunks"]) == remaining + assert res["data"]["total"] == remaining diff --git a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py index 3d3cf6a2f..57bad2715 100644 --- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py +++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py @@ -54,7 +54,7 @@ class TestUpdatedChunk: pytest.param( {"content": 1}, 100, - """TypeError("unsupported operand type(s) for +: \'int\' and \'str\'")""", + "TypeError('expected string or bytes-like object')", marks=pytest.mark.skip, ), ({"content": "update chunk"}, 0, ""),