From 8c9b54db31bc18d6446590503816d9a779af53cd Mon Sep 17 00:00:00 2001 From: cecilia-uu <117628326+cecilia-uu@users.noreply.github.com> Date: Fri, 28 Jun 2024 14:27:57 +0800 Subject: [PATCH] API: completed delete_doc api (#1290) ### What problem does this PR solve? Adds the functionality of deleting documentation ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/documents_api.py | 60 ++++++++++++++++++++- sdk/python/ragflow/ragflow.py | 9 ++-- sdk/python/test/test_document.py | 90 ++++++++++++++++++++++++++++++-- 3 files changed, 151 insertions(+), 8 deletions(-) diff --git a/api/apps/documents_api.py b/api/apps/documents_api.py index 7338eeb66..44e8a4160 100644 --- a/api/apps/documents_api.py +++ b/api/apps/documents_api.py @@ -24,6 +24,7 @@ from flask_login import login_required, current_user from api.db import FileType, ParserType from api.db.services import duplicate_name from api.db.services.document_service import DocumentService +from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService from api.settings import RetCode @@ -31,6 +32,8 @@ from api.utils import get_uuid from api.utils.api_utils import construct_json_result from api.utils.file_utils import filename_type, thumbnail from rag.utils.minio_conn import MINIO +from api.db.db_models import Task, File +from api.db import FileType, TaskStatus, ParserType, FileSource MAXIMUM_OF_UPLOADING_FILES = 256 @@ -89,6 +92,7 @@ def upload(dataset_id): # grab all the errs err = [] MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0)) + uploaded_docs_json = [] for file in file_objs: try: # TODO: get this value from the database as some tenants have this limit while others don't @@ -132,6 +136,7 @@ def upload(dataset_id): DocumentService.insert(doc) FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) + uploaded_docs_json.append(doc) except Exception as e: err.append(file.filename + ": " + str(e)) @@ -139,14 +144,65 @@ def upload(dataset_id): # return all the errors return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) # success + return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS) + +# ----------------------------delete a file----------------------------------------------------- +@manager.route('//', methods=['DELETE']) +@login_required +def delete(document_id, dataset_id): # string + # get the root folder + root_folder = FileService.get_root_folder(current_user.id) + # parent file's id + parent_file_id = root_folder["id"] + # consider the new user + FileService.init_knowledgebase_docs(parent_file_id, current_user.id) + # store all the errors that may have + errors = "" + try: + # whether there is this document + exist, doc = DocumentService.get_by_id(document_id) + if not exist: + return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR) + # whether this doc is authorized by this tenant + tenant_id = DocumentService.get_tenant_id(document_id) + if not tenant_id: + return construct_json_result(message=f"You cannot delete this document {document_id} due to the authorization" + f" reason!", code=RetCode.AUTHENTICATION_ERROR) + + # get the doc's id and location + real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id) + + if real_dataset_id != dataset_id: + return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, " + f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR) + + # there is an issue when removing + if not DocumentService.remove_document(doc, tenant_id): + return construct_json_result( + message="There was an error during the document removal process. Please check the status of the " + "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR) + + # fetch the File2Document record associated with the provided document ID. + file_to_doc = File2DocumentService.get_by_document_id(document_id) + # delete the associated File record. + FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id]) + # delete the File2Document record itself using the document ID. This removes the + # association between the document and the file after the File record has been deleted. + File2DocumentService.delete_by_document_id(document_id) + + # delete it from minio + MINIO.rm(dataset_id, location) + except Exception as e: + errors += str(e) + if errors: + return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) + return construct_json_result(data=True, code=RetCode.SUCCESS) # ----------------------------upload online files------------------------------------------------ # ----------------------------download a file----------------------------------------------------- -# ----------------------------delete a file----------------------------------------------------- - # ----------------------------enable rename----------------------------------------------------- # ----------------------------list files----------------------------------------------------- diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index 4277480c6..1e4ec332a 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -101,10 +101,13 @@ class RAGFlow: result_dict = json.loads(res.text) return result_dict - # ----------------------------upload remote files----------------------------------------------------- - # ----------------------------download a file----------------------------------------------------- - # ----------------------------delete a file----------------------------------------------------- + def delete_files(self, document_id, dataset_id): + endpoint = f"{self.document_url}/{dataset_id}/{document_id}" + res = requests.delete(endpoint, headers=self.authorization_header) + return res.json() + + # ----------------------------download a file----------------------------------------------------- # ----------------------------enable rename----------------------------------------------------- diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index f22ebca70..a1f34895c 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -149,12 +149,96 @@ class TestFile(TestSdk): res = ragflow.upload_local_file(dataset_id, file_paths) assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.' -# ----------------------------upload remote files----------------------------------------------------- +# ----------------------------delete a file----------------------------------------------------- + def test_delete_one_file(self): + """ + Test deleting one file with success. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_delete_one_file") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt"] + res = ragflow.upload_local_file(dataset_id, file_paths) + # get the doc_id + data = res['data'][0] + doc_id = data['id'] + # delete the files + deleted_res = ragflow.delete_files(doc_id, dataset_id) + # assert value + assert deleted_res['code'] == RetCode.SUCCESS and deleted_res['data'] is True + + def test_delete_document_with_not_existing_document(self): + """ + Test deleting a document that does not exist with failure. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_delete_document_with_not_existing_document") + dataset_id = created_res['data']['dataset_id'] + res = ragflow.delete_files("111", dataset_id) + assert res['code'] == RetCode.DATA_ERROR and res['message'] == 'Document 111 not found!' + + def test_delete_document_with_creating_100_documents_and_deleting_100_documents(self): + """ + Test deleting documents when uploading 100 docs and deleting 100 docs. + """ + # upload 100 docs + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_delete_one_file") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt"] * 100 + res = ragflow.upload_local_file(dataset_id, file_paths) + + # get the doc_id + data = res['data'] + for d in data: + doc_id = d['id'] + # delete the files + deleted_res = ragflow.delete_files(doc_id, dataset_id) + # assert value + assert deleted_res['code'] == RetCode.SUCCESS and deleted_res['data'] is True + + def test_delete_document_from_nonexistent_dataset(self): + """ + Test deleting documents from a non-existent dataset + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_delete_one_file") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt"] + res = ragflow.upload_local_file(dataset_id, file_paths) + # get the doc_id + data = res['data'][0] + doc_id = data['id'] + # delete the files + deleted_res = ragflow.delete_files(doc_id, "000") + # assert value + assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] == + f'The document {doc_id} is not in the dataset: 000, but in the dataset: {dataset_id}.') + + def test_delete_document_which_is_located_in_other_dataset(self): + """ + Test deleting a document which is located in other dataset. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + # upload a document + created_res = ragflow.create_dataset("test_delete_document_which_is_located_in_other_dataset") + created_res_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt"] + res = ragflow.upload_local_file(created_res_id, file_paths) + # other dataset + other_res = ragflow.create_dataset("other_dataset") + other_dataset_id = other_res['data']['dataset_id'] + # get the doc_id + data = res['data'][0] + doc_id = data['id'] + # delete the files from the other dataset + deleted_res = ragflow.delete_files(doc_id, other_dataset_id) + # assert value + assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] == + f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.') # ----------------------------download a file----------------------------------------------------- -# ----------------------------delete a file----------------------------------------------------- - # ----------------------------enable rename----------------------------------------------------- # ----------------------------list files-----------------------------------------------------