mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Add Duplicate ID Check and Update Deletion Logic (#6376)
- Introduce the `check_duplicate_ids` function in `dataset.py` and `doc.py` to check for and handle duplicate IDs. - Update the deletion operation to ensure that when deleting datasets and documents, error messages regarding duplicate IDs can be returned. - Implement the `check_duplicate_ids` function in `api_utils.py` to return unique IDs and error messages for duplicate IDs. ### What problem does this PR solve? Close https://github.com/infiniflow/ragflow/issues/6234 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: wenju.li <wenju.li@deepctr.cn> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
@ -30,7 +30,7 @@ from api.utils.api_utils import (
|
||||
token_required,
|
||||
get_error_data_result,
|
||||
valid,
|
||||
get_parser_config, valid_parser_config, dataset_readonly_fields,
|
||||
get_parser_config, valid_parser_config, dataset_readonly_fields,check_duplicate_ids
|
||||
)
|
||||
|
||||
|
||||
@ -237,7 +237,7 @@ def delete(tenant_id):
|
||||
if not req:
|
||||
ids = None
|
||||
else:
|
||||
ids = set(req.get("ids"))
|
||||
ids = req.get("ids")
|
||||
if not ids:
|
||||
id_list = []
|
||||
kbs = KnowledgebaseService.query(tenant_id=tenant_id)
|
||||
@ -245,6 +245,9 @@ def delete(tenant_id):
|
||||
id_list.append(kb.id)
|
||||
else:
|
||||
id_list = ids
|
||||
unique_id_list, duplicate_messages = check_duplicate_ids(id_list, "dataset")
|
||||
id_list = unique_id_list
|
||||
|
||||
for id in id_list:
|
||||
kbs = KnowledgebaseService.query(id=id, tenant_id=tenant_id)
|
||||
if not kbs:
|
||||
@ -276,6 +279,11 @@ def delete(tenant_id):
|
||||
)
|
||||
else:
|
||||
return get_error_data_result(message="; ".join(errors))
|
||||
if duplicate_messages:
|
||||
if success_count > 0:
|
||||
return get_result(message=f"Partially deleted {success_count} datasets with {len(duplicate_messages)} errors", data={"success_count": success_count, "errors": duplicate_messages},)
|
||||
else:
|
||||
return get_error_data_result(message=";".join(duplicate_messages))
|
||||
return get_result(code=settings.RetCode.SUCCESS)
|
||||
|
||||
|
||||
|
||||
@ -36,7 +36,7 @@ from api.db.services.document_service import DocumentService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.db.services.file_service import FileService
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.utils.api_utils import construct_json_result, get_parser_config
|
||||
from api.utils.api_utils import construct_json_result, get_parser_config, check_duplicate_ids
|
||||
from rag.nlp import search
|
||||
from rag.prompts import keyword_extraction
|
||||
from rag.app.tag import label_question
|
||||
@ -584,7 +584,7 @@ def delete(tenant_id, dataset_id):
|
||||
if not req:
|
||||
doc_ids = None
|
||||
else:
|
||||
doc_ids = set(req.get("ids"))
|
||||
doc_ids = req.get("ids")
|
||||
if not doc_ids:
|
||||
doc_list = []
|
||||
docs = DocumentService.query(kb_id=dataset_id)
|
||||
@ -592,11 +592,16 @@ def delete(tenant_id, dataset_id):
|
||||
doc_list.append(doc.id)
|
||||
else:
|
||||
doc_list = doc_ids
|
||||
|
||||
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_list, "document")
|
||||
doc_list = unique_doc_ids
|
||||
|
||||
root_folder = FileService.get_root_folder(tenant_id)
|
||||
pf_id = root_folder["id"]
|
||||
FileService.init_knowledgebase_docs(pf_id, tenant_id)
|
||||
errors = ""
|
||||
not_found = []
|
||||
success_count = 0
|
||||
for doc_id in doc_list:
|
||||
try:
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
@ -624,6 +629,7 @@ def delete(tenant_id, dataset_id):
|
||||
File2DocumentService.delete_by_document_id(doc_id)
|
||||
|
||||
STORAGE_IMPL.rm(b, n)
|
||||
success_count += 1
|
||||
except Exception as e:
|
||||
errors += str(e)
|
||||
|
||||
@ -633,6 +639,12 @@ def delete(tenant_id, dataset_id):
|
||||
if errors:
|
||||
return get_result(message=errors, code=settings.RetCode.SERVER_ERROR)
|
||||
|
||||
if duplicate_messages:
|
||||
if success_count > 0:
|
||||
return get_result(message=f"Partially deleted {success_count} datasets with {len(duplicate_messages)} errors", data={"success_count": success_count, "errors": duplicate_messages},)
|
||||
else:
|
||||
return get_error_data_result(message=";".join(duplicate_messages))
|
||||
|
||||
return get_result()
|
||||
|
||||
|
||||
@ -680,8 +692,13 @@ def parse(tenant_id, dataset_id):
|
||||
req = request.json
|
||||
if not req.get("document_ids"):
|
||||
return get_error_data_result("`document_ids` is required")
|
||||
doc_list = req.get("document_ids")
|
||||
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_list, "document")
|
||||
doc_list = unique_doc_ids
|
||||
|
||||
not_found = []
|
||||
for id in set(req["document_ids"]):
|
||||
success_count = 0
|
||||
for id in doc_list:
|
||||
doc = DocumentService.query(id=id, kb_id=dataset_id)
|
||||
if not doc:
|
||||
not_found.append(id)
|
||||
@ -701,9 +718,14 @@ def parse(tenant_id, dataset_id):
|
||||
doc["tenant_id"] = tenant_id
|
||||
bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
|
||||
queue_tasks(doc, bucket, name, 0)
|
||||
|
||||
success_count += 1
|
||||
if not_found:
|
||||
return get_result(message=f"Documents not found: {not_found}", code=settings.RetCode.DATA_ERROR)
|
||||
if duplicate_messages:
|
||||
if success_count > 0:
|
||||
return get_result(message=f"Partially parsed {success_count} documents with {len(duplicate_messages)} errors", data={"success_count": success_count, "errors": duplicate_messages},)
|
||||
else:
|
||||
return get_error_data_result(message=";".join(duplicate_messages))
|
||||
|
||||
return get_result()
|
||||
|
||||
@ -750,9 +772,15 @@ def stop_parsing(tenant_id, dataset_id):
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
req = request.json
|
||||
|
||||
if not req.get("document_ids"):
|
||||
return get_error_data_result("`document_ids` is required")
|
||||
for id in req["document_ids"]:
|
||||
doc_list = req.get("document_ids")
|
||||
unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_list, "document")
|
||||
doc_list = unique_doc_ids
|
||||
|
||||
success_count = 0
|
||||
for id in doc_list:
|
||||
doc = DocumentService.query(id=id, kb_id=dataset_id)
|
||||
if not doc:
|
||||
return get_error_data_result(message=f"You don't own the document {id}.")
|
||||
@ -763,6 +791,12 @@ def stop_parsing(tenant_id, dataset_id):
|
||||
info = {"run": "2", "progress": 0, "chunk_num": 0}
|
||||
DocumentService.update_by_id(id, info)
|
||||
settings.docStoreConn.delete({"doc_id": doc[0].id}, search.index_name(tenant_id), dataset_id)
|
||||
success_count += 1
|
||||
if duplicate_messages:
|
||||
if success_count > 0:
|
||||
return get_result(message=f"Partially stopped {success_count} documents with {len(duplicate_messages)} errors", data={"success_count": success_count, "errors": duplicate_messages},)
|
||||
else:
|
||||
return get_error_data_result(message=";".join(duplicate_messages))
|
||||
return get_result()
|
||||
|
||||
|
||||
@ -1125,12 +1159,15 @@ def rm_chunk(tenant_id, dataset_id, document_id):
|
||||
req = request.json
|
||||
condition = {"doc_id": document_id}
|
||||
if "chunk_ids" in req:
|
||||
condition["id"] = req["chunk_ids"]
|
||||
unique_chunk_ids, duplicate_messages = check_duplicate_ids(req["chunk_ids"], "chunk")
|
||||
condition["id"] = unique_chunk_ids
|
||||
chunk_number = settings.docStoreConn.delete(condition, search.index_name(tenant_id), dataset_id)
|
||||
if chunk_number != 0:
|
||||
DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0)
|
||||
if "chunk_ids" in req and chunk_number != len(req["chunk_ids"]):
|
||||
return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(req['chunk_ids'])}")
|
||||
if "chunk_ids" in req and chunk_number != len(unique_chunk_ids):
|
||||
return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(unique_chunk_ids)}")
|
||||
if duplicate_messages:
|
||||
return get_result(message=f"Partially deleted {chunk_number} chunks with {len(duplicate_messages)} errors", data={"success_count": chunk_number, "errors": duplicate_messages},)
|
||||
return get_result(message=f"deleted {chunk_number} chunks")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user