mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix bugs in API (#3103)
### What problem does this PR solve? Fix bugs in API - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
This commit is contained in:
@ -46,6 +46,9 @@ from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
import os
|
||||
|
||||
MAXIMUM_OF_UPLOADING_FILES = 256
|
||||
|
||||
|
||||
|
||||
@manager.route('/datasets/<dataset_id>/documents', methods=['POST'])
|
||||
@token_required
|
||||
@ -58,11 +61,21 @@ def upload(dataset_id, tenant_id):
|
||||
if file_obj.filename == '':
|
||||
return get_result(
|
||||
retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
|
||||
|
||||
# total size
|
||||
total_size = 0
|
||||
for file_obj in file_objs:
|
||||
file_obj.seek(0, os.SEEK_END)
|
||||
total_size += file_obj.tell()
|
||||
file_obj.seek(0)
|
||||
MAX_TOTAL_FILE_SIZE=10*1024*1024
|
||||
if total_size > MAX_TOTAL_FILE_SIZE:
|
||||
return get_result(
|
||||
retmsg=f'Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)',
|
||||
retcode=RetCode.ARGUMENT_ERROR)
|
||||
e, kb = KnowledgebaseService.get_by_id(dataset_id)
|
||||
if not e:
|
||||
raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
|
||||
err, files = FileService.upload_document(kb, file_objs, tenant_id)
|
||||
err, files= FileService.upload_document(kb, file_objs, tenant_id)
|
||||
if err:
|
||||
return get_result(
|
||||
retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
|
||||
@ -140,6 +153,7 @@ def update_doc(tenant_id, dataset_id, document_id):
|
||||
if not e:
|
||||
return get_error_data_result(retmsg="Document not found!")
|
||||
req["parser_config"] = get_parser_config(req["chunk_method"], req.get("parser_config"))
|
||||
DocumentService.update_parser_config(doc.id, req["parser_config"])
|
||||
if doc.token_num > 0:
|
||||
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
|
||||
doc.process_duation * -1)
|
||||
@ -210,10 +224,10 @@ def list_docs(dataset_id, tenant_id):
|
||||
}
|
||||
renamed_doc = {}
|
||||
for key, value in doc.items():
|
||||
if key =="run":
|
||||
renamed_doc["run"]=run_mapping.get(str(value))
|
||||
new_key = key_mapping.get(key, key)
|
||||
renamed_doc[new_key] = value
|
||||
if key =="run":
|
||||
renamed_doc["run"]=run_mapping.get(value)
|
||||
renamed_doc_list.append(renamed_doc)
|
||||
return get_result(data={"total": tol, "docs": renamed_doc_list})
|
||||
|
||||
@ -280,14 +294,11 @@ def parse(tenant_id,dataset_id):
|
||||
doc = DocumentService.query(id=id,kb_id=dataset_id)
|
||||
if not doc:
|
||||
return get_error_data_result(retmsg=f"You don't own the document {id}.")
|
||||
if doc[0].progress != 0.0:
|
||||
return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
|
||||
info = {"run": "1", "progress": 0}
|
||||
info["progress_msg"] = ""
|
||||
info["chunk_num"] = 0
|
||||
info["token_num"] = 0
|
||||
DocumentService.update_by_id(id, info)
|
||||
# if str(req["run"]) == TaskStatus.CANCEL.value:
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
||||
TaskService.filter_delete([Task.doc_id == id])
|
||||
@ -312,10 +323,8 @@ def stop_parsing(tenant_id,dataset_id):
|
||||
return get_error_data_result(retmsg=f"You don't own the document {id}.")
|
||||
if doc[0].progress == 100.0 or doc[0].progress == 0.0:
|
||||
return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
|
||||
info = {"run": "2", "progress": 0}
|
||||
info = {"run": "2", "progress": 0,"chunk_num":0}
|
||||
DocumentService.update_by_id(id, info)
|
||||
# if str(req["run"]) == TaskStatus.CANCEL.value:
|
||||
tenant_id = DocumentService.get_tenant_id(id)
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
||||
return get_result()
|
||||
@ -355,10 +364,10 @@ def list_chunks(tenant_id,dataset_id,document_id):
|
||||
doc=doc.to_dict()
|
||||
renamed_doc = {}
|
||||
for key, value in doc.items():
|
||||
if key == "run":
|
||||
renamed_doc["run"] = run_mapping.get(str(value))
|
||||
new_key = key_mapping.get(key, key)
|
||||
renamed_doc[new_key] = value
|
||||
if key == "run":
|
||||
renamed_doc["run"] = run_mapping.get(str(value))
|
||||
res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
|
||||
origin_chunks = []
|
||||
sign = 0
|
||||
@ -398,12 +407,17 @@ def list_chunks(tenant_id,dataset_id,document_id):
|
||||
"content_with_weight": "content",
|
||||
"doc_id": "document_id",
|
||||
"important_kwd": "important_keywords",
|
||||
"img_id": "image_id"
|
||||
"img_id": "image_id",
|
||||
"available_int":"available"
|
||||
}
|
||||
renamed_chunk = {}
|
||||
for key, value in chunk.items():
|
||||
new_key = key_mapping.get(key, key)
|
||||
renamed_chunk[new_key] = value
|
||||
if renamed_chunk["available"] == "0":
|
||||
renamed_chunk["available"] = False
|
||||
if renamed_chunk["available"] == "1":
|
||||
renamed_chunk["available"] = True
|
||||
res["chunks"].append(renamed_chunk)
|
||||
return get_result(data=res)
|
||||
|
||||
@ -441,7 +455,7 @@ def add_chunk(tenant_id,dataset_id,document_id):
|
||||
embd_id = DocumentService.get_embd_id(document_id)
|
||||
embd_mdl = TenantLLMService.model_instance(
|
||||
tenant_id, LLMType.EMBEDDING.value, embd_id)
|
||||
|
||||
print(embd_mdl,flush=True)
|
||||
v, c = embd_mdl.encode([doc.name, req["content"]])
|
||||
v = 0.1 * v[0] + 0.9 * v[1]
|
||||
d["q_%d_vec" % len(v)] = v.tolist()
|
||||
@ -459,7 +473,7 @@ def add_chunk(tenant_id,dataset_id,document_id):
|
||||
"kb_id": "dataset_id",
|
||||
"create_timestamp_flt": "create_timestamp",
|
||||
"create_time": "create_time",
|
||||
"document_keyword": "document",
|
||||
"document_keyword": "document"
|
||||
}
|
||||
renamed_chunk = {}
|
||||
for key, value in d.items():
|
||||
@ -480,12 +494,18 @@ def rm_chunk(tenant_id,dataset_id,document_id):
|
||||
return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
|
||||
doc = doc[0]
|
||||
req = request.json
|
||||
if not req.get("chunk_ids"):
|
||||
return get_error_data_result("`chunk_ids` is required")
|
||||
query = {
|
||||
"doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
|
||||
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
||||
for chunk_id in req.get("chunk_ids"):
|
||||
if not req:
|
||||
chunk_ids=None
|
||||
else:
|
||||
chunk_ids=req.get("chunk_ids")
|
||||
if not chunk_ids:
|
||||
chunk_list=sres.ids
|
||||
else:
|
||||
chunk_list=chunk_ids
|
||||
for chunk_id in chunk_list:
|
||||
if chunk_id not in sres.ids:
|
||||
return get_error_data_result(f"Chunk {chunk_id} not found")
|
||||
if not ELASTICSEARCH.deleteByQuery(
|
||||
|
||||
Reference in New Issue
Block a user