Fix issues in API (#3008)

### What problem does this PR solve?

Fix issues in API

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
This commit is contained in:
liuhua
2024-10-24 20:10:47 +08:00
committed by GitHub
parent 161c7a231b
commit 648f8e81d1
5 changed files with 187 additions and 205 deletions

View File

@ -64,7 +64,12 @@ def create(tenant_id):
if not req.get("embedding_model"):
req['embedding_model'] = t.embd_id
else:
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
"BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
"nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
"text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
and req.get("embedding_model") not in valid_embedding_models:
return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
key_mapping = {
"chunk_num": "chunk_count",
@ -133,6 +138,9 @@ def update(tenant_id,dataset_id):
return get_error_data_result(
retmsg="Can't change `tenant_id`.")
e, kb = KnowledgebaseService.get_by_id(dataset_id)
if "parser_config" in req:
print(kb.parser_config,flush=True)
req["parser_config"]=kb.parser_config.update(req["parser_config"])
if "chunk_count" in req:
if req["chunk_count"] != kb.chunk_num:
return get_error_data_result(
@ -153,10 +161,15 @@ def update(tenant_id,dataset_id):
if "embedding_model" in req:
if kb.chunk_num != 0 and req['embedding_model'] != kb.embd_id:
return get_error_data_result(
retmsg="If `chunk_count` is not 0, `embedding_method` is not changeable.")
retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable.")
if not req.get("embedding_model"):
return get_error_data_result("`embedding_model` can't be empty")
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
"BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
"nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
"text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
and req.get("embedding_model") not in valid_embedding_models:
return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
req['embd_id'] = req.pop('embedding_model')
if "name" in req:

View File

@ -163,9 +163,6 @@ def update_doc(tenant_id, dataset_id, document_id):
doc.process_duation * -1)
if not e:
return get_error_data_result(retmsg="Document not found!")
tenant_id = DocumentService.get_tenant_id(req["id"])
if not tenant_id:
return get_error_data_result(retmsg="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
@ -245,14 +242,22 @@ def delete(tenant_id,dataset_id):
if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
req = request.json
if not req.get("ids"):
return get_error_data_result(retmsg="`ids` is required")
doc_ids = req["ids"]
if not req:
doc_ids=None
else:
doc_ids=req.get("ids")
if not doc_ids:
doc_list = []
docs=DocumentService.query(kb_id=dataset_id)
for doc in docs:
doc_list.append(doc.id)
else:
doc_list=doc_ids
root_folder = FileService.get_root_folder(tenant_id)
pf_id = root_folder["id"]
FileService.init_knowledgebase_docs(pf_id, tenant_id)
errors = ""
for doc_id in doc_ids:
for doc_id in doc_list:
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
@ -290,8 +295,11 @@ def parse(tenant_id,dataset_id):
if not req.get("document_ids"):
return get_error_data_result("`document_ids` is required")
for id in req["document_ids"]:
if not DocumentService.query(id=id,kb_id=dataset_id):
doc = DocumentService.query(id=id,kb_id=dataset_id)
if not doc:
return get_error_data_result(retmsg=f"You don't own the document {id}.")
if doc[0].progress != 0.0:
return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
info = {"run": "1", "progress": 0}
info["progress_msg"] = ""
info["chunk_num"] = 0
@ -349,7 +357,27 @@ def list_chunks(tenant_id,dataset_id,document_id):
"doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
}
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
key_mapping = {
"chunk_num": "chunk_count",
"kb_id": "dataset_id",
"token_num": "token_count",
"parser_id": "chunk_method"
}
run_mapping = {
"0": "UNSTART",
"1": "RUNNING",
"2": "CANCEL",
"3": "DONE",
"4": "FAIL"
}
doc=doc.to_dict()
renamed_doc = {}
for key, value in doc.items():
if key == "run":
renamed_doc["run"] = run_mapping.get(str(value))
new_key = key_mapping.get(key, key)
renamed_doc[new_key] = value
res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
origin_chunks = []
sign = 0
for id in sres.ids:
@ -388,7 +416,7 @@ def list_chunks(tenant_id,dataset_id,document_id):
"content_with_weight": "content",
"doc_id": "document_id",
"important_kwd": "important_keywords",
"img_id": "image_id",
"img_id": "image_id"
}
renamed_chunk = {}
for key, value in chunk.items():