Fix issues in API (#3008)

### What problem does this PR solve? Fix issues in API ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
2026-01-30 23:26:36 +08:00 · 2024-10-24 20:10:47 +08:00
parent 161c7a231b
commit 648f8e81d1
5 changed files with 187 additions and 205 deletions
--- a/api/apps/sdk/dataset.py
+++ b/api/apps/sdk/dataset.py
@ -64,7 +64,12 @@ def create(tenant_id):
    if not req.get("embedding_model"):
        req['embedding_model'] = t.embd_id
    else:
-        if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
+        valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
+                                "BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
+                                "nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
+                                "text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
+        if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
+                and req.get("embedding_model") not in valid_embedding_models:
            return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
    key_mapping = {
        "chunk_num": "chunk_count",
@ -133,6 +138,9 @@ def update(tenant_id,dataset_id):
            return get_error_data_result(
                retmsg="Can't change `tenant_id`.")
    e, kb = KnowledgebaseService.get_by_id(dataset_id)
+    if "parser_config" in req:
+        print(kb.parser_config,flush=True)
+        req["parser_config"]=kb.parser_config.update(req["parser_config"])
    if "chunk_count" in req:
        if req["chunk_count"] != kb.chunk_num:
            return get_error_data_result(
@ -153,10 +161,15 @@ def update(tenant_id,dataset_id):
    if "embedding_model" in req:
        if kb.chunk_num != 0 and req['embedding_model'] != kb.embd_id:
            return get_error_data_result(
-                retmsg="If `chunk_count` is not 0, `embedding_method` is not changeable.")
+                retmsg="If `chunk_count` is not 0, `embedding_model` is not changeable.")
        if not req.get("embedding_model"):
            return get_error_data_result("`embedding_model` can't be empty")
-        if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model")):
+        valid_embedding_models=["BAAI/bge-large-zh-v1.5","BAAI/bge-base-en-v1.5","BAAI/bge-large-en-v1.5","BAAI/bge-small-en-v1.5",
+                                "BAAI/bge-small-zh-v1.5","jinaai/jina-embeddings-v2-base-en","jinaai/jina-embeddings-v2-small-en",
+                                "nomic-ai/nomic-embed-text-v1.5","sentence-transformers/all-MiniLM-L6-v2","text-embedding-v2",
+                                "text-embedding-v3","maidalun1020/bce-embedding-base_v1"]
+        if not TenantLLMService.query(tenant_id=tenant_id,model_type="embedding", llm_name=req.get("embedding_model"))\
+                and req.get("embedding_model") not in valid_embedding_models:
            return get_error_data_result(f"`embedding_model` {req.get('embedding_model')} doesn't exist")
        req['embd_id'] = req.pop('embedding_model')
    if "name" in req:
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@ -163,9 +163,6 @@ def update_doc(tenant_id, dataset_id, document_id):
                                                    doc.process_duation * -1)
            if not e:
                return get_error_data_result(retmsg="Document not found!")
-            tenant_id = DocumentService.get_tenant_id(req["id"])
-            if not tenant_id:
-                return get_error_data_result(retmsg="Tenant not found!")
            ELASTICSEARCH.deleteByQuery(
                Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))

@ -245,14 +242,22 @@ def delete(tenant_id,dataset_id):
    if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
        return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
    req = request.json
-    if not req.get("ids"):
-        return get_error_data_result(retmsg="`ids` is required")
-    doc_ids = req["ids"]
+    if not req:
+        doc_ids=None
+    else:
+        doc_ids=req.get("ids")
+    if not doc_ids:
+        doc_list = []
+        docs=DocumentService.query(kb_id=dataset_id)
+        for doc in docs:
+            doc_list.append(doc.id)
+    else:
+        doc_list=doc_ids
    root_folder = FileService.get_root_folder(tenant_id)
    pf_id = root_folder["id"]
    FileService.init_knowledgebase_docs(pf_id, tenant_id)
    errors = ""
-    for doc_id in doc_ids:
+    for doc_id in doc_list:
        try:
            e, doc = DocumentService.get_by_id(doc_id)
            if not e:
@ -290,8 +295,11 @@ def parse(tenant_id,dataset_id):
    if not req.get("document_ids"):
        return get_error_data_result("`document_ids` is required")
    for id in req["document_ids"]:
-        if not DocumentService.query(id=id,kb_id=dataset_id):
+        doc = DocumentService.query(id=id,kb_id=dataset_id)
+        if not doc:
            return get_error_data_result(retmsg=f"You don't own the document {id}.")
+        if doc[0].progress != 0.0:
+            return get_error_data_result("Can't stop parsing document with progress at 0 or 100")
        info = {"run": "1", "progress": 0}
        info["progress_msg"] = ""
        info["chunk_num"] = 0
@ -349,7 +357,27 @@ def list_chunks(tenant_id,dataset_id,document_id):
        "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
    }
    sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
-    res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
+    key_mapping = {
+        "chunk_num": "chunk_count",
+        "kb_id": "dataset_id",
+        "token_num": "token_count",
+        "parser_id": "chunk_method"
+    }
+    run_mapping = {
+        "0": "UNSTART",
+        "1": "RUNNING",
+        "2": "CANCEL",
+        "3": "DONE",
+        "4": "FAIL"
+    }
+    doc=doc.to_dict()
+    renamed_doc = {}
+    for key, value in doc.items():
+        if key == "run":
+            renamed_doc["run"] = run_mapping.get(str(value))
+        new_key = key_mapping.get(key, key)
+        renamed_doc[new_key] = value
+    res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
    origin_chunks = []
    sign = 0
    for id in sres.ids:
@ -388,7 +416,7 @@ def list_chunks(tenant_id,dataset_id,document_id):
            "content_with_weight": "content",
            "doc_id": "document_id",
            "important_kwd": "important_keywords",
-            "img_id": "image_id",
+            "img_id": "image_id"
        }
        renamed_chunk = {}
        for key, value in chunk.items():