### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-01-09 17:07:21 +08:00
committed by GitHub
parent f892d7d426
commit c5da3cdd97
30 changed files with 736 additions and 202 deletions

View File

@ -73,7 +73,8 @@ def create(tenant_id):
chunk_method:
type: string
enum: ["naive", "manual", "qa", "table", "paper", "book", "laws",
"presentation", "picture", "one", "knowledge_graph", "email"]
"presentation", "picture", "one", "knowledge_graph", "email", "tag"
]
description: Chunking method.
parser_config:
type: object
@ -108,6 +109,7 @@ def create(tenant_id):
"one",
"knowledge_graph",
"email",
"tag"
]
check_validation = valid(
permission,
@ -302,7 +304,8 @@ def update(tenant_id, dataset_id):
chunk_method:
type: string
enum: ["naive", "manual", "qa", "table", "paper", "book", "laws",
"presentation", "picture", "one", "knowledge_graph", "email"]
"presentation", "picture", "one", "knowledge_graph", "email", "tag"
]
description: Updated chunking method.
parser_config:
type: object
@ -339,6 +342,7 @@ def update(tenant_id, dataset_id):
"one",
"knowledge_graph",
"email",
"tag"
]
check_validation = valid(
permission,

View File

@ -16,6 +16,7 @@
from flask import request, jsonify
from api.db import LLMType, ParserType
from api.db.services.dialog_service import label_question
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api import settings
@ -54,7 +55,8 @@ def retrieval(tenant_id):
page_size=top,
similarity_threshold=similarity_threshold,
vector_similarity_weight=0.3,
top=top
top=top,
rank_feature=label_question(question, [kb])
)
records = []
for c in ranks["chunks"]:

View File

@ -16,7 +16,7 @@
import pathlib
import datetime
from api.db.services.dialog_service import keyword_extraction
from api.db.services.dialog_service import keyword_extraction, label_question
from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import rag_tokenizer
from api.db import LLMType, ParserType
@ -276,6 +276,7 @@ def update_doc(tenant_id, dataset_id, document_id):
"one",
"knowledge_graph",
"email",
"tag"
}
if req.get("chunk_method") not in valid_chunk_method:
return get_error_data_result(
@ -1355,6 +1356,7 @@ def retrieval_test(tenant_id):
doc_ids,
rerank_mdl=rerank_mdl,
highlight=highlight,
rank_feature=label_question(question, kbs)
)
for c in ranks["chunks"]:
c.pop("vector", None)