Feat: add meta data filter. (#9405)

### What problem does this PR solve?

#8531 
#7417 
#6761 
#6573
#6477

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-08-12 14:12:56 +08:00
committed by GitHub
parent 3ccaa06031
commit 153e430b00
11 changed files with 184 additions and 4 deletions

View File

@ -51,6 +51,7 @@ def set_dialog():
similarity_threshold = req.get("similarity_threshold", 0.1)
vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
llm_setting = req.get("llm_setting", {})
meta_data_filter = req.get("meta_data_filter", {})
prompt_config = req["prompt_config"]
if not is_create:
@ -85,6 +86,7 @@ def set_dialog():
"llm_id": llm_id,
"llm_setting": llm_setting,
"prompt_config": prompt_config,
"meta_data_filter": meta_data_filter,
"top_n": top_n,
"top_k": top_k,
"rerank_id": rerank_id,

View File

@ -681,6 +681,11 @@ def set_meta():
return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
try:
meta = json.loads(req["meta"])
if not isinstance(meta, dict):
return get_json_result(data=False, message="Only dictionary type supported.", code=settings.RetCode.ARGUMENT_ERROR)
for k,v in meta.items():
if not isinstance(v, str) and not isinstance(v, int) and not isinstance(v, float):
return get_json_result(data=False, message=f"The type is not supported: {v}", code=settings.RetCode.ARGUMENT_ERROR)
except Exception as e:
return get_json_result(data=False, message=f"Json syntax error: {e}", code=settings.RetCode.ARGUMENT_ERROR)
if not isinstance(meta, dict):

View File

@ -351,6 +351,7 @@ def knowledge_graph(kb_id):
obj["graph"]["edges"] = sorted(filtered_edges, key=lambda x: x.get("weight", 0), reverse=True)[:128]
return get_json_result(data=obj)
@manager.route('/<kb_id>/knowledge_graph', methods=['DELETE']) # noqa: F821
@login_required
def delete_knowledge_graph(kb_id):
@ -364,3 +365,17 @@ def delete_knowledge_graph(kb_id):
settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph", "subgraph", "entity", "relation"]}, search.index_name(kb.tenant_id), kb_id)
return get_json_result(data=True)
@manager.route("/get_meta", methods=["GET"]) # noqa: F821
@login_required
def get_meta():
kb_ids = request.args.get("kb_ids", "").split(",")
for kb_id in kb_ids:
if not KnowledgebaseService.accessible(kb_id, current_user.id):
return get_json_result(
data=False,
message='No authorization.',
code=settings.RetCode.AUTHENTICATION_ERROR
)
return get_json_result(data=DocumentService.get_meta_by_kbs(kb_ids))

View File

@ -744,6 +744,7 @@ class Dialog(DataBaseModel):
null=False,
default={"system": "", "prologue": "Hi! I'm your assistant, what can I do for you?", "parameters": [], "empty_response": "Sorry! No relevant content was found in the knowledge base!"},
)
meta_data_filter = JSONField(null=True, default={})
similarity_threshold = FloatField(default=0.2)
vector_similarity_weight = FloatField(default=0.3)
@ -1015,4 +1016,8 @@ def migrate_db():
migrate(migrator.add_column("api_4_conversation", "errors", TextField(null=True, help_text="errors")))
except Exception:
pass
try:
migrate(migrator.add_column("dialog", "meta_data_filter", JSONField(null=True, default={})))
except Exception:
pass
logging.disable(logging.NOTSET)

View File

@ -30,6 +30,7 @@ from api import settings
from api.db import LLMType, ParserType, StatusEnum
from api.db.db_models import DB, Dialog
from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.langfuse_service import TenantLangfuseService
from api.db.services.llm_service import LLMBundle, TenantLLMService
@ -38,6 +39,7 @@ from rag.app.resume import forbidden_select_fields4resume
from rag.app.tag import label_question
from rag.nlp.search import index_name
from rag.prompts import chunks_format, citation_prompt, cross_languages, full_question, kb_prompt, keyword_extraction, message_fit_in
from rag.prompts.prompts import gen_meta_filter
from rag.utils import num_tokens_from_string, rmSpace
from rag.utils.tavily_conn import Tavily
@ -250,6 +252,46 @@ def repair_bad_citation_formats(answer: str, kbinfos: dict, idx: set):
return answer, idx
def meta_filter(metas: dict, filters: list[dict]):
doc_ids = []
def filter_out(v2docs, operator, value):
nonlocal doc_ids
for input,docids in v2docs.items():
try:
input = float(input)
value = float(value)
except Exception:
input = str(input)
value = str(value)
for conds in [
(operator == "contains", str(value).lower() in str(input).lower()),
(operator == "not contains", str(value).lower() not in str(input).lower()),
(operator == "start with", str(input).lower().startswith(str(value).lower())),
(operator == "end with", str(input).lower().endswith(str(value).lower())),
(operator == "empty", not input),
(operator == "not empty", input),
(operator == "=", input == value),
(operator == "", input != value),
(operator == ">", input > value),
(operator == "<", input < value),
(operator == "", input >= value),
(operator == "", input <= value),
]:
try:
if all(conds):
doc_ids.extend(docids)
except Exception:
pass
for k, v2docs in metas.items():
for f in filters:
if k != f["key"]:
continue
filter_out(v2docs, f["op"], f["value"])
return doc_ids
def chat(dialog, messages, stream=True, **kwargs):
assert messages[-1]["role"] == "user", "The last content of this conversation is not from user."
if not dialog.kb_ids and not dialog.prompt_config.get("tavily_api_key"):
@ -287,9 +329,10 @@ def chat(dialog, messages, stream=True, **kwargs):
retriever = settings.retrievaler
questions = [m["content"] for m in messages if m["role"] == "user"][-3:]
attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None
attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else []
if "doc_ids" in messages[-1]:
attachments = messages[-1]["doc_ids"]
prompt_config = dialog.prompt_config
field_map = KnowledgebaseService.get_field_map(dialog.kb_ids)
# try to use sql if field mapping is good to go
@ -316,6 +359,14 @@ def chat(dialog, messages, stream=True, **kwargs):
if prompt_config.get("cross_languages"):
questions = [cross_languages(dialog.tenant_id, dialog.llm_id, questions[0], prompt_config["cross_languages"])]
if dialog.meta_data_filter:
metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
if dialog.meta_data_filter.get("method") == "auto":
filters = gen_meta_filter(chat_mdl, metas, questions[-1])
attachments.extend(meta_filter(metas, filters))
elif dialog.meta_data_filter.get("method") == "manual":
attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"]))
if prompt_config.get("keyword", False):
questions[-1] += keyword_extraction(chat_mdl, questions[-1])

View File

@ -574,6 +574,25 @@ class DocumentService(CommonService):
def update_meta_fields(cls, doc_id, meta_fields):
return cls.update_by_id(doc_id, {"meta_fields": meta_fields})
@classmethod
@DB.connection_context()
def get_meta_by_kbs(cls, kb_ids):
fields = [
cls.model.id,
cls.model.meta_fields,
]
meta = {}
for r in cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids)):
doc_id = r.id
for k,v in r.meta_fields.items():
if k not in meta:
meta[k] = {}
v = str(v)
if v not in meta[k]:
meta[k][v] = []
meta[k][v].append(doc_id)
return meta
@classmethod
@DB.connection_context()
def update_progress(cls):