Add graphrag (#1793)

### What problem does this PR solve? #1594 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 08:05:07 +08:00 · 2024-08-02 18:51:14 +08:00
parent 80032b1fc0
commit 152072f900
74 changed files with 2522 additions and 105 deletions
--- a/api/db/init.py
+++ b/api/db/init.py
@ -85,6 +85,7 @@ class ParserType(StrEnum):
    PICTURE = "picture"
    ONE = "one"
    AUDIO = "audio"
+    KG = "knowledge_graph"


 class FileSource(StrEnum):
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@ -122,7 +122,7 @@ def init_llm_factory():
    LLMService.filter_delete([LLMService.model.fid == "QAnything"])
    TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
    TenantService.filter_update([1 == 1], {
-        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
+        "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
    ## insert openai two embedding models to the current openai user.
    print("Start to insert 2 OpenAI embedding models...")
    tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@ -145,7 +145,7 @@ def init_llm_factory():
    """
    drop table llm;
    drop table llm_factories;
-    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
+    update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph';
    alter table knowledgebase modify avatar longtext;
    alter table user modify avatar longtext;
    alter table dialog modify icon longtext;
@ -153,7 +153,7 @@ def init_llm_factory():


 def add_graph_templates():
-    dir = os.path.join(get_project_base_directory(), "graph", "templates")
+    dir = os.path.join(get_project_base_directory(), "agent", "templates")
    for fnm in os.listdir(dir):
        try:
            cnvs = json.load(open(os.path.join(dir, fnm), "r"))
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -18,12 +18,12 @@ import json
 import re
 from copy import deepcopy

-from api.db import LLMType
+from api.db import LLMType, ParserType
 from api.db.db_models import Dialog, Conversation
 from api.db.services.common_service import CommonService
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
-from api.settings import chat_logger, retrievaler
+from api.settings import chat_logger, retrievaler, kg_retrievaler
 from rag.app.resume import forbidden_select_fields4resume
 from rag.nlp import keyword_extraction
 from rag.nlp.search import index_name
@ -101,6 +101,9 @@ def chat(dialog, messages, stream=True, **kwargs):
        yield {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []}
        return {"answer": "**ERROR**: Knowledge bases use different embedding models.", "reference": []}

+    is_kg = all([kb.parser_id == ParserType.KG for kb in kbs])
+    retr = retrievaler if not is_kg else kg_retrievaler
+
    questions = [m["content"] for m in messages if m["role"] == "user"]
    embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0])
    if llm_id2llm_type(dialog.llm_id) == "image2text":
@ -138,7 +141,7 @@ def chat(dialog, messages, stream=True, **kwargs):
    else:
        if prompt_config.get("keyword", False):
            questions[-1] += keyword_extraction(chat_mdl, questions[-1])
-        kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
+        kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
                                        dialog.similarity_threshold,
                                        dialog.vector_similarity_weight,
                                        doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
@ -147,7 +150,7 @@ def chat(dialog, messages, stream=True, **kwargs):
    #self-rag
    if dialog.prompt_config.get("self_rag") and not relevant(dialog.tenant_id, dialog.llm_id, questions[-1], knowledges):
        questions[-1] = rewrite(dialog.tenant_id, dialog.llm_id, questions[-1])
-        kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
+        kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
                                        dialog.similarity_threshold,
                                        dialog.vector_similarity_weight,
                                        doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
@ -179,7 +182,7 @@ def chat(dialog, messages, stream=True, **kwargs):
        nonlocal prompt_config, knowledges, kwargs, kbinfos
        refs = []
        if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
-            answer, idx = retrievaler.insert_citations(answer,
+            answer, idx = retr.insert_citations(answer,
                                                       [ck["content_ltks"]
                                                        for ck in kbinfos["chunks"]],
                                                       [ck["vector"]
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@ -139,6 +139,8 @@ def queue_tasks(doc, bucket, name):
            page_size = doc["parser_config"].get("task_page_size", 22)
        if doc["parser_id"] == "one":
            page_size = 1000000000
+        if doc["parser_id"] == "knowledge_graph":
+            page_size = 1000000000
        if not do_layout:
            page_size = 1000000000
        page_ranges = doc["parser_config"].get("pages")