Optimize graphrag again (#6513)

### What problem does this PR solve? Removed set_entity and set_relation to avoid accessing doc engine during graph computation. Introduced GraphChange to avoid writing unchanged chunks. ### Type of change - [x] Performance Improvement
2026-01-31 23:55:06 +08:00 · 2025-03-26 15:34:42 +08:00
parent 7a677cb095
commit 6bf26e2a81
19 changed files with 466 additions and 530 deletions
--- a/graphrag/general/index.py
+++ b/graphrag/general/index.py
@ -15,11 +15,11 @@
 #
 import json
 import logging
-from functools import partial
 import networkx as nx
 import trio

 from api import settings
+from api.utils import get_uuid
 from graphrag.light.graph_extractor import GraphExtractor as LightKGExt
 from graphrag.general.graph_extractor import GraphExtractor as GeneralKGExt
 from graphrag.general.community_reports_extractor import CommunityReportsExtractor
@ -27,32 +27,15 @@ from graphrag.entity_resolution import EntityResolution
 from graphrag.general.extractor import Extractor
 from graphrag.utils import (
    graph_merge,
-    set_entity,
-    get_relation,
-    set_relation,
-    get_entity,
    get_graph,
    set_graph,
    chunk_id,
-    update_nodes_pagerank_nhop_neighbour,
    does_graph_contains,
-    get_graph_doc_ids,
+    tidy_graph,
+    GraphChange,
 )
 from rag.nlp import rag_tokenizer, search
-from rag.utils.redis_conn import REDIS_CONN
-
-
-def graphrag_task_set(tenant_id, kb_id, doc_id) -> bool:
-    key = f"graphrag:{tenant_id}:{kb_id}"
-    ok = REDIS_CONN.set(key, doc_id, exp=3600 * 24)
-    if not ok:
-        raise Exception(f"Faild to set the {key} to {doc_id}")
-
-
-def graphrag_task_get(tenant_id, kb_id) -> str | None:
-    key = f"graphrag:{tenant_id}:{kb_id}"
-    doc_id = REDIS_CONN.get(key)
-    return doc_id
+from rag.utils.redis_conn import RedisDistributedLock


 async def run_graphrag(
@ -72,7 +55,7 @@ async def run_graphrag(
    ):
        chunks.append(d["content_with_weight"])

-    graph, doc_ids = await update_graph(
+    subgraph = await generate_subgraph(
        LightKGExt
        if row["parser_config"]["graphrag"]["method"] != "general"
        else GeneralKGExt,
@ -86,14 +69,26 @@ async def run_graphrag(
        embedding_model,
        callback,
    )
-    if not graph:
+    new_graph = None
+    if subgraph:
+        new_graph = await merge_subgraph(
+            tenant_id,
+            kb_id,
+            doc_id,
+            subgraph,
+            embedding_model,
+            callback,
+        )
+
+    if not with_resolution or not with_community:
        return
-    if with_resolution or with_community:
-        graphrag_task_set(tenant_id, kb_id, doc_id)
-    if with_resolution:
+
+    if new_graph is None:
+        new_graph = await get_graph(tenant_id, kb_id)
+
+    if with_resolution and new_graph is not None:
        await resolve_entities(
-            graph,
-            doc_ids,
+            new_graph,
            tenant_id,
            kb_id,
            doc_id,
@ -101,10 +96,9 @@ async def run_graphrag(
            embedding_model,
            callback,
        )
-    if with_community:
+    if with_community and new_graph is not None:
        await extract_community(
-            graph,
-            doc_ids,
+            new_graph,
            tenant_id,
            kb_id,
            doc_id,
@ -117,7 +111,7 @@ async def run_graphrag(
    return


-async def update_graph(
+async def generate_subgraph(
    extractor: Extractor,
    tenant_id: str,
    kb_id: str,
@ -131,34 +125,41 @@ async def update_graph(
 ):
    contains = await does_graph_contains(tenant_id, kb_id, doc_id)
    if contains:
-        callback(msg=f"Graph already contains {doc_id}, cancel myself")
-        return None, None
+        callback(msg=f"Graph already contains {doc_id}")
+        return None
    start = trio.current_time()
    ext = extractor(
        llm_bdl,
        language=language,
        entity_types=entity_types,
-        get_entity=partial(get_entity, tenant_id, kb_id),
-        set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
-        get_relation=partial(get_relation, tenant_id, kb_id),
-        set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
    )
    ents, rels = await ext(doc_id, chunks, callback)
    subgraph = nx.Graph()
-    for en in ents:
-        subgraph.add_node(en["entity_name"], entity_type=en["entity_type"])
+    for ent in ents:
+        assert "description" in ent, f"entity {ent} does not have description"
+        ent["source_id"] = [doc_id]
+        subgraph.add_node(ent["entity_name"], **ent)

+    ignored_rels = 0
    for rel in rels:
+        assert "description" in rel, f"relation {rel} does not have description"
+        if not subgraph.has_node(rel["src_id"]) or not subgraph.has_node(rel["tgt_id"]):
+            ignored_rels += 1
+            continue
+        rel["source_id"] = [doc_id]
        subgraph.add_edge(
            rel["src_id"],
            rel["tgt_id"],
-            weight=rel["weight"],
-            # description=rel["description"]
+            **rel,
        )
-    # TODO: infinity doesn't support array search
+    if ignored_rels:
+        callback(msg=f"ignored {ignored_rels} relations due to missing entities.")
+    tidy_graph(subgraph, callback)
+
+    subgraph.graph["source_id"] = [doc_id]
    chunk = {
        "content_with_weight": json.dumps(
-            nx.node_link_data(subgraph, edges="edges"), ensure_ascii=False, indent=2
+            nx.node_link_data(subgraph, edges="edges"), ensure_ascii=False
        ),
        "knowledge_graph_kwd": "subgraph",
        "kb_id": kb_id,
@ -167,6 +168,11 @@ async def update_graph(
        "removed_kwd": "N",
    }
    cid = chunk_id(chunk)
+    await trio.to_thread.run_sync(
+        lambda: settings.docStoreConn.delete(
+            {"knowledge_graph_kwd": "subgraph", "source_id": doc_id}, search.index_name(tenant_id), kb_id
+        )
+    )
    await trio.to_thread.run_sync(
        lambda: settings.docStoreConn.insert(
            [{"id": cid, **chunk}], search.index_name(tenant_id), kb_id
@ -174,39 +180,49 @@ async def update_graph(
    )
    now = trio.current_time()
    callback(msg=f"generated subgraph for doc {doc_id} in {now - start:.2f} seconds.")
-    start = now
+    return subgraph

+async def merge_subgraph(
+    tenant_id: str,
+    kb_id: str,
+    doc_id: str,
+    subgraph: nx.Graph,
+    embedding_model,
+    callback,
+):
+    graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
    while True:
+        if graphrag_task_lock.acquire():
+            break
+        callback(msg=f"merge_subgraph {doc_id} is waiting graphrag_task_lock")
+        await trio.sleep(10)
+
+    start = trio.current_time()
+    change = GraphChange()
+    old_graph = await get_graph(tenant_id, kb_id)
+    if old_graph is not None:
+        logging.info("Merge with an exiting graph...................")
+        tidy_graph(old_graph, callback)
+        new_graph = graph_merge(old_graph, subgraph, change)
+    else:
        new_graph = subgraph
-        now_docids = set([doc_id])
-        old_graph, old_doc_ids = await get_graph(tenant_id, kb_id)
-        if old_graph is not None:
-            logging.info("Merge with an exiting graph...................")
-            new_graph = graph_merge(old_graph, subgraph)
-        await update_nodes_pagerank_nhop_neighbour(tenant_id, kb_id, new_graph, 2)
-        if old_doc_ids:
-            for old_doc_id in old_doc_ids:
-                now_docids.add(old_doc_id)
-        old_doc_ids2 = await get_graph_doc_ids(tenant_id, kb_id)
-        delta_doc_ids = set(old_doc_ids2) - set(old_doc_ids)
-        if delta_doc_ids:
-            callback(
-                msg="The global graph has changed during merging, try again"
-            )
-            await trio.sleep(1)
-            continue
-        break
-    await set_graph(tenant_id, kb_id, new_graph, list(now_docids))
+        change.added_updated_nodes = set(new_graph.nodes())
+        change.added_updated_edges = set(new_graph.edges())
+    pr = nx.pagerank(new_graph)
+    for node_name, pagerank in pr.items():
+        new_graph.nodes[node_name]["pagerank"] = pagerank
+
+    await set_graph(tenant_id, kb_id, embedding_model, new_graph, change, callback)
+    graphrag_task_lock.release()
    now = trio.current_time()
    callback(
        msg=f"merging subgraph for doc {doc_id} into the global graph done in {now - start:.2f} seconds."
    )
-    return new_graph, now_docids
+    return new_graph


 async def resolve_entities(
    graph,
-    doc_ids,
    tenant_id: str,
    kb_id: str,
    doc_id: str,
@ -214,74 +230,30 @@ async def resolve_entities(
    embed_bdl,
    callback,
 ):
-    working_doc_id = graphrag_task_get(tenant_id, kb_id)
-    if doc_id != working_doc_id:
-        callback(
-            msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
-        )
-        return
+    graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
+    while True:
+        if graphrag_task_lock.acquire():
+            break
+        await trio.sleep(10)
+
    start = trio.current_time()
    er = EntityResolution(
        llm_bdl,
-        get_entity=partial(get_entity, tenant_id, kb_id),
-        set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
-        get_relation=partial(get_relation, tenant_id, kb_id),
-        set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
    )
    reso = await er(graph, callback=callback)
    graph = reso.graph
-    callback(msg=f"Graph resolution removed {len(reso.removed_entities)} nodes.")
-    await update_nodes_pagerank_nhop_neighbour(tenant_id, kb_id, graph, 2)
+    change = reso.change
+    callback(msg=f"Graph resolution removed {len(change.removed_nodes)} nodes and {len(change.removed_edges)} edges.")
    callback(msg="Graph resolution updated pagerank.")

-    working_doc_id = graphrag_task_get(tenant_id, kb_id)
-    if doc_id != working_doc_id:
-        callback(
-            msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
-        )
-        return
-    await set_graph(tenant_id, kb_id, graph, doc_ids)
-
-    await trio.to_thread.run_sync(
-        lambda: settings.docStoreConn.delete(
-            {
-                "knowledge_graph_kwd": "relation",
-                "kb_id": kb_id,
-                "from_entity_kwd": reso.removed_entities,
-            },
-            search.index_name(tenant_id),
-            kb_id,
-        )
-    )
-    await trio.to_thread.run_sync(
-        lambda: settings.docStoreConn.delete(
-            {
-                "knowledge_graph_kwd": "relation",
-                "kb_id": kb_id,
-                "to_entity_kwd": reso.removed_entities,
-            },
-            search.index_name(tenant_id),
-            kb_id,
-        )
-    )
-    await trio.to_thread.run_sync(
-        lambda: settings.docStoreConn.delete(
-            {
-                "knowledge_graph_kwd": "entity",
-                "kb_id": kb_id,
-                "entity_kwd": reso.removed_entities,
-            },
-            search.index_name(tenant_id),
-            kb_id,
-        )
-    )
+    await set_graph(tenant_id, kb_id, embed_bdl, graph, change, callback)
+    graphrag_task_lock.release()
    now = trio.current_time()
    callback(msg=f"Graph resolution done in {now - start:.2f}s.")


 async def extract_community(
    graph,
-    doc_ids,
    tenant_id: str,
    kb_id: str,
    doc_id: str,
@ -289,49 +261,34 @@ async def extract_community(
    embed_bdl,
    callback,
 ):
-    working_doc_id = graphrag_task_get(tenant_id, kb_id)
-    if doc_id != working_doc_id:
-        callback(
-            msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
-        )
-        return
+    graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
+    while True:
+        if graphrag_task_lock.acquire():
+            break
+        await trio.sleep(10)
+
    start = trio.current_time()
    ext = CommunityReportsExtractor(
        llm_bdl,
-        get_entity=partial(get_entity, tenant_id, kb_id),
-        set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
-        get_relation=partial(get_relation, tenant_id, kb_id),
-        set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
    )
    cr = await ext(graph, callback=callback)
    community_structure = cr.structured_output
    community_reports = cr.output
-    working_doc_id = graphrag_task_get(tenant_id, kb_id)
-    if doc_id != working_doc_id:
-        callback(
-            msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
-        )
-        return
-    await set_graph(tenant_id, kb_id, graph, doc_ids)
+    doc_ids = graph.graph["source_id"]

    now = trio.current_time()
    callback(
        msg=f"Graph extracted {len(cr.structured_output)} communities in {now - start:.2f}s."
    )
    start = now
-    await trio.to_thread.run_sync(
-        lambda: settings.docStoreConn.delete(
-            {"knowledge_graph_kwd": "community_report", "kb_id": kb_id},
-            search.index_name(tenant_id),
-            kb_id,
-        )
-    )
+    chunks = []
    for stru, rep in zip(community_structure, community_reports):
        obj = {
            "report": rep,
            "evidences": "\n".join([f["explanation"] for f in stru["findings"]]),
        }
        chunk = {
+            "id": get_uuid(),
            "docnm_kwd": stru["title"],
            "title_tks": rag_tokenizer.tokenize(stru["title"]),
            "content_with_weight": json.dumps(obj, ensure_ascii=False),
@ -349,17 +306,23 @@ async def extract_community(
        chunk["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(
            chunk["content_ltks"]
        )
-        # try:
-        #    ebd, _ = embed_bdl.encode([", ".join(community["entities"])])
-        #    chunk["q_%d_vec" % len(ebd[0])] = ebd[0]
-        # except Exception as e:
-        #    logging.exception(f"Fail to embed entity relation: {e}")
-        await trio.to_thread.run_sync(
-            lambda: settings.docStoreConn.insert(
-                [{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id)
-            )
-        )
+        chunks.append(chunk)

+    await trio.to_thread.run_sync(
+        lambda: settings.docStoreConn.delete(
+            {"knowledge_graph_kwd": "community_report", "kb_id": kb_id},
+            search.index_name(tenant_id),
+            kb_id,
+        )
+    )
+    es_bulk_size = 4
+    for b in range(0, len(chunks), es_bulk_size):
+        doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(tenant_id), kb_id))
+        if doc_store_result:
+            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
+            raise Exception(error_message)
+
+    graphrag_task_lock.release()
    now = trio.current_time()
    callback(
        msg=f"Graph indexed {len(cr.structured_output)} communities in {now - start:.2f}s."