Fix: whole knowledge graph lost after removing any document in the knowledge base (#7151)

### What problem does this PR solve? When you removed any document in a knowledge base using knowledge graph, the graph's `removed_kwd` is set to "Y". However, in the function `graphrag.utils.get_gaph`, `rebuild_graph` method is passed and directly return `None` while `removed_kwd=Y`, making residual part of the graph abandoned (but old entity data still exist in db). Besides, infinity instance actually pass deleting graph components' `source_id` when removing document. It may cause wrong graph after rebuild. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-23 11:36:38 +08:00 · 2025-04-30 09:43:17 +08:00
parent 538a408608
commit ab27609a64
4 changed files with 137 additions and 70 deletions
--- a/graphrag/general/index.py
+++ b/graphrag/general/index.py
@ -204,7 +204,7 @@ async def merge_subgraph(
 ):
    start = trio.current_time()
    change = GraphChange()
-    old_graph = await get_graph(tenant_id, kb_id)
+    old_graph = await get_graph(tenant_id, kb_id, subgraph.graph["source_id"])
    if old_graph is not None:
        logging.info("Merge with an exiting graph...................")
        tidy_graph(old_graph, callback)
--- a/graphrag/utils.py
+++ b/graphrag/utils.py
@ -406,32 +406,33 @@ async def get_graph_doc_ids(tenant_id, kb_id) -> list[str]:
    return doc_ids


-async def get_graph(tenant_id, kb_id):
+async def get_graph(tenant_id, kb_id, exclude_rebuild=None):
    conds = {
-        "fields": ["content_with_weight", "source_id"],
-        "removed_kwd": "N",
+        "fields": ["content_with_weight", "removed_kwd", "source_id"],
        "size": 1,
        "knowledge_graph_kwd": ["graph"]
    }
    res = await trio.to_thread.run_sync(lambda: settings.retrievaler.search(conds, search.index_name(tenant_id), [kb_id]))
-    if res.total == 0:
-        return None
-    for id in res.ids:
-        try:
-            g = json_graph.node_link_graph(json.loads(res.field[id]["content_with_weight"]), edges="edges")
-            if "source_id" not in g.graph:
-                g.graph["source_id"] = res.field[id]["source_id"]
-            return g
-        except Exception:
-            continue
-    result = await rebuild_graph(tenant_id, kb_id)
+    if not res.total == 0:
+        for id in res.ids:
+            try:
+                if res.field[id]["removed_kwd"] == "N":
+                    g = json_graph.node_link_graph(json.loads(res.field[id]["content_with_weight"]), edges="edges")
+                    if "source_id" not in g.graph:
+                        g.graph["source_id"] = res.field[id]["source_id"]
+                else:
+                    g = await rebuild_graph(tenant_id, kb_id, exclude_rebuild)
+                return g
+            except Exception:
+                continue
+    result = None
    return result


 async def set_graph(tenant_id: str, kb_id: str, embd_mdl, graph: nx.Graph, change: GraphChange, callback):
    start = trio.current_time()

-    await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph"]}, search.index_name(tenant_id), kb_id))
+    await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph", "subgraph"]}, search.index_name(tenant_id), kb_id))

    if change.removed_nodes:
        await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"knowledge_graph_kwd": ["entity"], "entity_kwd": sorted(change.removed_nodes)}, search.index_name(tenant_id), kb_id))
@ -454,6 +455,23 @@ async def set_graph(tenant_id: str, kb_id: str, embd_mdl, graph: nx.Graph, chang
        "available_int": 0,
        "removed_kwd": "N"
    }]
+    
+    # generate updated subgraphs
+    for source in graph.graph["source_id"]:
+        subgraph = graph.subgraph([n for n in graph.nodes if source in graph.nodes[n]["source_id"]]).copy()
+        subgraph.graph["source_id"] = [source]
+        for n in subgraph.nodes:
+            subgraph.nodes[n]["source_id"] = [source]
+        chunks.append({
+            "id": get_uuid(),
+            "content_with_weight": json.dumps(nx.node_link_data(subgraph, edges="edges"), ensure_ascii=False),
+            "knowledge_graph_kwd": "subgraph",
+            "kb_id": kb_id,
+            "source_id": [source],
+            "available_int": 0,
+            "removed_kwd": "N"
+        })
+    
    async with trio.open_nursery() as nursery:
        for node in change.added_updated_nodes:
            node_attrs = graph.nodes[node]
@ -554,48 +572,45 @@ def flat_uniq_list(arr, key):
    return list(set(res))


-async def rebuild_graph(tenant_id, kb_id):
+async def rebuild_graph(tenant_id, kb_id, exclude_rebuild=None):
    graph = nx.Graph()
-    src_ids = set()
-    flds = ["entity_kwd", "from_entity_kwd", "to_entity_kwd", "knowledge_graph_kwd", "content_with_weight", "source_id"]
+    flds = ["knowledge_graph_kwd", "content_with_weight", "source_id"]
    bs = 256
    for i in range(0, 1024*bs, bs):
        es_res = await trio.to_thread.run_sync(lambda: settings.docStoreConn.search(flds, [],
-                                 {"kb_id": kb_id, "knowledge_graph_kwd": ["entity"]},
+                                 {"kb_id": kb_id, "knowledge_graph_kwd": ["subgraph"]},
                                 [],
                                 OrderByExpr(),
                                 i, bs, search.index_name(tenant_id), [kb_id]
                                 ))
-        tot = settings.docStoreConn.getTotal(es_res)
-        if tot == 0:
+        # tot = settings.docStoreConn.getTotal(es_res)
+        es_res = settings.docStoreConn.getFields(es_res, flds)
+
+        if len(es_res) == 0:
            break

-        es_res = settings.docStoreConn.getFields(es_res, flds)
        for id, d in es_res.items():
-            assert d["knowledge_graph_kwd"] == "relation"
-            src_ids.update(d.get("source_id", []))
-            attrs = json.load(d["content_with_weight"])
-            graph.add_node(d["entity_kwd"], **attrs)
+            assert d["knowledge_graph_kwd"] == "subgraph"
+            if isinstance(exclude_rebuild, list):
+                if sum([n in d["source_id"] for n in exclude_rebuild]):
+                    continue
+            elif exclude_rebuild in d["source_id"]:
+                continue
+            
+            next_graph = json_graph.node_link_graph(json.loads(d["content_with_weight"]), edges="edges")
+            merged_graph = nx.compose(graph, next_graph)
+            merged_source = {
+                n: graph.nodes[n]["source_id"] + next_graph.nodes[n]["source_id"]
+                for n in graph.nodes & next_graph.nodes
+            }
+            nx.set_node_attributes(merged_graph, merged_source, "source_id")
+            if "source_id" in graph.graph:
+                merged_graph.graph["source_id"] = graph.graph["source_id"] + next_graph.graph["source_id"]
+            else:
+                merged_graph.graph["source_id"] = next_graph.graph["source_id"]
+            graph = merged_graph

-    for i in range(0, 1024*bs, bs):
-        es_res = await trio.to_thread.run_sync(lambda: settings.docStoreConn.search(flds, [],
-                                 {"kb_id": kb_id, "knowledge_graph_kwd": ["relation"]},
-                                 [],
-                                 OrderByExpr(),
-                                 i, bs, search.index_name(tenant_id), [kb_id]
-                                 ))
-        tot = settings.docStoreConn.getTotal(es_res)
-        if tot == 0:
-            return None
-
-        es_res = settings.docStoreConn.getFields(es_res, flds)
-        for id, d in es_res.items():
-            assert d["knowledge_graph_kwd"] == "relation"
-            src_ids.update(d.get("source_id", []))
-            if graph.has_node(d["from_entity_kwd"]) and graph.has_node(d["to_entity_kwd"]):
-                attrs = json.load(d["content_with_weight"])
-                graph.add_edge(d["from_entity_kwd"], d["to_entity_kwd"], **attrs)
-
-    src_ids = sorted(src_ids)
-    graph.graph["source_id"] = src_ids
+    if len(graph.nodes) == 0:
+        return None
+    graph.graph["source_id"] = sorted(graph.graph["source_id"])
    return graph