Optimize graphrag again (#6513)

### What problem does this PR solve? Removed set_entity and set_relation to avoid accessing doc engine during graph computation. Introduced GraphChange to avoid writing unchanged chunks. ### Type of change - [x] Performance Improvement
2026-01-04 03:25:30 +08:00 · 2025-03-26 15:34:42 +08:00
parent 7a677cb095
commit 6bf26e2a81
19 changed files with 466 additions and 530 deletions
--- a/graphrag/entity_resolution.py
+++ b/graphrag/entity_resolution.py
@ -16,7 +16,6 @@
 import logging
 import itertools
 import re
-import time
 from dataclasses import dataclass
 from typing import Any, Callable

@ -28,7 +27,7 @@ from rag.nlp import is_english
 import editdistance
 from graphrag.entity_resolution_prompt import ENTITY_RESOLUTION_PROMPT
 from rag.llm.chat_model import Base as CompletionLLM
-from graphrag.utils import perform_variable_replacements, chat_limiter
+from graphrag.utils import perform_variable_replacements, chat_limiter, GraphChange

 DEFAULT_RECORD_DELIMITER = "##"
 DEFAULT_ENTITY_INDEX_DELIMITER = "<|>"
@ -39,7 +38,7 @@ DEFAULT_RESOLUTION_RESULT_DELIMITER = "&&"
 class EntityResolutionResult:
    """Entity resolution result class definition."""
    graph: nx.Graph
-    removed_entities: list
+    change: GraphChange


 class EntityResolution(Extractor):
@ -54,12 +53,8 @@ class EntityResolution(Extractor):
    def __init__(
            self,
            llm_invoker: CompletionLLM,
-            get_entity: Callable | None = None,
-            set_entity: Callable | None = None,
-            get_relation: Callable | None = None,
-            set_relation: Callable | None = None
    ):
-        super().__init__(llm_invoker, get_entity=get_entity, set_entity=set_entity, get_relation=get_relation, set_relation=set_relation)
+        super().__init__(llm_invoker)
        """Init method definition."""
        self._llm = llm_invoker
        self._resolution_prompt = ENTITY_RESOLUTION_PROMPT
@ -84,8 +79,8 @@ class EntityResolution(Extractor):
                                                   or DEFAULT_RESOLUTION_RESULT_DELIMITER,
        }

-        nodes = graph.nodes
-        entity_types = list(set(graph.nodes[node].get('entity_type', '-') for node in nodes))
+        nodes = sorted(graph.nodes())
+        entity_types = sorted(set(graph.nodes[node].get('entity_type', '-') for node in nodes))
        node_clusters = {entity_type: [] for entity_type in entity_types}

        for node in nodes:
@ -105,54 +100,22 @@ class EntityResolution(Extractor):
                nursery.start_soon(lambda: self._resolve_candidate(candidate_resolution_i, resolution_result))
        callback(msg=f"Resolved {num_candidates} candidate pairs, {len(resolution_result)} of them are selected to merge.")

+        change = GraphChange()
        connect_graph = nx.Graph()
-        removed_entities = []
        connect_graph.add_edges_from(resolution_result)
-        all_entities_data = []
-        all_relationships_data = []
-        all_remove_nodes = []
-
        async with trio.open_nursery() as nursery:
            for sub_connect_graph in nx.connected_components(connect_graph):
-                sub_connect_graph = connect_graph.subgraph(sub_connect_graph)
-                remove_nodes = list(sub_connect_graph.nodes)
-                keep_node = remove_nodes.pop()
-                all_remove_nodes.append(remove_nodes)
-                nursery.start_soon(lambda: self._merge_nodes(keep_node, self._get_entity_(remove_nodes), all_entities_data))
-                for remove_node in remove_nodes:
-                    removed_entities.append(remove_node)
-                    remove_node_neighbors = graph[remove_node]
-                    remove_node_neighbors = list(remove_node_neighbors)
-                    for remove_node_neighbor in remove_node_neighbors:
-                        rel = self._get_relation_(remove_node, remove_node_neighbor)
-                        if graph.has_edge(remove_node, remove_node_neighbor):
-                            graph.remove_edge(remove_node, remove_node_neighbor)
-                        if remove_node_neighbor == keep_node:
-                            if graph.has_edge(keep_node, remove_node):
-                                graph.remove_edge(keep_node, remove_node)
-                            continue
-                        if not rel:
-                            continue
-                        if graph.has_edge(keep_node, remove_node_neighbor):
-                            nursery.start_soon(lambda: self._merge_edges(keep_node, remove_node_neighbor, [rel], all_relationships_data))
-                        else:
-                            pair = sorted([keep_node, remove_node_neighbor])
-                            graph.add_edge(pair[0], pair[1], weight=rel['weight'])
-                            self._set_relation_(pair[0], pair[1],
-                                            dict(
-                                                    src_id=pair[0],
-                                                    tgt_id=pair[1],
-                                                    weight=rel['weight'],
-                                                    description=rel['description'],
-                                                    keywords=[],
-                                                    source_id=rel.get("source_id", ""),
-                                                    metadata={"created_at": time.time()}
-                                            ))
-                    graph.remove_node(remove_node)
+                merging_nodes = list(sub_connect_graph.nodes)
+                nursery.start_soon(lambda: self._merge_graph_nodes(graph, merging_nodes, change))
+
+        # Update pagerank
+        pr = nx.pagerank(graph)
+        for node_name, pagerank in pr.items():
+            graph.nodes[node_name]["pagerank"] = pagerank

        return EntityResolutionResult(
            graph=graph,
-            removed_entities=removed_entities
+            change=change,
        )

    async def _resolve_candidate(self, candidate_resolution_i, resolution_result):
--- a/graphrag/general/community_report_prompt.py
+++ b/graphrag/general/community_report_prompt.py
@ -2,7 +2,7 @@
 # Licensed under the MIT License
 """
 Reference:
- - [graphrag](https://github.com/microsoft/graphrag)
+ - [GraphRAG](https://github.com/microsoft/graphrag/blob/main/graphrag/prompts/index/community_report.py)
 """

 COMMUNITY_REPORT_PROMPT = """
--- a/graphrag/general/community_reports_extractor.py
+++ b/graphrag/general/community_reports_extractor.py
@ -40,13 +40,9 @@ class CommunityReportsExtractor(Extractor):
    def __init__(
            self,
            llm_invoker: CompletionLLM,
-            get_entity: Callable | None = None,
-            set_entity: Callable | None = None,
-            get_relation: Callable | None = None,
-            set_relation: Callable | None = None,
            max_report_length: int | None = None,
    ):
-        super().__init__(llm_invoker, get_entity=get_entity, set_entity=set_entity, get_relation=get_relation, set_relation=set_relation)
+        super().__init__(llm_invoker)
        """Init method definition."""
        self._llm = llm_invoker
        self._extraction_prompt = COMMUNITY_REPORT_PROMPT
@ -63,21 +59,28 @@ class CommunityReportsExtractor(Extractor):
        over, token_count = 0, 0
        async def extract_community_report(community):
            nonlocal res_str, res_dict, over, token_count
-            cm_id, ents = community
-            weight = ents["weight"]
-            ents = ents["nodes"]
-            ent_df = pd.DataFrame(self._get_entity_(ents)).dropna()
-            if ent_df.empty or "entity_name" not in ent_df.columns:
+            cm_id, cm = community
+            weight = cm["weight"]
+            ents = cm["nodes"]
+            if len(ents) < 2:
                return
-            ent_df["entity"] = ent_df["entity_name"]
-            del ent_df["entity_name"]
-            rela_df = pd.DataFrame(self._get_relation_(list(ent_df["entity"]), list(ent_df["entity"]), 10000))
-            if rela_df.empty:
-                return
-            rela_df["source"] = rela_df["src_id"]
-            rela_df["target"] = rela_df["tgt_id"]
-            del rela_df["src_id"]
-            del rela_df["tgt_id"]
+            ent_list = [{"entity": ent, "description": graph.nodes[ent]["description"]} for ent in ents]
+            ent_df = pd.DataFrame(ent_list)
+
+            rela_list = []
+            k = 0
+            for i in range(0, len(ents)):
+                if k >= 10000:
+                    break
+                for j in range(i + 1, len(ents)):
+                    if k >= 10000:
+                        break
+                    edge = graph.get_edge_data(ents[i], ents[j])
+                    if edge is None:
+                        continue
+                    rela_list.append({"source": ents[i], "target": ents[j], "description": edge["description"]})
+                    k += 1
+            rela_df = pd.DataFrame(rela_list)

            prompt_variables = {
                "entity_df": ent_df.to_csv(index_label="id"),
--- a/graphrag/general/extractor.py
+++ b/graphrag/general/extractor.py
@ -19,10 +19,11 @@ from collections import defaultdict, Counter
 from copy import deepcopy
 from typing import Callable
 import trio
+import networkx as nx

 from graphrag.general.graph_prompt import SUMMARIZE_DESCRIPTIONS_PROMPT
 from graphrag.utils import get_llm_cache, set_llm_cache, handle_single_entity_extraction, \
-    handle_single_relationship_extraction, split_string_by_multi_markers, flat_uniq_list, chat_limiter
+    handle_single_relationship_extraction, split_string_by_multi_markers, flat_uniq_list, chat_limiter, get_from_to, GraphChange
 from rag.llm.chat_model import Base as CompletionLLM
 from rag.prompts import message_fit_in
 from rag.utils import truncate
@ -40,18 +41,10 @@ class Extractor:
        llm_invoker: CompletionLLM,
        language: str | None = "English",
        entity_types: list[str] | None = None,
-        get_entity: Callable | None = None,
-        set_entity: Callable | None = None,
-        get_relation: Callable | None = None,
-        set_relation: Callable | None = None,
    ):
        self._llm = llm_invoker
        self._language = language
        self._entity_types = entity_types or DEFAULT_ENTITY_TYPES
-        self._get_entity_ = get_entity
-        self._set_entity_ = set_entity
-        self._get_relation_ = get_relation
-        self._set_relation_ = set_relation

    def _chat(self, system, history, gen_conf):
        hist = deepcopy(history)
@ -152,25 +145,15 @@ class Extractor:
    async def _merge_nodes(self, entity_name: str, entities: list[dict], all_relationships_data):
        if not entities:
            return
-        already_entity_types = []
-        already_source_ids = []
-        already_description = []
-
-        already_node = self._get_entity_(entity_name)
-        if already_node:
-            already_entity_types.append(already_node["entity_type"])
-            already_source_ids.extend(already_node["source_id"])
-            already_description.append(already_node["description"])
-
        entity_type = sorted(
            Counter(
-                [dp["entity_type"] for dp in entities] + already_entity_types
+                [dp["entity_type"] for dp in entities]
            ).items(),
            key=lambda x: x[1],
            reverse=True,
        )[0][0]
        description = GRAPH_FIELD_SEP.join(
-            sorted(set([dp["description"] for dp in entities] + already_description))
+            sorted(set([dp["description"] for dp in entities]))
        )
        already_source_ids = flat_uniq_list(entities, "source_id")
        description = await self._handle_entity_relation_summary(entity_name, description)
@ -180,7 +163,6 @@ class Extractor:
            source_id=already_source_ids,
        )
        node_data["entity_name"] = entity_name
-        self._set_entity_(entity_name, node_data)
        all_relationships_data.append(node_data)

    async def _merge_edges(
@ -192,36 +174,11 @@ class Extractor:
    ):
        if not edges_data:
            return
-        already_weights = []
-        already_source_ids = []
-        already_description = []
-        already_keywords = []
-
-        relation = self._get_relation_(src_id, tgt_id)
-        if relation:
-            already_weights = [relation["weight"]]
-            already_source_ids = relation["source_id"]
-            already_description = [relation["description"]]
-            already_keywords = relation["keywords"]
-
-        weight = sum([dp["weight"] for dp in edges_data] + already_weights)
-        description = GRAPH_FIELD_SEP.join(
-            sorted(set([dp["description"] for dp in edges_data] + already_description))
-        )
-        keywords = flat_uniq_list(edges_data, "keywords") + already_keywords
-        source_id = flat_uniq_list(edges_data, "source_id") + already_source_ids
-
-        for need_insert_id in [src_id, tgt_id]:
-            if self._get_entity_(need_insert_id):
-                continue
-            self._set_entity_(need_insert_id, {
-                        "source_id": source_id,
-                        "description": description,
-                        "entity_type": 'UNKNOWN'
-                    })
-        description = await self._handle_entity_relation_summary(
-            f"({src_id}, {tgt_id})", description
-        )
+        weight = sum([edge["weight"] for edge in edges_data])
+        description = GRAPH_FIELD_SEP.join(sorted(set([edge["description"] for edge in edges_data])))
+        description = await self._handle_entity_relation_summary(f"{src_id} -> {tgt_id}", description)
+        keywords = flat_uniq_list(edges_data, "keywords")
+        source_id = flat_uniq_list(edges_data, "source_id")
        edge_data = dict(
            src_id=src_id,
            tgt_id=tgt_id,
@ -230,9 +187,41 @@ class Extractor:
            weight=weight,
            source_id=source_id
        )
-        self._set_relation_(src_id, tgt_id, edge_data)
-        if all_relationships_data is not None:
-            all_relationships_data.append(edge_data)
+        all_relationships_data.append(edge_data)
+
+    async def _merge_graph_nodes(self, graph: nx.Graph, nodes: list[str], change: GraphChange):
+        if len(nodes) <= 1:
+            return
+        change.added_updated_nodes.add(nodes[0])
+        change.removed_nodes.extend(nodes[1:])
+        nodes_set = set(nodes)
+        node0_attrs = graph.nodes[nodes[0]]
+        node0_neighbors = set(graph.neighbors(nodes[0]))
+        for node1 in nodes[1:]:
+            # Merge two nodes, keep "entity_name", "entity_type", "page_rank" unchanged.
+            node1_attrs = graph.nodes[node1]
+            node0_attrs["description"] += f"{GRAPH_FIELD_SEP}{node1_attrs['description']}"
+            for attr in ["keywords", "source_id"]:
+                node0_attrs[attr] = sorted(set(node0_attrs[attr].extend(node1_attrs[attr])))
+            for neighbor in graph.neighbors(node1):
+                change.removed_edges.add(get_from_to(node1, neighbor))
+                if neighbor not in nodes_set:
+                    edge1_attrs = graph.get_edge_data(node1, neighbor)
+                    if neighbor in node0_neighbors:
+                        # Merge two edges
+                        change.added_updated_edges.add(get_from_to(nodes[0], neighbor))
+                        edge0_attrs = graph.get_edge_data(nodes[0], neighbor)
+                        edge0_attrs["weight"] += edge1_attrs["weight"]
+                        edge0_attrs["description"] += f"{GRAPH_FIELD_SEP}{edge1_attrs['description']}"
+                        edge0_attrs["keywords"] = list(set(edge0_attrs["keywords"].extend(edge1_attrs["keywords"])))
+                        edge0_attrs["source_id"] = list(set(edge0_attrs["source_id"].extend(edge1_attrs["source_id"])))
+                        edge0_attrs["description"] = await self._handle_entity_relation_summary(f"({nodes[0]}, {neighbor})", edge0_attrs["description"])
+                        graph.add_edge(nodes[0], neighbor, **edge0_attrs)
+                    else:
+                        graph.add_edge(nodes[0], neighbor, **edge1_attrs)
+            graph.remove_node(node1)
+        node0_attrs["description"] = await self._handle_entity_relation_summary(nodes[0], node0_attrs["description"])
+        graph.nodes[nodes[0]].update(node0_attrs)

    async def _handle_entity_relation_summary(
            self,
--- a/graphrag/general/graph_extractor.py
+++ b/graphrag/general/graph_extractor.py
@ -6,7 +6,7 @@ Reference:
 """

 import re
-from typing import Any, Callable
+from typing import Any
 from dataclasses import dataclass
 import tiktoken
 import trio
@ -53,10 +53,6 @@ class GraphExtractor(Extractor):
        llm_invoker: CompletionLLM,
        language: str | None = "English",
        entity_types: list[str] | None = None,
-        get_entity: Callable | None = None,
-        set_entity: Callable | None = None,
-        get_relation: Callable | None = None,
-        set_relation: Callable | None = None,
        tuple_delimiter_key: str | None = None,
        record_delimiter_key: str | None = None,
        input_text_key: str | None = None,
@ -66,7 +62,7 @@ class GraphExtractor(Extractor):
        max_gleanings: int | None = None,
        on_error: ErrorHandlerFn | None = None,
    ):
-        super().__init__(llm_invoker, language, entity_types, get_entity, set_entity, get_relation, set_relation)
+        super().__init__(llm_invoker, language, entity_types)
        """Init method definition."""
        # TODO: streamline construction
        self._llm = llm_invoker
--- a/graphrag/general/graph_prompt.py
+++ b/graphrag/general/graph_prompt.py
@ -2,7 +2,7 @@
 # Licensed under the MIT License
 """
 Reference:
- - [graphrag](https://github.com/microsoft/graphrag)
+ - [GraphRAG](https://github.com/microsoft/graphrag/blob/main/graphrag/prompts/index/extract_graph.py)
 """

 GRAPH_EXTRACTION_PROMPT = """
--- a/graphrag/general/index.py
+++ b/graphrag/general/index.py
@ -15,11 +15,11 @@
 #
 import json
 import logging
-from functools import partial
 import networkx as nx
 import trio

 from api import settings
+from api.utils import get_uuid
 from graphrag.light.graph_extractor import GraphExtractor as LightKGExt
 from graphrag.general.graph_extractor import GraphExtractor as GeneralKGExt
 from graphrag.general.community_reports_extractor import CommunityReportsExtractor
@ -27,32 +27,15 @@ from graphrag.entity_resolution import EntityResolution
 from graphrag.general.extractor import Extractor
 from graphrag.utils import (
    graph_merge,
-    set_entity,
-    get_relation,
-    set_relation,
-    get_entity,
    get_graph,
    set_graph,
    chunk_id,
-    update_nodes_pagerank_nhop_neighbour,
    does_graph_contains,
-    get_graph_doc_ids,
+    tidy_graph,
+    GraphChange,
 )
 from rag.nlp import rag_tokenizer, search
-from rag.utils.redis_conn import REDIS_CONN
-
-
-def graphrag_task_set(tenant_id, kb_id, doc_id) -> bool:
-    key = f"graphrag:{tenant_id}:{kb_id}"
-    ok = REDIS_CONN.set(key, doc_id, exp=3600 * 24)
-    if not ok:
-        raise Exception(f"Faild to set the {key} to {doc_id}")
-
-
-def graphrag_task_get(tenant_id, kb_id) -> str | None:
-    key = f"graphrag:{tenant_id}:{kb_id}"
-    doc_id = REDIS_CONN.get(key)
-    return doc_id
+from rag.utils.redis_conn import RedisDistributedLock


 async def run_graphrag(
@ -72,7 +55,7 @@ async def run_graphrag(
    ):
        chunks.append(d["content_with_weight"])

-    graph, doc_ids = await update_graph(
+    subgraph = await generate_subgraph(
        LightKGExt
        if row["parser_config"]["graphrag"]["method"] != "general"
        else GeneralKGExt,
@ -86,14 +69,26 @@ async def run_graphrag(
        embedding_model,
        callback,
    )
-    if not graph:
+    new_graph = None
+    if subgraph:
+        new_graph = await merge_subgraph(
+            tenant_id,
+            kb_id,
+            doc_id,
+            subgraph,
+            embedding_model,
+            callback,
+        )
+
+    if not with_resolution or not with_community:
        return
-    if with_resolution or with_community:
-        graphrag_task_set(tenant_id, kb_id, doc_id)
-    if with_resolution:
+
+    if new_graph is None:
+        new_graph = await get_graph(tenant_id, kb_id)
+
+    if with_resolution and new_graph is not None:
        await resolve_entities(
-            graph,
-            doc_ids,
+            new_graph,
            tenant_id,
            kb_id,
            doc_id,
@ -101,10 +96,9 @@ async def run_graphrag(
            embedding_model,
            callback,
        )
-    if with_community:
+    if with_community and new_graph is not None:
        await extract_community(
-            graph,
-            doc_ids,
+            new_graph,
            tenant_id,
            kb_id,
            doc_id,
@ -117,7 +111,7 @@ async def run_graphrag(
    return


-async def update_graph(
+async def generate_subgraph(
    extractor: Extractor,
    tenant_id: str,
    kb_id: str,
@ -131,34 +125,41 @@ async def update_graph(
 ):
    contains = await does_graph_contains(tenant_id, kb_id, doc_id)
    if contains:
-        callback(msg=f"Graph already contains {doc_id}, cancel myself")
-        return None, None
+        callback(msg=f"Graph already contains {doc_id}")
+        return None
    start = trio.current_time()
    ext = extractor(
        llm_bdl,
        language=language,
        entity_types=entity_types,
-        get_entity=partial(get_entity, tenant_id, kb_id),
-        set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
-        get_relation=partial(get_relation, tenant_id, kb_id),
-        set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
    )
    ents, rels = await ext(doc_id, chunks, callback)
    subgraph = nx.Graph()
-    for en in ents:
-        subgraph.add_node(en["entity_name"], entity_type=en["entity_type"])
+    for ent in ents:
+        assert "description" in ent, f"entity {ent} does not have description"
+        ent["source_id"] = [doc_id]
+        subgraph.add_node(ent["entity_name"], **ent)

+    ignored_rels = 0
    for rel in rels:
+        assert "description" in rel, f"relation {rel} does not have description"
+        if not subgraph.has_node(rel["src_id"]) or not subgraph.has_node(rel["tgt_id"]):
+            ignored_rels += 1
+            continue
+        rel["source_id"] = [doc_id]
        subgraph.add_edge(
            rel["src_id"],
            rel["tgt_id"],
-            weight=rel["weight"],
-            # description=rel["description"]
+            **rel,
        )
-    # TODO: infinity doesn't support array search
+    if ignored_rels:
+        callback(msg=f"ignored {ignored_rels} relations due to missing entities.")
+    tidy_graph(subgraph, callback)
+
+    subgraph.graph["source_id"] = [doc_id]
    chunk = {
        "content_with_weight": json.dumps(
-            nx.node_link_data(subgraph, edges="edges"), ensure_ascii=False, indent=2
+            nx.node_link_data(subgraph, edges="edges"), ensure_ascii=False
        ),
        "knowledge_graph_kwd": "subgraph",
        "kb_id": kb_id,
@ -167,6 +168,11 @@ async def update_graph(
        "removed_kwd": "N",
    }
    cid = chunk_id(chunk)
+    await trio.to_thread.run_sync(
+        lambda: settings.docStoreConn.delete(
+            {"knowledge_graph_kwd": "subgraph", "source_id": doc_id}, search.index_name(tenant_id), kb_id
+        )
+    )
    await trio.to_thread.run_sync(
        lambda: settings.docStoreConn.insert(
            [{"id": cid, **chunk}], search.index_name(tenant_id), kb_id
@ -174,39 +180,49 @@ async def update_graph(
    )
    now = trio.current_time()
    callback(msg=f"generated subgraph for doc {doc_id} in {now - start:.2f} seconds.")
-    start = now
+    return subgraph

+async def merge_subgraph(
+    tenant_id: str,
+    kb_id: str,
+    doc_id: str,
+    subgraph: nx.Graph,
+    embedding_model,
+    callback,
+):
+    graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
    while True:
+        if graphrag_task_lock.acquire():
+            break
+        callback(msg=f"merge_subgraph {doc_id} is waiting graphrag_task_lock")
+        await trio.sleep(10)
+
+    start = trio.current_time()
+    change = GraphChange()
+    old_graph = await get_graph(tenant_id, kb_id)
+    if old_graph is not None:
+        logging.info("Merge with an exiting graph...................")
+        tidy_graph(old_graph, callback)
+        new_graph = graph_merge(old_graph, subgraph, change)
+    else:
        new_graph = subgraph
-        now_docids = set([doc_id])
-        old_graph, old_doc_ids = await get_graph(tenant_id, kb_id)
-        if old_graph is not None:
-            logging.info("Merge with an exiting graph...................")
-            new_graph = graph_merge(old_graph, subgraph)
-        await update_nodes_pagerank_nhop_neighbour(tenant_id, kb_id, new_graph, 2)
-        if old_doc_ids:
-            for old_doc_id in old_doc_ids:
-                now_docids.add(old_doc_id)
-        old_doc_ids2 = await get_graph_doc_ids(tenant_id, kb_id)
-        delta_doc_ids = set(old_doc_ids2) - set(old_doc_ids)
-        if delta_doc_ids:
-            callback(
-                msg="The global graph has changed during merging, try again"
-            )
-            await trio.sleep(1)
-            continue
-        break
-    await set_graph(tenant_id, kb_id, new_graph, list(now_docids))
+        change.added_updated_nodes = set(new_graph.nodes())
+        change.added_updated_edges = set(new_graph.edges())
+    pr = nx.pagerank(new_graph)
+    for node_name, pagerank in pr.items():
+        new_graph.nodes[node_name]["pagerank"] = pagerank
+
+    await set_graph(tenant_id, kb_id, embedding_model, new_graph, change, callback)
+    graphrag_task_lock.release()
    now = trio.current_time()
    callback(
        msg=f"merging subgraph for doc {doc_id} into the global graph done in {now - start:.2f} seconds."
    )
-    return new_graph, now_docids
+    return new_graph


 async def resolve_entities(
    graph,
-    doc_ids,
    tenant_id: str,
    kb_id: str,
    doc_id: str,
@ -214,74 +230,30 @@ async def resolve_entities(
    embed_bdl,
    callback,
 ):
-    working_doc_id = graphrag_task_get(tenant_id, kb_id)
-    if doc_id != working_doc_id:
-        callback(
-            msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
-        )
-        return
+    graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
+    while True:
+        if graphrag_task_lock.acquire():
+            break
+        await trio.sleep(10)
+
    start = trio.current_time()
    er = EntityResolution(
        llm_bdl,
-        get_entity=partial(get_entity, tenant_id, kb_id),
-        set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
-        get_relation=partial(get_relation, tenant_id, kb_id),
-        set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
    )
    reso = await er(graph, callback=callback)
    graph = reso.graph
-    callback(msg=f"Graph resolution removed {len(reso.removed_entities)} nodes.")
-    await update_nodes_pagerank_nhop_neighbour(tenant_id, kb_id, graph, 2)
+    change = reso.change
+    callback(msg=f"Graph resolution removed {len(change.removed_nodes)} nodes and {len(change.removed_edges)} edges.")
    callback(msg="Graph resolution updated pagerank.")

-    working_doc_id = graphrag_task_get(tenant_id, kb_id)
-    if doc_id != working_doc_id:
-        callback(
-            msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
-        )
-        return
-    await set_graph(tenant_id, kb_id, graph, doc_ids)
-
-    await trio.to_thread.run_sync(
-        lambda: settings.docStoreConn.delete(
-            {
-                "knowledge_graph_kwd": "relation",
-                "kb_id": kb_id,
-                "from_entity_kwd": reso.removed_entities,
-            },
-            search.index_name(tenant_id),
-            kb_id,
-        )
-    )
-    await trio.to_thread.run_sync(
-        lambda: settings.docStoreConn.delete(
-            {
-                "knowledge_graph_kwd": "relation",
-                "kb_id": kb_id,
-                "to_entity_kwd": reso.removed_entities,
-            },
-            search.index_name(tenant_id),
-            kb_id,
-        )
-    )
-    await trio.to_thread.run_sync(
-        lambda: settings.docStoreConn.delete(
-            {
-                "knowledge_graph_kwd": "entity",
-                "kb_id": kb_id,
-                "entity_kwd": reso.removed_entities,
-            },
-            search.index_name(tenant_id),
-            kb_id,
-        )
-    )
+    await set_graph(tenant_id, kb_id, embed_bdl, graph, change, callback)
+    graphrag_task_lock.release()
    now = trio.current_time()
    callback(msg=f"Graph resolution done in {now - start:.2f}s.")


 async def extract_community(
    graph,
-    doc_ids,
    tenant_id: str,
    kb_id: str,
    doc_id: str,
@ -289,49 +261,34 @@ async def extract_community(
    embed_bdl,
    callback,
 ):
-    working_doc_id = graphrag_task_get(tenant_id, kb_id)
-    if doc_id != working_doc_id:
-        callback(
-            msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
-        )
-        return
+    graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
+    while True:
+        if graphrag_task_lock.acquire():
+            break
+        await trio.sleep(10)
+
    start = trio.current_time()
    ext = CommunityReportsExtractor(
        llm_bdl,
-        get_entity=partial(get_entity, tenant_id, kb_id),
-        set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
-        get_relation=partial(get_relation, tenant_id, kb_id),
-        set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
    )
    cr = await ext(graph, callback=callback)
    community_structure = cr.structured_output
    community_reports = cr.output
-    working_doc_id = graphrag_task_get(tenant_id, kb_id)
-    if doc_id != working_doc_id:
-        callback(
-            msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
-        )
-        return
-    await set_graph(tenant_id, kb_id, graph, doc_ids)
+    doc_ids = graph.graph["source_id"]

    now = trio.current_time()
    callback(
        msg=f"Graph extracted {len(cr.structured_output)} communities in {now - start:.2f}s."
    )
    start = now
-    await trio.to_thread.run_sync(
-        lambda: settings.docStoreConn.delete(
-            {"knowledge_graph_kwd": "community_report", "kb_id": kb_id},
-            search.index_name(tenant_id),
-            kb_id,
-        )
-    )
+    chunks = []
    for stru, rep in zip(community_structure, community_reports):
        obj = {
            "report": rep,
            "evidences": "\n".join([f["explanation"] for f in stru["findings"]]),
        }
        chunk = {
+            "id": get_uuid(),
            "docnm_kwd": stru["title"],
            "title_tks": rag_tokenizer.tokenize(stru["title"]),
            "content_with_weight": json.dumps(obj, ensure_ascii=False),
@ -349,17 +306,23 @@ async def extract_community(
        chunk["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(
            chunk["content_ltks"]
        )
-        # try:
-        #    ebd, _ = embed_bdl.encode([", ".join(community["entities"])])
-        #    chunk["q_%d_vec" % len(ebd[0])] = ebd[0]
-        # except Exception as e:
-        #    logging.exception(f"Fail to embed entity relation: {e}")
-        await trio.to_thread.run_sync(
-            lambda: settings.docStoreConn.insert(
-                [{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id)
-            )
-        )
+        chunks.append(chunk)

+    await trio.to_thread.run_sync(
+        lambda: settings.docStoreConn.delete(
+            {"knowledge_graph_kwd": "community_report", "kb_id": kb_id},
+            search.index_name(tenant_id),
+            kb_id,
+        )
+    )
+    es_bulk_size = 4
+    for b in range(0, len(chunks), es_bulk_size):
+        doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(tenant_id), kb_id))
+        if doc_store_result:
+            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
+            raise Exception(error_message)
+
+    graphrag_task_lock.release()
    now = trio.current_time()
    callback(
        msg=f"Graph indexed {len(cr.structured_output)} communities in {now - start:.2f}s."
--- a/graphrag/general/leiden.py
+++ b/graphrag/general/leiden.py
@ -100,7 +100,8 @@ def run(graph: nx.Graph, args: dict[str, Any]) -> dict[int, dict[str, dict]]:
        logging.debug(
            "Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc
        )
-    if not graph.nodes():
+    nodes = set(graph.nodes())
+    if not nodes:
        return {}

    node_id_to_community_map = _compute_leiden_communities(
@ -120,7 +121,7 @@ def run(graph: nx.Graph, args: dict[str, Any]) -> dict[int, dict[str, dict]]:
        result = {}
        results_by_level[level] = result
        for node_id, raw_community_id in node_id_to_community_map[level].items():
-            if node_id not in graph.nodes:
+            if node_id not in nodes:
                logging.warning(f"Node {node_id} not found in the graph.")
                continue
            community_id = str(raw_community_id)
--- a/graphrag/light/graph_extractor.py
+++ b/graphrag/light/graph_extractor.py
@ -5,7 +5,7 @@ Reference:
 - [graphrag](https://github.com/microsoft/graphrag)
 """
 import re
-from typing import Any, Callable
+from typing import Any
 from dataclasses import dataclass
 from graphrag.general.extractor import Extractor, ENTITY_EXTRACTION_MAX_GLEANINGS
 from graphrag.light.graph_prompt import PROMPTS
@ -33,14 +33,10 @@ class GraphExtractor(Extractor):
        llm_invoker: CompletionLLM,
        language: str | None = "English",
        entity_types: list[str] | None = None,
-        get_entity: Callable | None = None,
-        set_entity: Callable | None = None,
-        get_relation: Callable | None = None,
-        set_relation: Callable | None = None,
        example_number: int = 2,
        max_gleanings: int | None = None,
    ):
-        super().__init__(llm_invoker, language, entity_types, get_entity, set_entity, get_relation, set_relation)
+        super().__init__(llm_invoker, language, entity_types)
        """Init method definition."""
        self._max_gleanings = (
            max_gleanings
--- a/graphrag/light/graph_prompt.py
+++ b/graphrag/light/graph_prompt.py
@ -1,7 +1,7 @@
 # Licensed under the MIT License
 """
 Reference:
- - [LightRag](https://github.com/HKUDS/LightRAG)
+ - [LightRAG](https://github.com/HKUDS/LightRAG/blob/main/lightrag/prompt.py)
 """


--- a/graphrag/utils.py
+++ b/graphrag/utils.py
@ -12,26 +12,37 @@ import logging
 import re
 import time
 from collections import defaultdict
-from copy import deepcopy
 from hashlib import md5
 from typing import Any, Callable
 import os
 import trio
+from typing import Set, Tuple

 import networkx as nx
 import numpy as np
 import xxhash
 from networkx.readwrite import json_graph
+import dataclasses

 from api import settings
+from api.utils import get_uuid
 from rag.nlp import search, rag_tokenizer
 from rag.utils.doc_store_conn import OrderByExpr
 from rag.utils.redis_conn import REDIS_CONN

+GRAPH_FIELD_SEP = "<SEP>"
+
 ErrorHandlerFn = Callable[[BaseException | None, str | None, dict | None], None]

 chat_limiter = trio.CapacityLimiter(int(os.environ.get('MAX_CONCURRENT_CHATS', 10)))

+@dataclasses.dataclass
+class GraphChange:
+    removed_nodes: Set[str] = dataclasses.field(default_factory=set)
+    added_updated_nodes: Set[str] = dataclasses.field(default_factory=set)
+    removed_edges: Set[Tuple[str, str]] = dataclasses.field(default_factory=set)
+    added_updated_edges: Set[Tuple[str, str]] = dataclasses.field(default_factory=set)
+
 def perform_variable_replacements(
    input: str, history: list[dict] | None = None, variables: dict | None = None
 ) -> str:
@ -146,24 +157,74 @@ def set_tags_to_cache(kb_ids, tags):
    k = hasher.hexdigest()
    REDIS_CONN.set(k, json.dumps(tags).encode("utf-8"), 600)

+def tidy_graph(graph: nx.Graph, callback):
+    """
+    Ensure all nodes and edges in the graph have some essential attribute.
+    """
+    def is_valid_node(node_attrs: dict) -> bool:
+        valid_node = True
+        for attr in ["description", "source_id"]:
+            if attr not in node_attrs:
+                valid_node = False
+                break
+        return valid_node
+    purged_nodes = []
+    for node, node_attrs in graph.nodes(data=True):
+        if not is_valid_node(node_attrs):
+            purged_nodes.append(node)
+    for node in purged_nodes:
+        graph.remove_node(node)
+    if purged_nodes and callback:
+        callback(msg=f"Purged {len(purged_nodes)} nodes from graph due to missing essential attributes.")

-def graph_merge(g1, g2):
-    g = g2.copy()
-    for n, attr in g1.nodes(data=True):
-        if n not in g2.nodes():
-            g.add_node(n, **attr)
+    purged_edges = []
+    for source, target, attr in graph.edges(data=True):
+        if not is_valid_node(attr):
+            purged_edges.append((source, target))
+        if "keywords" not in attr:
+            attr["keywords"] = []
+    for source, target in purged_edges:
+        graph.remove_edge(source, target)
+    if purged_edges and callback:
+        callback(msg=f"Purged {len(purged_edges)} edges from graph due to missing essential attributes.")
+
+def get_from_to(node1, node2):
+    if node1 < node2:
+        return (node1, node2)
+    else:
+        return (node2, node1)
+
+def graph_merge(g1: nx.Graph, g2: nx.Graph, change: GraphChange):
+    """Merge graph g2 into g1 in place."""
+    for node_name, attr in g2.nodes(data=True):
+        change.added_updated_nodes.add(node_name)
+        if not g1.has_node(node_name):
+            g1.add_node(node_name, **attr)
            continue
+        node = g1.nodes[node_name]
+        node["description"] += GRAPH_FIELD_SEP + attr["description"]
+        # A node's source_id indicates which chunks it came from.
+        node["source_id"] += attr["source_id"]

-    for source, target, attr in g1.edges(data=True):
-        if g.has_edge(source, target):
-            g[source][target].update({"weight": attr.get("weight", 0)+1})
+    for source, target, attr in g2.edges(data=True):
+        change.added_updated_edges.add(get_from_to(source, target))
+        edge = g1.get_edge_data(source, target)
+        if edge is None:
+            g1.add_edge(source, target, **attr)
            continue
-        g.add_edge(source, target)#, **attr)
-
-    for node_degree in g.degree:
-        g.nodes[str(node_degree[0])]["rank"] = int(node_degree[1])
-    return g
+        edge["weight"] += attr.get("weight", 0)
+        edge["description"] += GRAPH_FIELD_SEP + attr["description"]
+        edge["keywords"] += attr["keywords"]
+        # A edge's source_id indicates which chunks it came from.
+        edge["source_id"] += attr["source_id"]

+    for node_degree in g1.degree:
+        g1.nodes[str(node_degree[0])]["rank"] = int(node_degree[1])
+    # A graph's source_id indicates which documents it came from.
+    if "source_id" not in g1.graph:
+        g1.graph["source_id"] = []
+    g1.graph["source_id"] += g2.graph.get("source_id", [])
+    return g1

 def compute_args_hash(*args):
    return md5(str(args).encode()).hexdigest()
@ -237,55 +298,10 @@ def is_float_regex(value):
 def chunk_id(chunk):
    return xxhash.xxh64((chunk["content_with_weight"] + chunk["kb_id"]).encode("utf-8")).hexdigest()

-def get_entity_cache(tenant_id, kb_id, ent_name) -> str | list[str]:
-    hasher = xxhash.xxh64()
-    hasher.update(str(tenant_id).encode("utf-8"))
-    hasher.update(str(kb_id).encode("utf-8"))
-    hasher.update(str(ent_name).encode("utf-8"))

-    k = hasher.hexdigest()
-    bin = REDIS_CONN.get(k)
-    if not bin:
-        return
-    return json.loads(bin)
-
-
-def set_entity_cache(tenant_id, kb_id, ent_name, content_with_weight):
-    hasher = xxhash.xxh64()
-    hasher.update(str(tenant_id).encode("utf-8"))
-    hasher.update(str(kb_id).encode("utf-8"))
-    hasher.update(str(ent_name).encode("utf-8"))
-
-    k = hasher.hexdigest()
-    REDIS_CONN.set(k, content_with_weight.encode("utf-8"), 3600)
-
-
-def get_entity(tenant_id, kb_id, ent_name):
-    cache = get_entity_cache(tenant_id, kb_id, ent_name)
-    if cache:
-        return cache
-    conds = {
-        "fields": ["content_with_weight"],
-        "entity_kwd": ent_name,
-        "size": 10000,
-        "knowledge_graph_kwd": ["entity"]
-    }
-    res = []
-    es_res = settings.retrievaler.search(conds, search.index_name(tenant_id), [kb_id])
-    for id in es_res.ids:
-        try:
-            if isinstance(ent_name, str):
-                set_entity_cache(tenant_id, kb_id, ent_name, es_res.field[id]["content_with_weight"])
-                return json.loads(es_res.field[id]["content_with_weight"])
-            res.append(json.loads(es_res.field[id]["content_with_weight"]))
-        except Exception:
-            continue
-
-    return res
-
-
-def set_entity(tenant_id, kb_id, embd_mdl, ent_name, meta):
+async def graph_node_to_chunk(kb_id, embd_mdl, ent_name, meta, chunks):
    chunk = {
+        "id": get_uuid(),
        "important_kwd": [ent_name],
        "title_tks": rag_tokenizer.tokenize(ent_name),
        "entity_kwd": ent_name,
@ -293,28 +309,19 @@ def set_entity(tenant_id, kb_id, embd_mdl, ent_name, meta):
        "entity_type_kwd": meta["entity_type"],
        "content_with_weight": json.dumps(meta, ensure_ascii=False),
        "content_ltks": rag_tokenizer.tokenize(meta["description"]),
-        "source_id": list(set(meta["source_id"])),
+        "source_id": meta["source_id"],
        "kb_id": kb_id,
        "available_int": 0
    }
    chunk["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(chunk["content_ltks"])
-    set_entity_cache(tenant_id, kb_id, ent_name, chunk["content_with_weight"])
-    res = settings.retrievaler.search({"entity_kwd": ent_name, "size": 1, "fields": []},
-                                      search.index_name(tenant_id), [kb_id])
-    if res.ids:
-        settings.docStoreConn.update({"entity_kwd": ent_name}, chunk, search.index_name(tenant_id), kb_id)
-    else:
-        ebd = get_embed_cache(embd_mdl.llm_name, ent_name)
-        if ebd is None:
-            try:
-                ebd, _ = embd_mdl.encode([ent_name])
-                ebd = ebd[0]
-                set_embed_cache(embd_mdl.llm_name, ent_name, ebd)
-            except Exception as e:
-                logging.exception(f"Fail to embed entity: {e}")
-        if ebd is not None:
-            chunk["q_%d_vec" % len(ebd)] = ebd
-        settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id)
+    ebd = get_embed_cache(embd_mdl.llm_name, ent_name)
+    if ebd is None:
+        ebd, _ = await trio.to_thread.run_sync(lambda: embd_mdl.encode([ent_name]))
+        ebd = ebd[0]
+        set_embed_cache(embd_mdl.llm_name, ent_name, ebd)
+    assert ebd is not None
+    chunk["q_%d_vec" % len(ebd)] = ebd
+    chunks.append(chunk)


 def get_relation(tenant_id, kb_id, from_ent_name, to_ent_name, size=1):
@ -344,40 +351,30 @@ def get_relation(tenant_id, kb_id, from_ent_name, to_ent_name, size=1):
    return res


-def set_relation(tenant_id, kb_id, embd_mdl, from_ent_name, to_ent_name, meta):
+async def graph_edge_to_chunk(kb_id, embd_mdl, from_ent_name, to_ent_name, meta, chunks):
    chunk = {
+        "id": get_uuid(),
        "from_entity_kwd": from_ent_name,
        "to_entity_kwd": to_ent_name,
        "knowledge_graph_kwd": "relation",
        "content_with_weight": json.dumps(meta, ensure_ascii=False),
        "content_ltks": rag_tokenizer.tokenize(meta["description"]),
        "important_kwd": meta["keywords"],
-        "source_id": list(set(meta["source_id"])),
+        "source_id": meta["source_id"],
        "weight_int": int(meta["weight"]),
        "kb_id": kb_id,
        "available_int": 0
    }
    chunk["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(chunk["content_ltks"])
-    res = settings.retrievaler.search({"from_entity_kwd": to_ent_name, "to_entity_kwd": to_ent_name, "size": 1, "fields": []},
-                                      search.index_name(tenant_id), [kb_id])
-
-    if res.ids:
-        settings.docStoreConn.update({"from_entity_kwd": from_ent_name, "to_entity_kwd": to_ent_name},
-                                 chunk,
-                                 search.index_name(tenant_id), kb_id)
-    else:
-        txt = f"{from_ent_name}->{to_ent_name}"
-        ebd = get_embed_cache(embd_mdl.llm_name, txt)
-        if ebd is None:
-            try:
-                ebd, _ = embd_mdl.encode([txt+f": {meta['description']}"])
-                ebd = ebd[0]
-                set_embed_cache(embd_mdl.llm_name, txt, ebd)
-            except Exception as e:
-                logging.exception(f"Fail to embed entity relation: {e}")
-        if ebd is not None:
-            chunk["q_%d_vec" % len(ebd)] = ebd
-        settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id)
+    txt = f"{from_ent_name}->{to_ent_name}"
+    ebd = get_embed_cache(embd_mdl.llm_name, txt)
+    if ebd is None:
+        ebd, _ = await trio.to_thread.run_sync(lambda: embd_mdl.encode([txt+f": {meta['description']}"]))
+        ebd = ebd[0]
+        set_embed_cache(embd_mdl.llm_name, txt, ebd)
+    assert ebd is not None
+    chunk["q_%d_vec" % len(ebd)] = ebd
+    chunks.append(chunk)

 async def does_graph_contains(tenant_id, kb_id, doc_id):
    # Get doc_ids of graph
@ -418,33 +415,68 @@ async def get_graph(tenant_id, kb_id):
    }
    res = await trio.to_thread.run_sync(lambda: settings.retrievaler.search(conds, search.index_name(tenant_id), [kb_id]))
    if res.total == 0:
-        return None, []
+        return None
    for id in res.ids:
        try:
-            return json_graph.node_link_graph(json.loads(res.field[id]["content_with_weight"]), edges="edges"), \
-                   res.field[id]["source_id"]
+            g = json_graph.node_link_graph(json.loads(res.field[id]["content_with_weight"]), edges="edges")
+            if "source_id" not in g.graph:
+                g.graph["source_id"] = res.field[id]["source_id"]
+            return g
        except Exception:
            continue
    result = await rebuild_graph(tenant_id, kb_id)
    return result


-async def set_graph(tenant_id, kb_id, graph, docids):
-    chunk = {
-        "content_with_weight": json.dumps(nx.node_link_data(graph, edges="edges"), ensure_ascii=False,
-                                          indent=2),
+async def set_graph(tenant_id: str, kb_id: str, embd_mdl, graph: nx.Graph, change: GraphChange, callback):
+    start = trio.current_time()
+
+    await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph"]}, search.index_name(tenant_id), kb_id))
+
+    if change.removed_nodes:
+        await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"knowledge_graph_kwd": ["entity"], "entity_kwd": sorted(change.removed_nodes)}, search.index_name(tenant_id), kb_id))
+
+    if change.removed_edges:
+        async with trio.open_nursery() as nursery:
+            for from_node, to_node in change.removed_edges:
+                nursery.start_soon(lambda: settings.docStoreConn.delete({"knowledge_graph_kwd": ["relation"], "from_entity_kwd": from_node, "to_entity_kwd": to_node}, search.index_name(tenant_id), kb_id))
+    now = trio.current_time()
+    if callback:
+        callback(msg=f"set_graph removed {len(change.removed_nodes)} nodes and {len(change.removed_edges)} edges from index in {now - start:.2f}s.")
+    start = now
+
+    chunks = [{
+        "id": get_uuid(),
+        "content_with_weight": json.dumps(nx.node_link_data(graph, edges="edges"), ensure_ascii=False),
        "knowledge_graph_kwd": "graph",
        "kb_id": kb_id,
-        "source_id": list(docids),
+        "source_id": graph.graph.get("source_id", []),
        "available_int": 0,
        "removed_kwd": "N"
-    }     
-    res = await trio.to_thread.run_sync(lambda: settings.retrievaler.search({"knowledge_graph_kwd": "graph", "size": 1, "fields": []}, search.index_name(tenant_id), [kb_id]))
-    if res.ids:
-        await trio.to_thread.run_sync(lambda: settings.docStoreConn.update({"knowledge_graph_kwd": "graph"}, chunk,
-                                     search.index_name(tenant_id), kb_id))
-    else:
-        await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id))
+    }]
+    async with trio.open_nursery() as nursery:
+        for node in change.added_updated_nodes:
+            node_attrs = graph.nodes[node]
+            nursery.start_soon(lambda: graph_node_to_chunk(kb_id, embd_mdl, node, node_attrs, chunks))
+        for from_node, to_node in change.added_updated_edges:
+            edge_attrs = graph.edges[from_node, to_node]
+            nursery.start_soon(lambda: graph_edge_to_chunk(kb_id, embd_mdl, from_node, to_node, edge_attrs, chunks))
+    now = trio.current_time()
+    if callback:
+        callback(msg=f"set_graph converted graph change to {len(chunks)} chunks in {now - start:.2f}s.")
+    start = now
+
+    await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph", "entity", "relation"]}, search.index_name(tenant_id), kb_id))
+
+    es_bulk_size = 4
+    for b in range(0, len(chunks), es_bulk_size):
+        doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(tenant_id), kb_id))
+        if doc_store_result:
+            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
+            raise Exception(error_message)
+    now = trio.current_time()
+    if callback:
+        callback(msg=f"set_graph added/updated {len(change.added_updated_nodes)} nodes and {len(change.added_updated_edges)} edges from index in {now - start:.2f}s.")


 def is_continuous_subsequence(subseq, seq):
@ -489,67 +521,6 @@ def merge_tuples(list1, list2):
    return result


-async def update_nodes_pagerank_nhop_neighbour(tenant_id, kb_id, graph, n_hop):
-    def n_neighbor(id):
-        nonlocal graph, n_hop
-        count = 0
-        source_edge = list(graph.edges(id))
-        if not source_edge:
-            return []
-        count = count + 1
-        while count < n_hop:
-            count = count + 1
-            sc_edge = deepcopy(source_edge)
-            source_edge = []
-            for pair in sc_edge:
-                append_edge = list(graph.edges(pair[-1]))
-                for tuples in merge_tuples([pair], append_edge):
-                    source_edge.append(tuples)
-        nbrs = []
-        for path in source_edge:
-            n = {"path": path, "weights": []}
-            wts = nx.get_edge_attributes(graph, 'weight')
-            for i in range(len(path)-1):
-                f, t = path[i], path[i+1]
-                n["weights"].append(wts.get((f, t), 0))
-            nbrs.append(n)
-        return nbrs
-
-    pr = nx.pagerank(graph)
-    try:
-        async with trio.open_nursery() as nursery:
-            for n, p in pr.items():
-                graph.nodes[n]["pagerank"] = p
-                nursery.start_soon(lambda: trio.to_thread.run_sync(lambda: settings.docStoreConn.update({"entity_kwd": n, "kb_id": kb_id},
-                                                {"rank_flt": p,
-                                                "n_hop_with_weight": json.dumps((n), ensure_ascii=False)},
-                                                search.index_name(tenant_id), kb_id)))
-    except Exception as e:
-        logging.exception(e)
-
-    ty2ents = defaultdict(list)
-    for p, r in sorted(pr.items(), key=lambda x: x[1], reverse=True):
-        ty = graph.nodes[p].get("entity_type")
-        if not ty or len(ty2ents[ty]) > 12:
-            continue
-        ty2ents[ty].append(p)
-
-    chunk = {
-        "content_with_weight": json.dumps(ty2ents, ensure_ascii=False),
-        "kb_id": kb_id,
-        "knowledge_graph_kwd": "ty2ents",
-        "available_int": 0
-    }
-    res = await trio.to_thread.run_sync(lambda: settings.retrievaler.search({"knowledge_graph_kwd": "ty2ents", "size": 1, "fields": []},
-                                      search.index_name(tenant_id), [kb_id]))
-    if res.ids:
-        await trio.to_thread.run_sync(lambda: settings.docStoreConn.update({"knowledge_graph_kwd": "ty2ents"},
-                                     chunk,
-                                     search.index_name(tenant_id), kb_id))
-    else:
-        await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id))
-
-
 async def get_entity_type2sampels(idxnms, kb_ids: list):
    es_res = await trio.to_thread.run_sync(lambda: settings.retrievaler.search({"knowledge_graph_kwd": "ty2ents", "kb_id": kb_ids,
                                       "size": 10000,
@ -584,33 +555,46 @@ def flat_uniq_list(arr, key):

 async def rebuild_graph(tenant_id, kb_id):
    graph = nx.Graph()
-    src_ids = []
-    flds = ["entity_kwd", "entity_type_kwd", "from_entity_kwd", "to_entity_kwd", "weight_int", "knowledge_graph_kwd", "source_id"]
+    src_ids = set()
+    flds = ["entity_kwd", "from_entity_kwd", "to_entity_kwd", "knowledge_graph_kwd", "content_with_weight", "source_id"]
    bs = 256
-    for i in range(0, 39*bs, bs):
+    for i in range(0, 1024*bs, bs):
        es_res = await trio.to_thread.run_sync(lambda: settings.docStoreConn.search(flds, [],
-                                 {"kb_id": kb_id, "knowledge_graph_kwd": ["entity", "relation"]},
+                                 {"kb_id": kb_id, "knowledge_graph_kwd": ["entity"]},
                                 [],
                                 OrderByExpr(),
                                 i, bs, search.index_name(tenant_id), [kb_id]
                                 ))
        tot = settings.docStoreConn.getTotal(es_res)
        if tot == 0:
-            return None, None
+            break

        es_res = settings.docStoreConn.getFields(es_res, flds)
        for id, d in es_res.items():
-            src_ids.extend(d.get("source_id", []))
-            if d["knowledge_graph_kwd"] == "entity":
-                graph.add_node(d["entity_kwd"], entity_type=d["entity_type_kwd"])
-            elif "from_entity_kwd" in d and "to_entity_kwd" in d:
-                graph.add_edge(
-                    d["from_entity_kwd"],
-                    d["to_entity_kwd"],
-                    weight=int(d["weight_int"])
-                )
+            assert d["knowledge_graph_kwd"] == "relation"
+            src_ids.update(d.get("source_id", []))
+            attrs = json.load(d["content_with_weight"])
+            graph.add_node(d["entity_kwd"], **attrs)

-        if len(es_res.keys()) < 128:
-            return graph, list(set(src_ids))
+    for i in range(0, 1024*bs, bs):
+        es_res = await trio.to_thread.run_sync(lambda: settings.docStoreConn.search(flds, [],
+                                 {"kb_id": kb_id, "knowledge_graph_kwd": ["relation"]},
+                                 [],
+                                 OrderByExpr(),
+                                 i, bs, search.index_name(tenant_id), [kb_id]
+                                 ))
+        tot = settings.docStoreConn.getTotal(es_res)
+        if tot == 0:
+            return None

-    return graph, list(set(src_ids))
+        es_res = settings.docStoreConn.getFields(es_res, flds)
+        for id, d in es_res.items():
+            assert d["knowledge_graph_kwd"] == "relation"
+            src_ids.update(d.get("source_id", []))
+            if graph.has_node(d["from_entity_kwd"]) and graph.has_node(d["to_entity_kwd"]):
+                attrs = json.load(d["content_with_weight"])
+                graph.add_edge(d["from_entity_kwd"], d["to_entity_kwd"], **attrs)
+
+    src_ids = sorted(src_ids)
+    graph.graph["source_id"] = src_ids
+    return graph