Optimize graphrag again (#6513)

### What problem does this PR solve?

Removed set_entity and set_relation to avoid accessing doc engine during
graph computation.
Introduced GraphChange to avoid writing unchanged chunks.

### Type of change

- [x] Performance Improvement
This commit is contained in:
Zhichang Yu
2025-03-26 15:34:42 +08:00
committed by GitHub
parent 7a677cb095
commit 6bf26e2a81
19 changed files with 466 additions and 530 deletions

View File

@ -2,7 +2,7 @@
# Licensed under the MIT License
"""
Reference:
- [graphrag](https://github.com/microsoft/graphrag)
- [GraphRAG](https://github.com/microsoft/graphrag/blob/main/graphrag/prompts/index/community_report.py)
"""
COMMUNITY_REPORT_PROMPT = """

View File

@ -40,13 +40,9 @@ class CommunityReportsExtractor(Extractor):
def __init__(
self,
llm_invoker: CompletionLLM,
get_entity: Callable | None = None,
set_entity: Callable | None = None,
get_relation: Callable | None = None,
set_relation: Callable | None = None,
max_report_length: int | None = None,
):
super().__init__(llm_invoker, get_entity=get_entity, set_entity=set_entity, get_relation=get_relation, set_relation=set_relation)
super().__init__(llm_invoker)
"""Init method definition."""
self._llm = llm_invoker
self._extraction_prompt = COMMUNITY_REPORT_PROMPT
@ -63,21 +59,28 @@ class CommunityReportsExtractor(Extractor):
over, token_count = 0, 0
async def extract_community_report(community):
nonlocal res_str, res_dict, over, token_count
cm_id, ents = community
weight = ents["weight"]
ents = ents["nodes"]
ent_df = pd.DataFrame(self._get_entity_(ents)).dropna()
if ent_df.empty or "entity_name" not in ent_df.columns:
cm_id, cm = community
weight = cm["weight"]
ents = cm["nodes"]
if len(ents) < 2:
return
ent_df["entity"] = ent_df["entity_name"]
del ent_df["entity_name"]
rela_df = pd.DataFrame(self._get_relation_(list(ent_df["entity"]), list(ent_df["entity"]), 10000))
if rela_df.empty:
return
rela_df["source"] = rela_df["src_id"]
rela_df["target"] = rela_df["tgt_id"]
del rela_df["src_id"]
del rela_df["tgt_id"]
ent_list = [{"entity": ent, "description": graph.nodes[ent]["description"]} for ent in ents]
ent_df = pd.DataFrame(ent_list)
rela_list = []
k = 0
for i in range(0, len(ents)):
if k >= 10000:
break
for j in range(i + 1, len(ents)):
if k >= 10000:
break
edge = graph.get_edge_data(ents[i], ents[j])
if edge is None:
continue
rela_list.append({"source": ents[i], "target": ents[j], "description": edge["description"]})
k += 1
rela_df = pd.DataFrame(rela_list)
prompt_variables = {
"entity_df": ent_df.to_csv(index_label="id"),

View File

@ -19,10 +19,11 @@ from collections import defaultdict, Counter
from copy import deepcopy
from typing import Callable
import trio
import networkx as nx
from graphrag.general.graph_prompt import SUMMARIZE_DESCRIPTIONS_PROMPT
from graphrag.utils import get_llm_cache, set_llm_cache, handle_single_entity_extraction, \
handle_single_relationship_extraction, split_string_by_multi_markers, flat_uniq_list, chat_limiter
handle_single_relationship_extraction, split_string_by_multi_markers, flat_uniq_list, chat_limiter, get_from_to, GraphChange
from rag.llm.chat_model import Base as CompletionLLM
from rag.prompts import message_fit_in
from rag.utils import truncate
@ -40,18 +41,10 @@ class Extractor:
llm_invoker: CompletionLLM,
language: str | None = "English",
entity_types: list[str] | None = None,
get_entity: Callable | None = None,
set_entity: Callable | None = None,
get_relation: Callable | None = None,
set_relation: Callable | None = None,
):
self._llm = llm_invoker
self._language = language
self._entity_types = entity_types or DEFAULT_ENTITY_TYPES
self._get_entity_ = get_entity
self._set_entity_ = set_entity
self._get_relation_ = get_relation
self._set_relation_ = set_relation
def _chat(self, system, history, gen_conf):
hist = deepcopy(history)
@ -152,25 +145,15 @@ class Extractor:
async def _merge_nodes(self, entity_name: str, entities: list[dict], all_relationships_data):
if not entities:
return
already_entity_types = []
already_source_ids = []
already_description = []
already_node = self._get_entity_(entity_name)
if already_node:
already_entity_types.append(already_node["entity_type"])
already_source_ids.extend(already_node["source_id"])
already_description.append(already_node["description"])
entity_type = sorted(
Counter(
[dp["entity_type"] for dp in entities] + already_entity_types
[dp["entity_type"] for dp in entities]
).items(),
key=lambda x: x[1],
reverse=True,
)[0][0]
description = GRAPH_FIELD_SEP.join(
sorted(set([dp["description"] for dp in entities] + already_description))
sorted(set([dp["description"] for dp in entities]))
)
already_source_ids = flat_uniq_list(entities, "source_id")
description = await self._handle_entity_relation_summary(entity_name, description)
@ -180,7 +163,6 @@ class Extractor:
source_id=already_source_ids,
)
node_data["entity_name"] = entity_name
self._set_entity_(entity_name, node_data)
all_relationships_data.append(node_data)
async def _merge_edges(
@ -192,36 +174,11 @@ class Extractor:
):
if not edges_data:
return
already_weights = []
already_source_ids = []
already_description = []
already_keywords = []
relation = self._get_relation_(src_id, tgt_id)
if relation:
already_weights = [relation["weight"]]
already_source_ids = relation["source_id"]
already_description = [relation["description"]]
already_keywords = relation["keywords"]
weight = sum([dp["weight"] for dp in edges_data] + already_weights)
description = GRAPH_FIELD_SEP.join(
sorted(set([dp["description"] for dp in edges_data] + already_description))
)
keywords = flat_uniq_list(edges_data, "keywords") + already_keywords
source_id = flat_uniq_list(edges_data, "source_id") + already_source_ids
for need_insert_id in [src_id, tgt_id]:
if self._get_entity_(need_insert_id):
continue
self._set_entity_(need_insert_id, {
"source_id": source_id,
"description": description,
"entity_type": 'UNKNOWN'
})
description = await self._handle_entity_relation_summary(
f"({src_id}, {tgt_id})", description
)
weight = sum([edge["weight"] for edge in edges_data])
description = GRAPH_FIELD_SEP.join(sorted(set([edge["description"] for edge in edges_data])))
description = await self._handle_entity_relation_summary(f"{src_id} -> {tgt_id}", description)
keywords = flat_uniq_list(edges_data, "keywords")
source_id = flat_uniq_list(edges_data, "source_id")
edge_data = dict(
src_id=src_id,
tgt_id=tgt_id,
@ -230,9 +187,41 @@ class Extractor:
weight=weight,
source_id=source_id
)
self._set_relation_(src_id, tgt_id, edge_data)
if all_relationships_data is not None:
all_relationships_data.append(edge_data)
all_relationships_data.append(edge_data)
async def _merge_graph_nodes(self, graph: nx.Graph, nodes: list[str], change: GraphChange):
if len(nodes) <= 1:
return
change.added_updated_nodes.add(nodes[0])
change.removed_nodes.extend(nodes[1:])
nodes_set = set(nodes)
node0_attrs = graph.nodes[nodes[0]]
node0_neighbors = set(graph.neighbors(nodes[0]))
for node1 in nodes[1:]:
# Merge two nodes, keep "entity_name", "entity_type", "page_rank" unchanged.
node1_attrs = graph.nodes[node1]
node0_attrs["description"] += f"{GRAPH_FIELD_SEP}{node1_attrs['description']}"
for attr in ["keywords", "source_id"]:
node0_attrs[attr] = sorted(set(node0_attrs[attr].extend(node1_attrs[attr])))
for neighbor in graph.neighbors(node1):
change.removed_edges.add(get_from_to(node1, neighbor))
if neighbor not in nodes_set:
edge1_attrs = graph.get_edge_data(node1, neighbor)
if neighbor in node0_neighbors:
# Merge two edges
change.added_updated_edges.add(get_from_to(nodes[0], neighbor))
edge0_attrs = graph.get_edge_data(nodes[0], neighbor)
edge0_attrs["weight"] += edge1_attrs["weight"]
edge0_attrs["description"] += f"{GRAPH_FIELD_SEP}{edge1_attrs['description']}"
edge0_attrs["keywords"] = list(set(edge0_attrs["keywords"].extend(edge1_attrs["keywords"])))
edge0_attrs["source_id"] = list(set(edge0_attrs["source_id"].extend(edge1_attrs["source_id"])))
edge0_attrs["description"] = await self._handle_entity_relation_summary(f"({nodes[0]}, {neighbor})", edge0_attrs["description"])
graph.add_edge(nodes[0], neighbor, **edge0_attrs)
else:
graph.add_edge(nodes[0], neighbor, **edge1_attrs)
graph.remove_node(node1)
node0_attrs["description"] = await self._handle_entity_relation_summary(nodes[0], node0_attrs["description"])
graph.nodes[nodes[0]].update(node0_attrs)
async def _handle_entity_relation_summary(
self,

View File

@ -6,7 +6,7 @@ Reference:
"""
import re
from typing import Any, Callable
from typing import Any
from dataclasses import dataclass
import tiktoken
import trio
@ -53,10 +53,6 @@ class GraphExtractor(Extractor):
llm_invoker: CompletionLLM,
language: str | None = "English",
entity_types: list[str] | None = None,
get_entity: Callable | None = None,
set_entity: Callable | None = None,
get_relation: Callable | None = None,
set_relation: Callable | None = None,
tuple_delimiter_key: str | None = None,
record_delimiter_key: str | None = None,
input_text_key: str | None = None,
@ -66,7 +62,7 @@ class GraphExtractor(Extractor):
max_gleanings: int | None = None,
on_error: ErrorHandlerFn | None = None,
):
super().__init__(llm_invoker, language, entity_types, get_entity, set_entity, get_relation, set_relation)
super().__init__(llm_invoker, language, entity_types)
"""Init method definition."""
# TODO: streamline construction
self._llm = llm_invoker

View File

@ -2,7 +2,7 @@
# Licensed under the MIT License
"""
Reference:
- [graphrag](https://github.com/microsoft/graphrag)
- [GraphRAG](https://github.com/microsoft/graphrag/blob/main/graphrag/prompts/index/extract_graph.py)
"""
GRAPH_EXTRACTION_PROMPT = """

View File

@ -15,11 +15,11 @@
#
import json
import logging
from functools import partial
import networkx as nx
import trio
from api import settings
from api.utils import get_uuid
from graphrag.light.graph_extractor import GraphExtractor as LightKGExt
from graphrag.general.graph_extractor import GraphExtractor as GeneralKGExt
from graphrag.general.community_reports_extractor import CommunityReportsExtractor
@ -27,32 +27,15 @@ from graphrag.entity_resolution import EntityResolution
from graphrag.general.extractor import Extractor
from graphrag.utils import (
graph_merge,
set_entity,
get_relation,
set_relation,
get_entity,
get_graph,
set_graph,
chunk_id,
update_nodes_pagerank_nhop_neighbour,
does_graph_contains,
get_graph_doc_ids,
tidy_graph,
GraphChange,
)
from rag.nlp import rag_tokenizer, search
from rag.utils.redis_conn import REDIS_CONN
def graphrag_task_set(tenant_id, kb_id, doc_id) -> bool:
key = f"graphrag:{tenant_id}:{kb_id}"
ok = REDIS_CONN.set(key, doc_id, exp=3600 * 24)
if not ok:
raise Exception(f"Faild to set the {key} to {doc_id}")
def graphrag_task_get(tenant_id, kb_id) -> str | None:
key = f"graphrag:{tenant_id}:{kb_id}"
doc_id = REDIS_CONN.get(key)
return doc_id
from rag.utils.redis_conn import RedisDistributedLock
async def run_graphrag(
@ -72,7 +55,7 @@ async def run_graphrag(
):
chunks.append(d["content_with_weight"])
graph, doc_ids = await update_graph(
subgraph = await generate_subgraph(
LightKGExt
if row["parser_config"]["graphrag"]["method"] != "general"
else GeneralKGExt,
@ -86,14 +69,26 @@ async def run_graphrag(
embedding_model,
callback,
)
if not graph:
new_graph = None
if subgraph:
new_graph = await merge_subgraph(
tenant_id,
kb_id,
doc_id,
subgraph,
embedding_model,
callback,
)
if not with_resolution or not with_community:
return
if with_resolution or with_community:
graphrag_task_set(tenant_id, kb_id, doc_id)
if with_resolution:
if new_graph is None:
new_graph = await get_graph(tenant_id, kb_id)
if with_resolution and new_graph is not None:
await resolve_entities(
graph,
doc_ids,
new_graph,
tenant_id,
kb_id,
doc_id,
@ -101,10 +96,9 @@ async def run_graphrag(
embedding_model,
callback,
)
if with_community:
if with_community and new_graph is not None:
await extract_community(
graph,
doc_ids,
new_graph,
tenant_id,
kb_id,
doc_id,
@ -117,7 +111,7 @@ async def run_graphrag(
return
async def update_graph(
async def generate_subgraph(
extractor: Extractor,
tenant_id: str,
kb_id: str,
@ -131,34 +125,41 @@ async def update_graph(
):
contains = await does_graph_contains(tenant_id, kb_id, doc_id)
if contains:
callback(msg=f"Graph already contains {doc_id}, cancel myself")
return None, None
callback(msg=f"Graph already contains {doc_id}")
return None
start = trio.current_time()
ext = extractor(
llm_bdl,
language=language,
entity_types=entity_types,
get_entity=partial(get_entity, tenant_id, kb_id),
set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
get_relation=partial(get_relation, tenant_id, kb_id),
set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
)
ents, rels = await ext(doc_id, chunks, callback)
subgraph = nx.Graph()
for en in ents:
subgraph.add_node(en["entity_name"], entity_type=en["entity_type"])
for ent in ents:
assert "description" in ent, f"entity {ent} does not have description"
ent["source_id"] = [doc_id]
subgraph.add_node(ent["entity_name"], **ent)
ignored_rels = 0
for rel in rels:
assert "description" in rel, f"relation {rel} does not have description"
if not subgraph.has_node(rel["src_id"]) or not subgraph.has_node(rel["tgt_id"]):
ignored_rels += 1
continue
rel["source_id"] = [doc_id]
subgraph.add_edge(
rel["src_id"],
rel["tgt_id"],
weight=rel["weight"],
# description=rel["description"]
**rel,
)
# TODO: infinity doesn't support array search
if ignored_rels:
callback(msg=f"ignored {ignored_rels} relations due to missing entities.")
tidy_graph(subgraph, callback)
subgraph.graph["source_id"] = [doc_id]
chunk = {
"content_with_weight": json.dumps(
nx.node_link_data(subgraph, edges="edges"), ensure_ascii=False, indent=2
nx.node_link_data(subgraph, edges="edges"), ensure_ascii=False
),
"knowledge_graph_kwd": "subgraph",
"kb_id": kb_id,
@ -167,6 +168,11 @@ async def update_graph(
"removed_kwd": "N",
}
cid = chunk_id(chunk)
await trio.to_thread.run_sync(
lambda: settings.docStoreConn.delete(
{"knowledge_graph_kwd": "subgraph", "source_id": doc_id}, search.index_name(tenant_id), kb_id
)
)
await trio.to_thread.run_sync(
lambda: settings.docStoreConn.insert(
[{"id": cid, **chunk}], search.index_name(tenant_id), kb_id
@ -174,39 +180,49 @@ async def update_graph(
)
now = trio.current_time()
callback(msg=f"generated subgraph for doc {doc_id} in {now - start:.2f} seconds.")
start = now
return subgraph
async def merge_subgraph(
tenant_id: str,
kb_id: str,
doc_id: str,
subgraph: nx.Graph,
embedding_model,
callback,
):
graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
while True:
if graphrag_task_lock.acquire():
break
callback(msg=f"merge_subgraph {doc_id} is waiting graphrag_task_lock")
await trio.sleep(10)
start = trio.current_time()
change = GraphChange()
old_graph = await get_graph(tenant_id, kb_id)
if old_graph is not None:
logging.info("Merge with an exiting graph...................")
tidy_graph(old_graph, callback)
new_graph = graph_merge(old_graph, subgraph, change)
else:
new_graph = subgraph
now_docids = set([doc_id])
old_graph, old_doc_ids = await get_graph(tenant_id, kb_id)
if old_graph is not None:
logging.info("Merge with an exiting graph...................")
new_graph = graph_merge(old_graph, subgraph)
await update_nodes_pagerank_nhop_neighbour(tenant_id, kb_id, new_graph, 2)
if old_doc_ids:
for old_doc_id in old_doc_ids:
now_docids.add(old_doc_id)
old_doc_ids2 = await get_graph_doc_ids(tenant_id, kb_id)
delta_doc_ids = set(old_doc_ids2) - set(old_doc_ids)
if delta_doc_ids:
callback(
msg="The global graph has changed during merging, try again"
)
await trio.sleep(1)
continue
break
await set_graph(tenant_id, kb_id, new_graph, list(now_docids))
change.added_updated_nodes = set(new_graph.nodes())
change.added_updated_edges = set(new_graph.edges())
pr = nx.pagerank(new_graph)
for node_name, pagerank in pr.items():
new_graph.nodes[node_name]["pagerank"] = pagerank
await set_graph(tenant_id, kb_id, embedding_model, new_graph, change, callback)
graphrag_task_lock.release()
now = trio.current_time()
callback(
msg=f"merging subgraph for doc {doc_id} into the global graph done in {now - start:.2f} seconds."
)
return new_graph, now_docids
return new_graph
async def resolve_entities(
graph,
doc_ids,
tenant_id: str,
kb_id: str,
doc_id: str,
@ -214,74 +230,30 @@ async def resolve_entities(
embed_bdl,
callback,
):
working_doc_id = graphrag_task_get(tenant_id, kb_id)
if doc_id != working_doc_id:
callback(
msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
)
return
graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
while True:
if graphrag_task_lock.acquire():
break
await trio.sleep(10)
start = trio.current_time()
er = EntityResolution(
llm_bdl,
get_entity=partial(get_entity, tenant_id, kb_id),
set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
get_relation=partial(get_relation, tenant_id, kb_id),
set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
)
reso = await er(graph, callback=callback)
graph = reso.graph
callback(msg=f"Graph resolution removed {len(reso.removed_entities)} nodes.")
await update_nodes_pagerank_nhop_neighbour(tenant_id, kb_id, graph, 2)
change = reso.change
callback(msg=f"Graph resolution removed {len(change.removed_nodes)} nodes and {len(change.removed_edges)} edges.")
callback(msg="Graph resolution updated pagerank.")
working_doc_id = graphrag_task_get(tenant_id, kb_id)
if doc_id != working_doc_id:
callback(
msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
)
return
await set_graph(tenant_id, kb_id, graph, doc_ids)
await trio.to_thread.run_sync(
lambda: settings.docStoreConn.delete(
{
"knowledge_graph_kwd": "relation",
"kb_id": kb_id,
"from_entity_kwd": reso.removed_entities,
},
search.index_name(tenant_id),
kb_id,
)
)
await trio.to_thread.run_sync(
lambda: settings.docStoreConn.delete(
{
"knowledge_graph_kwd": "relation",
"kb_id": kb_id,
"to_entity_kwd": reso.removed_entities,
},
search.index_name(tenant_id),
kb_id,
)
)
await trio.to_thread.run_sync(
lambda: settings.docStoreConn.delete(
{
"knowledge_graph_kwd": "entity",
"kb_id": kb_id,
"entity_kwd": reso.removed_entities,
},
search.index_name(tenant_id),
kb_id,
)
)
await set_graph(tenant_id, kb_id, embed_bdl, graph, change, callback)
graphrag_task_lock.release()
now = trio.current_time()
callback(msg=f"Graph resolution done in {now - start:.2f}s.")
async def extract_community(
graph,
doc_ids,
tenant_id: str,
kb_id: str,
doc_id: str,
@ -289,49 +261,34 @@ async def extract_community(
embed_bdl,
callback,
):
working_doc_id = graphrag_task_get(tenant_id, kb_id)
if doc_id != working_doc_id:
callback(
msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
)
return
graphrag_task_lock = RedisDistributedLock("graphrag_task", lock_value=doc_id, timeout=600)
while True:
if graphrag_task_lock.acquire():
break
await trio.sleep(10)
start = trio.current_time()
ext = CommunityReportsExtractor(
llm_bdl,
get_entity=partial(get_entity, tenant_id, kb_id),
set_entity=partial(set_entity, tenant_id, kb_id, embed_bdl),
get_relation=partial(get_relation, tenant_id, kb_id),
set_relation=partial(set_relation, tenant_id, kb_id, embed_bdl),
)
cr = await ext(graph, callback=callback)
community_structure = cr.structured_output
community_reports = cr.output
working_doc_id = graphrag_task_get(tenant_id, kb_id)
if doc_id != working_doc_id:
callback(
msg=f"Another graphrag task of doc_id {working_doc_id} is working on this kb, cancel myself"
)
return
await set_graph(tenant_id, kb_id, graph, doc_ids)
doc_ids = graph.graph["source_id"]
now = trio.current_time()
callback(
msg=f"Graph extracted {len(cr.structured_output)} communities in {now - start:.2f}s."
)
start = now
await trio.to_thread.run_sync(
lambda: settings.docStoreConn.delete(
{"knowledge_graph_kwd": "community_report", "kb_id": kb_id},
search.index_name(tenant_id),
kb_id,
)
)
chunks = []
for stru, rep in zip(community_structure, community_reports):
obj = {
"report": rep,
"evidences": "\n".join([f["explanation"] for f in stru["findings"]]),
}
chunk = {
"id": get_uuid(),
"docnm_kwd": stru["title"],
"title_tks": rag_tokenizer.tokenize(stru["title"]),
"content_with_weight": json.dumps(obj, ensure_ascii=False),
@ -349,17 +306,23 @@ async def extract_community(
chunk["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(
chunk["content_ltks"]
)
# try:
# ebd, _ = embed_bdl.encode([", ".join(community["entities"])])
# chunk["q_%d_vec" % len(ebd[0])] = ebd[0]
# except Exception as e:
# logging.exception(f"Fail to embed entity relation: {e}")
await trio.to_thread.run_sync(
lambda: settings.docStoreConn.insert(
[{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id)
)
)
chunks.append(chunk)
await trio.to_thread.run_sync(
lambda: settings.docStoreConn.delete(
{"knowledge_graph_kwd": "community_report", "kb_id": kb_id},
search.index_name(tenant_id),
kb_id,
)
)
es_bulk_size = 4
for b in range(0, len(chunks), es_bulk_size):
doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(tenant_id), kb_id))
if doc_store_result:
error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
raise Exception(error_message)
graphrag_task_lock.release()
now = trio.current_time()
callback(
msg=f"Graph indexed {len(cr.structured_output)} communities in {now - start:.2f}s."

View File

@ -100,7 +100,8 @@ def run(graph: nx.Graph, args: dict[str, Any]) -> dict[int, dict[str, dict]]:
logging.debug(
"Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc
)
if not graph.nodes():
nodes = set(graph.nodes())
if not nodes:
return {}
node_id_to_community_map = _compute_leiden_communities(
@ -120,7 +121,7 @@ def run(graph: nx.Graph, args: dict[str, Any]) -> dict[int, dict[str, dict]]:
result = {}
results_by_level[level] = result
for node_id, raw_community_id in node_id_to_community_map[level].items():
if node_id not in graph.nodes:
if node_id not in nodes:
logging.warning(f"Node {node_id} not found in the graph.")
continue
community_id = str(raw_community_id)