Fix: whole knowledge graph lost after removing any document in the knowledge base (#7151)

### What problem does this PR solve?

When you removed any document in a knowledge base using knowledge graph,
the graph's `removed_kwd` is set to "Y".
However, in the function `graphrag.utils.get_gaph`, `rebuild_graph`
method is passed and directly return `None` while `removed_kwd=Y`,
making residual part of the graph abandoned (but old entity data still
exist in db).

Besides, infinity instance actually pass deleting graph components'
`source_id` when removing document. It may cause wrong graph after
rebuild.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
alkscr
2025-04-30 09:43:17 +08:00
committed by GitHub
parent 538a408608
commit ab27609a64
4 changed files with 137 additions and 70 deletions

View File

@ -42,6 +42,11 @@ from rag.utils.doc_store_conn import (
logger = logging.getLogger('ragflow.infinity_conn')
def field_keyword(field_name: str):
# The "docnm_kwd" field is always a string, not list.
if field_name == "source_id" or (field_name.endswith("_kwd") and field_name != "docnm_kwd" and field_name != "knowledge_graph_kwd"):
return True
return False
def equivalent_condition_to_str(condition: dict, table_instance=None) -> str | None:
assert "_id" not in condition
@ -64,7 +69,20 @@ def equivalent_condition_to_str(condition: dict, table_instance=None) -> str | N
for k, v in condition.items():
if not isinstance(k, str) or k in ["kb_id"] or not v:
continue
if isinstance(v, list):
if field_keyword(k):
if isinstance(v, list):
inCond = list()
for item in v:
if isinstance(item, str):
item = item.replace("'","''")
inCond.append(f"filter_fulltext('{k}', '{item}')")
if inCond:
strInCond = " or ".join(inCond)
strInCond = f"({strInCond})"
cond.append(strInCond)
else:
cond.append(f"filter_fulltext('{k}', '{v}')")
elif isinstance(v, list):
inCond = list()
for item in v:
if isinstance(item, str):
@ -118,7 +136,7 @@ class InfinityConnection(DocStoreConnection):
logger.info(f"Use Infinity {infinity_uri} as the doc engine.")
for _ in range(24):
try:
connPool = ConnectionPool(infinity_uri)
connPool = ConnectionPool(infinity_uri, max_size=32)
inf_conn = connPool.get_conn()
res = inf_conn.show_current_node()
if res.error_code == ErrorCode.OK and res.server_status in ["started", "alive"]:
@ -173,12 +191,6 @@ class InfinityConnection(DocStoreConnection):
ConflictType.Ignore,
)
def field_keyword(self, field_name: str):
# The "docnm_kwd" field is always a string, not list.
if field_name == "source_id" or (field_name.endswith("_kwd") and field_name != "docnm_kwd" and field_name != "knowledge_graph_kwd"):
return True
return False
"""
Database operations
"""
@ -487,7 +499,7 @@ class InfinityConnection(DocStoreConnection):
assert "_id" not in d
assert "id" in d
for k, v in d.items():
if self.field_keyword(k):
if field_keyword(k):
if isinstance(v, list):
d[k] = "###".join(v)
else:
@ -534,9 +546,15 @@ class InfinityConnection(DocStoreConnection):
table_instance = db_instance.get_table(table_name)
#if "exists" in condition:
# del condition["exists"]
clmns = {}
if table_instance:
for n, ty, de, _ in table_instance.show_columns().rows():
clmns[n] = (ty, de)
filter = equivalent_condition_to_str(condition, table_instance)
removeValue = {}
for k, v in list(newValue.items()):
if self.field_keyword(k):
if field_keyword(k):
if isinstance(v, list):
newValue[k] = "###".join(v)
else:
@ -554,13 +572,42 @@ class InfinityConnection(DocStoreConnection):
assert isinstance(v, list)
newValue[k] = "_".join(f"{num:08x}" for num in v)
elif k == "remove":
del newValue[k]
if v in [PAGERANK_FLD]:
newValue[v] = 0
if isinstance(v, str):
assert v in clmns, f"'{v}' should be in '{clmns}'."
ty, de = clmns[v]
if ty.lower().find("cha"):
if not de:
de = ""
newValue[v] = de
else:
for kk, vv in v.items():
removeValue[kk] = vv
del newValue[k]
else:
newValue[k] = v
remove_opt = {} # "[k,new_value]": [id_to_update, ...]
if removeValue:
col_to_remove = list(removeValue.keys())
row_to_opt = table_instance.output(col_to_remove + ['id']).filter(filter).to_df()
logger.debug(f"INFINITY search table {str(table_name)}, filter {filter}, result: {str(row_to_opt[0])}")
row_to_opt = self.getFields(row_to_opt, col_to_remove)
for id, old_v in row_to_opt.items():
for k, remove_v in removeValue.items():
if remove_v in old_v[k]:
new_v = old_v[k].copy()
new_v.remove(remove_v)
kv_key = json.dumps([k, new_v])
if kv_key not in remove_opt:
remove_opt[kv_key] = [id]
else:
remove_opt[kv_key].append(id)
logger.debug(f"INFINITY update table {table_name}, filter {filter}, newValue {newValue}.")
for update_kv, ids in remove_opt.items():
k, v = json.loads(update_kv)
table_instance.update(filter + " AND id in ({0})".format(",".join([f"'{id}'" for id in ids])), {k:"###".join(v)})
table_instance.update(filter, newValue)
self.connPool.release_conn(inf_conn)
return True
@ -613,7 +660,7 @@ class InfinityConnection(DocStoreConnection):
for column in res2.columns:
k = column.lower()
if self.field_keyword(k):
if field_keyword(k):
res2[column] = res2[column].apply(lambda v:[kwd for kwd in v.split("###") if kwd])
elif k == "position_int":
def to_position_int(v):