From f56940139855c183167cd7cd1397ac767a30f1cb Mon Sep 17 00:00:00 2001 From: Stephen Hu Date: Fri, 11 Jul 2025 18:21:39 +0800 Subject: [PATCH] Fix: better_handle_different_types (#8775) ### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/8719#issuecomment-3055883271 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/nlp/__init__.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 9a24d48b2..fd302fac3 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -589,17 +589,21 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 dels = get_delimiters(delimiter) for text, image in zip(texts, images): - splited_sec = re.split(r"(%s)" % dels, text) - for sub_sec in splited_sec: - if re.match(f"^{dels}$", sub_sec): - continue # if text is tuple, unpack it if isinstance(text, tuple): text_str = text[0] text_pos = text[1] if len(text) > 1 else "" - add_chunk(text_str, image, text_pos) + splited_sec = re.split(r"(%s)" % dels, text_str) + for sub_sec in splited_sec: + if re.match(f"^{dels}$", sub_sec): + continue + add_chunk(sub_sec, image, text_pos) else: - add_chunk(text, image) + splited_sec = re.split(r"(%s)" % dels, text) + for sub_sec in splited_sec: + if re.match(f"^{dels}$", sub_sec): + continue + add_chunk(sub_sec, image) return cks, result_images