From 7264fb6978815c0153d71b2dae0cacc981260421 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Mon, 17 Nov 2025 19:38:26 +0800 Subject: [PATCH] Fix: concat images in word document. (#11310) ### What problem does this PR solve? Fix: concat images in word document. Partially solved issues in #11063 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/nlp/__init__.py | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index de7c2ce60..f61019377 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -723,47 +723,40 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): if not sections: return [], [] - cks = [""] - images = [None] - tk_nums = [0] + cks = [] + images = [] + tk_nums = [] def add_chunk(t, image, pos=""): - nonlocal cks, tk_nums, delimiter + nonlocal cks, images, tk_nums tnum = num_tokens_from_string(t) if tnum < 8: pos = "" - if cks[-1] == "" or tk_nums[-1] > chunk_token_num: - if t.find(pos) < 0: + + if not cks or tk_nums[-1] > chunk_token_num: + # new chunk + if pos and t.find(pos) < 0: t += pos cks.append(t) images.append(image) tk_nums.append(tnum) else: - if cks[-1].find(pos) < 0: + # add to last chunk + if pos and cks[-1].find(pos) < 0: t += pos cks[-1] += t images[-1] = concat_img(images[-1], image) tk_nums[-1] += tnum dels = get_delimiters(delimiter) - line = "" - for sec, image in sections: - if not image: - line += sec + "\n" - continue - split_sec = re.split(r"(%s)" % dels, line + sec) - for sub_sec in split_sec: - if re.match(f"^{dels}$", sub_sec): - continue - add_chunk("\n"+sub_sec, image,"") - line = "" + pattern = r"(%s)" % dels - if line: - split_sec = re.split(r"(%s)" % dels, line) + for sec, image in sections: + split_sec = re.split(pattern, sec) for sub_sec in split_sec: - if re.match(f"^{dels}$", sub_sec): + if not sub_sec or re.match(f"^{dels}$", sub_sec): continue - add_chunk("\n"+sub_sec, image,"") + add_chunk("\n" + sub_sec, image, "") return cks, images