From 93f5df716f5a8459bef42f3a58d21186bebc971a Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Fri, 30 May 2025 17:20:53 +0800 Subject: [PATCH] Fix: order chunks from docx by positions. (#7979) ### What problem does this PR solve? #7934 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/nlp/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 06f49623e..f88c059a5 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -279,12 +279,13 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None): def tokenize_chunks_with_images(chunks, doc, eng, images): res = [] # wrap up as es documents - for ck, image in zip(chunks, images): + for ii, (ck, image) in enumerate(zip(chunks, images)): if len(ck.strip()) == 0: continue logging.debug("-- {}".format(ck)) d = copy.deepcopy(doc) d["image"] = image + add_positions(d, [[ii]*5]) tokenize(d, ck, eng) res.append(d) return res