From 93f5df716f5a8459bef42f3a58d21186bebc971a Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Fri, 30 May 2025 17:20:53 +0800
Subject: [PATCH] Fix: order chunks from docx by positions. (#7979)

### What problem does this PR solve?

#7934

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 rag/nlp/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 06f49623e..f88c059a5 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -279,12 +279,13 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
 def tokenize_chunks_with_images(chunks, doc, eng, images):
     res = []
     # wrap up as es documents
-    for ck, image in zip(chunks, images):
+    for ii, (ck, image) in enumerate(zip(chunks, images)):
         if len(ck.strip()) == 0:
             continue
         logging.debug("-- {}".format(ck))
         d = copy.deepcopy(doc)
         d["image"] = image
+        add_positions(d, [[ii]*5])
         tokenize(d, ck, eng)
         res.append(d)
     return res