Support displaying images in the chunks of docx files when using general parser (#1253)

### What problem does this PR solve? Support displaying images in chunks of docx files when using general parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 23:55:06 +08:00 · 2024-06-24 16:29:36 +08:00
parent 9a0736b20f
commit 38bd02f402
3 changed files with 121 additions and 28 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -24,6 +24,7 @@ import copy
 import roman_numbers as r
 from word2number import w2n
 from cn2an import cn2an
+from PIL import Image

 all_codecs = [
    'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser):
    return res


+def tokenize_chunks_docx(chunks, doc, eng, images):
+    res = []
+    # wrap up as es documents
+    for ck, image in zip(chunks, images):
+        if len(ck.strip()) == 0:continue
+        print("--", ck)
+        d = copy.deepcopy(doc)
+        d["image"] = image
+        tokenize(d, ck, eng)
+        res.append(d)
+    return res
+
+
 def tokenize_table(tbls, doc, eng, batch_size=10):
    res = []
    # add tables
@ -504,4 +518,54 @@ def docx_question_level(p):
    if p.style.name.startswith('Heading'):
        return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
    else:
-        return 0, re.sub(r"\u3000", " ", p.text).strip()
+        return 0, re.sub(r"\u3000", " ", p.text).strip()
+    
+def concat_img(img1, img2):
+    if img1 and not img2:
+        return img1
+    if not img1 and img2:
+        return img2
+    if not img1 and not img2:
+        return None
+    width1, height1 = img1.size
+    width2, height2 = img2.size
+
+    new_width = max(width1, width2)
+    new_height = height1 + height2
+    new_image = Image.new('RGB', (new_width, new_height))
+
+    new_image.paste(img1, (0, 0))
+    new_image.paste(img2, (0, height1))
+
+    return new_image
+
+def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):
+    if not sections:
+        return []
+
+    cks = [""]
+    images = [None]
+    tk_nums = [0]
+
+    def add_chunk(t, image, pos=""):
+        nonlocal cks, tk_nums, delimiter
+        tnum = num_tokens_from_string(t)
+        if tnum < 8:
+            pos = ""
+        if tk_nums[-1] > chunk_token_num:
+            if t.find(pos) < 0:
+                t += pos
+            cks.append(t)
+            images.append(image)
+            tk_nums.append(tnum)
+        else:
+            if cks[-1].find(pos) < 0:
+                t += pos
+            cks[-1] += t
+            images[-1] = concat_img(images[-1], image)
+            tk_nums[-1] += tnum
+
+    for sec, image in sections:
+        add_chunk(sec, image, '')
+
+    return cks, images