Feat: Markdown add image (#7124)

### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/6984 1. Markdown parser supports get pictures 2. For Native, when handling Markdown, it will handle images 3. improve merge and ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-02-01 16:15:07 +08:00 · 2025-04-25 18:35:28 +08:00
parent fef44a71c5
commit 1662c7eda3
4 changed files with 120 additions and 18 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -276,8 +276,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
        res.append(d)
    return res

-
-def tokenize_chunks_docx(chunks, doc, eng, images):
+def tokenize_chunks_with_images(chunks, doc, eng, images):
    res = []
    # wrap up as es documents
    for ck, image in zip(chunks, images):
@ -290,7 +289,6 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
        res.append(d)
    return res

-
 def tokenize_table(tbls, doc, eng, batch_size=10):
    res = []
    # add tables
@ -539,7 +537,46 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):
        add_chunk(sec, pos)

    return cks
+    

+def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。；！？"):
+    if not texts or len(texts) != len(images):
+        return [], []
+    # Enuser texts is str not tuple, if it is tuple, convert to str (get the first item)
+    if isinstance(texts[0], tuple):
+        texts = [t[0] for t in texts]
+    cks = [""]
+    result_images = [None]
+    tk_nums = [0]
+
+    def add_chunk(t, image, pos=""):
+        nonlocal cks, result_images, tk_nums, delimiter
+        tnum = num_tokens_from_string(t)
+        if not pos:
+            pos = ""
+        if tnum < 8:
+            pos = ""
+        # Ensure that the length of the merged chunk does not exceed chunk_token_num
+        if tk_nums[-1] > chunk_token_num:
+            if t.find(pos) < 0:
+                t += pos
+            cks.append(t)
+            result_images.append(image)
+            tk_nums.append(tnum)
+        else:
+            if cks[-1].find(pos) < 0:
+                t += pos
+            cks[-1] += t
+            if result_images[-1] is None:
+                result_images[-1] = image
+            else:
+                result_images[-1] = concat_img(result_images[-1], image)
+            tk_nums[-1] += tnum
+
+    for text, image in zip(texts, images):
+        add_chunk(text, image)
+
+    return cks, result_images

 def docx_question_level(p, bull=-1):
    txt = re.sub(r"\u3000", " ", p.text).strip()