Support displaying images in the chunks of docx files when using general parser (#1253)

### What problem does this PR solve? Support displaying images in chunks of docx files when using general parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 23:55:06 +08:00 · 2024-06-24 16:29:36 +08:00
parent 9a0736b20f
commit 38bd02f402
3 changed files with 121 additions and 28 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -16,16 +16,28 @@ from docx import Document
 from timeit import default_timer as timer
 import re
 from deepdoc.parser.pdf_parser import PlainParser
-from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
+from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
 from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string
-
+from PIL import Image
+from functools import reduce

 class Docx(DocxParser):
    def __init__(self):
        pass

+    def get_picture(self, document, paragraph):
+        img = paragraph._element.xpath('.//pic:pic')
+        if not img:
+            return None
+        img = img[0]
+        embed = img.xpath('.//a:blip/@r:embed')[0]
+        related_part = document.part.related_parts[embed]
+        image = related_part.image
+        image = Image.open(BytesIO(image.blob)).convert('RGB')
+        return image
+
    def __clean(self, line):
        line = re.sub(r"\u3000", " ", line).strip()
        return line
@ -35,17 +47,41 @@ class Docx(DocxParser):
            filename) if not binary else Document(BytesIO(binary))
        pn = 0
        lines = []
+        last_image = None
        for p in self.doc.paragraphs:
            if pn > to_page:
                break
-            if from_page <= pn < to_page and p.text.strip():
-                lines.append(self.__clean(p.text))
+            if from_page <= pn < to_page:
+                current_image = None
+                if p.text.strip():
+                    if p.style.name == 'Caption':
+                        former_image = None
+                        if lines and lines[-1][1] and lines[-1][2] != 'Caption':
+                            former_image = lines[-1][1].pop()
+                        elif last_image:
+                            former_image = last_image
+                            last_image = None
+                        lines.append((self.__clean(p.text), [former_image], p.style.name))
+                    else:
+                        current_image = self.get_picture(self.doc, p)
+                        image_list = [current_image]
+                        if last_image:
+                            image_list.insert(0, last_image)
+                            last_image = None
+                        lines.append((self.__clean(p.text), image_list, p.style.name))
+                else:
+                    if current_image := self.get_picture(self.doc, p):
+                        if lines:
+                            lines[-1][1].append(current_image)
+                        else:
+                            last_image = current_image
            for run in p.runs:
                if 'lastRenderedPageBreak' in run._element.xml:
                    pn += 1
                    continue
                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
                    pn += 1
+        new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
        tbls = []
        for tb in self.doc.tables:
            html= "<table>"
@ -64,7 +100,7 @@ class Docx(DocxParser):
                html += "</tr>"
            html += "</table>"
            tbls.append(((None, html), ""))
-        return [(l, "") for l in lines if l], tbls
+        return new_line, tbls


 class Pdf(PdfParser):
@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    if re.search(r"\.docx$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        sections, tbls = Docx()(filename, binary)
-        res = tokenize_table(tbls, doc, eng)
+        res = tokenize_table(tbls, doc, eng)    # just for table
+
        callback(0.8, "Finish parsing.")
+        st = timer()
+
+        chunks, images = naive_merge_docx(
+            sections, int(parser_config.get(
+                "chunk_token_num", 128)), parser_config.get(
+                "delimiter", "\n!?。；！？"))
+
+        res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
+        cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
+        return res

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf(