Feat: Pipeline Docx file supports Markdown output (#10439)

### What problem does this PR solve? Pipeline Docx file supports Markdown output. <img width="1242" height="755" alt="image" src="https://github.com/user-attachments/assets/63cca75b-20b9-4a90-a01c-c0c2fccf1f2a" /> <img width="1227" height="717" alt="image" src="https://github.com/user-attachments/assets/0dcb94b2-7ba0-48d5-9231-dc6e5c4b4192" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 00:25:06 +08:00 · 2025-10-10 09:39:15 +08:00
parent d931c33ced
commit 8aabc2807c
4 changed files with 256 additions and 6 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -256,6 +256,49 @@ class Docx(DocxParser):
            tbls.append(((None, html), ""))
        return new_line, tbls

+    def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
+        """
+        This function uses mammoth, licensed under the BSD 2-Clause License.
+        """
+
+        import base64
+        import uuid
+
+        import mammoth
+        from markdownify import markdownify
+
+        docx_file = BytesIO(binary) if binary else open(filename, "rb")
+
+        def _convert_image_to_base64(image):
+            try:
+                with image.open() as image_file:
+                    image_bytes = image_file.read()
+                encoded = base64.b64encode(image_bytes).decode("utf-8")
+                base64_url = f"data:{image.content_type};base64,{encoded}"
+
+                alt_name = "image"
+                alt_name = f"img_{uuid.uuid4().hex[:8]}"
+
+                return {"src": base64_url, "alt": alt_name}
+            except Exception as e:
+                logging.warning(f"Failed to convert image to base64: {e}")
+                return {"src": "", "alt": "image"}
+
+        try:
+            if inline_images:
+                result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
+            else:
+                result = mammoth.convert_to_html(docx_file)
+
+            html = result.value
+
+            markdown_text = markdownify(html)
+            return markdown_text
+
+        finally:
+            if not binary:
+                docx_file.close()
+

 class Pdf(PdfParser):
    def __init__(self):
@ -512,7 +555,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
        except Exception:
            vision_model = None
-        
+
        if vision_model:
            # Process images for each section
            section_images = []