Feat: Pipeline Docx file supports Markdown output (#10439)

### What problem does this PR solve? Pipeline Docx file supports Markdown output. <img width="1242" height="755" alt="image" src="https://github.com/user-attachments/assets/63cca75b-20b9-4a90-a01c-c0c2fccf1f2a" /> <img width="1227" height="717" alt="image" src="https://github.com/user-attachments/assets/0dcb94b2-7ba0-48d5-9231-dc6e5c4b4192" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-23 03:26:53 +08:00 · 2025-10-10 09:39:15 +08:00
parent d931c33ced
commit 8aabc2807c
4 changed files with 256 additions and 6 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -256,6 +256,49 @@ class Docx(DocxParser):
            tbls.append(((None, html), ""))
        return new_line, tbls

+    def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
+        """
+        This function uses mammoth, licensed under the BSD 2-Clause License.
+        """
+
+        import base64
+        import uuid
+
+        import mammoth
+        from markdownify import markdownify
+
+        docx_file = BytesIO(binary) if binary else open(filename, "rb")
+
+        def _convert_image_to_base64(image):
+            try:
+                with image.open() as image_file:
+                    image_bytes = image_file.read()
+                encoded = base64.b64encode(image_bytes).decode("utf-8")
+                base64_url = f"data:{image.content_type};base64,{encoded}"
+
+                alt_name = "image"
+                alt_name = f"img_{uuid.uuid4().hex[:8]}"
+
+                return {"src": base64_url, "alt": alt_name}
+            except Exception as e:
+                logging.warning(f"Failed to convert image to base64: {e}")
+                return {"src": "", "alt": "image"}
+
+        try:
+            if inline_images:
+                result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
+            else:
+                result = mammoth.convert_to_html(docx_file)
+
+            html = result.value
+
+            markdown_text = markdownify(html)
+            return markdown_text
+
+        finally:
+            if not binary:
+                docx_file.close()
+

 class Pdf(PdfParser):
    def __init__(self):
@ -512,7 +555,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
        except Exception:
            vision_model = None
-        
+
        if vision_model:
            # Process images for each section
            section_images = []
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -52,6 +52,7 @@ class ParserParam(ProcessParamBase):
            ],
            "word": [
                "json",
+                "markdown",
            ],
            "slides": [
                "json",
@ -247,13 +248,15 @@ class Parser(ProcessBase):
        conf = self._param.setups["word"]
        self.set_output("output_format", conf["output_format"])
        docx_parser = Docx()
-        sections, tbls = docx_parser(name, binary=blob)
-        sections = [{"text": section[0], "image": section[1]} for section in sections if section]
-        sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
-        # json
-        assert conf.get("output_format") == "json", "have to be json for doc"
+
        if conf.get("output_format") == "json":
+            sections, tbls = docx_parser(name, binary=blob)
+            sections = [{"text": section[0], "image": section[1]} for section in sections if section]
+            sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
            self.set_output("json", sections)
+        elif conf.get("output_format") == "markdown":
+            markdown_text = docx_parser.to_markdown(name, binary=blob)
+            self.set_output("markdown", markdown_text)

    def _slides(self, name, blob):
        from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser