Feat: Pipeline Docx file supports Markdown output (#10439)

### What problem does this PR solve?

Pipeline Docx file supports Markdown output.

<img width="1242" height="755" alt="image"
src="https://github.com/user-attachments/assets/63cca75b-20b9-4a90-a01c-c0c2fccf1f2a"
/>

<img width="1227" height="717" alt="image"
src="https://github.com/user-attachments/assets/0dcb94b2-7ba0-48d5-9231-dc6e5c4b4192"
/>


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-10-10 09:39:15 +08:00
committed by GitHub
parent d931c33ced
commit 8aabc2807c
4 changed files with 256 additions and 6 deletions

View File

@ -52,6 +52,7 @@ class ParserParam(ProcessParamBase):
],
"word": [
"json",
"markdown",
],
"slides": [
"json",
@ -247,13 +248,15 @@ class Parser(ProcessBase):
conf = self._param.setups["word"]
self.set_output("output_format", conf["output_format"])
docx_parser = Docx()
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
# json
assert conf.get("output_format") == "json", "have to be json for doc"
if conf.get("output_format") == "json":
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
self.set_output("json", sections)
elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob)
self.set_output("markdown", markdown_text)
def _slides(self, name, blob):
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser