mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Pipeline Docx file supports Markdown output (#10439)
### What problem does this PR solve? Pipeline Docx file supports Markdown output. <img width="1242" height="755" alt="image" src="https://github.com/user-attachments/assets/63cca75b-20b9-4a90-a01c-c0c2fccf1f2a" /> <img width="1227" height="717" alt="image" src="https://github.com/user-attachments/assets/0dcb94b2-7ba0-48d5-9231-dc6e5c4b4192" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -256,6 +256,49 @@ class Docx(DocxParser):
|
||||
tbls.append(((None, html), ""))
|
||||
return new_line, tbls
|
||||
|
||||
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
|
||||
"""
|
||||
This function uses mammoth, licensed under the BSD 2-Clause License.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import uuid
|
||||
|
||||
import mammoth
|
||||
from markdownify import markdownify
|
||||
|
||||
docx_file = BytesIO(binary) if binary else open(filename, "rb")
|
||||
|
||||
def _convert_image_to_base64(image):
|
||||
try:
|
||||
with image.open() as image_file:
|
||||
image_bytes = image_file.read()
|
||||
encoded = base64.b64encode(image_bytes).decode("utf-8")
|
||||
base64_url = f"data:{image.content_type};base64,{encoded}"
|
||||
|
||||
alt_name = "image"
|
||||
alt_name = f"img_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
return {"src": base64_url, "alt": alt_name}
|
||||
except Exception as e:
|
||||
logging.warning(f"Failed to convert image to base64: {e}")
|
||||
return {"src": "", "alt": "image"}
|
||||
|
||||
try:
|
||||
if inline_images:
|
||||
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
||||
else:
|
||||
result = mammoth.convert_to_html(docx_file)
|
||||
|
||||
html = result.value
|
||||
|
||||
markdown_text = markdownify(html)
|
||||
return markdown_text
|
||||
|
||||
finally:
|
||||
if not binary:
|
||||
docx_file.close()
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
@ -512,7 +555,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
|
||||
except Exception:
|
||||
vision_model = None
|
||||
|
||||
|
||||
if vision_model:
|
||||
# Process images for each section
|
||||
section_images = []
|
||||
|
||||
@ -52,6 +52,7 @@ class ParserParam(ProcessParamBase):
|
||||
],
|
||||
"word": [
|
||||
"json",
|
||||
"markdown",
|
||||
],
|
||||
"slides": [
|
||||
"json",
|
||||
@ -247,13 +248,15 @@ class Parser(ProcessBase):
|
||||
conf = self._param.setups["word"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
docx_parser = Docx()
|
||||
sections, tbls = docx_parser(name, binary=blob)
|
||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
|
||||
# json
|
||||
assert conf.get("output_format") == "json", "have to be json for doc"
|
||||
|
||||
if conf.get("output_format") == "json":
|
||||
sections, tbls = docx_parser(name, binary=blob)
|
||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
|
||||
self.set_output("json", sections)
|
||||
elif conf.get("output_format") == "markdown":
|
||||
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
||||
self.set_output("markdown", markdown_text)
|
||||
|
||||
def _slides(self, name, blob):
|
||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||
|
||||
Reference in New Issue
Block a user