Feat: Pipeline Docx file supports Markdown output (#10439)

### What problem does this PR solve?

Pipeline Docx file supports Markdown output.

<img width="1242" height="755" alt="image"
src="https://github.com/user-attachments/assets/63cca75b-20b9-4a90-a01c-c0c2fccf1f2a"
/>

<img width="1227" height="717" alt="image"
src="https://github.com/user-attachments/assets/0dcb94b2-7ba0-48d5-9231-dc6e5c4b4192"
/>


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-10-10 09:39:15 +08:00
committed by GitHub
parent d931c33ced
commit 8aabc2807c
4 changed files with 256 additions and 6 deletions

View File

@ -133,6 +133,8 @@ dependencies = [
"litellm>=1.74.15.post1", "litellm>=1.74.15.post1",
"flask-mail>=0.10.0", "flask-mail>=0.10.0",
"lark>=1.2.2", "lark>=1.2.2",
"mammoth>=1.11.0",
"markdownify>=1.2.0",
] ]
[project.optional-dependencies] [project.optional-dependencies]

View File

@ -256,6 +256,49 @@ class Docx(DocxParser):
tbls.append(((None, html), "")) tbls.append(((None, html), ""))
return new_line, tbls return new_line, tbls
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
"""
This function uses mammoth, licensed under the BSD 2-Clause License.
"""
import base64
import uuid
import mammoth
from markdownify import markdownify
docx_file = BytesIO(binary) if binary else open(filename, "rb")
def _convert_image_to_base64(image):
try:
with image.open() as image_file:
image_bytes = image_file.read()
encoded = base64.b64encode(image_bytes).decode("utf-8")
base64_url = f"data:{image.content_type};base64,{encoded}"
alt_name = "image"
alt_name = f"img_{uuid.uuid4().hex[:8]}"
return {"src": base64_url, "alt": alt_name}
except Exception as e:
logging.warning(f"Failed to convert image to base64: {e}")
return {"src": "", "alt": "image"}
try:
if inline_images:
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
else:
result = mammoth.convert_to_html(docx_file)
html = result.value
markdown_text = markdownify(html)
return markdown_text
finally:
if not binary:
docx_file.close()
class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
@ -512,7 +555,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...") callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
except Exception: except Exception:
vision_model = None vision_model = None
if vision_model: if vision_model:
# Process images for each section # Process images for each section
section_images = [] section_images = []

View File

@ -52,6 +52,7 @@ class ParserParam(ProcessParamBase):
], ],
"word": [ "word": [
"json", "json",
"markdown",
], ],
"slides": [ "slides": [
"json", "json",
@ -247,13 +248,15 @@ class Parser(ProcessBase):
conf = self._param.setups["word"] conf = self._param.setups["word"]
self.set_output("output_format", conf["output_format"]) self.set_output("output_format", conf["output_format"])
docx_parser = Docx() docx_parser = Docx()
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
# json
assert conf.get("output_format") == "json", "have to be json for doc"
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
sections, tbls = docx_parser(name, binary=blob)
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
self.set_output("json", sections) self.set_output("json", sections)
elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob)
self.set_output("markdown", markdown_text)
def _slides(self, name, blob): def _slides(self, name, blob):
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser

202
uv.lock generated
View File

@ -831,6 +831,15 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/1c/3d/3e04a822b8615904269f7126d8b019ae5c3b5c3c78397ec8bab056b02099/cn2an-0.5.22-py3-none-any.whl", hash = "sha256:cba4c8f305b43da01f50696047cca3116c727424ac62338da6a3426e01454f3e" }, { url = "https://mirrors.aliyun.com/pypi/packages/1c/3d/3e04a822b8615904269f7126d8b019ae5c3b5c3c78397ec8bab056b02099/cn2an-0.5.22-py3-none-any.whl", hash = "sha256:cba4c8f305b43da01f50696047cca3116c727424ac62338da6a3426e01454f3e" },
] ]
[[package]]
name = "cobble"
version = "0.1.4"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/54/7a/a507c709be2c96e1bb6102eb7b7f4026c5e5e223ef7d745a17d239e9d844/cobble-0.1.4.tar.gz", hash = "sha256:de38be1539992c8a06e569630717c485a5f91be2192c461ea2b220607dfa78aa" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/d5/e1/3714a2f371985215c219c2a70953d38e3eed81ef165aed061d21de0e998b/cobble-0.1.4-py3-none-any.whl", hash = "sha256:36c91b1655e599fd428e2b95fdd5f0da1ca2e9f1abb0bc871dec21a0e78a2b44" },
]
[[package]] [[package]]
name = "cohere" name = "cohere"
version = "5.6.2" version = "5.6.2"
@ -861,6 +870,15 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6" }, { url = "https://mirrors.aliyun.com/pypi/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6" },
] ]
[[package]]
name = "colorclass"
version = "2.2.2"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/d7/1a/31ff00a33569a3b59d65bbdc445c73e12f92ad28195b7ace299f68b9af70/colorclass-2.2.2.tar.gz", hash = "sha256:6d4fe287766166a98ca7bc6f6312daf04a0481b1eda43e7173484051c0ab4366" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/30/b6/daf3e2976932da4ed3579cff7a30a53d22ea9323ee4f0d8e43be60454897/colorclass-2.2.2-py2.py3-none-any.whl", hash = "sha256:6f10c273a0ef7a1150b1120b6095cbdd68e5cf36dfd5d0fc957a2500bbf99a55" },
]
[[package]] [[package]]
name = "coloredlogs" name = "coloredlogs"
version = "15.0.1" version = "15.0.1"
@ -873,6 +891,15 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934" }, { url = "https://mirrors.aliyun.com/pypi/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934" },
] ]
[[package]]
name = "compressed-rtf"
version = "1.0.7"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/b7/0c/929a4e8ef9d7143f54d77dadb5f370cc7b98534b1bd6e1124d0abe8efb24/compressed_rtf-1.0.7.tar.gz", hash = "sha256:7c30859334839f3cdc7d10796af5b434bb326b9df7cb5a65e95a8eacb2951b0e" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/07/1d/62f5bf92e12335eb63517f42671ed78512d48bbc69e02a942dd7b90f03f0/compressed_rtf-1.0.7-py3-none-any.whl", hash = "sha256:b7904921d78c67a0a4b7fff9fb361a00ae2b447b6edca010ce321cd98fa0fcc0" },
]
[[package]] [[package]]
name = "contourpy" name = "contourpy"
version = "1.3.2" version = "1.3.2"
@ -1322,6 +1349,23 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/fc/da/8376678b4a9ae0f9418d93df9c9cf851dced49c95ceb38daac6651e38f7a/duckduckgo_search-7.5.5-py3-none-any.whl", hash = "sha256:c71a0661aa436f215d9a05d653af424affb58825ab3e79f3b788053cbdee9ebc" }, { url = "https://mirrors.aliyun.com/pypi/packages/fc/da/8376678b4a9ae0f9418d93df9c9cf851dced49c95ceb38daac6651e38f7a/duckduckgo_search-7.5.5-py3-none-any.whl", hash = "sha256:c71a0661aa436f215d9a05d653af424affb58825ab3e79f3b788053cbdee9ebc" },
] ]
[[package]]
name = "easygui"
version = "0.98.3"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/cc/ad/e35f7a30272d322be09dc98592d2f55d27cc933a7fde8baccbbeb2bd9409/easygui-0.98.3.tar.gz", hash = "sha256:d653ff79ee1f42f63b5a090f2f98ce02335d86ad8963b3ce2661805cafe99a04" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/8e/a7/b276ff776533b423710a285c8168b52551cb2ab0855443131fdc7fd8c16f/easygui-0.98.3-py2.py3-none-any.whl", hash = "sha256:33498710c68b5376b459cd3fc48d1d1f33822139eb3ed01defbc0528326da3ba" },
]
[[package]]
name = "ebcdic"
version = "1.1.1"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/0d/2f/633031205333bee5f9f93761af8268746aa75f38754823aabb8570eb245b/ebcdic-1.1.1-py2.py3-none-any.whl", hash = "sha256:33b4cb729bc2d0bf46cc1847b0e5946897cb8d3f53520c5b9aa5fa98d7e735f1" },
]
[[package]] [[package]]
name = "editdistance" name = "editdistance"
version = "0.8.1" version = "0.8.1"
@ -1435,6 +1479,26 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10" }, { url = "https://mirrors.aliyun.com/pypi/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10" },
] ]
[[package]]
name = "extract-msg"
version = "0.41.5"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "chardet" },
{ name = "compressed-rtf" },
{ name = "ebcdic" },
{ name = "imapclient" },
{ name = "olefile" },
{ name = "red-black-tree-mod" },
{ name = "rtfde" },
{ name = "tzlocal" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/ef/fa/67443d9b9f505c32cba96e34745223378b84cd4795c387310788cc8b6d7d/extract_msg-0.41.5.tar.gz", hash = "sha256:99d4fdc0c0912c836370bf9fbb6e77558bb978499c1b5fdd31634684e323885c" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/be/e2/f0ed8df3907ad6e90e762d8e90adb4e25d12fea851a8371611fa14405782/extract_msg-0.41.5-py2.py3-none-any.whl", hash = "sha256:ad70dcdab3701b0fae554168c9642ad4ebef7f2ec283313c55e895a6518911e5" },
]
[[package]] [[package]]
name = "fake-http-header" name = "fake-http-header"
version = "0.3.5" version = "0.3.5"
@ -2575,6 +2639,18 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b" }, { url = "https://mirrors.aliyun.com/pypi/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b" },
] ]
[[package]]
name = "imapclient"
version = "2.3.1"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "six" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/19/d8/a4a0337d5e39a0569d89793d5053d7535eefd9b8756df4e10dc114caf3c2/IMAPClient-2.3.1.zip", hash = "sha256:26ea995664fae3a88b878ebce2aff7402931697b86658b7882043ddb01b0e6ba" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/13/9c/b2890e73bc9eee53fe63218e3f3cb774a6beefdb7b5c47928a81cc3b3c13/IMAPClient-2.3.1-py2.py3-none-any.whl", hash = "sha256:057f28025d2987c63e065afb0e4370b0b850b539b0e1494cea0427e88130108c" },
]
[[package]] [[package]]
name = "importlib-metadata" name = "importlib-metadata"
version = "8.7.0" version = "8.7.0"
@ -2902,6 +2978,15 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/2d/00/d90b10b962b4277f5e64a78b6609968859ff86889f5b898c1a778c06ec00/lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c" }, { url = "https://mirrors.aliyun.com/pypi/packages/2d/00/d90b10b962b4277f5e64a78b6609968859ff86889f5b898c1a778c06ec00/lark-1.2.2-py3-none-any.whl", hash = "sha256:c2276486b02f0f1b90be155f2c8ba4a8e194d42775786db622faccd652d8e80c" },
] ]
[[package]]
name = "lark-parser"
version = "0.12.0"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/5a/ee/fd1192d7724419ddfe15b6f17d1c8742800d4de917c0adac3b6aaf22e921/lark-parser-0.12.0.tar.gz", hash = "sha256:15967db1f1214013dca65b1180745047b9be457d73da224fcda3d9dd4e96a138" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/76/00/90f05db333fe1aa6b6ffea83a35425b7d53ea95c8bba0b1597f226cf1d5f/lark_parser-0.12.0-py2.py3-none-any.whl", hash = "sha256:0eaf30cb5ba787fe404d73a7d6e61df97b21d5a63ac26c5008c78a494373c675" },
]
[[package]] [[package]]
name = "litellm" name = "litellm"
version = "1.75.5.post1" version = "1.75.5.post1"
@ -3069,6 +3154,18 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/6c/e1/0686c91738f3e6c2e1a243e0fdd4371667c4d2e5009b0a3605806c2aa020/lz4-4.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:2f4f2965c98ab254feddf6b5072854a6935adab7bc81412ec4fe238f07b85f62" }, { url = "https://mirrors.aliyun.com/pypi/packages/6c/e1/0686c91738f3e6c2e1a243e0fdd4371667c4d2e5009b0a3605806c2aa020/lz4-4.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:2f4f2965c98ab254feddf6b5072854a6935adab7bc81412ec4fe238f07b85f62" },
] ]
[[package]]
name = "mammoth"
version = "1.11.0"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "cobble" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/ed/3c/a58418d2af00f2da60d4a51e18cd0311307b72d48d2fffec36a97b4a5e44/mammoth-1.11.0.tar.gz", hash = "sha256:a0f59e442f34d5b6447f4b0999306cbf3e67aaabfa8cb516f878fb1456744637" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/ca/54/2e39566a131b13f6d8d193f974cb6a34e81bb7cc2fa6f7e03de067b36588/mammoth-1.11.0-py2.py3-none-any.whl", hash = "sha256:c077ab0d450bd7c0c6ecd529a23bf7e0fa8190c929e28998308ff4eada3f063b" },
]
[[package]] [[package]]
name = "markdown" name = "markdown"
version = "3.6" version = "3.6"
@ -3099,6 +3196,19 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/47/2b/dac4143951a16c0c03e8fe217c9fa784838d02a29c52ef0e8b265befea8f/markdown_to_json-2.1.1-py3-none-any.whl", hash = "sha256:c73b8a3ac7fbde65463dbaeba8bb925d1d54377cbb01a064cd65e1f3e394bd62" }, { url = "https://mirrors.aliyun.com/pypi/packages/47/2b/dac4143951a16c0c03e8fe217c9fa784838d02a29c52ef0e8b265befea8f/markdown_to_json-2.1.1-py3-none-any.whl", hash = "sha256:c73b8a3ac7fbde65463dbaeba8bb925d1d54377cbb01a064cd65e1f3e394bd62" },
] ]
[[package]]
name = "markdownify"
version = "1.2.0"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "six" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/83/1b/6f2697b51eaca81f08852fd2734745af15718fea10222a1d40f8a239c4ea/markdownify-1.2.0.tar.gz", hash = "sha256:f6c367c54eb24ee953921804dfe6d6575c5e5b42c643955e7242034435de634c" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/6a/e2/7af643acb4cae0741dffffaa7f3f7c9e7ab4046724543ba1777c401d821c/markdownify-1.2.0-py3-none-any.whl", hash = "sha256:48e150a1c4993d4d50f282f725c0111bd9eb25645d41fa2f543708fd44161351" },
]
[[package]] [[package]]
name = "markupsafe" name = "markupsafe"
version = "3.0.2" version = "3.0.2"
@ -3386,6 +3496,19 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/b1/ef/27dd35a7049c9a4f4211c6cd6a8c9db0a50647546f003a5867827ec45391/msgspec-0.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:067f0de1c33cfa0b6a8206562efdf6be5985b988b53dd244a8e06f993f27c8c0" }, { url = "https://mirrors.aliyun.com/pypi/packages/b1/ef/27dd35a7049c9a4f4211c6cd6a8c9db0a50647546f003a5867827ec45391/msgspec-0.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:067f0de1c33cfa0b6a8206562efdf6be5985b988b53dd244a8e06f993f27c8c0" },
] ]
[[package]]
name = "msoffcrypto-tool"
version = "5.4.2"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "cryptography" },
{ name = "olefile" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/d2/b7/0fd6573157e0ec60c0c470e732ab3322fba4d2834fd24e1088d670522a01/msoffcrypto_tool-5.4.2.tar.gz", hash = "sha256:44b545adba0407564a0cc3d6dde6ca36b7c0fdf352b85bca51618fa1d4817370" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/03/54/7f6d3d9acad083dae8c22d9ab483b657359a1bf56fee1d7af88794677707/msoffcrypto_tool-5.4.2-py3-none-any.whl", hash = "sha256:274fe2181702d1e5a107ec1b68a4c9fea997a44972ae1cc9ae0cb4f6a50fef0e" },
]
[[package]] [[package]]
name = "multidict" name = "multidict"
version = "6.6.3" version = "6.6.3"
@ -3735,6 +3858,29 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/9e/4e/0d0c945463719429b7bd21dece907ad0bde437a2ff12b9b12fee94722ab0/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1" }, { url = "https://mirrors.aliyun.com/pypi/packages/9e/4e/0d0c945463719429b7bd21dece907ad0bde437a2ff12b9b12fee94722ab0/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1" },
] ]
[[package]]
name = "olefile"
version = "0.46"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/34/81/e1ac43c6b45b4c5f8d9352396a14144bba52c8fec72a80f425f6a4d653ad/olefile-0.46.zip", hash = "sha256:133b031eaf8fd2c9399b78b8bc5b8fcbe4c31e85295749bb17a87cba8f3c3964" }
[[package]]
name = "oletools"
version = "0.60.2"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "colorclass" },
{ name = "easygui" },
{ name = "msoffcrypto-tool", marker = "(platform_python_implementation != 'PyPy' and sys_platform == 'darwin') or (platform_python_implementation != 'PyPy' and sys_platform == 'win32') or (sys_platform != 'darwin' and sys_platform != 'win32')" },
{ name = "olefile" },
{ name = "pcodedmp" },
{ name = "pyparsing" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/5c/2f/037f40e44706d542b94a2312ccc33ee2701ebfc9a83b46b55263d49ce55a/oletools-0.60.2.zip", hash = "sha256:ad452099f4695ffd8855113f453348200d195ee9fa341a09e197d66ee7e0b2c3" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/ac/ff/05257b7183279b80ecec6333744de23f48f0faeeba46c93e6d13ce835515/oletools-0.60.2-py2.py3-none-any.whl", hash = "sha256:72ad8bd748fd0c4e7b5b4733af770d11543ebb2bf2697455f99f975fcd50cc96" },
]
[[package]] [[package]]
name = "ollama" name = "ollama"
version = "0.2.1" version = "0.2.1"
@ -4197,6 +4343,19 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/87/2b/b50d3d08ea0fc419c183a84210571eba005328efa62b6b98bc28e9ead32a/patsy-1.0.1-py2.py3-none-any.whl", hash = "sha256:751fb38f9e97e62312e921a1954b81e1bb2bcda4f5eeabaf94db251ee791509c" }, { url = "https://mirrors.aliyun.com/pypi/packages/87/2b/b50d3d08ea0fc419c183a84210571eba005328efa62b6b98bc28e9ead32a/patsy-1.0.1-py2.py3-none-any.whl", hash = "sha256:751fb38f9e97e62312e921a1954b81e1bb2bcda4f5eeabaf94db251ee791509c" },
] ]
[[package]]
name = "pcodedmp"
version = "1.2.6"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "oletools" },
{ name = "win-unicode-console", marker = "platform_python_implementation != 'PyPy' and sys_platform == 'win32'" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/3d/20/6d461e29135f474408d0d7f95b2456a9ba245560768ee51b788af10f7429/pcodedmp-1.2.6.tar.gz", hash = "sha256:025f8c809a126f45a082ffa820893e6a8d990d9d7ddb68694b5a9f0a6dbcd955" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/ba/72/b380fb5c89d89c3afafac8cf02a71a45f4f4a4f35531ca949a34683962d1/pcodedmp-1.2.6-py2.py3-none-any.whl", hash = "sha256:4441f7c0ab4cbda27bd4668db3b14f36261d86e5059ce06c0828602cbe1c4278" },
]
[[package]] [[package]]
name = "pdfminer-six" name = "pdfminer-six"
version = "20221105" version = "20221105"
@ -5309,6 +5468,7 @@ dependencies = [
{ name = "elastic-transport" }, { name = "elastic-transport" },
{ name = "elasticsearch" }, { name = "elasticsearch" },
{ name = "elasticsearch-dsl" }, { name = "elasticsearch-dsl" },
{ name = "extract-msg" },
{ name = "filelock" }, { name = "filelock" },
{ name = "flasgger" }, { name = "flasgger" },
{ name = "flask" }, { name = "flask" },
@ -5331,8 +5491,10 @@ dependencies = [
{ name = "langfuse" }, { name = "langfuse" },
{ name = "lark" }, { name = "lark" },
{ name = "litellm" }, { name = "litellm" },
{ name = "mammoth" },
{ name = "markdown" }, { name = "markdown" },
{ name = "markdown-to-json" }, { name = "markdown-to-json" },
{ name = "markdownify" },
{ name = "mcp" }, { name = "mcp" },
{ name = "mini-racer" }, { name = "mini-racer" },
{ name = "minio" }, { name = "minio" },
@ -5462,6 +5624,7 @@ requires-dist = [
{ name = "elastic-transport", specifier = "==8.12.0" }, { name = "elastic-transport", specifier = "==8.12.0" },
{ name = "elasticsearch", specifier = "==8.12.1" }, { name = "elasticsearch", specifier = "==8.12.1" },
{ name = "elasticsearch-dsl", specifier = "==8.12.0" }, { name = "elasticsearch-dsl", specifier = "==8.12.0" },
{ name = "extract-msg", specifier = ">=0.39.0" },
{ name = "fastembed", marker = "(platform_machine != 'x86_64' and extra == 'full') or (sys_platform == 'darwin' and extra == 'full')", specifier = ">=0.3.6,<0.4.0" }, { name = "fastembed", marker = "(platform_machine != 'x86_64' and extra == 'full') or (sys_platform == 'darwin' and extra == 'full')", specifier = ">=0.3.6,<0.4.0" },
{ name = "fastembed-gpu", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'full'", specifier = ">=0.3.6,<0.4.0" }, { name = "fastembed-gpu", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'full'", specifier = ">=0.3.6,<0.4.0" },
{ name = "filelock", specifier = "==3.15.4" }, { name = "filelock", specifier = "==3.15.4" },
@ -5487,8 +5650,10 @@ requires-dist = [
{ name = "langfuse", specifier = ">=2.60.0" }, { name = "langfuse", specifier = ">=2.60.0" },
{ name = "lark", specifier = ">=1.2.2" }, { name = "lark", specifier = ">=1.2.2" },
{ name = "litellm", specifier = ">=1.74.15.post1" }, { name = "litellm", specifier = ">=1.74.15.post1" },
{ name = "mammoth", specifier = ">=1.11.0" },
{ name = "markdown", specifier = "==3.6" }, { name = "markdown", specifier = "==3.6" },
{ name = "markdown-to-json", specifier = "==2.1.1" }, { name = "markdown-to-json", specifier = "==2.1.1" },
{ name = "markdownify", specifier = ">=1.2.0" },
{ name = "mcp", specifier = ">=1.9.4" }, { name = "mcp", specifier = ">=1.9.4" },
{ name = "mini-racer", specifier = ">=0.12.4,<0.13.0" }, { name = "mini-racer", specifier = ">=0.12.4,<0.13.0" },
{ name = "minio", specifier = "==7.2.4" }, { name = "minio", specifier = "==7.2.4" },
@ -5641,6 +5806,12 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/c2/5a/2f2e7fc026d5e64b5408aa3fbe0296a6407b8481196cae4daacacb3a3ae0/readerwriterlock-1.0.9-py3-none-any.whl", hash = "sha256:8c4b704e60d15991462081a27ef46762fea49b478aa4426644f2146754759ca7" }, { url = "https://mirrors.aliyun.com/pypi/packages/c2/5a/2f2e7fc026d5e64b5408aa3fbe0296a6407b8481196cae4daacacb3a3ae0/readerwriterlock-1.0.9-py3-none-any.whl", hash = "sha256:8c4b704e60d15991462081a27ef46762fea49b478aa4426644f2146754759ca7" },
] ]
[[package]]
name = "red-black-tree-mod"
version = "1.20"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/34/12/944f61bc67a1e918953741c0b3b75a28f96d8060d08fd3614233309ced3b/red-black-tree-mod-1.20.tar.gz", hash = "sha256:2448e6fc9cbf1be204c753f352c6ee49aa8156dbf1faa57dfc26bd7705077e0a" }
[[package]] [[package]]
name = "referencing" name = "referencing"
version = "0.36.2" version = "0.36.2"
@ -5894,6 +6065,19 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762" }, { url = "https://mirrors.aliyun.com/pypi/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762" },
] ]
[[package]]
name = "rtfde"
version = "0.0.2"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "lark-parser" },
{ name = "oletools" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/81/ea/28f5ab6b46a072887c8c8fd8c8a1f7b54025fc4bb2e09024668ea6686044/RTFDE-0.0.2.tar.gz", hash = "sha256:b86b5d734950fe8745a5b89133f50554252dbd67c6d1b9265e23ee140e7ea8a2" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/5d/3f/39ba5a72620c43656bc80cb1f7afe0d498df4a48947d75ea0ca0752ffbf4/RTFDE-0.0.2-py3-none-any.whl", hash = "sha256:18386e4f060cee12a2a8035b0acf0cc99689f5dff1bf347bab7e92351860a21d" },
]
[[package]] [[package]]
name = "ruamel-base" name = "ruamel-base"
version = "1.0.0" version = "1.0.0"
@ -6901,6 +7085,18 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8" }, { url = "https://mirrors.aliyun.com/pypi/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8" },
] ]
[[package]]
name = "tzlocal"
version = "5.3.1"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
dependencies = [
{ name = "tzdata", marker = "sys_platform == 'win32'" },
]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd" }
wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d" },
]
[[package]] [[package]]
name = "umap-learn" name = "umap-learn"
version = "0.5.6" version = "0.5.6"
@ -7145,6 +7341,12 @@ dependencies = [
] ]
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz", hash = "sha256:db0fad1829fdd441b1852306e9856398204dc0786d2996dd2e0c8bb8e26133b2" } sdist = { url = "https://mirrors.aliyun.com/pypi/packages/67/35/25e68fbc99e672127cc6fbb14b8ec1ba3dfef035bf1e4c90f78f24a80b7d/wikipedia-1.4.0.tar.gz", hash = "sha256:db0fad1829fdd441b1852306e9856398204dc0786d2996dd2e0c8bb8e26133b2" }
[[package]]
name = "win-unicode-console"
version = "0.5"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/89/8d/7aad74930380c8972ab282304a2ff45f3d4927108bb6693cabcc9fc6a099/win_unicode_console-0.5.zip", hash = "sha256:d4142d4d56d46f449d6f00536a73625a871cba040f0bc1a2e305a04578f07d1e" }
[[package]] [[package]]
name = "win32-setctime" name = "win32-setctime"
version = "1.2.0" version = "1.2.0"