Feat: advanced markdown parsing (#9607)

### What problem does this PR solve? Using AST parsing to handle markdown more accurately, preventing components from being cut off by chunking. #9564 <img width="1746" height="993" alt="image" src="https://github.com/user-attachments/assets/4aaf4bf6-5714-4d48-a9cf-864f59633f7f" /> <img width="1739" height="982" alt="image" src="https://github.com/user-attachments/assets/dc00233f-7a55-434f-bbb7-74ce7f57a6cf" /> <img width="559" height="100" alt="image" src="https://github.com/user-attachments/assets/4a556b5b-d9c6-4544-a486-8ac342bd504e" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 15:45:08 +08:00 · 2025-08-21 09:36:18 +08:00
parent 4080f6a54a
commit 382458ace7
3 changed files with 196 additions and 28 deletions
--- a/deepdoc/parser/init.py
+++ b/deepdoc/parser/init.py
@ -14,13 +14,15 @@
 #  limitations under the License.
 #

-from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
 from .docx_parser import RAGFlowDocxParser as DocxParser
 from .excel_parser import RAGFlowExcelParser as ExcelParser
-from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser
 from .json_parser import RAGFlowJsonParser as JsonParser
+from .markdown_parser import MarkdownElementExtractor
 from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
+from .pdf_parser import PlainParser
+from .pdf_parser import RAGFlowPdfParser as PdfParser
+from .ppt_parser import RAGFlowPptParser as PptParser
 from .txt_parser import RAGFlowTxtParser as TxtParser

 __all__ = [
@ -33,4 +35,6 @@ __all__ = [
    "JsonParser",
    "MarkdownParser",
    "TxtParser",
-]
+    "MarkdownElementExtractor",
+]
+