mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: advanced markdown parsing (#9607)
### What problem does this PR solve? Using AST parsing to handle markdown more accurately, preventing components from being cut off by chunking. #9564 <img width="1746" height="993" alt="image" src="https://github.com/user-attachments/assets/4aaf4bf6-5714-4d48-a9cf-864f59633f7f" /> <img width="1739" height="982" alt="image" src="https://github.com/user-attachments/assets/dc00233f-7a55-434f-bbb7-74ce7f57a6cf" /> <img width="559" height="100" alt="image" src="https://github.com/user-attachments/assets/4a556b5b-d9c6-4544-a486-8ac342bd504e" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -14,13 +14,15 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
|
||||
from .docx_parser import RAGFlowDocxParser as DocxParser
|
||||
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
||||
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
||||
from .json_parser import RAGFlowJsonParser as JsonParser
|
||||
from .markdown_parser import MarkdownElementExtractor
|
||||
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
||||
from .pdf_parser import PlainParser
|
||||
from .pdf_parser import RAGFlowPdfParser as PdfParser
|
||||
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||
from .txt_parser import RAGFlowTxtParser as TxtParser
|
||||
|
||||
__all__ = [
|
||||
@ -33,4 +35,6 @@ __all__ = [
|
||||
"JsonParser",
|
||||
"MarkdownParser",
|
||||
"TxtParser",
|
||||
]
|
||||
"MarkdownElementExtractor",
|
||||
]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user