mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: advanced markdown parsing (#9607)
### What problem does this PR solve? Using AST parsing to handle markdown more accurately, preventing components from being cut off by chunking. #9564 <img width="1746" height="993" alt="image" src="https://github.com/user-attachments/assets/4aaf4bf6-5714-4d48-a9cf-864f59633f7f" /> <img width="1739" height="982" alt="image" src="https://github.com/user-attachments/assets/dc00233f-7a55-434f-bbb7-74ce7f57a6cf" /> <img width="559" height="100" alt="image" src="https://github.com/user-attachments/assets/4a556b5b-d9c6-4544-a486-8ac342bd504e" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -30,7 +30,7 @@ from tika import parser
|
||||
|
||||
from api.db import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
||||
@ -350,17 +350,14 @@ class Markdown(MarkdownParser):
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
txt = f.read()
|
||||
|
||||
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
||||
sections = []
|
||||
|
||||
extractor = MarkdownElementExtractor(txt)
|
||||
element_sections = extractor.extract_elements()
|
||||
sections = [(element, "") for element in element_sections]
|
||||
|
||||
tbls = []
|
||||
for sec in remainder.split("\n"):
|
||||
if sec.strip().find("#") == 0:
|
||||
sections.append((sec, ""))
|
||||
elif sections and sections[-1][0].strip().find("#") == 0:
|
||||
sec_, _ = sections.pop(-1)
|
||||
sections.append((sec_ + "\n" + sec, ""))
|
||||
else:
|
||||
sections.append((sec, ""))
|
||||
for table in tables:
|
||||
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
||||
return sections, tbls
|
||||
|
||||
Reference in New Issue
Block a user