Fix:enhance delimiters in markdown parser (#10896)

### What problem does this PR solve?
issue:
[#10890](https://github.com/infiniflow/ragflow/issues/10890)
change:
enhance delimiters in markdown parser
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
buua436
2025-10-30 17:36:51 +08:00
committed by GitHub
parent 5d79912274
commit bb9504d1cc
2 changed files with 17 additions and 4 deletions

View File

@ -117,11 +117,24 @@ class MarkdownElementExtractor:
self.markdown_content = markdown_content
self.lines = markdown_content.split("\n")
def extract_elements(self):
def get_delimiters(self,delimiters):
toks = re.findall(r"`([^`]+)`", delimiters)
toks = sorted(set(toks), key=lambda x: -len(x))
return "|".join(re.escape(t) for t in toks if t)
def extract_elements(self,delimiter=None):
"""Extract individual elements (headers, code blocks, lists, etc.)"""
sections = []
i = 0
dels=""
if delimiter:
dels = self.get_delimiters(delimiter)
if len(dels) > 0:
text = "\n".join(self.lines)
parts = re.split(dels, text)
sections = [p.strip() for p in parts if p and p.strip()]
return sections
while i < len(self.lines):
line = self.lines[i]

View File

@ -397,7 +397,7 @@ class Markdown(MarkdownParser):
return images if images else None
def __call__(self, filename, binary=None, separate_tables=True):
def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
@ -408,7 +408,7 @@ class Markdown(MarkdownParser):
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
extractor = MarkdownElementExtractor(txt)
element_sections = extractor.extract_elements()
element_sections = extractor.extract_elements(delimiter)
sections = [(element, "") for element in element_sections]
tbls = []
@ -600,7 +600,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary, separate_tables=False)
sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)