mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: Unnecessary truncation in markdown parser (#7972)
### What problem does this PR solve? Fix unnecessary truncation in markdown parser. So that markdown can work perfectly like [this](https://github.com/infiniflow/ragflow/issues/7824#issuecomment-2921312576) in #7824, supporting multiple special delimiters. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -545,7 +545,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
add_chunk(sub_sec, pos)
|
||||
|
||||
return cks
|
||||
|
||||
|
||||
|
||||
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
if not texts or len(texts) != len(images):
|
||||
@ -676,6 +676,8 @@ def get_delimiters(delimiters: str):
|
||||
s = t
|
||||
if s < len(delimiters):
|
||||
dels.extend(list(delimiters[s:]))
|
||||
|
||||
dels.sort(key=lambda x: -len(x))
|
||||
dels = [re.escape(d) for d in dels if d]
|
||||
dels = [d for d in dels if d]
|
||||
dels_pattern = "|".join(dels)
|
||||
|
||||
Reference in New Issue
Block a user