diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index 8d0e67022..2f8d6ab97 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -117,11 +117,24 @@ class MarkdownElementExtractor: self.markdown_content = markdown_content self.lines = markdown_content.split("\n") - def extract_elements(self): + def get_delimiters(self,delimiters): + toks = re.findall(r"`([^`]+)`", delimiters) + toks = sorted(set(toks), key=lambda x: -len(x)) + return "|".join(re.escape(t) for t in toks if t) + + def extract_elements(self,delimiter=None): """Extract individual elements (headers, code blocks, lists, etc.)""" sections = [] i = 0 + dels="" + if delimiter: + dels = self.get_delimiters(delimiter) + if len(dels) > 0: + text = "\n".join(self.lines) + parts = re.split(dels, text) + sections = [p.strip() for p in parts if p and p.strip()] + return sections while i < len(self.lines): line = self.lines[i] diff --git a/rag/app/naive.py b/rag/app/naive.py index f4c523c99..614b32ad3 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -397,7 +397,7 @@ class Markdown(MarkdownParser): return images if images else None - def __call__(self, filename, binary=None, separate_tables=True): + def __call__(self, filename, binary=None, separate_tables=True,delimiter=None): if binary: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") @@ -408,7 +408,7 @@ class Markdown(MarkdownParser): remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables) extractor = MarkdownElementExtractor(txt) - element_sections = extractor.extract_elements() + element_sections = extractor.extract_elements(delimiter) sections = [(element, "") for element in element_sections] tbls = [] @@ -600,7 +600,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) - sections, tables = markdown_parser(filename, binary, separate_tables=False) + sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?")) try: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)