diff --git a/api/utils/health_utils.py b/api/utils/health_utils.py index 0d47df081..967fa71b7 100644 --- a/api/utils/health_utils.py +++ b/api/utils/health_utils.py @@ -1,3 +1,20 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + from timeit import default_timer as timer from api import settings diff --git a/rag/app/laws.py b/rag/app/laws.py index 185c66935..35cb706d8 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -22,12 +22,15 @@ from docx import Document from api.db import ParserType from deepdoc.parser.utils import get_text -from rag.nlp import bullets_category, remove_contents_table, hierarchical_merge, \ - make_colon_as_title, tokenize_chunks, docx_question_level -from rag.nlp import rag_tokenizer +from rag.nlp import bullets_category, remove_contents_table, \ + make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge +from rag.nlp import rag_tokenizer, Node from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser + + + class Docx(DocxParser): def __init__(self): pass @@ -55,49 +58,37 @@ class Docx(DocxParser): return [line for line in lines if line] def __call__(self, filename, binary=None, from_page=0, to_page=100000): - self.doc = Document( - filename) if not binary else Document(BytesIO(binary)) - pn = 0 - lines = [] - bull = bullets_category([p.text for p in self.doc.paragraphs]) - for p in self.doc.paragraphs: - if pn > to_page: - break - question_level, p_text = docx_question_level(p, bull) - if not p_text.strip("\n"): - continue - lines.append((question_level, p_text)) - - for run in p.runs: - if 'lastRenderedPageBreak' in run._element.xml: - pn += 1 - continue - if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: - pn += 1 - - visit = [False for _ in range(len(lines))] - sections = [] - for s in range(len(lines)): - e = s + 1 - while e < len(lines): - if lines[e][0] <= lines[s][0]: + self.doc = Document( + filename) if not binary else Document(BytesIO(binary)) + pn = 0 + lines = [] + level_set = set() + bull = bullets_category([p.text for p in self.doc.paragraphs]) + for p in self.doc.paragraphs: + if pn > to_page: break - e += 1 - if e - s == 1 and visit[s]: - continue - sec = [] - next_level = lines[s][0] + 1 - while not sec and next_level < 22: - for i in range(s+1, e): - if lines[i][0] != next_level: + question_level, p_text = docx_question_level(p, bull) + if not p_text.strip("\n"): + continue + lines.append((question_level, p_text)) + level_set.add(question_level) + for run in p.runs: + if 'lastRenderedPageBreak' in run._element.xml: + pn += 1 continue - sec.append(lines[i][1]) - visit[i] = True - next_level += 1 - sec.insert(0, lines[s][1]) + if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + pn += 1 + + sorted_levels = sorted(level_set) + + h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1 + h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level + + root = Node(level=0, depth=h2_level, texts=[]) + root.build_tree(lines) + + return [("\n").join(element) for element in root.get_tree() if element] - sections.append("\n".join(sec)) - return [s for s in sections if s] def __str__(self) -> str: return f''' @@ -163,7 +154,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, chunks = Docx()(filename, binary) callback(0.7, "Finish parsing.") return tokenize_chunks(chunks, doc, eng, None) - + elif re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": @@ -172,7 +163,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, from_page=from_page, to_page=to_page, callback=callback)[0]: sections.append(txt + poss) - elif re.search(r"\.txt$", filename, re.IGNORECASE): + elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = get_text(filename, binary) sections = txt.split("\n") @@ -203,13 +194,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, make_colon_as_title(sections) bull = bullets_category(sections) - chunks = hierarchical_merge(bull, sections, 5) - if not chunks: + res = tree_merge(bull, sections, 2) + + + if not res: callback(0.99, "No chunk parsed out.") - return tokenize_chunks(["\n".join(ck) - for ck in chunks], doc, eng, pdf_parser) + return tokenize_chunks(res, doc, eng, pdf_parser) + # chunks = hierarchical_merge(bull, sections, 5) + # return tokenize_chunks(["\n".join(ck)for ck in chunks], doc, eng, pdf_parser) if __name__ == "__main__": import sys diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 2424ba033..8208fba85 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -189,6 +189,13 @@ BULLET_PATTERN = [[ r"Chapter (I+V?|VI*|XI|IX|X)", r"Section [0-9]+", r"Article [0-9]+" +], [ + r"^#[^#]", + r"^##[^#]", + r"^###.*", + r"^####.*", + r"^#####.*", + r"^######.*", ] ] @@ -427,8 +434,58 @@ def not_title(txt): return True return re.search(r"[,;,。;!!]", txt) +def tree_merge(bull, sections, depth): + + if not sections or bull < 0: + return sections + if isinstance(sections[0], type("")): + sections = [(s, "") for s in sections] + + # filter out position information in pdf sections + sections = [(t, o) for t, o in sections if + t and len(t.split("@")[0].strip()) > 1 and not re.match(r"[0-9]+$", t.split("@")[0].strip())] + + def get_level(bull, section): + text, layout = section + text = re.sub(r"\u3000", " ", text).strip() + + for i, title in enumerate(BULLET_PATTERN[bull]): + if re.match(title, text.strip()): + return i+1, text + else: + if re.search(r"(title|head)", layout) and not not_title(text): + return len(BULLET_PATTERN[bull])+1, text + else: + return len(BULLET_PATTERN[bull])+2, text + + level_set = set() + lines = [] + for section in sections: + level, text = get_level(bull, section) + + if not text.strip("\n"): + continue + + lines.append((level, text)) + level_set.add(level) + + sorted_levels = sorted(list(level_set)) + + if depth <= len(sorted_levels): + target_level = sorted_levels[depth - 1] + else: + target_level = sorted_levels[-1] + + if target_level == len(BULLET_PATTERN[bull]) + 2: + target_level = sorted_levels[-2] if len(sorted_levels) > 1 else sorted_levels[0] + + root = Node(level=0, depth=target_level, texts=[]) + root.build_tree(lines) + + return [("\n").join(element) for element in root.get_tree() if element] def hierarchical_merge(bull, sections, depth): + if not sections or bull < 0: return [] if isinstance(sections[0], type("")): @@ -628,7 +685,7 @@ def docx_question_level(p, bull=-1): for j, title in enumerate(BULLET_PATTERN[bull]): if re.match(title, txt): return j + 1, txt - return len(BULLET_PATTERN[bull]), txt + return len(BULLET_PATTERN[bull])+1, txt def concat_img(img1, img2): @@ -731,3 +788,68 @@ def get_delimiters(delimiters: str): dels_pattern = "|".join(dels) return dels_pattern + +class Node: + def __init__(self, level, depth=-1, texts=None): + self.level = level + self.depth = depth + self.texts = texts if texts is not None else [] # 存放内容 + self.children = [] # 子节点 + + def add_child(self, child_node): + self.children.append(child_node) + + def get_children(self): + return self.children + + def get_level(self): + return self.level + + def get_texts(self): + return self.texts + + def set_texts(self, texts): + self.texts = texts + + def add_text(self, text): + self.texts.append(text) + + def clear_text(self): + self.texts = [] + + def __repr__(self): + return f"Node(level={self.level}, texts={self.texts}, children={len(self.children)})" + + def build_tree(self, lines): + stack = [self] + for line in lines: + level, text = line + node = Node(level=level, texts=[text]) + + if level <= self.depth or self.depth == -1: + while stack and level <= stack[-1].get_level(): + stack.pop() + + stack[-1].add_child(node) + stack.append(node) + else: + stack[-1].add_text(text) + return self + + def get_tree(self): + tree_list = [] + self._dfs(self, tree_list, 0, []) + return tree_list + + def _dfs(self, node, tree_list, current_depth, titles): + + if node.get_texts(): + if 0 < node.get_level() < self.depth: + titles.extend(node.get_texts()) + else: + combined_text = ["\n".join(titles + node.get_texts())] + tree_list.append(combined_text) + + + for child in node.get_children(): + self._dfs(child, tree_list, current_depth + 1, titles.copy())