Fix IDE warnings (#12281)

### What problem does this PR solve? As title ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2025-12-30 00:32:30 +08:00 · 2025-12-29 12:01:18 +08:00
parent 647fb115a0
commit 01f0ced1e6
43 changed files with 817 additions and 637 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -273,7 +273,7 @@ def tokenize(d, txt, eng):
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])


-def split_with_pattern(d, pattern:str, content:str, eng) -> list:
+def split_with_pattern(d, pattern: str, content: str, eng) -> list:
    docs = []
    txts = [txt for txt in re.split(r"(%s)" % pattern, content, flags=re.DOTALL)]
    for j in range(0, len(txts), 2):
@ -281,7 +281,7 @@ def split_with_pattern(d, pattern:str, content:str, eng) -> list:
        if not txt:
            continue
        if j + 1 < len(txts):
-            txt += txts[j+1]
+            txt += txts[j + 1]
        dd = copy.deepcopy(d)
        tokenize(dd, txt, eng)
        docs.append(dd)
@ -304,7 +304,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=
            except NotImplementedError:
                pass
        else:
-            add_positions(d, [[ii]*5])
+            add_positions(d, [[ii] * 5])

        if child_delimiters_pattern:
            d["mom_with_weight"] = ck
@ -325,7 +325,7 @@ def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_patte
        logging.debug("-- {}".format(ck))
        d = copy.deepcopy(doc)
        d["image"] = image
-        add_positions(d, [[ii]*5])
+        add_positions(d, [[ii] * 5])
        if child_delimiters_pattern:
            d["mom_with_weight"] = ck
            res.extend(split_with_pattern(d, child_delimiters_pattern, ck, eng))
@ -658,7 +658,8 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
            if "content_ltks" in ck:
                ck["content_ltks"] = rag_tokenizer.tokenize(combined)
            if "content_sm_ltks" in ck:
-                ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
+                ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(
+                    ck.get("content_ltks", rag_tokenizer.tokenize(combined)))

    if positioned_indices:
        chunks[:] = [chunks[i] for i in ordered_indices]
@ -764,8 +765,8 @@ def not_title(txt):
        return True
    return re.search(r"[,;，。；！!]", txt)

-def tree_merge(bull, sections, depth):

+def tree_merge(bull, sections, depth):
    if not sections or bull < 0:
        return sections
    if isinstance(sections[0], type("")):
@ -777,16 +778,17 @@ def tree_merge(bull, sections, depth):

    def get_level(bull, section):
        text, layout = section
-        text = re.sub(r"\u3000", " ",   text).strip()
+        text = re.sub(r"\u3000", " ", text).strip()

        for i, title in enumerate(BULLET_PATTERN[bull]):
            if re.match(title, text.strip()):
-                return i+1, text
+                return i + 1, text
        else:
            if re.search(r"(title|head)", layout) and not not_title(text):
-                return len(BULLET_PATTERN[bull])+1, text
+                return len(BULLET_PATTERN[bull]) + 1, text
            else:
-                return len(BULLET_PATTERN[bull])+2, text
+                return len(BULLET_PATTERN[bull]) + 2, text
+
    level_set = set()
    lines = []
    for section in sections:
@ -812,8 +814,8 @@ def tree_merge(bull, sections, depth):

    return [element for element in root.get_tree() if element]

-def hierarchical_merge(bull, sections, depth):

+def hierarchical_merge(bull, sections, depth):
    if not sections or bull < 0:
        return []
    if isinstance(sections[0], type("")):
@ -922,10 +924,10 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
        if tnum < 8:
            pos = ""
        # Ensure that the length of the merged chunk does not exceed chunk_token_num
-        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
+        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent) / 100.:
            if cks:
                overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
-                t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
+                t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
            if t.find(pos) < 0:
                t += pos
            cks.append(t)
@ -957,7 +959,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
        return cks

    for sec, pos in sections:
-        add_chunk("\n"+sec, pos)
+        add_chunk("\n" + sec, pos)

    return cks

@ -978,10 +980,10 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
        if tnum < 8:
            pos = ""
        # Ensure that the length of the merged chunk does not exceed chunk_token_num
-        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
+        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent) / 100.:
            if cks:
                overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
-                t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
+                t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
            if t.find(pos) < 0:
                t += pos
            cks.append(t)
@ -1025,9 +1027,9 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
        if isinstance(text, tuple):
            text_str = text[0]
            text_pos = text[1] if len(text) > 1 else ""
-            add_chunk("\n"+text_str, image, text_pos)
+            add_chunk("\n" + text_str, image, text_pos)
        else:
-            add_chunk("\n"+text, image)
+            add_chunk("\n" + text, image)

    return cks, result_images

@ -1042,7 +1044,7 @@ def docx_question_level(p, bull=-1):
        for j, title in enumerate(BULLET_PATTERN[bull]):
            if re.match(title, txt):
                return j + 1, txt
-    return len(BULLET_PATTERN[bull])+1, txt
+    return len(BULLET_PATTERN[bull]) + 1, txt


 def concat_img(img1, img2):
@ -1211,7 +1213,7 @@ class Node:
        child = node.get_children()

        if level == 0 and texts:
-            tree_list.append("\n".join(titles+texts))
+            tree_list.append("\n".join(titles + texts))

        # Titles within configured depth are accumulated into the current path
        if 1 <= level <= self.depth: