Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-12-29 12:01:18 +08:00
committed by GitHub
parent 647fb115a0
commit 01f0ced1e6
43 changed files with 817 additions and 637 deletions

View File

@ -273,7 +273,7 @@ def tokenize(d, txt, eng):
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def split_with_pattern(d, pattern:str, content:str, eng) -> list:
def split_with_pattern(d, pattern: str, content: str, eng) -> list:
docs = []
txts = [txt for txt in re.split(r"(%s)" % pattern, content, flags=re.DOTALL)]
for j in range(0, len(txts), 2):
@ -281,7 +281,7 @@ def split_with_pattern(d, pattern:str, content:str, eng) -> list:
if not txt:
continue
if j + 1 < len(txts):
txt += txts[j+1]
txt += txts[j + 1]
dd = copy.deepcopy(d)
tokenize(dd, txt, eng)
docs.append(dd)
@ -304,7 +304,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=
except NotImplementedError:
pass
else:
add_positions(d, [[ii]*5])
add_positions(d, [[ii] * 5])
if child_delimiters_pattern:
d["mom_with_weight"] = ck
@ -325,7 +325,7 @@ def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_patte
logging.debug("-- {}".format(ck))
d = copy.deepcopy(doc)
d["image"] = image
add_positions(d, [[ii]*5])
add_positions(d, [[ii] * 5])
if child_delimiters_pattern:
d["mom_with_weight"] = ck
res.extend(split_with_pattern(d, child_delimiters_pattern, ck, eng))
@ -658,7 +658,8 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
if "content_ltks" in ck:
ck["content_ltks"] = rag_tokenizer.tokenize(combined)
if "content_sm_ltks" in ck:
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(
ck.get("content_ltks", rag_tokenizer.tokenize(combined)))
if positioned_indices:
chunks[:] = [chunks[i] for i in ordered_indices]
@ -764,8 +765,8 @@ def not_title(txt):
return True
return re.search(r"[,;,。;!!]", txt)
def tree_merge(bull, sections, depth):
def tree_merge(bull, sections, depth):
if not sections or bull < 0:
return sections
if isinstance(sections[0], type("")):
@ -777,16 +778,17 @@ def tree_merge(bull, sections, depth):
def get_level(bull, section):
text, layout = section
text = re.sub(r"\u3000", " ", text).strip()
text = re.sub(r"\u3000", " ", text).strip()
for i, title in enumerate(BULLET_PATTERN[bull]):
if re.match(title, text.strip()):
return i+1, text
return i + 1, text
else:
if re.search(r"(title|head)", layout) and not not_title(text):
return len(BULLET_PATTERN[bull])+1, text
return len(BULLET_PATTERN[bull]) + 1, text
else:
return len(BULLET_PATTERN[bull])+2, text
return len(BULLET_PATTERN[bull]) + 2, text
level_set = set()
lines = []
for section in sections:
@ -812,8 +814,8 @@ def tree_merge(bull, sections, depth):
return [element for element in root.get_tree() if element]
def hierarchical_merge(bull, sections, depth):
def hierarchical_merge(bull, sections, depth):
if not sections or bull < 0:
return []
if isinstance(sections[0], type("")):
@ -922,10 +924,10 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。
if tnum < 8:
pos = ""
# Ensure that the length of the merged chunk does not exceed chunk_token_num
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent) / 100.:
if cks:
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
if t.find(pos) < 0:
t += pos
cks.append(t)
@ -957,7 +959,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。
return cks
for sec, pos in sections:
add_chunk("\n"+sec, pos)
add_chunk("\n" + sec, pos)
return cks
@ -978,10 +980,10 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
if tnum < 8:
pos = ""
# Ensure that the length of the merged chunk does not exceed chunk_token_num
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent) / 100.:
if cks:
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
if t.find(pos) < 0:
t += pos
cks.append(t)
@ -1025,9 +1027,9 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
if isinstance(text, tuple):
text_str = text[0]
text_pos = text[1] if len(text) > 1 else ""
add_chunk("\n"+text_str, image, text_pos)
add_chunk("\n" + text_str, image, text_pos)
else:
add_chunk("\n"+text, image)
add_chunk("\n" + text, image)
return cks, result_images
@ -1042,7 +1044,7 @@ def docx_question_level(p, bull=-1):
for j, title in enumerate(BULLET_PATTERN[bull]):
if re.match(title, txt):
return j + 1, txt
return len(BULLET_PATTERN[bull])+1, txt
return len(BULLET_PATTERN[bull]) + 1, txt
def concat_img(img1, img2):
@ -1211,7 +1213,7 @@ class Node:
child = node.get_children()
if level == 0 and texts:
tree_list.append("\n".join(titles+texts))
tree_list.append("\n".join(titles + texts))
# Titles within configured depth are accumulated into the current path
if 1 <= level <= self.depth: