Fix typo in code (#8327)

### What problem does this PR solve?

Fix typo in code

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-06-18 09:41:09 +08:00
committed by GitHub
parent 09b7ac26ad
commit 4a2ff633e0
15 changed files with 45 additions and 45 deletions

View File

@ -69,7 +69,7 @@ class RAGFlowDocxParser:
max_type = max(max_type.items(), key=lambda x: x[1])[0]
colnm = len(df.iloc[0, :])
hdrows = [0] # header is not nessesarily appear in the first line
hdrows = [0] # header is not necessarily appear in the first line
if max_type == "Nu":
for r in range(1, len(df)):
tys = Counter([blockType(str(df.iloc[r, j]))

View File

@ -21,7 +21,7 @@ from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from rag.prompts import vision_llm_figure_describe_prompt
def vision_figure_parser_figure_data_wraper(figures_data_without_positions):
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
return [
(
(figure_data[1], [figure_data[0]]),

View File

@ -180,13 +180,13 @@ class RAGFlowPdfParser:
return fea
@staticmethod
def sort_X_by_page(arr, threashold):
def sort_X_by_page(arr, threshold):
# sort using y1 first and then x1
arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
for i in range(len(arr) - 1):
for j in range(i, -1, -1):
# restore the order using th
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold \
and arr[j + 1]["top"] < arr[j]["top"] \
and arr[j + 1]["page_number"] == arr[j]["page_number"]:
tmp = arr[j]
@ -264,13 +264,13 @@ class RAGFlowPdfParser:
for b in self.boxes:
if b.get("layout_type", "") != "table":
continue
ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
if ii is not None:
b["R"] = ii
b["R_top"] = rows[ii]["top"]
b["R_bott"] = rows[ii]["bottom"]
ii = Recognizer.find_overlapped_with_threashold(
ii = Recognizer.find_overlapped_with_threshold(
b, headers, thr=0.3)
if ii is not None:
b["H_top"] = headers[ii]["top"]
@ -285,7 +285,7 @@ class RAGFlowPdfParser:
b["C_left"] = clmns[ii]["x0"]
b["C_right"] = clmns[ii]["x1"]
ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
if ii is not None:
b["H_top"] = spans[ii]["top"]
b["H_bott"] = spans[ii]["bottom"]

View File

@ -106,7 +106,7 @@ class LayoutRecognizer(Recognizer):
bxs.pop(i)
continue
ii = self.find_overlapped_with_threashold(bxs[i], lts_,
ii = self.find_overlapped_with_threshold(bxs[i], lts_,
thr=0.4)
if ii is None: # belong to nothing
bxs[i]["layout_type"] = ""

View File

@ -52,20 +52,20 @@ class Recognizer:
self.label_list = label_list
@staticmethod
def sort_Y_firstly(arr, threashold):
def sort_Y_firstly(arr, threshold):
def cmp(c1, c2):
diff = c1["top"] - c2["top"]
if abs(diff) < threashold:
if abs(diff) < threshold:
diff = c1["x0"] - c2["x0"]
return diff
arr = sorted(arr, key=cmp_to_key(cmp))
return arr
@staticmethod
def sort_X_firstly(arr, threashold):
def sort_X_firstly(arr, threshold):
def cmp(c1, c2):
diff = c1["x0"] - c2["x0"]
if abs(diff) < threashold:
if abs(diff) < threshold:
diff = c1["top"] - c2["top"]
return diff
arr = sorted(arr, key=cmp_to_key(cmp))
@ -239,15 +239,15 @@ class Recognizer:
e -= 1
break
max_overlaped_i, max_overlaped = None, 0
max_overlapped_i, max_overlapped = None, 0
for i in range(s, e):
ov = Recognizer.overlapped_area(bxs[i], box)
if ov <= max_overlaped:
if ov <= max_overlapped:
continue
max_overlaped_i = i
max_overlaped = ov
max_overlapped_i = i
max_overlapped = ov
return max_overlaped_i
return max_overlapped_i
@staticmethod
def find_horizontally_tightest_fit(box, boxes):
@ -264,7 +264,7 @@ class Recognizer:
return min_i
@staticmethod
def find_overlapped_with_threashold(box, boxes, thr=0.3):
def find_overlapped_with_threshold(box, boxes, thr=0.3):
if not boxes:
return
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0

View File

@ -84,13 +84,13 @@ def get_table_html(img, tb_cpns, ocr):
clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5)
for b in boxes:
ii = LayoutRecognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
ii = LayoutRecognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
if ii is not None:
b["R"] = ii
b["R_top"] = rows[ii]["top"]
b["R_bott"] = rows[ii]["bottom"]
ii = LayoutRecognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
ii = LayoutRecognizer.find_overlapped_with_threshold(b, headers, thr=0.3)
if ii is not None:
b["H_top"] = headers[ii]["top"]
b["H_bott"] = headers[ii]["bottom"]
@ -104,7 +104,7 @@ def get_table_html(img, tb_cpns, ocr):
b["C_left"] = clmns[ii]["x0"]
b["C_right"] = clmns[ii]["x1"]
ii = LayoutRecognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
ii = LayoutRecognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
if ii is not None:
b["H_top"] = spans[ii]["top"]
b["H_bott"] = spans[ii]["bottom"]