mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix typo in code (#8327)
### What problem does this PR solve? Fix typo in code ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
@ -69,7 +69,7 @@ class RAGFlowDocxParser:
|
||||
max_type = max(max_type.items(), key=lambda x: x[1])[0]
|
||||
|
||||
colnm = len(df.iloc[0, :])
|
||||
hdrows = [0] # header is not nessesarily appear in the first line
|
||||
hdrows = [0] # header is not necessarily appear in the first line
|
||||
if max_type == "Nu":
|
||||
for r in range(1, len(df)):
|
||||
tys = Counter([blockType(str(df.iloc[r, j]))
|
||||
|
||||
@ -21,7 +21,7 @@ from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
|
||||
from rag.prompts import vision_llm_figure_describe_prompt
|
||||
|
||||
|
||||
def vision_figure_parser_figure_data_wraper(figures_data_without_positions):
|
||||
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
|
||||
return [
|
||||
(
|
||||
(figure_data[1], [figure_data[0]]),
|
||||
|
||||
@ -180,13 +180,13 @@ class RAGFlowPdfParser:
|
||||
return fea
|
||||
|
||||
@staticmethod
|
||||
def sort_X_by_page(arr, threashold):
|
||||
def sort_X_by_page(arr, threshold):
|
||||
# sort using y1 first and then x1
|
||||
arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
|
||||
for i in range(len(arr) - 1):
|
||||
for j in range(i, -1, -1):
|
||||
# restore the order using th
|
||||
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
|
||||
if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold \
|
||||
and arr[j + 1]["top"] < arr[j]["top"] \
|
||||
and arr[j + 1]["page_number"] == arr[j]["page_number"]:
|
||||
tmp = arr[j]
|
||||
@ -264,13 +264,13 @@ class RAGFlowPdfParser:
|
||||
for b in self.boxes:
|
||||
if b.get("layout_type", "") != "table":
|
||||
continue
|
||||
ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
|
||||
ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
|
||||
if ii is not None:
|
||||
b["R"] = ii
|
||||
b["R_top"] = rows[ii]["top"]
|
||||
b["R_bott"] = rows[ii]["bottom"]
|
||||
|
||||
ii = Recognizer.find_overlapped_with_threashold(
|
||||
ii = Recognizer.find_overlapped_with_threshold(
|
||||
b, headers, thr=0.3)
|
||||
if ii is not None:
|
||||
b["H_top"] = headers[ii]["top"]
|
||||
@ -285,7 +285,7 @@ class RAGFlowPdfParser:
|
||||
b["C_left"] = clmns[ii]["x0"]
|
||||
b["C_right"] = clmns[ii]["x1"]
|
||||
|
||||
ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
|
||||
ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
|
||||
if ii is not None:
|
||||
b["H_top"] = spans[ii]["top"]
|
||||
b["H_bott"] = spans[ii]["bottom"]
|
||||
|
||||
@ -106,7 +106,7 @@ class LayoutRecognizer(Recognizer):
|
||||
bxs.pop(i)
|
||||
continue
|
||||
|
||||
ii = self.find_overlapped_with_threashold(bxs[i], lts_,
|
||||
ii = self.find_overlapped_with_threshold(bxs[i], lts_,
|
||||
thr=0.4)
|
||||
if ii is None: # belong to nothing
|
||||
bxs[i]["layout_type"] = ""
|
||||
|
||||
@ -52,20 +52,20 @@ class Recognizer:
|
||||
self.label_list = label_list
|
||||
|
||||
@staticmethod
|
||||
def sort_Y_firstly(arr, threashold):
|
||||
def sort_Y_firstly(arr, threshold):
|
||||
def cmp(c1, c2):
|
||||
diff = c1["top"] - c2["top"]
|
||||
if abs(diff) < threashold:
|
||||
if abs(diff) < threshold:
|
||||
diff = c1["x0"] - c2["x0"]
|
||||
return diff
|
||||
arr = sorted(arr, key=cmp_to_key(cmp))
|
||||
return arr
|
||||
|
||||
@staticmethod
|
||||
def sort_X_firstly(arr, threashold):
|
||||
def sort_X_firstly(arr, threshold):
|
||||
def cmp(c1, c2):
|
||||
diff = c1["x0"] - c2["x0"]
|
||||
if abs(diff) < threashold:
|
||||
if abs(diff) < threshold:
|
||||
diff = c1["top"] - c2["top"]
|
||||
return diff
|
||||
arr = sorted(arr, key=cmp_to_key(cmp))
|
||||
@ -239,15 +239,15 @@ class Recognizer:
|
||||
e -= 1
|
||||
break
|
||||
|
||||
max_overlaped_i, max_overlaped = None, 0
|
||||
max_overlapped_i, max_overlapped = None, 0
|
||||
for i in range(s, e):
|
||||
ov = Recognizer.overlapped_area(bxs[i], box)
|
||||
if ov <= max_overlaped:
|
||||
if ov <= max_overlapped:
|
||||
continue
|
||||
max_overlaped_i = i
|
||||
max_overlaped = ov
|
||||
max_overlapped_i = i
|
||||
max_overlapped = ov
|
||||
|
||||
return max_overlaped_i
|
||||
return max_overlapped_i
|
||||
|
||||
@staticmethod
|
||||
def find_horizontally_tightest_fit(box, boxes):
|
||||
@ -264,7 +264,7 @@ class Recognizer:
|
||||
return min_i
|
||||
|
||||
@staticmethod
|
||||
def find_overlapped_with_threashold(box, boxes, thr=0.3):
|
||||
def find_overlapped_with_threshold(box, boxes, thr=0.3):
|
||||
if not boxes:
|
||||
return
|
||||
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
|
||||
|
||||
@ -84,13 +84,13 @@ def get_table_html(img, tb_cpns, ocr):
|
||||
clmns = LayoutRecognizer.layouts_cleanup(boxes, clmns, 5, 0.5)
|
||||
|
||||
for b in boxes:
|
||||
ii = LayoutRecognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
|
||||
ii = LayoutRecognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
|
||||
if ii is not None:
|
||||
b["R"] = ii
|
||||
b["R_top"] = rows[ii]["top"]
|
||||
b["R_bott"] = rows[ii]["bottom"]
|
||||
|
||||
ii = LayoutRecognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
|
||||
ii = LayoutRecognizer.find_overlapped_with_threshold(b, headers, thr=0.3)
|
||||
if ii is not None:
|
||||
b["H_top"] = headers[ii]["top"]
|
||||
b["H_bott"] = headers[ii]["bottom"]
|
||||
@ -104,7 +104,7 @@ def get_table_html(img, tb_cpns, ocr):
|
||||
b["C_left"] = clmns[ii]["x0"]
|
||||
b["C_right"] = clmns[ii]["x1"]
|
||||
|
||||
ii = LayoutRecognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
|
||||
ii = LayoutRecognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
|
||||
if ii is not None:
|
||||
b["H_top"] = spans[ii]["top"]
|
||||
b["H_bott"] = spans[ii]["bottom"]
|
||||
|
||||
Reference in New Issue
Block a user