mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Refactor (#537)
### What problem does this PR solve? ### Type of change - [x] Refactoring
This commit is contained in:
@ -37,8 +37,8 @@ class HuParser:
|
||||
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
||||
try:
|
||||
model_dir = os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res/deepdoc")
|
||||
get_project_base_directory(),
|
||||
"rag/res/deepdoc")
|
||||
self.updown_cnt_mdl.load_model(os.path.join(
|
||||
model_dir, "updown_concat_xgb.model"))
|
||||
except Exception as e:
|
||||
@ -49,7 +49,6 @@ class HuParser:
|
||||
self.updown_cnt_mdl.load_model(os.path.join(
|
||||
model_dir, "updown_concat_xgb.model"))
|
||||
|
||||
|
||||
self.page_from = 0
|
||||
"""
|
||||
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
||||
@ -76,7 +75,7 @@ class HuParser:
|
||||
def _y_dis(
|
||||
self, a, b):
|
||||
return (
|
||||
b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
|
||||
b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
|
||||
|
||||
def _match_proj(self, b):
|
||||
proj_patt = [
|
||||
@ -99,9 +98,9 @@ class HuParser:
|
||||
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
|
||||
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
|
||||
tks_all = up["text"][-LEN:].strip() \
|
||||
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
||||
up["text"][-1] + down["text"][0]) else "") \
|
||||
+ down["text"][:LEN].strip()
|
||||
+ (" " if re.match(r"[a-zA-Z0-9]+",
|
||||
up["text"][-1] + down["text"][0]) else "") \
|
||||
+ down["text"][:LEN].strip()
|
||||
tks_all = huqie.qie(tks_all).split(" ")
|
||||
fea = [
|
||||
up.get("R", -1) == down.get("R", -1),
|
||||
@ -123,7 +122,7 @@ class HuParser:
|
||||
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
||||
True if re.search(r"[,,][^。.]+$", up["text"]) else False,
|
||||
True if re.search(r"[\((][^\))]+$", up["text"])
|
||||
and re.search(r"[\))]", down["text"]) else False,
|
||||
and re.search(r"[\))]", down["text"]) else False,
|
||||
self._match_proj(down),
|
||||
True if re.match(r"[A-Z]", down["text"]) else False,
|
||||
True if re.match(r"[A-Z]", up["text"][-1]) else False,
|
||||
@ -185,7 +184,7 @@ class HuParser:
|
||||
continue
|
||||
for tb in tbls: # for table
|
||||
left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
|
||||
tb["x1"] + MARGIN, tb["bottom"] + MARGIN
|
||||
tb["x1"] + MARGIN, tb["bottom"] + MARGIN
|
||||
left *= ZM
|
||||
top *= ZM
|
||||
right *= ZM
|
||||
@ -297,7 +296,7 @@ class HuParser:
|
||||
for b in bxs:
|
||||
if not b["text"]:
|
||||
left, right, top, bott = b["x0"] * ZM, b["x1"] * \
|
||||
ZM, b["top"] * ZM, b["bottom"] * ZM
|
||||
ZM, b["top"] * ZM, b["bottom"] * ZM
|
||||
b["text"] = self.ocr.recognize(np.array(img),
|
||||
np.array([[left, top], [right, top], [right, bott], [left, bott]],
|
||||
dtype=np.float32))
|
||||
@ -622,7 +621,7 @@ class HuParser:
|
||||
i += 1
|
||||
continue
|
||||
lout_no = str(self.boxes[i]["page_number"]) + \
|
||||
"-" + str(self.boxes[i]["layoutno"])
|
||||
"-" + str(self.boxes[i]["layoutno"])
|
||||
if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
|
||||
"title",
|
||||
"figure caption",
|
||||
@ -975,6 +974,7 @@ class HuParser:
|
||||
self.outlines.append((a["/Title"], depth))
|
||||
continue
|
||||
dfs(a, depth + 1)
|
||||
|
||||
dfs(outlines, 0)
|
||||
except Exception as e:
|
||||
logging.warning(f"Outlines exception: {e}")
|
||||
@ -984,7 +984,7 @@ class HuParser:
|
||||
logging.info("Images converted.")
|
||||
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
|
||||
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
|
||||
range(len(self.page_chars))]
|
||||
range(len(self.page_chars))]
|
||||
if sum([1 if e else 0 for e in self.is_english]) > len(
|
||||
self.page_images) / 2:
|
||||
self.is_english = True
|
||||
@ -1012,9 +1012,9 @@ class HuParser:
|
||||
j += 1
|
||||
|
||||
self.__ocr(i + 1, img, chars, zoomin)
|
||||
#if callback:
|
||||
# callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
|
||||
#print("OCR:", timer()-st)
|
||||
if callback and i % 6 == 5:
|
||||
callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
|
||||
# print("OCR:", timer()-st)
|
||||
|
||||
if not self.is_english and not any(
|
||||
[c for c in self.page_chars]) and self.boxes:
|
||||
@ -1050,7 +1050,7 @@ class HuParser:
|
||||
left, right, top, bottom = float(left), float(
|
||||
right), float(top), float(bottom)
|
||||
poss.append(([int(p) - 1 for p in pn.split("-")],
|
||||
left, right, top, bottom))
|
||||
left, right, top, bottom))
|
||||
if not poss:
|
||||
if need_position:
|
||||
return None, None
|
||||
@ -1076,7 +1076,7 @@ class HuParser:
|
||||
self.page_images[pns[0]].crop((left * ZM, top * ZM,
|
||||
right *
|
||||
ZM, min(
|
||||
bottom, self.page_images[pns[0]].size[1])
|
||||
bottom, self.page_images[pns[0]].size[1])
|
||||
))
|
||||
)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
|
||||
Reference in New Issue
Block a user