### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
KevinHuSh
2024-04-25 14:14:28 +08:00
committed by GitHub
parent cf9b554c3a
commit 66f8d35632
14 changed files with 124 additions and 34 deletions

View File

@ -37,8 +37,8 @@ class HuParser:
self.updown_cnt_mdl.set_param({"device": "cuda"})
try:
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc")
get_project_base_directory(),
"rag/res/deepdoc")
self.updown_cnt_mdl.load_model(os.path.join(
model_dir, "updown_concat_xgb.model"))
except Exception as e:
@ -49,7 +49,6 @@ class HuParser:
self.updown_cnt_mdl.load_model(os.path.join(
model_dir, "updown_concat_xgb.model"))
self.page_from = 0
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
@ -76,7 +75,7 @@ class HuParser:
def _y_dis(
self, a, b):
return (
b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
def _match_proj(self, b):
proj_patt = [
@ -99,9 +98,9 @@ class HuParser:
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
tks_all = up["text"][-LEN:].strip() \
+ (" " if re.match(r"[a-zA-Z0-9]+",
up["text"][-1] + down["text"][0]) else "") \
+ down["text"][:LEN].strip()
+ (" " if re.match(r"[a-zA-Z0-9]+",
up["text"][-1] + down["text"][0]) else "") \
+ down["text"][:LEN].strip()
tks_all = huqie.qie(tks_all).split(" ")
fea = [
up.get("R", -1) == down.get("R", -1),
@ -123,7 +122,7 @@ class HuParser:
True if re.search(r"[,][^。.]+$", up["text"]) else False,
True if re.search(r"[,][^。.]+$", up["text"]) else False,
True if re.search(r"[\(][^\)]+$", up["text"])
and re.search(r"[\)]", down["text"]) else False,
and re.search(r"[\)]", down["text"]) else False,
self._match_proj(down),
True if re.match(r"[A-Z]", down["text"]) else False,
True if re.match(r"[A-Z]", up["text"][-1]) else False,
@ -185,7 +184,7 @@ class HuParser:
continue
for tb in tbls: # for table
left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
tb["x1"] + MARGIN, tb["bottom"] + MARGIN
tb["x1"] + MARGIN, tb["bottom"] + MARGIN
left *= ZM
top *= ZM
right *= ZM
@ -297,7 +296,7 @@ class HuParser:
for b in bxs:
if not b["text"]:
left, right, top, bott = b["x0"] * ZM, b["x1"] * \
ZM, b["top"] * ZM, b["bottom"] * ZM
ZM, b["top"] * ZM, b["bottom"] * ZM
b["text"] = self.ocr.recognize(np.array(img),
np.array([[left, top], [right, top], [right, bott], [left, bott]],
dtype=np.float32))
@ -622,7 +621,7 @@ class HuParser:
i += 1
continue
lout_no = str(self.boxes[i]["page_number"]) + \
"-" + str(self.boxes[i]["layoutno"])
"-" + str(self.boxes[i]["layoutno"])
if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
"title",
"figure caption",
@ -975,6 +974,7 @@ class HuParser:
self.outlines.append((a["/Title"], depth))
continue
dfs(a, depth + 1)
dfs(outlines, 0)
except Exception as e:
logging.warning(f"Outlines exception: {e}")
@ -984,7 +984,7 @@ class HuParser:
logging.info("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
range(len(self.page_chars))]
range(len(self.page_chars))]
if sum([1 if e else 0 for e in self.is_english]) > len(
self.page_images) / 2:
self.is_english = True
@ -1012,9 +1012,9 @@ class HuParser:
j += 1
self.__ocr(i + 1, img, chars, zoomin)
#if callback:
# callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
#print("OCR:", timer()-st)
if callback and i % 6 == 5:
callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
# print("OCR:", timer()-st)
if not self.is_english and not any(
[c for c in self.page_chars]) and self.boxes:
@ -1050,7 +1050,7 @@ class HuParser:
left, right, top, bottom = float(left), float(
right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")],
left, right, top, bottom))
left, right, top, bottom))
if not poss:
if need_position:
return None, None
@ -1076,7 +1076,7 @@ class HuParser:
self.page_images[pns[0]].crop((left * ZM, top * ZM,
right *
ZM, min(
bottom, self.page_images[pns[0]].size[1])
bottom, self.page_images[pns[0]].size[1])
))
)
if 0 < ii < len(poss) - 1: