Refactor (#537)

### What problem does this PR solve? ### Type of change - [x] Refactoring
2025-12-08 20:42:30 +08:00 · 2024-04-25 14:14:28 +08:00
parent cf9b554c3a
commit 66f8d35632
14 changed files with 124 additions and 34 deletions
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -37,8 +37,8 @@ class HuParser:
            self.updown_cnt_mdl.set_param({"device": "cuda"})
        try:
            model_dir = os.path.join(
-                    get_project_base_directory(),
-                    "rag/res/deepdoc")
+                get_project_base_directory(),
+                "rag/res/deepdoc")
            self.updown_cnt_mdl.load_model(os.path.join(
                model_dir, "updown_concat_xgb.model"))
        except Exception as e:
@ -49,7 +49,6 @@ class HuParser:
            self.updown_cnt_mdl.load_model(os.path.join(
                model_dir, "updown_concat_xgb.model"))

-
        self.page_from = 0
        """
        If you have trouble downloading HuggingFace models, -_^ this might help!!
@ -76,7 +75,7 @@ class HuParser:
    def _y_dis(
            self, a, b):
        return (
-            b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
+                       b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2

    def _match_proj(self, b):
        proj_patt = [
@ -99,9 +98,9 @@ class HuParser:
        tks_down = huqie.qie(down["text"][:LEN]).split(" ")
        tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
        tks_all = up["text"][-LEN:].strip() \
-            + (" " if re.match(r"[a-zA-Z0-9]+",
-                               up["text"][-1] + down["text"][0]) else "") \
-            + down["text"][:LEN].strip()
+                  + (" " if re.match(r"[a-zA-Z0-9]+",
+                                     up["text"][-1] + down["text"][0]) else "") \
+                  + down["text"][:LEN].strip()
        tks_all = huqie.qie(tks_all).split(" ")
        fea = [
            up.get("R", -1) == down.get("R", -1),
@ -123,7 +122,7 @@ class HuParser:
            True if re.search(r"[，,][^。.]+$", up["text"]) else False,
            True if re.search(r"[，,][^。.]+$", up["text"]) else False,
            True if re.search(r"[\(（][^\)）]+$", up["text"])
-            and re.search(r"[\)）]", down["text"]) else False,
+                    and re.search(r"[\)）]", down["text"]) else False,
            self._match_proj(down),
            True if re.match(r"[A-Z]", down["text"]) else False,
            True if re.match(r"[A-Z]", up["text"][-1]) else False,
@ -185,7 +184,7 @@ class HuParser:
                continue
            for tb in tbls:  # for table
                left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
-                    tb["x1"] + MARGIN, tb["bottom"] + MARGIN
+                                         tb["x1"] + MARGIN, tb["bottom"] + MARGIN
                left *= ZM
                top *= ZM
                right *= ZM
@ -297,7 +296,7 @@ class HuParser:
        for b in bxs:
            if not b["text"]:
                left, right, top, bott = b["x0"] * ZM, b["x1"] * \
-                    ZM, b["top"] * ZM, b["bottom"] * ZM
+                                         ZM, b["top"] * ZM, b["bottom"] * ZM
                b["text"] = self.ocr.recognize(np.array(img),
                                               np.array([[left, top], [right, top], [right, bott], [left, bott]],
                                                        dtype=np.float32))
@ -622,7 +621,7 @@ class HuParser:
                i += 1
                continue
            lout_no = str(self.boxes[i]["page_number"]) + \
-                "-" + str(self.boxes[i]["layoutno"])
+                      "-" + str(self.boxes[i]["layoutno"])
            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
                                                                                                      "title",
                                                                                                      "figure caption",
@ -975,6 +974,7 @@ class HuParser:
                        self.outlines.append((a["/Title"], depth))
                        continue
                    dfs(a, depth + 1)
+
            dfs(outlines, 0)
        except Exception as e:
            logging.warning(f"Outlines exception: {e}")
@ -984,7 +984,7 @@ class HuParser:
        logging.info("Images converted.")
        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
            random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
-            range(len(self.page_chars))]
+                           range(len(self.page_chars))]
        if sum([1 if e else 0 for e in self.is_english]) > len(
                self.page_images) / 2:
            self.is_english = True
@ -1012,9 +1012,9 @@ class HuParser:
                j += 1

            self.__ocr(i + 1, img, chars, zoomin)
-            #if callback:
-            #    callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
-        #print("OCR:", timer()-st)
+            if callback and i % 6 == 5:
+                callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
+        # print("OCR:", timer()-st)

        if not self.is_english and not any(
                [c for c in self.page_chars]) and self.boxes:
@ -1050,7 +1050,7 @@ class HuParser:
            left, right, top, bottom = float(left), float(
                right), float(top), float(bottom)
            poss.append(([int(p) - 1 for p in pn.split("-")],
-                        left, right, top, bottom))
+                         left, right, top, bottom))
        if not poss:
            if need_position:
                return None, None
@ -1076,7 +1076,7 @@ class HuParser:
                self.page_images[pns[0]].crop((left * ZM, top * ZM,
                                               right *
                                               ZM, min(
-                                                   bottom, self.page_images[pns[0]].size[1])
+                    bottom, self.page_images[pns[0]].size[1])
                                               ))
            )
            if 0 < ii < len(poss) - 1: