Feat: add splitter (#10161)

### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com>
2025-12-26 17:16:52 +08:00 · 2025-09-19 10:15:19 +08:00
parent f9c7404bee
commit a1b947ffd6
81 changed files with 3083 additions and 799 deletions
--- a/deepdoc/vision/init.py
+++ b/deepdoc/vision/init.py
@ -16,24 +16,28 @@
 import io
 import sys
 import threading
+
 import pdfplumber

 from .ocr import OCR
 from .recognizer import Recognizer
+from .layout_recognizer import AscendLayoutRecognizer
 from .layout_recognizer import LayoutRecognizer4YOLOv10 as LayoutRecognizer
 from .table_structure_recognizer import TableStructureRecognizer

-
 LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
 if LOCK_KEY_pdfplumber not in sys.modules:
    sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()


 def init_in_out(args):
-    from PIL import Image
    import os
    import traceback
+
+    from PIL import Image
+
    from api.utils.file_utils import traversal_files
+
    images = []
    outputs = []

@ -44,8 +48,7 @@ def init_in_out(args):
        nonlocal outputs, images
        with sys.modules[LOCK_KEY_pdfplumber]:
            pdf = pdfplumber.open(fnm)
-            images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
-                                enumerate(pdf.pages)]
+            images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(pdf.pages)]

        for i, page in enumerate(images):
            outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
@ -57,10 +60,10 @@ def init_in_out(args):
            pdf_pages(fnm)
            return
        try:
-            fp = open(fnm, 'rb')
+            fp = open(fnm, "rb")
            binary = fp.read()
            fp.close()
-            images.append(Image.open(io.BytesIO(binary)).convert('RGB'))
+            images.append(Image.open(io.BytesIO(binary)).convert("RGB"))
            outputs.append(os.path.split(fnm)[-1])
        except Exception:
            traceback.print_exc()
@ -81,6 +84,7 @@ __all__ = [
    "OCR",
    "Recognizer",
    "LayoutRecognizer",
+    "AscendLayoutRecognizer",
    "TableStructureRecognizer",
    "init_in_out",
 ]
--- a/deepdoc/vision/layout_recognizer.py
+++ b/deepdoc/vision/layout_recognizer.py
@ -14,6 +14,8 @@
 #  limitations under the License.
 #

+import logging
+import math
 import os
 import re
 from collections import Counter
@ -45,28 +47,22 @@ class LayoutRecognizer(Recognizer):

    def __init__(self, domain):
        try:
-            model_dir = os.path.join(
-                get_project_base_directory(),
-                "rag/res/deepdoc")
+            model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc")
            super().__init__(self.labels, domain, model_dir)
        except Exception:
-            model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
-                                          local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
-                                          local_dir_use_symlinks=False)
+            model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc", local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"), local_dir_use_symlinks=False)
            super().__init__(self.labels, domain, model_dir)

        self.garbage_layouts = ["footer", "header", "reference"]
        self.client = None
        if os.environ.get("TENSORRT_DLA_SVR"):
            from deepdoc.vision.dla_cli import DLAClient
+
            self.client = DLAClient(os.environ["TENSORRT_DLA_SVR"])

    def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
        def __is_garbage(b):
-            patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$",
-                    r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
-                    "\\(cid *: *[0-9]+ *\\)"
-                    ]
+            patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
            return any([re.search(p, b["text"]) for p in patt])

        if self.client:
@ -82,18 +78,23 @@ class LayoutRecognizer(Recognizer):
        page_layout = []
        for pn, lts in enumerate(layouts):
            bxs = ocr_res[pn]
-            lts = [{"type": b["type"],
+            lts = [
+                {
+                    "type": b["type"],
                    "score": float(b["score"]),
-                    "x0": b["bbox"][0] / scale_factor, "x1": b["bbox"][2] / scale_factor,
-                    "top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor,
+                    "x0": b["bbox"][0] / scale_factor,
+                    "x1": b["bbox"][2] / scale_factor,
+                    "top": b["bbox"][1] / scale_factor,
+                    "bottom": b["bbox"][-1] / scale_factor,
                    "page_number": pn,
-                    } for b in lts if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts]
-            lts = self.sort_Y_firstly(lts, np.mean(
-                [lt["bottom"] - lt["top"] for lt in lts]) / 2)
+                }
+                for b in lts
+                if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts
+            ]
+            lts = self.sort_Y_firstly(lts, np.mean([lt["bottom"] - lt["top"] for lt in lts]) / 2)
            lts = self.layouts_cleanup(bxs, lts)
            page_layout.append(lts)

-            # Tag layout type, layouts are ready
            def findLayout(ty):
                nonlocal bxs, lts, self
                lts_ = [lt for lt in lts if lt["type"] == ty]
@ -106,21 +107,17 @@ class LayoutRecognizer(Recognizer):
                        bxs.pop(i)
                        continue

-                    ii = self.find_overlapped_with_threshold(bxs[i], lts_,
-                                                              thr=0.4)
-                    if ii is None:  # belong to nothing
+                    ii = self.find_overlapped_with_threshold(bxs[i], lts_, thr=0.4)
+                    if ii is None:
                        bxs[i]["layout_type"] = ""
                        i += 1
                        continue
                    lts_[ii]["visited"] = True
                    keep_feats = [
-                        lts_[
-                            ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
-                        lts_[
-                            ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
+                        lts_[ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
+                        lts_[ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
                    ]
-                    if drop and lts_[
-                            ii]["type"] in self.garbage_layouts and not any(keep_feats):
+                    if drop and lts_[ii]["type"] in self.garbage_layouts and not any(keep_feats):
                        if lts_[ii]["type"] not in garbages:
                            garbages[lts_[ii]["type"]] = []
                        garbages[lts_[ii]["type"]].append(bxs[i]["text"])
@ -128,17 +125,14 @@ class LayoutRecognizer(Recognizer):
                        continue

                    bxs[i]["layoutno"] = f"{ty}-{ii}"
-                    bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[
-                        ii]["type"] != "equation" else "figure"
+                    bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"] != "equation" else "figure"
                    i += 1

-            for lt in ["footer", "header", "reference", "figure caption",
-                       "table caption", "title", "table", "text", "figure", "equation"]:
+            for lt in ["footer", "header", "reference", "figure caption", "table caption", "title", "table", "text", "figure", "equation"]:
                findLayout(lt)

            # add box to figure layouts which has not text box
-            for i, lt in enumerate(
-                    [lt for lt in lts if lt["type"] in ["figure", "equation"]]):
+            for i, lt in enumerate([lt for lt in lts if lt["type"] in ["figure", "equation"]]):
                if lt.get("visited"):
                    continue
                lt = deepcopy(lt)
@ -206,13 +200,11 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
            top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
            left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
-            img = cv2.copyMakeBorder(
-                img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
-            )  # add border
+            img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))  # add border
            img /= 255.0
            img = img.transpose(2, 0, 1)
            img = img[np.newaxis, :, :, :].astype(np.float32)
-            inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})
+            inputs.append({self.input_names[0]: img, "scale_factor": [shape[1] / ww, shape[0] / hh, dw, dh]})

        return inputs

@ -230,8 +222,7 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
        boxes[:, 2] -= inputs["scale_factor"][2]
        boxes[:, 1] -= inputs["scale_factor"][3]
        boxes[:, 3] -= inputs["scale_factor"][3]
-        input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
-                                inputs["scale_factor"][1]])
+        input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0], inputs["scale_factor"][1]])
        boxes = np.multiply(boxes, input_shape, dtype=np.float32)

        unique_class_ids = np.unique(class_ids)
@ -243,8 +234,223 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
            class_keep_boxes = nms(class_boxes, class_scores, 0.45)
            indices.extend(class_indices[class_keep_boxes])

-        return [{
-            "type": self.label_list[class_ids[i]].lower(),
-            "bbox": [float(t) for t in boxes[i].tolist()],
-            "score": float(scores[i])
-        } for i in indices]
+        return [{"type": self.label_list[class_ids[i]].lower(), "bbox": [float(t) for t in boxes[i].tolist()], "score": float(scores[i])} for i in indices]
+
+
+class AscendLayoutRecognizer(Recognizer):
+    labels = [
+        "title",
+        "Text",
+        "Reference",
+        "Figure",
+        "Figure caption",
+        "Table",
+        "Table caption",
+        "Table caption",
+        "Equation",
+        "Figure caption",
+    ]
+
+    def __init__(self, domain):
+        from ais_bench.infer.interface import InferSession
+
+        model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc")
+        model_file_path = os.path.join(model_dir, domain + ".om")
+
+        if not os.path.exists(model_file_path):
+            raise ValueError(f"Model file not found: {model_file_path}")
+
+        device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0))
+        self.session = InferSession(device_id=device_id, model_path=model_file_path)
+        self.input_shape = self.session.get_inputs()[0].shape[2:4]  # H,W
+        self.garbage_layouts = ["footer", "header", "reference"]
+
+    def preprocess(self, image_list):
+        inputs = []
+        H, W = self.input_shape
+        for img in image_list:
+            h, w = img.shape[:2]
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
+
+            r = min(H / h, W / w)
+            new_unpad = (int(round(w * r)), int(round(h * r)))
+            dw, dh = (W - new_unpad[0]) / 2.0, (H - new_unpad[1]) / 2.0
+
+            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+            top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+            left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+            img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
+
+            img /= 255.0
+            img = img.transpose(2, 0, 1)[np.newaxis, :, :, :].astype(np.float32)
+
+            inputs.append(
+                {
+                    "image": img,
+                    "scale_factor": [w / new_unpad[0], h / new_unpad[1]],
+                    "pad": [dw, dh],
+                    "orig_shape": [h, w],
+                }
+            )
+        return inputs
+
+    def postprocess(self, boxes, inputs, thr=0.25):
+        arr = np.squeeze(boxes)
+        if arr.ndim == 1:
+            arr = arr.reshape(1, -1)
+
+        results = []
+        if arr.shape[1] == 6:
+            # [x1,y1,x2,y2,score,cls]
+            m = arr[:, 4] >= thr
+            arr = arr[m]
+            if arr.size == 0:
+                return []
+            xyxy = arr[:, :4].astype(np.float32)
+            scores = arr[:, 4].astype(np.float32)
+            cls_ids = arr[:, 5].astype(np.int32)
+
+            if "pad" in inputs:
+                dw, dh = inputs["pad"]
+                sx, sy = inputs["scale_factor"]
+                xyxy[:, [0, 2]] -= dw
+                xyxy[:, [1, 3]] -= dh
+                xyxy *= np.array([sx, sy, sx, sy], dtype=np.float32)
+            else:
+                # backup
+                sx, sy = inputs["scale_factor"]
+                xyxy *= np.array([sx, sy, sx, sy], dtype=np.float32)
+
+            keep_indices = []
+            for c in np.unique(cls_ids):
+                idx = np.where(cls_ids == c)[0]
+                k = nms(xyxy[idx], scores[idx], 0.45)
+                keep_indices.extend(idx[k])
+
+            for i in keep_indices:
+                cid = int(cls_ids[i])
+                if 0 <= cid < len(self.labels):
+                    results.append({"type": self.labels[cid].lower(), "bbox": [float(t) for t in xyxy[i].tolist()], "score": float(scores[i])})
+            return results
+
+        raise ValueError(f"Unexpected output shape: {arr.shape}")
+
+    def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
+        import re
+        from collections import Counter
+
+        assert len(image_list) == len(ocr_res)
+
+        images = [np.array(im) if not isinstance(im, np.ndarray) else im for im in image_list]
+        layouts_all_pages = []  # list of list[{"type","score","bbox":[x1,y1,x2,y2]}]
+
+        conf_thr = max(thr, 0.08)
+
+        batch_loop_cnt = math.ceil(float(len(images)) / batch_size)
+        for bi in range(batch_loop_cnt):
+            s = bi * batch_size
+            e = min((bi + 1) * batch_size, len(images))
+            batch_images = images[s:e]
+
+            inputs_list = self.preprocess(batch_images)
+            logging.debug("preprocess done")
+
+            for ins in inputs_list:
+                feeds = [ins["image"]]
+                out_list = self.session.infer(feeds=feeds, mode="static")
+
+                for out in out_list:
+                    lts = self.postprocess(out, ins, conf_thr)
+
+                    page_lts = []
+                    for b in lts:
+                        if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts:
+                            x0, y0, x1, y1 = b["bbox"]
+                            page_lts.append(
+                                {
+                                    "type": b["type"],
+                                    "score": float(b["score"]),
+                                    "x0": float(x0) / scale_factor,
+                                    "x1": float(x1) / scale_factor,
+                                    "top": float(y0) / scale_factor,
+                                    "bottom": float(y1) / scale_factor,
+                                    "page_number": len(layouts_all_pages),
+                                }
+                            )
+                    layouts_all_pages.append(page_lts)
+
+        def _is_garbage_text(box):
+            patt = [r"^•+$", r"^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", r"^http://[^ ]{12,}", r"\(cid *: *[0-9]+ *\)"]
+            return any(re.search(p, box.get("text", "")) for p in patt)
+
+        boxes_out = []
+        page_layout = []
+        garbages = {}
+
+        for pn, lts in enumerate(layouts_all_pages):
+            if lts:
+                avg_h = np.mean([lt["bottom"] - lt["top"] for lt in lts])
+                lts = self.sort_Y_firstly(lts, avg_h / 2 if avg_h > 0 else 0)
+
+            bxs = ocr_res[pn]
+            lts = self.layouts_cleanup(bxs, lts)
+            page_layout.append(lts)
+
+            def _tag_layout(ty):
+                nonlocal bxs, lts
+                lts_of_ty = [lt for lt in lts if lt["type"] == ty]
+                i = 0
+                while i < len(bxs):
+                    if bxs[i].get("layout_type"):
+                        i += 1
+                        continue
+                    if _is_garbage_text(bxs[i]):
+                        bxs.pop(i)
+                        continue
+
+                    ii = self.find_overlapped_with_threshold(bxs[i], lts_of_ty, thr=0.4)
+                    if ii is None:
+                        bxs[i]["layout_type"] = ""
+                        i += 1
+                        continue
+
+                    lts_of_ty[ii]["visited"] = True
+
+                    keep_feats = [
+                        lts_of_ty[ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].shape[0] * 0.9 / scale_factor,
+                        lts_of_ty[ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].shape[0] * 0.1 / scale_factor,
+                    ]
+                    if drop and lts_of_ty[ii]["type"] in self.garbage_layouts and not any(keep_feats):
+                        garbages.setdefault(lts_of_ty[ii]["type"], []).append(bxs[i].get("text", ""))
+                        bxs.pop(i)
+                        continue
+
+                    bxs[i]["layoutno"] = f"{ty}-{ii}"
+                    bxs[i]["layout_type"] = lts_of_ty[ii]["type"] if lts_of_ty[ii]["type"] != "equation" else "figure"
+                    i += 1
+
+            for ty in ["footer", "header", "reference", "figure caption", "table caption", "title", "table", "text", "figure", "equation"]:
+                _tag_layout(ty)
+
+            figs = [lt for lt in lts if lt["type"] in ["figure", "equation"]]
+            for i, lt in enumerate(figs):
+                if lt.get("visited"):
+                    continue
+                lt = deepcopy(lt)
+                lt.pop("type", None)
+                lt["text"] = ""
+                lt["layout_type"] = "figure"
+                lt["layoutno"] = f"figure-{i}"
+                bxs.append(lt)
+
+            boxes_out.extend(bxs)
+
+        garbag_set = set()
+        for k, lst in garbages.items():
+            cnt = Counter(lst)
+            for g, c in cnt.items():
+                if c > 1:
+                    garbag_set.add(g)
+
+        ocr_res_new = [b for b in boxes_out if b["text"].strip() not in garbag_set]
+        return ocr_res_new, page_layout
--- a/deepdoc/vision/ocr.py
+++ b/deepdoc/vision/ocr.py
@ -13,7 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-
+import gc
 import logging
 import copy
 import time
@ -348,6 +348,13 @@ class TextRecognizer:

        return img

+    def close(self):
+        # close session and release manually
+        logging.info('Close TextRecognizer.')
+        if hasattr(self, "predictor"):
+            del self.predictor
+        gc.collect()
+
    def __call__(self, img_list):
        img_num = len(img_list)
        # Calculate the aspect ratio of all text bars
@ -395,6 +402,9 @@ class TextRecognizer:

        return rec_res, time.time() - st

+    def __del__(self):
+        self.close()
+

 class TextDetector:
    def __init__(self, model_dir, device_id: int | None = None):
@ -479,6 +489,12 @@ class TextDetector:
        dt_boxes = np.array(dt_boxes_new)
        return dt_boxes

+    def close(self):
+        logging.info("Close TextDetector.")
+        if hasattr(self, "predictor"):
+            del self.predictor
+        gc.collect()
+
    def __call__(self, img):
        ori_im = img.copy()
        data = {'image': img}
@ -508,6 +524,9 @@ class TextDetector:

        return dt_boxes, time.time() - st

+    def __del__(self):
+        self.close()
+

 class OCR:
    def __init__(self, model_dir=None):
--- a/deepdoc/vision/recognizer.py
+++ b/deepdoc/vision/recognizer.py
@ -13,7 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-
+import gc
 import logging
 import os
 import math
@ -406,6 +406,12 @@ class Recognizer:
            "score": float(scores[i])
        } for i in indices]

+    def close(self):
+        logging.info("Close recognizer.")
+        if hasattr(self, "ort_sess"):
+            del self.ort_sess
+        gc.collect()
+
    def __call__(self, image_list, thr=0.7, batch_size=16):
        res = []
        images = []
@ -430,5 +436,7 @@ class Recognizer:

        return res

+    def __del__(self):
+        self.close()


--- a/deepdoc/vision/table_structure_recognizer.py
+++ b/deepdoc/vision/table_structure_recognizer.py
@ -23,6 +23,7 @@ from huggingface_hub import snapshot_download

 from api.utils.file_utils import get_project_base_directory
 from rag.nlp import rag_tokenizer
+
 from .recognizer import Recognizer


@ -38,31 +39,49 @@ class TableStructureRecognizer(Recognizer):

    def __init__(self):
        try:
-            super().__init__(self.labels, "tsr", os.path.join(
-                    get_project_base_directory(),
-                    "rag/res/deepdoc"))
+            super().__init__(self.labels, "tsr", os.path.join(get_project_base_directory(), "rag/res/deepdoc"))
        except Exception:
-            super().__init__(self.labels, "tsr", snapshot_download(repo_id="InfiniFlow/deepdoc",
-                                              local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
-                                              local_dir_use_symlinks=False))
+            super().__init__(
+                self.labels,
+                "tsr",
+                snapshot_download(
+                    repo_id="InfiniFlow/deepdoc",
+                    local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
+                    local_dir_use_symlinks=False,
+                ),
+            )

    def __call__(self, images, thr=0.2):
-        tbls = super().__call__(images, thr)
+        table_structure_recognizer_type = os.getenv("TABLE_STRUCTURE_RECOGNIZER_TYPE", "onnx").lower()
+        if table_structure_recognizer_type not in ["onnx", "ascend"]:
+            raise RuntimeError("Unsupported table structure recognizer type.")
+
+        if table_structure_recognizer_type == "onnx":
+            logging.debug("Using Onnx table structure recognizer", flush=True)
+            tbls = super().__call__(images, thr)
+        else:  # ascend
+            logging.debug("Using Ascend table structure recognizer", flush=True)
+            tbls = self._run_ascend_tsr(images, thr)
+
        res = []
        # align left&right for rows, align top&bottom for columns
        for tbl in tbls:
-            lts = [{"label": b["type"],
+            lts = [
+                {
+                    "label": b["type"],
                    "score": b["score"],
-                    "x0": b["bbox"][0], "x1": b["bbox"][2],
-                    "top": b["bbox"][1], "bottom": b["bbox"][-1]
-                    } for b in tbl]
+                    "x0": b["bbox"][0],
+                    "x1": b["bbox"][2],
+                    "top": b["bbox"][1],
+                    "bottom": b["bbox"][-1],
+                }
+                for b in tbl
+            ]
            if not lts:
                continue

-            left = [b["x0"] for b in lts if b["label"].find(
-                "row") > 0 or b["label"].find("header") > 0]
-            right = [b["x1"] for b in lts if b["label"].find(
-                "row") > 0 or b["label"].find("header") > 0]
+            left = [b["x0"] for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0]
+            right = [b["x1"] for b in lts if b["label"].find("row") > 0 or b["label"].find("header") > 0]
            if not left:
                continue
            left = np.mean(left) if len(left) > 4 else np.min(left)
@ -93,11 +112,8 @@ class TableStructureRecognizer(Recognizer):

    @staticmethod
    def is_caption(bx):
-        patt = [
-            r"[图表]+[ 0-9:：]{2,}"
-        ]
-        if any([re.match(p, bx["text"].strip()) for p in patt]) \
-                or bx.get("layout_type", "").find("caption") >= 0:
+        patt = [r"[图表]+[ 0-9:：]{2,}"]
+        if any([re.match(p, bx["text"].strip()) for p in patt]) or bx.get("layout_type", "").find("caption") >= 0:
            return True
        return False

@ -115,7 +131,7 @@ class TableStructureRecognizer(Recognizer):
            (r"^[0-9A-Z/\._~-]+$", "Ca"),
            (r"^[A-Z]*[a-z' -]+$", "En"),
            (r"^[0-9.,+-]+[0-9A-Za-z/$￥%<>（）()' -]+$", "NE"),
-            (r"^.{1}$", "Sg")
+            (r"^.{1}$", "Sg"),
        ]
        for p, n in patt:
            if re.search(p, b["text"].strip()):
@ -156,21 +172,19 @@ class TableStructureRecognizer(Recognizer):
        rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
        rowh = np.min(rowh) if rowh else 0
        boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
-        #for b in boxes:print(b)
+        # for b in boxes:print(b)
        boxes[0]["rn"] = 0
        rows = [[boxes[0]]]
        btm = boxes[0]["bottom"]
        for b in boxes[1:]:
            b["rn"] = len(rows) - 1
            lst_r = rows[-1]
-            if lst_r[-1].get("R", "") != b.get("R", "") \
-                    or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
-                        ):  # new row
+            if lst_r[-1].get("R", "") != b.get("R", "") or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")):  # new row
                btm = b["bottom"]
                b["rn"] += 1
                rows.append([b])
                continue
-            btm = (btm + b["bottom"]) / 2.
+            btm = (btm + b["bottom"]) / 2.0
            rows[-1].append(b)

        colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
@ -186,14 +200,14 @@ class TableStructureRecognizer(Recognizer):
        for b in boxes[1:]:
            b["cn"] = len(cols) - 1
            lst_c = cols[-1]
-            if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
-                "page_number"]) \
-                    or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")):  # new col
+            if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1]["page_number"]) or (
+                b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")
+            ):  # new col
                right = b["x1"]
                b["cn"] += 1
                cols.append([b])
                continue
-            right = (right + b["x1"]) / 2.
+            right = (right + b["x1"]) / 2.0
            cols[-1].append(b)

        tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
@ -214,10 +228,8 @@ class TableStructureRecognizer(Recognizer):
                if e > 1:
                    j += 1
                    continue
-                f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
-                     [j - 1][0].get("text")) or j == 0
-                ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
-                      [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
+                f = (j > 0 and tbl[ii][j - 1] and tbl[ii][j - 1][0].get("text")) or j == 0
+                ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii][j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
                if f and ff:
                    j += 1
                    continue
@ -228,13 +240,11 @@ class TableStructureRecognizer(Recognizer):
                if j > 0 and not f:
                    for i in range(len(tbl)):
                        if tbl[i][j - 1]:
-                            left = min(left, np.min(
-                                [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
+                            left = min(left, np.min([bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
                if j + 1 < len(tbl[0]) and not ff:
                    for i in range(len(tbl)):
                        if tbl[i][j + 1]:
-                            right = min(right, np.min(
-                                [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
+                            right = min(right, np.min([a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
                assert left < 100000 or right < 100000
                if left < right:
                    for jj in range(j, len(tbl[0])):
@ -260,8 +270,7 @@ class TableStructureRecognizer(Recognizer):
                    for i in range(len(tbl)):
                        tbl[i].pop(j)
                cols.pop(j)
-        assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
-            len(cols), len(tbl[0]))
+        assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (len(cols), len(tbl[0]))

        if len(cols) >= 4:
            # remove single in row
@ -277,10 +286,8 @@ class TableStructureRecognizer(Recognizer):
                if e > 1:
                    i += 1
                    continue
-                f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
-                     [jj][0].get("text")) or i == 0
-                ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
-                      [jj][0].get("text")) or i + 1 >= len(tbl)
+                f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1][jj][0].get("text")) or i == 0
+                ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1][jj][0].get("text")) or i + 1 >= len(tbl)
                if f and ff:
                    i += 1
                    continue
@ -292,13 +299,11 @@ class TableStructureRecognizer(Recognizer):
                if i > 0 and not f:
                    for j in range(len(tbl[i - 1])):
                        if tbl[i - 1][j]:
-                            up = min(up, np.min(
-                                [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
+                            up = min(up, np.min([bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
                if i + 1 < len(tbl) and not ff:
                    for j in range(len(tbl[i + 1])):
                        if tbl[i + 1][j]:
-                            down = min(down, np.min(
-                                [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
+                            down = min(down, np.min([a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
                assert up < 100000 or down < 100000
                if up < down:
                    for ii in range(i, len(tbl)):
@ -333,22 +338,15 @@ class TableStructureRecognizer(Recognizer):
                cnt += 1
                if max_type == "Nu" and arr[0]["btype"] == "Nu":
                    continue
-                if any([a.get("H") for a in arr]) \
-                        or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
+                if any([a.get("H") for a in arr]) or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
                    h += 1
            if h / cnt > 0.5:
                hdset.add(i)

        if html:
-            return TableStructureRecognizer.__html_table(cap, hdset,
-                                                         TableStructureRecognizer.__cal_spans(boxes, rows,
-                                                                                              cols, tbl, True)
-                                                         )
+            return TableStructureRecognizer.__html_table(cap, hdset, TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl, True))

-        return TableStructureRecognizer.__desc_table(cap, hdset,
-                                                     TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl,
-                                                                                          False),
-                                                     is_english)
+        return TableStructureRecognizer.__desc_table(cap, hdset, TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl, False), is_english)

    @staticmethod
    def __html_table(cap, hdset, tbl):
@ -367,10 +365,8 @@ class TableStructureRecognizer(Recognizer):
                    continue
                txt = ""
                if arr:
-                    h = min(np.min([c["bottom"] - c["top"]
-                            for c in arr]) / 2, 10)
-                    txt = " ".join([c["text"]
-                                   for c in Recognizer.sort_Y_firstly(arr, h)])
+                    h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10)
+                    txt = " ".join([c["text"] for c in Recognizer.sort_Y_firstly(arr, h)])
                txts.append(txt)
                sp = ""
                if arr[0].get("colspan"):
@ -436,15 +432,11 @@ class TableStructureRecognizer(Recognizer):
                    if headers[j][k].find(headers[j - 1][k]) >= 0:
                        continue
                    if len(headers[j][k]) > len(headers[j - 1][k]):
-                        headers[j][k] += (de if headers[j][k]
-                                          else "") + headers[j - 1][k]
+                        headers[j][k] += (de if headers[j][k] else "") + headers[j - 1][k]
                    else:
-                        headers[j][k] = headers[j - 1][k] \
-                            + (de if headers[j - 1][k] else "") \
-                            + headers[j][k]
+                        headers[j][k] = headers[j - 1][k] + (de if headers[j - 1][k] else "") + headers[j][k]

-        logging.debug(
-            f">>>>>>>>>>>>>>>>>{cap}：SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
+        logging.debug(f">>>>>>>>>>>>>>>>>{cap}：SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
        row_txt = []
        for i in range(rowno):
            if i in hdr_rowno:
@ -503,14 +495,10 @@ class TableStructureRecognizer(Recognizer):
    @staticmethod
    def __cal_spans(boxes, rows, cols, tbl, html=True):
        # caculate span
-        clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
-                for cln in cols]
-        crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
-                for cln in cols]
-        rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
-                for row in rows]
-        rbtm = [np.mean([c.get("R_btm", c["bottom"])
-                         for c in row]) for row in rows]
+        clft = [np.mean([c.get("C_left", c["x0"]) for c in cln]) for cln in cols]
+        crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln]) for cln in cols]
+        rtop = [np.mean([c.get("R_top", c["top"]) for c in row]) for row in rows]
+        rbtm = [np.mean([c.get("R_btm", c["bottom"]) for c in row]) for row in rows]
        for b in boxes:
            if "SP" not in b:
                continue
@ -585,3 +573,40 @@ class TableStructureRecognizer(Recognizer):
                tbl[rowspan[0]][colspan[0]] = arr

        return tbl
+
+    def _run_ascend_tsr(self, image_list, thr=0.2, batch_size=16):
+        import math
+
+        from ais_bench.infer.interface import InferSession
+
+        model_dir = os.path.join(get_project_base_directory(), "rag/res/deepdoc")
+        model_file_path = os.path.join(model_dir, "tsr.om")
+
+        if not os.path.exists(model_file_path):
+            raise ValueError(f"Model file not found: {model_file_path}")
+
+        device_id = int(os.getenv("ASCEND_LAYOUT_RECOGNIZER_DEVICE_ID", 0))
+        session = InferSession(device_id=device_id, model_path=model_file_path)
+
+        images = [np.array(im) if not isinstance(im, np.ndarray) else im for im in image_list]
+        results = []
+
+        conf_thr = max(thr, 0.08)
+
+        batch_loop_cnt = math.ceil(float(len(images)) / batch_size)
+        for bi in range(batch_loop_cnt):
+            s = bi * batch_size
+            e = min((bi + 1) * batch_size, len(images))
+            batch_images = images[s:e]
+
+            inputs_list = self.preprocess(batch_images)
+            for ins in inputs_list:
+                feeds = []
+                if "image" in ins:
+                    feeds.append(ins["image"])
+                else:
+                    feeds.append(ins[self.input_names[0]])
+                output_list = session.infer(feeds=feeds, mode="static")
+                bb = self.postprocess(output_list, ins, conf_thr)
+                results.append(bb)
+        return results