add dockerfile for cuda envirement. Refine table search strategy, (#123)

2026-01-30 23:26:36 +08:00 · 2024-03-14 19:45:29 +08:00
parent 937048e5fb
commit 675a9f8d9a
18 changed files with 259 additions and 84 deletions
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@ -34,7 +34,7 @@ class HuExcelParser:
            total = 0
            for sheetname in wb.sheetnames:
                ws = wb[sheetname]
-                total += len(ws.rows)
+                total += len(list(ws.rows))
                return total

        if fnm.split(".")[-1].lower() in ["csv", "txt"]:
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -655,14 +655,14 @@ class HuParser:
            #if min(tv, fv) > 2000:
            #    i += 1
            #    continue
-            if tv < fv:
+            if tv < fv and tk:
                tables[tk].insert(0, c)
                logging.debug(
                    "TABLE:" +
                    self.boxes[i]["text"] +
                    "; Cap: " +
                    tk)
-            else:
+            elif fk:
                figures[fk].insert(0, c)
                logging.debug(
                    "FIGURE:" +
--- a/deepdoc/parser/ppt_parser.py
+++ b/deepdoc/parser/ppt_parser.py
@ -31,7 +31,7 @@ class HuPptParser(object):

        if shape.shape_type == 6:
            texts = []
-            for p in shape.shapes:
+            for p in sorted(shape.shapes, key=lambda x: (x.top//10, x.left)):
                t = self.__extract(p)
                if t: texts.append(t)
            return "\n".join(texts)
@ -46,7 +46,7 @@ class HuPptParser(object):
            if i < from_page: continue
            if i >= to_page:break
            texts = []
-            for shape in slide.shapes:
+            for shape in sorted(slide.shapes, key=lambda x: (x.top//10, x.left)):
                txt = self.__extract(shape)
                if txt: texts.append(txt)
            txts.append("\n".join(texts))
--- a/deepdoc/vision/ocr.py
+++ b/deepdoc/vision/ocr.py
@ -64,10 +64,15 @@ def load_model(model_dir, nm):
        raise ValueError("not find model file path {}".format(
            model_file_path))

+    options = ort.SessionOptions()
+    options.enable_cpu_mem_arena = False
+    options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+    options.intra_op_num_threads = 2
+    options.inter_op_num_threads = 2
    if ort.get_device() == "GPU":
-        sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider'])
+        sess = ort.InferenceSession(model_file_path, options=options, providers=['CUDAExecutionProvider'])
    else:
-        sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider'])
+        sess = ort.InferenceSession(model_file_path, options=options, providers=['CPUExecutionProvider'])
    return sess, sess.get_inputs()[0]


@ -325,7 +330,13 @@ class TextRecognizer(object):

            input_dict = {}
            input_dict[self.input_tensor.name] = norm_img_batch
-            outputs = self.predictor.run(None, input_dict)
+            for i in range(100000):
+                try:
+                    outputs = self.predictor.run(None, input_dict)
+                    break
+                except Exception as e:
+                    if i >= 3: raise e
+                    time.sleep(5)
            preds = outputs[0]
            rec_result = self.postprocess_op(preds)
            for rno in range(len(rec_result)):
@ -430,7 +441,13 @@ class TextDetector(object):
        img = img.copy()
        input_dict = {}
        input_dict[self.input_tensor.name] = img
-        outputs = self.predictor.run(None, input_dict)
+        for i in range(100000):
+            try:
+                outputs = self.predictor.run(None, input_dict)
+                break
+            except Exception as e:
+                if i >= 3: raise e
+                time.sleep(5)

        post_result = self.postprocess_op({"maps": outputs[0]}, shape_list)
        dt_boxes = post_result[0]['points']
--- a/deepdoc/vision/recognizer.py
+++ b/deepdoc/vision/recognizer.py
@ -42,7 +42,9 @@ class Recognizer(object):
            raise ValueError("not find model file path {}".format(
                model_file_path))
        if ort.get_device() == "GPU":
-            self.ort_sess = ort.InferenceSession(model_file_path, providers=['CUDAExecutionProvider'])
+            options = ort.SessionOptions()
+            options.enable_cpu_mem_arena = False
+            self.ort_sess = ort.InferenceSession(model_file_path, options=options, providers=[('CUDAExecutionProvider')])
        else:
            self.ort_sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider'])
        self.input_names = [node.name for node in self.ort_sess.get_inputs()]