refine manual parser (#140)

This commit is contained in:
KevinHuSh
2024-03-21 18:17:32 +08:00
committed by GitHub
parent f4ec7cfa76
commit 6c6b144de2
11 changed files with 77 additions and 47 deletions

View File

@ -11,7 +11,7 @@ import logging
from PIL import Image, ImageDraw
import numpy as np
from api.db import ParserType
from PyPDF2 import PdfReader as pdf2_read
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import huqie
from copy import deepcopy
@ -288,9 +288,9 @@ class HuParser:
for b in bxs])
self.boxes.append(bxs)
def _layouts_rec(self, ZM):
def _layouts_rec(self, ZM, drop=True):
assert len(self.page_images) == len(self.boxes)
self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM)
self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM, drop=drop)
# cumlative Y
for i in range(len(self.boxes)):
self.boxes[i]["top"] += \
@ -908,6 +908,23 @@ class HuParser:
self.page_images.append(img)
self.page_chars.append([])
self.outlines = []
try:
self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
outlines = self.pdf.outline
def dfs(arr, depth):
for a in arr:
if isinstance(a, dict):
self.outlines.append((a["/Title"], depth))
continue
dfs(a, depth+1)
dfs(outlines, 0)
except Exception as e:
logging.warning(f"Outlines exception: {e}")
if not self.outlines:
logging.warning(f"Miss outlines")
logging.info("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in