apply pep8 formalize (#155)

This commit is contained in:
KevinHuSh
2024-03-27 11:33:46 +08:00
committed by GitHub
parent a02e836790
commit fd7fcb5baf
55 changed files with 1568 additions and 753 deletions

View File

@ -4,5 +4,3 @@ from .pdf_parser import HuParser as PdfParser, PlainParser
from .docx_parser import HuDocxParser as DocxParser
from .excel_parser import HuExcelParser as ExcelParser
from .ppt_parser import HuPptParser as PptParser

View File

@ -99,12 +99,15 @@ class HuDocxParser:
return ["\n".join(lines)]
def __call__(self, fnm, from_page=0, to_page=100000):
self.doc = Document(fnm) if isinstance(fnm, str) else Document(BytesIO(fnm))
self.doc = Document(fnm) if isinstance(
fnm, str) else Document(BytesIO(fnm))
pn = 0
secs = []
for p in self.doc.paragraphs:
if pn > to_page: break
if from_page <= pn < to_page and p.text.strip(): secs.append((p.text, p.style.name))
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
secs.append((p.text, p.style.name))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1

View File

@ -15,13 +15,16 @@ class HuExcelParser:
ws = wb[sheetname]
rows = list(ws.rows)
tb += f"<table><caption>{sheetname}</caption><tr>"
for t in list(rows[0]): tb += f"<th>{t.value}</th>"
for t in list(rows[0]):
tb += f"<th>{t.value}</th>"
tb += "</tr>"
for r in list(rows[1:]):
tb += "<tr>"
for i,c in enumerate(r):
if c.value is None: tb += "<td></td>"
else: tb += f"<td>{c.value}</td>"
for i, c in enumerate(r):
if c.value is None:
tb += "<td></td>"
else:
tb += f"<td>{c.value}</td>"
tb += "</tr>"
tb += "</table>\n"
return tb
@ -38,13 +41,15 @@ class HuExcelParser:
ti = list(rows[0])
for r in list(rows[1:]):
l = []
for i,c in enumerate(r):
if not c.value:continue
for i, c in enumerate(r):
if not c.value:
continue
t = str(ti[i].value) if i < len(ti) else ""
t += ("" if t else "") + str(c.value)
l.append(t)
l = "; ".join(l)
if sheetname.lower().find("sheet") <0: l += " ——"+sheetname
if sheetname.lower().find("sheet") < 0:
l += " ——" + sheetname
res.append(l)
return res

View File

@ -43,9 +43,11 @@ class HuParser:
"rag/res/deepdoc"),
local_files_only=True)
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/text_concat_xgb_v1.0")
model_dir = snapshot_download(
repo_id="InfiniFlow/text_concat_xgb_v1.0")
self.updown_cnt_mdl.load_model(os.path.join(model_dir, "updown_concat_xgb.model"))
self.updown_cnt_mdl.load_model(os.path.join(
model_dir, "updown_concat_xgb.model"))
self.page_from = 0
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
@ -72,7 +74,7 @@ class HuParser:
def _y_dis(
self, a, b):
return (
b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
def _match_proj(self, b):
proj_patt = [
@ -95,9 +97,9 @@ class HuParser:
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
tks_all = up["text"][-LEN:].strip() \
+ (" " if re.match(r"[a-zA-Z0-9]+",
up["text"][-1] + down["text"][0]) else "") \
+ down["text"][:LEN].strip()
+ (" " if re.match(r"[a-zA-Z0-9]+",
up["text"][-1] + down["text"][0]) else "") \
+ down["text"][:LEN].strip()
tks_all = huqie.qie(tks_all).split(" ")
fea = [
up.get("R", -1) == down.get("R", -1),
@ -119,7 +121,7 @@ class HuParser:
True if re.search(r"[,][^。.]+$", up["text"]) else False,
True if re.search(r"[,][^。.]+$", up["text"]) else False,
True if re.search(r"[\(][^\)]+$", up["text"])
and re.search(r"[\)]", down["text"]) else False,
and re.search(r"[\)]", down["text"]) else False,
self._match_proj(down),
True if re.match(r"[A-Z]", down["text"]) else False,
True if re.match(r"[A-Z]", up["text"][-1]) else False,
@ -181,7 +183,7 @@ class HuParser:
continue
for tb in tbls: # for table
left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
tb["x1"] + MARGIN, tb["bottom"] + MARGIN
tb["x1"] + MARGIN, tb["bottom"] + MARGIN
left *= ZM
top *= ZM
right *= ZM
@ -235,7 +237,8 @@ class HuParser:
b["R_top"] = rows[ii]["top"]
b["R_bott"] = rows[ii]["bottom"]
ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
ii = Recognizer.find_overlapped_with_threashold(
b, headers, thr=0.3)
if ii is not None:
b["H_top"] = headers[ii]["top"]
b["H_bott"] = headers[ii]["bottom"]
@ -272,7 +275,8 @@ class HuParser:
)
# merge chars in the same rect
for c in Recognizer.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
for c in Recognizer.sort_X_firstly(
chars, self.mean_width[pagenum - 1] // 4):
ii = Recognizer.find_overlapped(c, bxs)
if ii is None:
self.lefted_chars.append(c)
@ -283,13 +287,15 @@ class HuParser:
self.lefted_chars.append(c)
continue
if c["text"] == " " and bxs[ii]["text"]:
if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]):
bxs[ii]["text"] += " "
else:
bxs[ii]["text"] += c["text"]
for b in bxs:
if not b["text"]:
left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM
left, right, top, bott = b["x0"] * ZM, b["x1"] * \
ZM, b["top"] * ZM, b["bottom"] * ZM
b["text"] = self.ocr.recognize(np.array(img),
np.array([[left, top], [right, top], [right, bott], [left, bott]],
dtype=np.float32))
@ -302,7 +308,8 @@ class HuParser:
def _layouts_rec(self, ZM, drop=True):
assert len(self.page_images) == len(self.boxes)
self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM, drop=drop)
self.boxes, self.page_layout = self.layouter(
self.page_images, self.boxes, ZM, drop=drop)
# cumlative Y
for i in range(len(self.boxes)):
self.boxes[i]["top"] += \
@ -332,7 +339,8 @@ class HuParser:
"equation"]:
i += 1
continue
if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
if abs(self._y_dis(b, b_)
) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
# merge
bxs[i]["x1"] = b_["x1"]
bxs[i]["top"] = (b["top"] + b_["top"]) / 2
@ -366,12 +374,15 @@ class HuParser:
self.boxes = bxs
def _naive_vertical_merge(self):
bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
bxs = Recognizer.sort_Y_firstly(
self.boxes, np.median(
self.mean_height) / 3)
i = 0
while i + 1 < len(bxs):
b = bxs[i]
b_ = bxs[i + 1]
if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
if b["page_number"] < b_["page_number"] and re.match(
r"[0-9 •一—-]+$", b["text"]):
bxs.pop(i)
continue
if not b["text"].strip():
@ -379,7 +390,8 @@ class HuParser:
continue
concatting_feats = [
b["text"].strip()[-1] in ",;:'\",、‘“;:-",
len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
len(b["text"].strip()) > 1 and b["text"].strip(
)[-2] in ",;:'\",‘“、;:",
b["text"].strip()[0] in "。;?!?”)),,、:",
]
# features for not concating
@ -387,7 +399,7 @@ class HuParser:
b.get("layoutno", 0) != b.get("layoutno", 0),
b["text"].strip()[-1] in "。?!?",
self.is_english and b["text"].strip()[-1] in ".!?",
b["page_number"] == b_["page_number"] and b_["top"] - \
b["page_number"] == b_["page_number"] and b_["top"] -
b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
b["page_number"] < b_["page_number"] and abs(
b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
@ -396,7 +408,12 @@ class HuParser:
detach_feats = [b["x1"] < b_["x0"],
b["x0"] > b_["x1"]]
if (any(feats) and not any(concatting_feats)) or any(detach_feats):
print(b["text"], b_["text"], any(feats), any(concatting_feats), any(detach_feats))
print(
b["text"],
b_["text"],
any(feats),
any(concatting_feats),
any(detach_feats))
i += 1
continue
# merge up and down
@ -526,31 +543,39 @@ class HuParser:
i += 1
continue
findit = True
eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
eng = re.match(
r"[0-9a-zA-Z :'.-]{5,}",
self.boxes[i]["text"].strip())
self.boxes.pop(i)
if i >= len(self.boxes): break
if i >= len(self.boxes):
break
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
self.boxes[i]["text"].strip().split(" ")[:2])
while not prefix:
self.boxes.pop(i)
if i >= len(self.boxes): break
if i >= len(self.boxes):
break
prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
self.boxes[i]["text"].strip().split(" ")[:2])
self.boxes.pop(i)
if i >= len(self.boxes) or not prefix: break
if i >= len(self.boxes) or not prefix:
break
for j in range(i, min(i + 128, len(self.boxes))):
if not re.match(prefix, self.boxes[j]["text"]):
continue
for k in range(i, j): self.boxes.pop(i)
for k in range(i, j):
self.boxes.pop(i)
break
if findit: return
if findit:
return
page_dirty = [0] * len(self.page_images)
for b in self.boxes:
if re.search(r"(··|··|··)", b["text"]):
page_dirty[b["page_number"] - 1] += 1
page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
if not page_dirty: return
if not page_dirty:
return
i = 0
while i < len(self.boxes):
if self.boxes[i]["page_number"] in page_dirty:
@ -582,7 +607,8 @@ class HuParser:
b_["top"] = b["top"]
self.boxes.pop(i)
def _extract_table_figure(self, need_image, ZM, return_html, need_position):
def _extract_table_figure(self, need_image, ZM,
return_html, need_position):
tables = {}
figures = {}
# extract figure and table boxes
@ -594,7 +620,7 @@ class HuParser:
i += 1
continue
lout_no = str(self.boxes[i]["page_number"]) + \
"-" + str(self.boxes[i]["layoutno"])
"-" + str(self.boxes[i]["layoutno"])
if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
"title",
"figure caption",
@ -761,7 +787,8 @@ class HuParser:
for k, bxs in tables.items():
if not bxs:
continue
bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs]))
bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
[(b["bottom"] - b["top"]) / 2 for b in bxs]))
poss = []
res.append((cropout(bxs, "table", poss),
self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
@ -769,7 +796,8 @@ class HuParser:
assert len(positions) == len(res)
if need_position: return list(zip(res, positions))
if need_position:
return list(zip(res, positions))
return res
def proj_match(self, line):
@ -873,7 +901,8 @@ class HuParser:
boxes.pop(0)
mw = np.mean(widths)
if mj or mw / pw >= 0.35 or mw > 200:
res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
res.append(
"\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
else:
logging.debug("REMOVED: " +
"<<".join([c["text"] for c in lines]))
@ -883,13 +912,16 @@ class HuParser:
@staticmethod
def total_page_number(fnm, binary=None):
try:
pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
pdf = pdfplumber.open(
fnm) if not binary else pdfplumber.open(BytesIO(binary))
return len(pdf.pages)
except Exception as e:
pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
pdf = fitz.open(fnm) if not binary else fitz.open(
stream=fnm, filetype="pdf")
return len(pdf)
def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
def __images__(self, fnm, zoomin=3, page_from=0,
page_to=299, callback=None):
self.lefted_chars = []
self.mean_height = []
self.mean_width = []
@ -899,21 +931,26 @@ class HuParser:
self.page_layout = []
self.page_from = page_from
try:
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
self.pdf = pdfplumber.open(fnm) if isinstance(
fnm, str) else pdfplumber.open(BytesIO(fnm))
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
enumerate(self.pdf.pages[page_from:page_to])]
self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
self.pdf.pages[page_from:page_to]]
self.total_page = len(self.pdf.pages)
except Exception as e:
self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
self.pdf = fitz.open(fnm) if isinstance(
fnm, str) else fitz.open(
stream=fnm, filetype="pdf")
self.page_images = []
self.page_chars = []
mat = fitz.Matrix(zoomin, zoomin)
self.total_page = len(self.pdf)
for i, page in enumerate(self.pdf):
if i < page_from: continue
if i >= page_to: break
if i < page_from:
continue
if i >= page_to:
break
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples)
@ -930,7 +967,7 @@ class HuParser:
if isinstance(a, dict):
self.outlines.append((a["/Title"], depth))
continue
dfs(a, depth+1)
dfs(a, depth + 1)
dfs(outlines, 0)
except Exception as e:
logging.warning(f"Outlines exception: {e}")
@ -940,8 +977,9 @@ class HuParser:
logging.info("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
range(len(self.page_chars))]
if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
range(len(self.page_chars))]
if sum([1 if e else 0 for e in self.is_english]) > len(
self.page_images) / 2:
self.is_english = True
else:
self.is_english = False
@ -970,9 +1008,11 @@ class HuParser:
# self.page_cum_height.append(
# np.max([c["bottom"] for c in chars]))
self.__ocr(i + 1, img, chars, zoomin)
if callback: callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
if callback:
callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
if not self.is_english and not any(
[c for c in self.page_chars]) and self.boxes:
bxes = [b for bxs in self.boxes for b in bxs]
self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
"".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
@ -989,7 +1029,8 @@ class HuParser:
self._text_merge()
self._concat_downward()
self._filter_forpages()
tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
tbls = self._extract_table_figure(
need_image, zoomin, return_html, False)
return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
def remove_tag(self, txt):
@ -1003,15 +1044,19 @@ class HuParser:
"#").strip("@").split("\t")
left, right, top, bottom = float(left), float(
right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
poss.append(([int(p) - 1 for p in pn.split("-")],
left, right, top, bottom))
if not poss:
if need_position: return None, None
if need_position:
return None, None
return
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
max_width = max(
np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
0, pos[3] - 120), max(pos[3] - GAP, 0)))
pos = poss[-1]
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
@ -1026,7 +1071,7 @@ class HuParser:
self.page_images[pns[0]].crop((left * ZM, top * ZM,
right *
ZM, min(
bottom, self.page_images[pns[0]].size[1])
bottom, self.page_images[pns[0]].size[1])
))
)
if 0 < ii < len(poss) - 1:
@ -1047,7 +1092,8 @@ class HuParser:
bottom -= self.page_images[pn].size[1]
if not imgs:
if need_position: return None, None
if need_position:
return None, None
return
height = 0
for img in imgs:
@ -1076,12 +1122,14 @@ class HuParser:
pn = bx["page_number"]
top = bx["top"] - self.page_cum_height[pn - 1]
bott = bx["bottom"] - self.page_cum_height[pn - 1]
poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
poss.append((pn, bx["x0"], bx["x1"], top, min(
bott, self.page_images[pn - 1].size[1] / ZM)))
while bott * ZM > self.page_images[pn - 1].size[1]:
bott -= self.page_images[pn - 1].size[1] / ZM
top = 0
pn += 1
poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
poss.append((pn, bx["x0"], bx["x1"], top, min(
bott, self.page_images[pn - 1].size[1] / ZM)))
return poss
@ -1090,11 +1138,14 @@ class PlainParser(object):
self.outlines = []
lines = []
try:
self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
self.pdf = pdf2_read(
filename if isinstance(
filename, str) else BytesIO(filename))
for page in self.pdf.pages[from_page:to_page]:
lines.extend([t for t in page.extract_text().split("\n")])
outlines = self.pdf.outline
def dfs(arr, depth):
for a in arr:
if isinstance(a, dict):
@ -1117,5 +1168,6 @@ class PlainParser(object):
def remove_tag(txt):
raise NotImplementedError
if __name__ == "__main__":
pass

View File

@ -23,7 +23,8 @@ class HuPptParser(object):
tb = shape.table
rows = []
for i in range(1, len(tb.rows)):
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
rows.append("; ".join([tb.cell(
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
return "\n".join(rows)
if shape.has_text_frame:
@ -31,9 +32,10 @@ class HuPptParser(object):
if shape.shape_type == 6:
texts = []
for p in sorted(shape.shapes, key=lambda x: (x.top//10, x.left)):
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
t = self.__extract(p)
if t: texts.append(t)
if t:
texts.append(t)
return "\n".join(texts)
def __call__(self, fnm, from_page, to_page, callback=None):
@ -43,12 +45,16 @@ class HuPptParser(object):
txts = []
self.total_page = len(ppt.slides)
for i, slide in enumerate(ppt.slides):
if i < from_page: continue
if i >= to_page:break
if i < from_page:
continue
if i >= to_page:
break
texts = []
for shape in sorted(slide.shapes, key=lambda x: (x.top//10, x.left)):
for shape in sorted(
slide.shapes, key=lambda x: (x.top // 10, x.left)):
txt = self.__extract(shape)
if txt: texts.append(txt)
if txt:
texts.append(txt)
txts.append("\n".join(texts))
return txts

View File

@ -24,18 +24,19 @@ from deepdoc.vision import Recognizer
class LayoutRecognizer(Recognizer):
labels = [
"_background_",
"Text",
"Title",
"Figure",
"Figure caption",
"Table",
"Table caption",
"Header",
"Footer",
"Reference",
"Equation",
]
"_background_",
"Text",
"Title",
"Figure",
"Figure caption",
"Table",
"Table caption",
"Header",
"Footer",
"Reference",
"Equation",
]
def __init__(self, domain):
try:
model_dir = snapshot_download(
@ -47,10 +48,12 @@ class LayoutRecognizer(Recognizer):
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
super().__init__(self.labels, domain, model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
# os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
super().__init__(self.labels, domain, model_dir)
self.garbage_layouts = ["footer", "header", "reference"]
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
def __call__(self, image_list, ocr_res, scale_factor=3,
thr=0.2, batch_size=16, drop=True):
def __is_garbage(b):
patt = [r"^•+$", r"(版权归©|免责条款|地址[:])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
@ -75,7 +78,8 @@ class LayoutRecognizer(Recognizer):
"top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor,
"page_number": pn,
} for b in lts]
lts = self.sort_Y_firstly(lts, np.mean([l["bottom"]-l["top"] for l in lts]) / 2)
lts = self.sort_Y_firstly(lts, np.mean(
[l["bottom"] - l["top"] for l in lts]) / 2)
lts = self.layouts_cleanup(bxs, lts)
page_layout.append(lts)
@ -93,17 +97,20 @@ class LayoutRecognizer(Recognizer):
continue
ii = self.find_overlapped_with_threashold(bxs[i], lts_,
thr=0.4)
thr=0.4)
if ii is None: # belong to nothing
bxs[i]["layout_type"] = ""
i += 1
continue
lts_[ii]["visited"] = True
keep_feats = [
lts_[ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1]*0.9/scale_factor,
lts_[ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1]*0.1/scale_factor,
lts_[
ii]["type"] == "footer" and bxs[i]["bottom"] < image_list[pn].size[1] * 0.9 / scale_factor,
lts_[
ii]["type"] == "header" and bxs[i]["top"] > image_list[pn].size[1] * 0.1 / scale_factor,
]
if drop and lts_[ii]["type"] in self.garbage_layouts and not any(keep_feats):
if drop and lts_[
ii]["type"] in self.garbage_layouts and not any(keep_feats):
if lts_[ii]["type"] not in garbages:
garbages[lts_[ii]["type"]] = []
garbages[lts_[ii]["type"]].append(bxs[i]["text"])
@ -111,7 +118,8 @@ class LayoutRecognizer(Recognizer):
continue
bxs[i]["layoutno"] = f"{ty}-{ii}"
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure"
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[
ii]["type"] != "equation" else "figure"
i += 1
for lt in ["footer", "header", "reference", "figure caption",
@ -120,7 +128,7 @@ class LayoutRecognizer(Recognizer):
# add box to figure layouts which has not text box
for i, lt in enumerate(
[lt for lt in lts if lt["type"] in ["figure","equation"]]):
[lt for lt in lts if lt["type"] in ["figure", "equation"]]):
if lt.get("visited"):
continue
lt = deepcopy(lt)
@ -143,6 +151,3 @@ class LayoutRecognizer(Recognizer):
ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
return ocr_res, page_layout

View File

@ -63,6 +63,7 @@ class DecodeImage(object):
data['image'] = img
return data
class StandardizeImage(object):
"""normalize image
Args:
@ -707,4 +708,4 @@ def preprocess(im, preprocess_ops):
im, im_info = decode_image(im, im_info)
for operator in preprocess_ops:
im, im_info = operator(im, im_info)
return im, im_info
return im, im_info

View File

@ -11,12 +11,20 @@
# limitations under the License.
#
import os, sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')))
import numpy as np
import argparse
from deepdoc.vision import OCR, init_in_out
from deepdoc.vision.seeit import draw_box
from deepdoc.vision import OCR, init_in_out
import argparse
import numpy as np
import os
import sys
sys.path.insert(
0,
os.path.abspath(
os.path.join(
os.path.dirname(
os.path.abspath(__file__)),
'../../')))
def main(args):
ocr = OCR()
@ -26,14 +34,14 @@ def main(args):
bxs = ocr(np.array(img))
bxs = [(line[0], line[1][0]) for line in bxs]
bxs = [{
"text": t,
"bbox": [b[0][0], b[0][1], b[1][0], b[-1][1]],
"type": "ocr",
"score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
"text": t,
"bbox": [b[0][0], b[0][1], b[1][0], b[-1][1]],
"type": "ocr",
"score": 1} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]]
img = draw_box(images[i], bxs, ["ocr"], 1.)
img.save(outputs[i], quality=95)
with open(outputs[i] + ".txt", "w+") as f: f.write("\n".join([o["text"] for o in bxs]))
with open(outputs[i] + ".txt", "w+") as f:
f.write("\n".join([o["text"] for o in bxs]))
if __name__ == "__main__":
@ -42,6 +50,6 @@ if __name__ == "__main__":
help="Directory where to store images or PDFs, or a file path to a single image or PDF",
required=True)
parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './ocr_outputs'",
default="./ocr_outputs")
default="./ocr_outputs")
args = parser.parse_args()
main(args)
main(args)

View File

@ -11,24 +11,35 @@
# limitations under the License.
#
import os, sys
from deepdoc.vision.seeit import draw_box
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
from api.utils.file_utils import get_project_base_directory
import argparse
import os
import sys
import re
import numpy as np
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../')))
import argparse
from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import Recognizer, LayoutRecognizer, TableStructureRecognizer, OCR, init_in_out
from deepdoc.vision.seeit import draw_box
sys.path.insert(
0,
os.path.abspath(
os.path.join(
os.path.dirname(
os.path.abspath(__file__)),
'../../')))
def main(args):
images, outputs = init_in_out(args)
if args.mode.lower() == "layout":
labels = LayoutRecognizer.labels
detr = Recognizer(labels, "layout", os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
detr = Recognizer(
labels,
"layout",
os.path.join(
get_project_base_directory(),
"rag/res/deepdoc/"))
if args.mode.lower() == "tsr":
labels = TableStructureRecognizer.labels
detr = TableStructureRecognizer()
@ -39,7 +50,8 @@ def main(args):
if args.mode.lower() == "tsr":
#lyt = [t for t in lyt if t["type"] == "table column"]
html = get_table_html(images[i], lyt, ocr)
with open(outputs[i]+".html", "w+") as f: f.write(html)
with open(outputs[i] + ".html", "w+") as f:
f.write(html)
lyt = [{
"type": t["label"],
"bbox": [t["x0"], t["top"], t["x1"], t["bottom"]],
@ -58,7 +70,7 @@ def get_table_html(img, tb_cpns, ocr):
"bottom": b[-1][1],
"layout_type": "table",
"page_number": 0} for b, t in boxes if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
np.mean([b[-1][1]-b[0][1] for b,_ in boxes]) / 3
np.mean([b[-1][1] - b[0][1] for b, _ in boxes]) / 3
)
def gather(kwd, fzy=10, ption=0.6):
@ -117,7 +129,7 @@ def get_table_html(img, tb_cpns, ocr):
margin-bottom: 50px;
border: 1px solid #e1e1e1;
}
caption {
color: #6ac1ca;
font-size: 20px;
@ -126,25 +138,25 @@ def get_table_html(img, tb_cpns, ocr):
font-weight: 600;
margin-bottom: 10px;
}
._table_1nkzy_11 table {
width: 100%%;
border-collapse: collapse;
}
th {
color: #fff;
background-color: #6ac1ca;
}
td:hover {
background: #c1e8e8;
}
tr:nth-child(even) {
background-color: #f2f2f2;
}
._table_1nkzy_11 th,
._table_1nkzy_11 td {
text-align: center;
@ -157,7 +169,7 @@ def get_table_html(img, tb_cpns, ocr):
%s
</body>
</html>
"""% TableStructureRecognizer.construct_table(boxes, html=True)
""" % TableStructureRecognizer.construct_table(boxes, html=True)
return html
@ -168,7 +180,10 @@ if __name__ == "__main__":
required=True)
parser.add_argument('--output_dir', help="Directory where to store the output images. Default: './layouts_outputs'",
default="./layouts_outputs")
parser.add_argument('--threshold', help="A threshold to filter out detections. Default: 0.5", default=0.5)
parser.add_argument(
'--threshold',
help="A threshold to filter out detections. Default: 0.5",
default=0.5)
parser.add_argument('--mode', help="Task mode: layout recognition or table structure recognition", choices=["layout", "tsr"],
default="layout")
args = parser.parse_args()

View File

@ -44,7 +44,8 @@ class TableStructureRecognizer(Recognizer):
except Exception as e:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc")
super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
# os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
super().__init__(self.labels, "tsr", model_dir)
def __call__(self, images, thr=0.2):
tbls = super().__call__(images, thr)
@ -138,7 +139,8 @@ class TableStructureRecognizer(Recognizer):
i = 0
while i < len(boxes):
if TableStructureRecognizer.is_caption(boxes[i]):
if is_english: cap + " "
if is_english:
cap + " "
cap += boxes[i]["text"]
boxes.pop(i)
i -= 1
@ -164,7 +166,7 @@ class TableStructureRecognizer(Recognizer):
lst_r = rows[-1]
if lst_r[-1].get("R", "") != b.get("R", "") \
or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
): # new row
): # new row
btm = b["bottom"]
b["rn"] += 1
rows.append([b])
@ -214,9 +216,9 @@ class TableStructureRecognizer(Recognizer):
j += 1
continue
f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
[j - 1][0].get("text")) or j == 0
[j - 1][0].get("text")) or j == 0
ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
[j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
[j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
if f and ff:
j += 1
continue
@ -277,9 +279,9 @@ class TableStructureRecognizer(Recognizer):
i += 1
continue
f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
[jj][0].get("text")) or i == 0
[jj][0].get("text")) or i == 0
ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
[jj][0].get("text")) or i + 1 >= len(tbl)
[jj][0].get("text")) or i + 1 >= len(tbl)
if f and ff:
i += 1
continue
@ -366,7 +368,8 @@ class TableStructureRecognizer(Recognizer):
continue
txt = ""
if arr:
h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10)
h = min(np.min([c["bottom"] - c["top"]
for c in arr]) / 2, 10)
txt = " ".join([c["text"]
for c in Recognizer.sort_Y_firstly(arr, h)])
txts.append(txt)
@ -438,8 +441,8 @@ class TableStructureRecognizer(Recognizer):
else "") + headers[j - 1][k]
else:
headers[j][k] = headers[j - 1][k] \
+ (de if headers[j - 1][k] else "") \
+ headers[j][k]
+ (de if headers[j - 1][k] else "") \
+ headers[j][k]
logging.debug(
f">>>>>>>>>>>>>>>>>{cap}SIZE:{rowno}X{clmno} Header: {hdr_rowno}")