mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
add ocr and recognizer demo, update README (#74)
This commit is contained in:
@ -1,3 +1,15 @@
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
@ -12,15 +24,16 @@ from .recognizer import Recognizer
|
||||
|
||||
|
||||
class TableStructureRecognizer(Recognizer):
|
||||
labels = [
|
||||
"table",
|
||||
"table column",
|
||||
"table row",
|
||||
"table column header",
|
||||
"table projected row header",
|
||||
"table spanning cell",
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
self.labels = [
|
||||
"table",
|
||||
"table column",
|
||||
"table row",
|
||||
"table column header",
|
||||
"table projected row header",
|
||||
"table spanning cell",
|
||||
]
|
||||
super().__init__(self.labels, "tsr",
|
||||
os.path.join(get_project_base_directory(), "rag/res/deepdoc/"))
|
||||
|
||||
@ -79,7 +92,8 @@ class TableStructureRecognizer(Recognizer):
|
||||
return True
|
||||
return False
|
||||
|
||||
def __blockType(self, b):
|
||||
@staticmethod
|
||||
def blockType(b):
|
||||
patt = [
|
||||
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
||||
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
||||
@ -109,11 +123,12 @@ class TableStructureRecognizer(Recognizer):
|
||||
|
||||
return "Ot"
|
||||
|
||||
def construct_table(self, boxes, is_english=False, html=False):
|
||||
@staticmethod
|
||||
def construct_table(boxes, is_english=False, html=False):
|
||||
cap = ""
|
||||
i = 0
|
||||
while i < len(boxes):
|
||||
if self.is_caption(boxes[i]):
|
||||
if TableStructureRecognizer.is_caption(boxes[i]):
|
||||
cap += boxes[i]["text"]
|
||||
boxes.pop(i)
|
||||
i -= 1
|
||||
@ -122,14 +137,15 @@ class TableStructureRecognizer(Recognizer):
|
||||
if not boxes:
|
||||
return []
|
||||
for b in boxes:
|
||||
b["btype"] = self.__blockType(b)
|
||||
b["btype"] = TableStructureRecognizer.blockType(b)
|
||||
max_type = Counter([b["btype"] for b in boxes]).items()
|
||||
max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
|
||||
logging.debug("MAXTYPE: " + max_type)
|
||||
|
||||
rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
|
||||
rowh = np.min(rowh) if rowh else 0
|
||||
boxes = self.sort_R_firstly(boxes, rowh / 2)
|
||||
boxes = Recognizer.sort_R_firstly(boxes, rowh / 2)
|
||||
#for b in boxes:print(b)
|
||||
boxes[0]["rn"] = 0
|
||||
rows = [[boxes[0]]]
|
||||
btm = boxes[0]["bottom"]
|
||||
@ -150,9 +166,9 @@ class TableStructureRecognizer(Recognizer):
|
||||
colwm = np.min(colwm) if colwm else 0
|
||||
crosspage = len(set([b["page_number"] for b in boxes])) > 1
|
||||
if crosspage:
|
||||
boxes = self.sort_X_firstly(boxes, colwm / 2, False)
|
||||
boxes = Recognizer.sort_X_firstly(boxes, colwm / 2, False)
|
||||
else:
|
||||
boxes = self.sort_C_firstly(boxes, colwm / 2)
|
||||
boxes = Recognizer.sort_C_firstly(boxes, colwm / 2)
|
||||
boxes[0]["cn"] = 0
|
||||
cols = [[boxes[0]]]
|
||||
right = boxes[0]["x1"]
|
||||
@ -313,16 +329,18 @@ class TableStructureRecognizer(Recognizer):
|
||||
hdset.add(i)
|
||||
|
||||
if html:
|
||||
return [self.__html_table(cap, hdset,
|
||||
self.__cal_spans(boxes, rows,
|
||||
cols, tbl, True)
|
||||
)]
|
||||
return TableStructureRecognizer.__html_table(cap, hdset,
|
||||
TableStructureRecognizer.__cal_spans(boxes, rows,
|
||||
cols, tbl, True)
|
||||
)
|
||||
|
||||
return self.__desc_table(cap, hdset,
|
||||
self.__cal_spans(boxes, rows, cols, tbl, False),
|
||||
is_english)
|
||||
return TableStructureRecognizer.__desc_table(cap, hdset,
|
||||
TableStructureRecognizer.__cal_spans(boxes, rows, cols, tbl,
|
||||
False),
|
||||
is_english)
|
||||
|
||||
def __html_table(self, cap, hdset, tbl):
|
||||
@staticmethod
|
||||
def __html_table(cap, hdset, tbl):
|
||||
# constrcut HTML
|
||||
html = "<table>"
|
||||
if cap:
|
||||
@ -339,8 +357,8 @@ class TableStructureRecognizer(Recognizer):
|
||||
txt = ""
|
||||
if arr:
|
||||
h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2, 10)
|
||||
txt = "".join([c["text"]
|
||||
for c in self.sort_Y_firstly(arr, h)])
|
||||
txt = " ".join([c["text"]
|
||||
for c in Recognizer.sort_Y_firstly(arr, h)])
|
||||
txts.append(txt)
|
||||
sp = ""
|
||||
if arr[0].get("colspan"):
|
||||
@ -366,7 +384,8 @@ class TableStructureRecognizer(Recognizer):
|
||||
html += "\n</table>"
|
||||
return html
|
||||
|
||||
def __desc_table(self, cap, hdr_rowno, tbl, is_english):
|
||||
@staticmethod
|
||||
def __desc_table(cap, hdr_rowno, tbl, is_english):
|
||||
# get text of every colomn in header row to become header text
|
||||
clmno = len(tbl[0])
|
||||
rowno = len(tbl)
|
||||
@ -469,7 +488,8 @@ class TableStructureRecognizer(Recognizer):
|
||||
row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
|
||||
return row_txt
|
||||
|
||||
def __cal_spans(self, boxes, rows, cols, tbl, html=True):
|
||||
@staticmethod
|
||||
def __cal_spans(boxes, rows, cols, tbl, html=True):
|
||||
# caculate span
|
||||
clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
|
||||
for cln in cols]
|
||||
@ -553,4 +573,3 @@ class TableStructureRecognizer(Recognizer):
|
||||
tbl[rowspan[0]][colspan[0]] = arr
|
||||
|
||||
return tbl
|
||||
|
||||
|
||||
Reference in New Issue
Block a user