feat: improve presentation PdfParser (#11639)

The old presentation PdfParser lost table format after parse

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
rommy2017
2025-12-02 17:35:14 +08:00
committed by GitHub
parent c946858328
commit 4ba17361e9

View File

@ -16,15 +16,17 @@
import copy import copy
import re import re
from collections import defaultdict
from io import BytesIO from io import BytesIO
from PIL import Image from PIL import Image
from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read from PyPDF2 import PdfReader as pdf2_read
from deepdoc.parser import PdfParser, PptParser, PlainParser
from rag.app.naive import by_plaintext, PARSERS from rag.app.naive import by_plaintext, PARSERS
from rag.nlp import rag_tokenizer
from rag.nlp import tokenize, is_english
class Ppt(PptParser): class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None): def __call__(self, fnm, from_page, to_page, callback=None):
@ -44,42 +46,106 @@ class Ppt(PptParser):
buffered.seek(0) buffered.seek(0)
imgs.append(Image.open(buffered).copy()) imgs.append(Image.open(buffered).copy())
except RuntimeError as e: except RuntimeError as e:
raise RuntimeError(f'ppt parse error at page {i+1}, original error: {str(e)}') from e raise RuntimeError(
f'ppt parse error at page {i + 1}, original error: {str(e)}') from e
assert len(imgs) == len( assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) txts), "Slides text and image do not match: {} vs. {}".format(
len(imgs), len(txts))
callback(0.9, "Image extraction finished") callback(0.9, "Image extraction finished")
self.is_english = is_english(txts) self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))] return [(txts[i], imgs[i]) for i in range(len(txts))]
class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
def __garbage(self, txt):
txt = txt.lower().strip()
if re.match(r"[0-9\.,%/-]+$", txt):
return True
if len(txt) < 3:
return True
return False
def __call__(self, filename, binary=None, from_page=0, def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None): to_page=100000, zoomin=3, callback=None, **kwargs):
from timeit import default_timer as timer # 1. OCR
start = timer()
callback(msg="OCR started") callback(msg="OCR started")
self.__images__(filename if not binary else binary, self.__images__(filename if not binary else binary, zoomin, from_page,
zoomin, from_page, to_page, callback) to_page, callback)
callback(msg="Page {}~{}: OCR finished ({:.2f}s)".format(from_page, min(to_page, self.total_page), timer() - start))
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format( # 2. Layout Analysis
len(self.boxes), len(self.page_images)) callback(msg="Layout Analysis")
self._layouts_rec(zoomin)
# 3. Table Analysis
callback(msg="Table Analysis")
self._table_transformer_job(zoomin)
# 4. Text Merge
self._text_merge()
# 5. Extract Tables (Force HTML)
tbls = self._extract_table_figure(True, zoomin, True, True)
# 6. Re-assemble Page Content
page_items = defaultdict(list)
# (A) Add text
for b in self.boxes:
if not (from_page < b["page_number"] <= to_page + from_page):
continue
page_items[b["page_number"]].append({
"top": b["top"],
"x0": b["x0"],
"text": b["text"],
"type": "text"
})
# (B) Add table and figure
for (img, content), positions in tbls:
if not positions:
continue
# Handle content type (list vs str)
if isinstance(content, list):
final_text = "\n".join(content)
elif isinstance(content, str):
final_text = content
else:
final_text = str(content)
try:
# Parse positions
pn_index = positions[0][0]
if isinstance(pn_index, list):
pn_index = pn_index[0]
current_page_num = int(pn_index) + 1
except Exception as e:
print(f"Error parsing position: {e}")
continue
if not (from_page < current_page_num <= to_page + from_page):
continue
top = positions[0][3]
left = positions[0][1]
page_items[current_page_num].append({
"top": top,
"x0": left,
"text": final_text,
"type": "table_or_figure"
})
# 7. Generate result
res = [] res = []
for i in range(len(self.boxes)): for i in range(len(self.page_images)):
lines = "\n".join([b["text"] for b in self.boxes[i] current_pn = from_page + i + 1
if not self.__garbage(b["text"])]) items = page_items.get(current_pn, [])
res.append((lines, self.page_images[i])) # Sort by vertical position
callback(0.9, "Page {}~{}: Parsing finished".format( items.sort(key=lambda x: (x["top"], x["x0"]))
from_page, min(to_page, self.total_page))) full_page_text = "\n\n".join([item["text"] for item in items])
if not full_page_text.strip():
full_page_text = f"[No text or data found in Page {current_pn}]"
page_img = self.page_images[i]
res.append((full_page_text, page_img))
callback(0.9, "Parsing finished")
return res, [] return res, []
@ -106,14 +172,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
eng = lang.lower() == "english" eng = lang.lower() == "english"
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(
re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = [] res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE): if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt() ppt_parser = Ppt()
for pn, (txt, img) in enumerate(ppt_parser( for pn, (txt, img) in enumerate(ppt_parser(
filename if not binary else binary, from_page, 1000000, callback)): filename if not binary else binary, from_page, 1000000,
callback)):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
pn += from_page pn += from_page
d["image"] = img d["image"] = img
@ -135,14 +203,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections, _, _ = parser( sections, _, _ = parser(
filename = filename, filename=filename,
binary = binary, binary=binary,
from_page = from_page, from_page=from_page,
to_page = to_page, to_page=to_page,
lang = lang, lang=lang,
callback = callback, callback=callback,
pdf_cls = Pdf, pdf_cls=Pdf,
layout_recognizer = layout_recognizer, layout_recognizer=layout_recognizer,
**kwargs **kwargs
) )
@ -151,7 +219,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if name in ["tcadp", "docling", "mineru"]: if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0 parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
for pn, (txt, img) in enumerate(sections): for pn, (txt, img) in enumerate(sections):
@ -161,7 +229,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["image"] = img d["image"] = img
d["page_num_int"] = [pn + 1] d["page_num_int"] = [pn + 1]
d["top_int"] = [0] d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)] d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0,
img.size[1] if img else 0)]
tokenize(d, txt, eng) tokenize(d, txt, eng)
res.append(d) res.append(d)
return res return res
@ -175,4 +244,5 @@ if __name__ == "__main__":
def dummy(a, b): def dummy(a, b):
pass pass
chunk(sys.argv[1], callback=dummy) chunk(sys.argv[1], callback=dummy)