feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Lin Manhui
2026-01-09 17:48:45 +08:00
committed by GitHub
parent 6abf55c048
commit 2e09db02f3
34 changed files with 1510 additions and 453 deletions

View File

@ -22,9 +22,7 @@ from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import bullets_category, is_english, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks, attach_media_context
from rag.nlp import bullets_category, is_english, remove_contents_table, hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, tokenize_chunks, attach_media_context
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@ -32,17 +30,12 @@ from PIL import Image
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
@ -62,24 +55,17 @@ class Pdf(PdfParser):
self._merge_with_same_bullet()
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
for b in self.boxes], tbls
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time.
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections, tbls = [], []
@ -87,28 +73,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.")
doc_parser = naive.Docx()
# TODO: table of contents need to be removed
main_sections = doc_parser(
filename, binary=binary, from_page=from_page, to_page=to_page)
main_sections = doc_parser(filename, binary=binary, from_page=from_page, to_page=to_page)
sections = []
tbls = []
for text, image, html in main_sections:
sections.append((text, image))
tbls.append(((None, html), ""))
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs)
# tbls = [((None, lns), None) for lns in tbls]
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if
not isinstance(item[1], Image.Image)]
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)]
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -127,13 +108,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not sections and not tables:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
@ -142,16 +124,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
@ -165,31 +145,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
make_colon_as_title(sections)
bull = bullets_category(
[t for t in random_choices([t for t, _ in sections], k=100)])
bull = bullets_category([t for t in random_choices([t for t, _ in sections], k=100)])
if bull >= 0:
chunks = ["\n".join(ck)
for ck in hierarchical_merge(bull, sections, 5)]
chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 5)]
else:
sections = [s.split("@") for s, _ in sections]
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections]
chunks = naive_merge(
sections,
parser_config.get("chunk_token_num", 256),
parser_config.get("delimiter", "\n。;!?")
)
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], "") for pr in sections]
chunks = naive_merge(sections, parser_config.get("chunk_token_num", 256), parser_config.get("delimiter", "\n。;!?"))
# is it English
# is_english(random_choices([t for t, _ in sections], k=218))
@ -208,9 +180,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)

View File

@ -21,8 +21,7 @@ from docx import Document
from common.constants import ParserType
from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, remove_contents_table, \
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
from rag.nlp import bullets_category, remove_contents_table, make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
from rag.nlp import rag_tokenizer, Node
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
from rag.app.naive import by_plaintext, PARSERS
@ -38,8 +37,7 @@ class Docx(DocxParser):
return line
def old_call(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
for p in self.doc.paragraphs:
@ -48,16 +46,15 @@ class Docx(DocxParser):
if from_page <= pn < to_page and p.text.strip():
lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
if "lastRenderedPageBreak" in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
return [line for line in lines if line]
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
level_set = set()
@ -71,10 +68,10 @@ class Docx(DocxParser):
lines.append((question_level, p_text))
level_set.add(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
if "lastRenderedPageBreak" in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
sorted_levels = sorted(level_set)
@ -88,12 +85,12 @@ class Docx(DocxParser):
return [element for element in root.get_tree() if element]
def __str__(self) -> str:
return f'''
return f"""
question:{self.question},
answer:{self.answer},
level:{self.level},
childs:{self.childs}
'''
"""
class Pdf(PdfParser):
@ -101,18 +98,12 @@ class Pdf(PdfParser):
self.model_speciess = ParserType.LAWS.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
@ -123,22 +114,15 @@ class Pdf(PdfParser):
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], None
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Supported file formats are docx, pdf, txt.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections = []
@ -152,9 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return tokenize_chunks(chunks, doc, eng, None)
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -173,13 +155,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not raw_sections and not tables:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
for txt, poss in raw_sections:
@ -210,8 +193,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
@ -219,8 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.warning(f"tika.parser got empty content from {filename}.")
return []
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
# Remove 'Contents' part
remove_contents_table(sections, eng)
@ -241,9 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -20,8 +20,7 @@ import re
from common.constants import ParserType
from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, \
docx_question_level, attach_media_context
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper, vision_figure_parser_docx_wrapper
@ -36,18 +35,12 @@ class Pdf(PdfParser):
self.model_speciess = ParserType.MANUAL.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.debug("OCR: {}".format(timer() - start))
@ -71,8 +64,7 @@ class Pdf(PdfParser):
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)], tbls
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls
class Docx(DocxParser):
@ -80,12 +72,12 @@ class Docx(DocxParser):
pass
def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic')
img = paragraph._element.xpath(".//pic:pic")
if not img:
return None
try:
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
embed = img.xpath(".//a:blip/@r:embed")[0]
related_part = document.part.related_parts[embed]
image = related_part.image
if image is not None:
@ -111,7 +103,7 @@ class Docx(DocxParser):
new_width = max(width1, width2)
new_height = height1 + height2
new_image = Image.new('RGB', (new_width, new_height))
new_image = Image.new("RGB", (new_width, new_height))
new_image.paste(img1, (0, 0))
new_image.paste(img2, (0, height1))
@ -119,8 +111,7 @@ class Docx(DocxParser):
return new_image
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
last_answer, last_image = "", None
question_stack, level_stack = [], []
@ -128,19 +119,19 @@ class Docx(DocxParser):
for p in self.doc.paragraphs:
if pn > to_page:
break
question_level, p_text = 0, ''
question_level, p_text = 0, ""
if from_page <= pn < to_page and p.text.strip():
question_level, p_text = docx_question_level(p)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{p_text}'
last_answer = f"{last_answer}\n{p_text}"
current_image = self.get_picture(self.doc, p)
last_image = self.concat_img(last_image, current_image)
else: # is a question
if last_answer or last_image:
sum_question = '\n'.join(question_stack)
sum_question = "\n".join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
last_answer, last_image = '', None
ti_list.append((f"{sum_question}\n{last_answer}", last_image))
last_answer, last_image = "", None
i = question_level
while question_stack and i <= level_stack[-1]:
@ -149,15 +140,15 @@ class Docx(DocxParser):
question_stack.append(p_text)
level_stack.append(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
if "lastRenderedPageBreak" in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
if last_answer:
sum_question = '\n'.join(question_stack)
sum_question = "\n".join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
ti_list.append((f"{sum_question}\n{last_answer}", last_image))
tbls = []
for tb in self.doc.tables:
@ -182,26 +173,19 @@ class Docx(DocxParser):
return ti_list, tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
Only pdf is supported.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
pdf_parser = None
doc = {
"docnm_kwd": filename
}
doc = {"docnm_kwd": filename}
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
if re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -222,8 +206,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
paddleocr_llm_name=parser_model_name,
parse_method="manual",
**kwargs
**kwargs,
)
def _normalize_section(section):
@ -252,7 +237,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if not sections and not tbls:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
@ -264,8 +249,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
for txt, _, _ in sections:
for t, lvl in pdf_parser.outlines:
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
tks_ = set([txt[i] + txt[i + 1]
for i in range(min(len(t), len(txt) - 1))])
tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
levels.append(lvl)
break
@ -274,8 +258,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
else:
bull = bullets_category([txt for txt, _, _ in sections])
most_level, levels = title_frequency(
bull, [(txt, lvl) for txt, lvl, _ in sections])
most_level, levels = title_frequency(bull, [(txt, lvl) for txt, lvl, _ in sections])
assert len(sections) == len(levels)
sec_ids = []
@ -285,25 +268,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sid += 1
sec_ids.append(sid)
sections = [(txt, sec_ids[i], poss)
for i, (txt, _, poss) in enumerate(sections)]
sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0], -1,
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
def tag(pn, left, right, top, bottom):
if pn + left + right + top + bottom == 0:
return ""
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(pn, left, right, top, bottom)
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, left, right, top, bottom)
chunks = []
last_sid = -2
tk_cnt = 0
for txt, sec_id, poss in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
poss = "\t".join([tag(*pos) for pos in poss])
if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
if chunks:
@ -330,14 +309,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback)
tbls = vision_figure_parser_docx_wrapper(sections=ti_list, tbls=tbls, callback=callback, **kwargs)
res = tokenize_table(tbls, doc, eng)
for text, image in ti_list:
d = copy.deepcopy(doc)
if image:
d['image'] = image
d["image"] = image
d["doc_type_kwd"] = "image"
tokenize(d, text, eng)
res.append(d)
@ -353,9 +331,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -33,29 +33,32 @@ from common.token_utils import num_tokens_from_string
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, \
PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, \
vision_figure_parser_pdf_wrapper
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, \
attach_media_context # noqa: F401
from rag.nlp import (
concat_img,
find_codec,
naive_merge,
naive_merge_with_images,
naive_merge_docx,
rag_tokenizer,
tokenize_chunks,
doc_tokenize_chunks_with_images,
tokenize_table,
append_context2table_image4pdf,
tokenize_chunks_with_images,
) # noqa: F401
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
**kwargs):
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
tables = vision_figure_parser_pdf_wrapper(
tbls=tables,
@ -67,17 +70,17 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
def by_mineru(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
mineru_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
mineru_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
):
pdf_parser = None
if tenant_id:
@ -115,8 +118,7 @@ def by_mineru(
return None, None, None
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
**kwargs):
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw")
@ -130,7 +132,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
parse_method=parse_method
parse_method=parse_method,
)
return sections, tables, pdf_parser
@ -142,16 +144,60 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return None, None, tcadp_parser
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type="PDF"
)
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF")
return sections, tables, tcadp_parser
def by_paddleocr(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
paddleocr_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
):
pdf_parser = None
if tenant_id:
if not paddleocr_llm_name:
try:
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR)
if candidates:
paddleocr_llm_name = candidates[0].llm_name
elif env_name:
paddleocr_llm_name = env_name
except Exception as e: # best-effort fallback
logging.warning(f"fallback to env paddleocr: {e}")
if paddleocr_llm_name:
try:
ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=paddleocr_llm_name, lang=lang)
pdf_parser = ocr_model.mdl
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
parse_method=parse_method,
**kwargs,
)
return sections, tables, pdf_parser
except Exception as e:
logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}")
return None, None, None
if callback:
callback(-1, "PaddleOCR not found.")
return None, None, None
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
@ -168,12 +214,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
)
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
return sections, tables, pdf_parser
@ -182,6 +223,7 @@ PARSERS = {
"mineru": by_mineru,
"docling": by_docling,
"tcadp": by_tcadp,
"paddleocr": by_paddleocr,
"plaintext": by_plaintext, # default
}
@ -191,12 +233,12 @@ class Docx(DocxParser):
pass
def get_picture(self, document, paragraph):
imgs = paragraph._element.xpath('.//pic:pic')
imgs = paragraph._element.xpath(".//pic:pic")
if not imgs:
return None
res_img = None
for img in imgs:
embed = img.xpath('.//a:blip/@r:embed')
embed = img.xpath(".//a:blip/@r:embed")
if not embed:
continue
embed = embed[0]
@ -219,7 +261,7 @@ class Docx(DocxParser):
logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
continue
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
image = Image.open(BytesIO(image_blob)).convert("RGB")
if res_img is None:
res_img = image
else:
@ -251,11 +293,11 @@ class Docx(DocxParser):
try:
# Iterate through all paragraphs and tables in document order
for i, block in enumerate(self.doc._element.body):
if block.tag.endswith('p'): # Paragraph
if block.tag.endswith("p"): # Paragraph
p = Paragraph(block, self.doc)
blocks.append(('p', i, p))
elif block.tag.endswith('tbl'): # Table
blocks.append(('t', i, None)) # Table object will be retrieved later
blocks.append(("p", i, p))
elif block.tag.endswith("tbl"): # Table
blocks.append(("t", i, None)) # Table object will be retrieved later
except Exception as e:
logging.error(f"Error collecting blocks: {e}")
return ""
@ -264,7 +306,7 @@ class Docx(DocxParser):
target_table_pos = -1
table_count = 0
for i, (block_type, pos, _) in enumerate(blocks):
if block_type == 't':
if block_type == "t":
if table_count == table_index:
target_table_pos = pos
break
@ -280,7 +322,7 @@ class Docx(DocxParser):
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != 'p':
if block_type != "p":
continue
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
@ -309,7 +351,7 @@ class Docx(DocxParser):
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != 'p':
if block_type != "p":
continue
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
@ -340,8 +382,7 @@ class Docx(DocxParser):
return ""
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
last_image = None
@ -357,7 +398,7 @@ class Docx(DocxParser):
if pn > to_page:
break
if block.tag.endswith('p'):
if block.tag.endswith("p"):
p = Paragraph(block, self.doc)
if from_page <= pn < to_page:
@ -417,7 +458,7 @@ class Docx(DocxParser):
if "w:br" in xml and 'type="page"' in xml:
pn += 1
elif block.tag.endswith('tbl'):
elif block.tag.endswith("tbl"):
if pn < from_page or pn > to_page:
table_idx += 1
continue
@ -455,7 +496,6 @@ class Docx(DocxParser):
return new_line
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
"""
This function uses mammoth, licensed under the BSD 2-Clause License.
@ -486,8 +526,7 @@ class Docx(DocxParser):
try:
if inline_images:
result = mammoth.convert_to_html(docx_file,
convert_image=mammoth.images.img_element(_convert_image_to_base64))
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
else:
result = mammoth.convert_to_html(docx_file)
@ -505,18 +544,11 @@ class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
start = timer()
first_start = start
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
@ -559,13 +591,14 @@ class Markdown(MarkdownParser):
return []
from bs4 import BeautifulSoup
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
soup = BeautifulSoup(html_content, "html.parser")
return soup
def get_hyperlink_urls(self, soup):
if soup:
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
return set([a.get("href") for a in soup.find_all("a") if a.get("href")])
return []
def extract_image_urls_with_lines(self, text):
@ -588,10 +621,10 @@ class Markdown(MarkdownParser):
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, 'html.parser')
soup = BeautifulSoup(text, "html.parser")
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
for img_tag in soup.find_all('img'):
src = img_tag.get('src')
for img_tag in soup.find_all("img"):
src = img_tag.get("src")
if not src:
continue
@ -627,14 +660,14 @@ class Markdown(MarkdownParser):
continue
img_obj = None
try:
if url.startswith(('http://', 'https://')):
if url.startswith(("http://", "https://")):
response = requests.get(url, stream=True, timeout=30)
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
else:
local_path = Path(url)
if local_path.exists():
img_obj = Image.open(url).convert('RGB')
img_obj = Image.open(url).convert("RGB")
else:
logging.warning(f"Local image file not found: {url}")
except Exception as e:
@ -652,7 +685,7 @@ class Markdown(MarkdownParser):
with open(filename, "r") as f:
txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables)
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
# extractor = MarkdownElementExtractor(remainder)
extractor = MarkdownElementExtractor(txt)
@ -678,7 +711,7 @@ class Markdown(MarkdownParser):
tbls = []
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), ""))
if return_section_images:
return sections, tbls, section_images
return sections, tbls
@ -694,7 +727,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
if rel_elm.target_ref in ("../NULL", "NULL"):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
@ -702,21 +735,18 @@ def load_from_xml_v2(baseURI, rels_item_xml):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
urls = set()
url_res = []
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
child_deli = (parser_config.get("children_delimiter") or "").encode('utf-8').decode('unicode_escape').encode(
'latin1').decode('utf-8')
child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
cust_child_deli = re.findall(r"`([^`]+)`", child_deli)
child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli))
if cust_child_deli:
@ -728,10 +758,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
pdf_parser = None
@ -750,8 +777,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
# Recursively chunk each embedded file and collect results
for embed_filename, embed_bytes in embeds:
try:
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False,
**kwargs) or []
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
embed_res.extend(sub_res)
except Exception as e:
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
@ -772,8 +798,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False,
**kwargs)
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
@ -784,11 +809,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
# chunks list[dict]
# images list - index of image chunk in chunks
chunks, images = naive_merge_docx(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"), table_context_size, image_context_size)
chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size)
vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs)
callback(0.8, "Finish parsing.")
@ -801,9 +823,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
@ -824,7 +844,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
callback=callback,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not sections and not tables:
@ -833,7 +854,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if table_context_size or image_context_size:
tables = append_context2table_image4pdf(sections, tables, image_context_size)
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
@ -847,10 +868,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if layout_recognizer == "TCADP Parser":
table_result_type = parser_config.get("table_result_type", "1")
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
)
tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type)
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return res
@ -858,13 +876,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
# Determine file type based on extension
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type=file_type
)
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type)
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
@ -879,9 +891,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = TxtParser()(filename, binary,
parser_config.get("chunk_token_num", 128),
parser_config.get("delimiter", "\n!?;。;!?"))
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
@ -919,11 +929,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
else:
section_images = [None] * len(sections)
section_images[idx] = combined_image
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
boosted_figures = markdown_vision_parser(callback=callback)
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]),
sections[idx][1])
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
@ -962,8 +970,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
else:
@ -972,8 +980,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
logging.warning(error_msg)
return []
else:
raise NotImplementedError(
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer()
if is_markdown:
@ -1021,8 +1028,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
has_images = merged_images and any(img is not None for img in merged_images)
if has_images:
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images,
child_delimiters_pattern=child_deli))
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
else:
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
else:
@ -1031,17 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
section_images = None
if section_images:
chunks, images = naive_merge_with_images(sections, section_images,
int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
res.extend(
tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
else:
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
@ -1071,9 +1070,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -28,18 +28,12 @@ from common.parser_config_utils import normalize_layout_recognizer
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
@ -57,21 +51,16 @@ class Pdf(PdfParser):
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
sections = [(b["text"], self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)]
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
One file forms a chunk which maintains original text order.
Supported file formats are docx, pdf, excel, txt.
One file forms a chunk which maintains original text order.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
eng = lang.lower() == "english" # is_english(cks)
if re.search(r"\.docx$", filename, re.IGNORECASE):
@ -99,9 +88,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -120,13 +107,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not sections and not tbls:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
@ -134,8 +122,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
sections.append((rows if isinstance(rows, str) else rows[0], [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
sections = [s for s, _ in sections if s]
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
@ -167,19 +154,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
tokenize(doc, "\n".join(sections), eng)
return [doc]
@ -188,9 +171,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -36,22 +36,18 @@ class Ppt(PptParser):
callback(0.5, "Text extraction finished.")
import aspose.slides as slides
import aspose.pydrawing as drawing
imgs = []
with slides.Presentation(BytesIO(fnm)) as presentation:
for i, slide in enumerate(presentation.slides[from_page: to_page]):
for i, slide in enumerate(presentation.slides[from_page:to_page]):
try:
with BytesIO() as buffered:
slide.get_thumbnail(
0.1, 0.1).save(
buffered, drawing.imaging.ImageFormat.jpeg)
slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg)
buffered.seek(0)
imgs.append(Image.open(buffered).copy())
except RuntimeError as e:
raise RuntimeError(
f'ppt parse error at page {i + 1}, original error: {str(e)}') from e
assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(
len(imgs), len(txts))
raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback(0.9, "Image extraction finished")
self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))]
@ -61,12 +57,10 @@ class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None, **kwargs):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, **kwargs):
# 1. OCR
callback(msg="OCR started")
self.__images__(filename if not binary else binary, zoomin, from_page,
to_page, callback)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
# 2. Layout Analysis
callback(msg="Layout Analysis")
@ -91,12 +85,7 @@ class Pdf(PdfParser):
global_page_num = b["page_number"] + from_page
if not (from_page < global_page_num <= to_page + from_page):
continue
page_items[global_page_num].append({
"top": b["top"],
"x0": b["x0"],
"text": b["text"],
"type": "text"
})
page_items[global_page_num].append({"top": b["top"], "x0": b["x0"], "text": b["text"], "type": "text"})
# (B) Add table and figure
for (img, content), positions in tbls:
@ -127,12 +116,7 @@ class Pdf(PdfParser):
top = positions[0][3]
left = positions[0][1]
page_items[current_page_num].append({
"top": top,
"x0": left,
"text": final_text,
"type": "table_or_figure"
})
page_items[current_page_num].append({"top": top, "x0": left, "text": final_text, "type": "table_or_figure"})
# 7. Generate result
res = []
@ -153,18 +137,16 @@ class Pdf(PdfParser):
class PlainPdf(PlainParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, callback=None, **kwargs):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
page_txt = []
for page in self.pdf.pages[from_page: to_page]:
for page in self.pdf.pages[from_page:to_page]:
page_txt.append(page.extract_text())
callback(0.9, "Parsing finished")
return [(txt, None) for txt in page_txt], []
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, parser_config=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
@ -173,18 +155,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if parser_config is None:
parser_config = {}
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(
re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt()
for pn, (txt, img) in enumerate(ppt_parser(
filename if not binary else binary, from_page, 1000000,
callback)):
for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
@ -196,9 +172,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.append(d)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -217,13 +191,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not sections:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
@ -236,22 +211,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["image"] = img
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0,
img.size[1] if img else 0)]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
tokenize(d, txt, eng)
res.append(d)
return res
raise NotImplementedError(
"file type not supported yet(pptx, pdf supported)")
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -166,7 +166,7 @@ class ParserParam(ProcessParamBase):
pdf_parse_method = pdf_config.get("parse_method", "")
self.check_empty(pdf_parse_method, "Parse method abnormal.")
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]:
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser", "paddleocr"]:
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
pdf_output_format = pdf_config.get("output_format", "")
@ -232,6 +232,9 @@ class Parser(ProcessBase):
if lowered.endswith("@mineru"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "MinerU"
elif lowered.endswith("@paddleocr"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "PaddleOCR"
if parse_method.lower() == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
@ -239,6 +242,7 @@ class Parser(ProcessBase):
lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines]
elif parse_method.lower() == "mineru":
def resolve_mineru_llm_name():
configured = parser_model_name or conf.get("mineru_llm_name")
if configured:
@ -320,6 +324,84 @@ class Parser(ProcessBase):
bboxes.append({"text": section})
else:
bboxes.append({"text": section})
elif parse_method.lower() == "paddleocr":
def resolve_paddleocr_llm_name():
configured = parser_model_name or conf.get("paddleocr_llm_name")
if configured:
return configured
tenant_id = self._canvas._tenant_id
if not tenant_id:
return None
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value)
if candidates:
return candidates[0].llm_name
return env_name
parser_model_name = resolve_paddleocr_llm_name()
if not parser_model_name:
raise RuntimeError("PaddleOCR model not configured. Please add PaddleOCR in Model Providers or set PADDLEOCR_* env.")
tenant_id = self._canvas._tenant_id
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=parser_model_name)
pdf_parser = ocr_model.mdl
lines, _ = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
parse_method=conf.get("paddleocr_parse_method", "raw"),
)
bboxes = []
for section in lines:
# PaddleOCRParser returns sections as tuple, different formats based on parse_method:
# - "raw": (text, position_tag)
# - "manual": (text, label, position_tag)
# - "paper": (text_with_tag, label)
text = section[0]
# Parse position tag if exists
position_tag = ""
if len(section) > 1:
if len(section) == 2: # raw format: (text, tag)
position_tag = section[1]
elif len(section) == 3: # manual format: (text, label, tag)
position_tag = section[2]
elif "paper" in conf.get("paddleocr_parse_method", "") and len(section) == 2:
# paper format: text may contain tag
text_with_tag = text
import re
tag_match = re.search(r"(@@[0-9-]+\t[0-9.\t]+##)", text_with_tag)
if tag_match:
position_tag = tag_match.group(1)
text = text_with_tag.replace(position_tag, "").strip()
# Extract coordinate information from position tag
page_number, x0, x1, top, bottom = 1, 0, 0, 0, 0
if position_tag:
import re
tag_match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if tag_match:
pn, x0_str, x1_str, top_str, bottom_str = tag_match.groups()
page_number = int(pn.split("-")[0]) # Take first page number
x0, x1, top, bottom = float(x0_str), float(x1_str), float(top_str), float(bottom_str)
box = {
"text": text,
"page_number": page_number,
"x0": x0,
"x1": x1,
"top": top,
"bottom": bottom,
}
bboxes.append(box)
else:
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
@ -802,7 +884,7 @@ class Parser(ProcessBase):
outs = self.output()
tasks = []
for d in outs.get("json", []):
tasks.append(asyncio.create_task(image2id(d,partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id),get_uuid())))
tasks.append(asyncio.create_task(image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())))
try:
await asyncio.gather(*tasks, return_exceptions=False)

View File

@ -19,6 +19,7 @@ import os
from typing import Any, Optional
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.paddleocr_parser import PaddleOCRParser
class Base:
@ -60,16 +61,11 @@ class MinerUOcrModel(Base, MinerUParser):
# Redact sensitive config keys before logging
redacted_config = {}
for k, v in config.items():
if any(
sensitive_word in k.lower()
for sensitive_word in ("key", "password", "token", "secret")
):
if any(sensitive_word in k.lower() for sensitive_word in ("key", "password", "token", "secret")):
redacted_config[k] = "[REDACTED]"
else:
redacted_config[k] = v
logging.info(
f"Parsed MinerU config (sensitive fields redacted): {redacted_config}"
)
logging.info(f"Parsed MinerU config (sensitive fields redacted): {redacted_config}")
MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
@ -93,6 +89,60 @@ class MinerUOcrModel(Base, MinerUParser):
server_url=self.mineru_server_url,
delete_output=self.mineru_delete_output,
parse_method=parse_method,
**kwargs
**kwargs,
)
return sections, tables
class PaddleOCROcrModel(Base, PaddleOCRParser):
_FACTORY_NAME = "PaddleOCR"
def __init__(self, key: str | dict, model_name: str, **kwargs):
Base.__init__(self, key, model_name, **kwargs)
raw_config = {}
if key:
try:
raw_config = json.loads(key)
except Exception:
raw_config = {}
# nested {"api_key": {...}} from UI
# flat {"PADDLEOCR_*": "..."} payload auto-provisioned from env vars
config = raw_config.get("api_key", raw_config)
if not isinstance(config, dict):
config = {}
def _resolve_config(key: str, env_key: str, default=""):
# lower-case keys (UI), upper-case PADDLEOCR_* (env auto-provision), env vars
return config.get(key, config.get(env_key, os.environ.get(env_key, default)))
self.paddleocr_api_url = _resolve_config("paddleocr_api_url", "PADDLEOCR_API_URL", "")
self.paddleocr_algorithm = _resolve_config("paddleocr_algorithm", "PADDLEOCR_ALGORITHM", "PaddleOCR-VL")
self.paddleocr_access_token = _resolve_config("paddleocr_access_token", "PADDLEOCR_ACCESS_TOKEN", None)
# Redact sensitive config keys before logging
redacted_config = {}
for k, v in config.items():
if any(sensitive_word in k.lower() for sensitive_word in ("key", "password", "token", "secret")):
redacted_config[k] = "[REDACTED]"
else:
redacted_config[k] = v
logging.info(f"Parsed PaddleOCR config (sensitive fields redacted): {redacted_config}")
PaddleOCRParser.__init__(
self,
api_url=self.paddleocr_api_url,
access_token=self.paddleocr_access_token,
algorithm=self.paddleocr_algorithm,
)
def check_available(self) -> tuple[bool, str]:
return self.check_installation()
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
ok, reason = self.check_available()
if not ok:
raise RuntimeError(f"PaddleOCR server not accessible: {reason}")
sections, tables = PaddleOCRParser.parse_pdf(self, filepath=filepath, binary=binary, callback=callback, parse_method=parse_method, **kwargs)
return sections, tables