mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-02 00:25:06 +08:00
feat: add paddleocr parser (#12513)
### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -22,9 +22,7 @@ from deepdoc.parser.utils import get_text
|
||||
from rag.app import naive
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import bullets_category, is_english, remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
||||
tokenize_chunks, attach_media_context
|
||||
from rag.nlp import bullets_category, is_english, remove_contents_table, hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, tokenize_chunks, attach_media_context
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, HtmlParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||
@ -32,17 +30,12 @@ from PIL import Image
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
from timeit import default_timer as timer
|
||||
|
||||
start = timer()
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
|
||||
start = timer()
|
||||
@ -62,24 +55,17 @@ class Pdf(PdfParser):
|
||||
self._merge_with_same_bullet()
|
||||
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
|
||||
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
|
||||
for b in self.boxes], tbls
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time.
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
pdf_parser = None
|
||||
sections, tbls = [], []
|
||||
@ -87,28 +73,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.1, "Start to parse.")
|
||||
doc_parser = naive.Docx()
|
||||
# TODO: table of contents need to be removed
|
||||
main_sections = doc_parser(
|
||||
filename, binary=binary, from_page=from_page, to_page=to_page)
|
||||
|
||||
main_sections = doc_parser(filename, binary=binary, from_page=from_page, to_page=to_page)
|
||||
|
||||
sections = []
|
||||
tbls = []
|
||||
for text, image, html in main_sections:
|
||||
sections.append((text, image))
|
||||
tbls.append(((None, html), ""))
|
||||
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
|
||||
|
||||
tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs)
|
||||
# tbls = [((None, lns), None) for lns in tbls]
|
||||
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if
|
||||
not isinstance(item[1], Image.Image)]
|
||||
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -127,13 +108,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections and not tables:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -142,16 +124,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
txt = get_text(filename, binary)
|
||||
sections = txt.split("\n")
|
||||
sections = [(line, "") for line in sections if line]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
sections = HtmlParser()(filename, binary)
|
||||
sections = [(line, "") for line in sections if line]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
||||
@ -165,31 +145,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [(line, "") for line in sections if line]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
make_colon_as_title(sections)
|
||||
bull = bullets_category(
|
||||
[t for t in random_choices([t for t, _ in sections], k=100)])
|
||||
bull = bullets_category([t for t in random_choices([t for t, _ in sections], k=100)])
|
||||
if bull >= 0:
|
||||
chunks = ["\n".join(ck)
|
||||
for ck in hierarchical_merge(bull, sections, 5)]
|
||||
chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 5)]
|
||||
else:
|
||||
sections = [s.split("@") for s, _ in sections]
|
||||
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections]
|
||||
chunks = naive_merge(
|
||||
sections,
|
||||
parser_config.get("chunk_token_num", 256),
|
||||
parser_config.get("delimiter", "\n。;!?")
|
||||
)
|
||||
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], "") for pr in sections]
|
||||
chunks = naive_merge(sections, parser_config.get("chunk_token_num", 256), parser_config.get("delimiter", "\n。;!?"))
|
||||
|
||||
# is it English
|
||||
# is_english(random_choices([t for t, _ in sections], k=218))
|
||||
@ -208,9 +180,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
|
||||
|
||||
@ -21,8 +21,7 @@ from docx import Document
|
||||
|
||||
from common.constants import ParserType
|
||||
from deepdoc.parser.utils import get_text
|
||||
from rag.nlp import bullets_category, remove_contents_table, \
|
||||
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
|
||||
from rag.nlp import bullets_category, remove_contents_table, make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
|
||||
from rag.nlp import rag_tokenizer, Node
|
||||
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
@ -38,8 +37,7 @@ class Docx(DocxParser):
|
||||
return line
|
||||
|
||||
def old_call(self, filename, binary=None, from_page=0, to_page=100000):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
lines = []
|
||||
for p in self.doc.paragraphs:
|
||||
@ -48,16 +46,15 @@ class Docx(DocxParser):
|
||||
if from_page <= pn < to_page and p.text.strip():
|
||||
lines.append(self.__clean(p.text))
|
||||
for run in p.runs:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
if "lastRenderedPageBreak" in run._element.xml:
|
||||
pn += 1
|
||||
continue
|
||||
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
||||
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
|
||||
pn += 1
|
||||
return [line for line in lines if line]
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
lines = []
|
||||
level_set = set()
|
||||
@ -71,10 +68,10 @@ class Docx(DocxParser):
|
||||
lines.append((question_level, p_text))
|
||||
level_set.add(question_level)
|
||||
for run in p.runs:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
if "lastRenderedPageBreak" in run._element.xml:
|
||||
pn += 1
|
||||
continue
|
||||
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
||||
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
|
||||
pn += 1
|
||||
|
||||
sorted_levels = sorted(level_set)
|
||||
@ -88,12 +85,12 @@ class Docx(DocxParser):
|
||||
return [element for element in root.get_tree() if element]
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'''
|
||||
return f"""
|
||||
question:{self.question},
|
||||
answer:{self.answer},
|
||||
level:{self.level},
|
||||
childs:{self.childs}
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -101,18 +98,12 @@ class Pdf(PdfParser):
|
||||
self.model_speciess = ParserType.LAWS.value
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
from timeit import default_timer as timer
|
||||
|
||||
start = timer()
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
|
||||
start = timer()
|
||||
@ -123,22 +114,15 @@ class Pdf(PdfParser):
|
||||
|
||||
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
|
||||
|
||||
return [(b["text"], self._line_tag(b, zoomin))
|
||||
for b in self.boxes], None
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Supported file formats are docx, pdf, txt.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
pdf_parser = None
|
||||
sections = []
|
||||
@ -152,9 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return tokenize_chunks(chunks, doc, eng, None)
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -173,13 +155,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not raw_sections and not tables:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
for txt, poss in raw_sections:
|
||||
@ -210,8 +193,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [s for s in sections if s]
|
||||
callback(0.8, "Finish parsing.")
|
||||
else:
|
||||
@ -219,8 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
logging.warning(f"tika.parser got empty content from {filename}.")
|
||||
return []
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
# Remove 'Contents' part
|
||||
remove_contents_table(sections, eng)
|
||||
@ -241,9 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -20,8 +20,7 @@ import re
|
||||
|
||||
from common.constants import ParserType
|
||||
from io import BytesIO
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, \
|
||||
docx_question_level, attach_media_context
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
|
||||
from common.token_utils import num_tokens_from_string
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper, vision_figure_parser_docx_wrapper
|
||||
@ -36,18 +35,12 @@ class Pdf(PdfParser):
|
||||
self.model_speciess = ParserType.MANUAL.value
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
from timeit import default_timer as timer
|
||||
|
||||
start = timer()
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
logging.debug("OCR: {}".format(timer() - start))
|
||||
|
||||
@ -71,8 +64,7 @@ class Pdf(PdfParser):
|
||||
for b in self.boxes:
|
||||
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
||||
|
||||
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
|
||||
for i, b in enumerate(self.boxes)], tbls
|
||||
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls
|
||||
|
||||
|
||||
class Docx(DocxParser):
|
||||
@ -80,12 +72,12 @@ class Docx(DocxParser):
|
||||
pass
|
||||
|
||||
def get_picture(self, document, paragraph):
|
||||
img = paragraph._element.xpath('.//pic:pic')
|
||||
img = paragraph._element.xpath(".//pic:pic")
|
||||
if not img:
|
||||
return None
|
||||
try:
|
||||
img = img[0]
|
||||
embed = img.xpath('.//a:blip/@r:embed')[0]
|
||||
embed = img.xpath(".//a:blip/@r:embed")[0]
|
||||
related_part = document.part.related_parts[embed]
|
||||
image = related_part.image
|
||||
if image is not None:
|
||||
@ -111,7 +103,7 @@ class Docx(DocxParser):
|
||||
|
||||
new_width = max(width1, width2)
|
||||
new_height = height1 + height2
|
||||
new_image = Image.new('RGB', (new_width, new_height))
|
||||
new_image = Image.new("RGB", (new_width, new_height))
|
||||
|
||||
new_image.paste(img1, (0, 0))
|
||||
new_image.paste(img2, (0, height1))
|
||||
@ -119,8 +111,7 @@ class Docx(DocxParser):
|
||||
return new_image
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
last_answer, last_image = "", None
|
||||
question_stack, level_stack = [], []
|
||||
@ -128,19 +119,19 @@ class Docx(DocxParser):
|
||||
for p in self.doc.paragraphs:
|
||||
if pn > to_page:
|
||||
break
|
||||
question_level, p_text = 0, ''
|
||||
question_level, p_text = 0, ""
|
||||
if from_page <= pn < to_page and p.text.strip():
|
||||
question_level, p_text = docx_question_level(p)
|
||||
if not question_level or question_level > 6: # not a question
|
||||
last_answer = f'{last_answer}\n{p_text}'
|
||||
last_answer = f"{last_answer}\n{p_text}"
|
||||
current_image = self.get_picture(self.doc, p)
|
||||
last_image = self.concat_img(last_image, current_image)
|
||||
else: # is a question
|
||||
if last_answer or last_image:
|
||||
sum_question = '\n'.join(question_stack)
|
||||
sum_question = "\n".join(question_stack)
|
||||
if sum_question:
|
||||
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
|
||||
last_answer, last_image = '', None
|
||||
ti_list.append((f"{sum_question}\n{last_answer}", last_image))
|
||||
last_answer, last_image = "", None
|
||||
|
||||
i = question_level
|
||||
while question_stack and i <= level_stack[-1]:
|
||||
@ -149,15 +140,15 @@ class Docx(DocxParser):
|
||||
question_stack.append(p_text)
|
||||
level_stack.append(question_level)
|
||||
for run in p.runs:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
if "lastRenderedPageBreak" in run._element.xml:
|
||||
pn += 1
|
||||
continue
|
||||
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
||||
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
|
||||
pn += 1
|
||||
if last_answer:
|
||||
sum_question = '\n'.join(question_stack)
|
||||
sum_question = "\n".join(question_stack)
|
||||
if sum_question:
|
||||
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
|
||||
ti_list.append((f"{sum_question}\n{last_answer}", last_image))
|
||||
|
||||
tbls = []
|
||||
for tb in self.doc.tables:
|
||||
@ -182,26 +173,19 @@ class Docx(DocxParser):
|
||||
return ti_list, tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
Only pdf is supported.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
pdf_parser = None
|
||||
doc = {
|
||||
"docnm_kwd": filename
|
||||
}
|
||||
doc = {"docnm_kwd": filename}
|
||||
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -222,8 +206,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
parse_method="manual",
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _normalize_section(section):
|
||||
@ -252,7 +237,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if not sections and not tbls:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -264,8 +249,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
for txt, _, _ in sections:
|
||||
for t, lvl in pdf_parser.outlines:
|
||||
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
|
||||
tks_ = set([txt[i] + txt[i + 1]
|
||||
for i in range(min(len(t), len(txt) - 1))])
|
||||
tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
|
||||
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
|
||||
levels.append(lvl)
|
||||
break
|
||||
@ -274,8 +258,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
else:
|
||||
bull = bullets_category([txt for txt, _, _ in sections])
|
||||
most_level, levels = title_frequency(
|
||||
bull, [(txt, lvl) for txt, lvl, _ in sections])
|
||||
most_level, levels = title_frequency(bull, [(txt, lvl) for txt, lvl, _ in sections])
|
||||
|
||||
assert len(sections) == len(levels)
|
||||
sec_ids = []
|
||||
@ -285,25 +268,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
|
||||
sections = [(txt, sec_ids[i], poss)
|
||||
for i, (txt, _, poss) in enumerate(sections)]
|
||||
sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
|
||||
for (img, rows), poss in tbls:
|
||||
if not rows:
|
||||
continue
|
||||
sections.append((rows if isinstance(rows, str) else rows[0], -1,
|
||||
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
|
||||
def tag(pn, left, right, top, bottom):
|
||||
if pn + left + right + top + bottom == 0:
|
||||
return ""
|
||||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
||||
.format(pn, left, right, top, bottom)
|
||||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, left, right, top, bottom)
|
||||
|
||||
chunks = []
|
||||
last_sid = -2
|
||||
tk_cnt = 0
|
||||
for txt, sec_id, poss in sorted(sections, key=lambda x: (
|
||||
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
||||
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
||||
poss = "\t".join([tag(*pos) for pos in poss])
|
||||
if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
|
||||
if chunks:
|
||||
@ -330,14 +309,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||
docx_parser = Docx()
|
||||
ti_list, tbls = docx_parser(filename, binary,
|
||||
from_page=0, to_page=10000, callback=callback)
|
||||
ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback)
|
||||
tbls = vision_figure_parser_docx_wrapper(sections=ti_list, tbls=tbls, callback=callback, **kwargs)
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
for text, image in ti_list:
|
||||
d = copy.deepcopy(doc)
|
||||
if image:
|
||||
d['image'] = image
|
||||
d["image"] = image
|
||||
d["doc_type_kwd"] = "image"
|
||||
tokenize(d, text, eng)
|
||||
res.append(d)
|
||||
@ -353,9 +331,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
283
rag/app/naive.py
283
rag/app/naive.py
@ -33,29 +33,32 @@ from common.token_utils import num_tokens_from_string
|
||||
from common.constants import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, \
|
||||
PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, \
|
||||
vision_figure_parser_pdf_wrapper
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from deepdoc.parser.docling_parser import DoclingParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
|
||||
tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, \
|
||||
attach_media_context # noqa: F401
|
||||
from rag.nlp import (
|
||||
concat_img,
|
||||
find_codec,
|
||||
naive_merge,
|
||||
naive_merge_with_images,
|
||||
naive_merge_docx,
|
||||
rag_tokenizer,
|
||||
tokenize_chunks,
|
||||
doc_tokenize_chunks_with_images,
|
||||
tokenize_table,
|
||||
append_context2table_image4pdf,
|
||||
tokenize_chunks_with_images,
|
||||
) # noqa: F401
|
||||
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
|
||||
**kwargs):
|
||||
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
||||
callback = callback
|
||||
binary = binary
|
||||
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
||||
sections, tables = pdf_parser(
|
||||
filename if not binary else binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
callback=callback
|
||||
)
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
|
||||
tables = vision_figure_parser_pdf_wrapper(
|
||||
tbls=tables,
|
||||
@ -67,17 +70,17 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
|
||||
|
||||
def by_mineru(
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
mineru_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
mineru_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
pdf_parser = None
|
||||
if tenant_id:
|
||||
@ -115,8 +118,7 @@ def by_mineru(
|
||||
return None, None, None
|
||||
|
||||
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
|
||||
**kwargs):
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
||||
pdf_parser = DoclingParser()
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
|
||||
@ -130,7 +132,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
parse_method=parse_method
|
||||
parse_method=parse_method,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
@ -142,16 +144,60 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
|
||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
return None, None, tcadp_parser
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||
file_type="PDF"
|
||||
)
|
||||
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF")
|
||||
return sections, tables, tcadp_parser
|
||||
|
||||
|
||||
def by_paddleocr(
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
paddleocr_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
pdf_parser = None
|
||||
if tenant_id:
|
||||
if not paddleocr_llm_name:
|
||||
try:
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR)
|
||||
if candidates:
|
||||
paddleocr_llm_name = candidates[0].llm_name
|
||||
elif env_name:
|
||||
paddleocr_llm_name = env_name
|
||||
except Exception as e: # best-effort fallback
|
||||
logging.warning(f"fallback to env paddleocr: {e}")
|
||||
|
||||
if paddleocr_llm_name:
|
||||
try:
|
||||
ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=paddleocr_llm_name, lang=lang)
|
||||
pdf_parser = ocr_model.mdl
|
||||
sections, tables = pdf_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
parse_method=parse_method,
|
||||
**kwargs,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}")
|
||||
|
||||
return None, None, None
|
||||
|
||||
if callback:
|
||||
callback(-1, "PaddleOCR not found.")
|
||||
return None, None, None
|
||||
|
||||
|
||||
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
|
||||
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
|
||||
@ -168,12 +214,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
|
||||
)
|
||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||
|
||||
sections, tables = pdf_parser(
|
||||
filename if not binary else binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
callback=callback
|
||||
)
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
@ -182,6 +223,7 @@ PARSERS = {
|
||||
"mineru": by_mineru,
|
||||
"docling": by_docling,
|
||||
"tcadp": by_tcadp,
|
||||
"paddleocr": by_paddleocr,
|
||||
"plaintext": by_plaintext, # default
|
||||
}
|
||||
|
||||
@ -191,12 +233,12 @@ class Docx(DocxParser):
|
||||
pass
|
||||
|
||||
def get_picture(self, document, paragraph):
|
||||
imgs = paragraph._element.xpath('.//pic:pic')
|
||||
imgs = paragraph._element.xpath(".//pic:pic")
|
||||
if not imgs:
|
||||
return None
|
||||
res_img = None
|
||||
for img in imgs:
|
||||
embed = img.xpath('.//a:blip/@r:embed')
|
||||
embed = img.xpath(".//a:blip/@r:embed")
|
||||
if not embed:
|
||||
continue
|
||||
embed = embed[0]
|
||||
@ -219,7 +261,7 @@ class Docx(DocxParser):
|
||||
logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
|
||||
continue
|
||||
try:
|
||||
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
||||
image = Image.open(BytesIO(image_blob)).convert("RGB")
|
||||
if res_img is None:
|
||||
res_img = image
|
||||
else:
|
||||
@ -251,11 +293,11 @@ class Docx(DocxParser):
|
||||
try:
|
||||
# Iterate through all paragraphs and tables in document order
|
||||
for i, block in enumerate(self.doc._element.body):
|
||||
if block.tag.endswith('p'): # Paragraph
|
||||
if block.tag.endswith("p"): # Paragraph
|
||||
p = Paragraph(block, self.doc)
|
||||
blocks.append(('p', i, p))
|
||||
elif block.tag.endswith('tbl'): # Table
|
||||
blocks.append(('t', i, None)) # Table object will be retrieved later
|
||||
blocks.append(("p", i, p))
|
||||
elif block.tag.endswith("tbl"): # Table
|
||||
blocks.append(("t", i, None)) # Table object will be retrieved later
|
||||
except Exception as e:
|
||||
logging.error(f"Error collecting blocks: {e}")
|
||||
return ""
|
||||
@ -264,7 +306,7 @@ class Docx(DocxParser):
|
||||
target_table_pos = -1
|
||||
table_count = 0
|
||||
for i, (block_type, pos, _) in enumerate(blocks):
|
||||
if block_type == 't':
|
||||
if block_type == "t":
|
||||
if table_count == table_index:
|
||||
target_table_pos = pos
|
||||
break
|
||||
@ -280,7 +322,7 @@ class Docx(DocxParser):
|
||||
if pos >= target_table_pos: # Skip blocks after the table
|
||||
continue
|
||||
|
||||
if block_type != 'p':
|
||||
if block_type != "p":
|
||||
continue
|
||||
|
||||
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||
@ -309,7 +351,7 @@ class Docx(DocxParser):
|
||||
if pos >= target_table_pos: # Skip blocks after the table
|
||||
continue
|
||||
|
||||
if block_type != 'p':
|
||||
if block_type != "p":
|
||||
continue
|
||||
|
||||
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||
@ -340,8 +382,7 @@ class Docx(DocxParser):
|
||||
return ""
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
lines = []
|
||||
last_image = None
|
||||
@ -357,7 +398,7 @@ class Docx(DocxParser):
|
||||
if pn > to_page:
|
||||
break
|
||||
|
||||
if block.tag.endswith('p'):
|
||||
if block.tag.endswith("p"):
|
||||
p = Paragraph(block, self.doc)
|
||||
|
||||
if from_page <= pn < to_page:
|
||||
@ -417,7 +458,7 @@ class Docx(DocxParser):
|
||||
if "w:br" in xml and 'type="page"' in xml:
|
||||
pn += 1
|
||||
|
||||
elif block.tag.endswith('tbl'):
|
||||
elif block.tag.endswith("tbl"):
|
||||
if pn < from_page or pn > to_page:
|
||||
table_idx += 1
|
||||
continue
|
||||
@ -455,7 +496,6 @@ class Docx(DocxParser):
|
||||
|
||||
return new_line
|
||||
|
||||
|
||||
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
|
||||
"""
|
||||
This function uses mammoth, licensed under the BSD 2-Clause License.
|
||||
@ -486,8 +526,7 @@ class Docx(DocxParser):
|
||||
|
||||
try:
|
||||
if inline_images:
|
||||
result = mammoth.convert_to_html(docx_file,
|
||||
convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
||||
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
||||
else:
|
||||
result = mammoth.convert_to_html(docx_file)
|
||||
|
||||
@ -505,18 +544,11 @@ class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
||||
start = timer()
|
||||
first_start = start
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
||||
|
||||
@ -559,13 +591,14 @@ class Markdown(MarkdownParser):
|
||||
return []
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html_content = markdown(text)
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
return soup
|
||||
|
||||
def get_hyperlink_urls(self, soup):
|
||||
if soup:
|
||||
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
||||
return set([a.get("href") for a in soup.find_all("a") if a.get("href")])
|
||||
return []
|
||||
|
||||
def extract_image_urls_with_lines(self, text):
|
||||
@ -588,10 +621,10 @@ class Markdown(MarkdownParser):
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(text, 'html.parser')
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
|
||||
for img_tag in soup.find_all('img'):
|
||||
src = img_tag.get('src')
|
||||
for img_tag in soup.find_all("img"):
|
||||
src = img_tag.get("src")
|
||||
if not src:
|
||||
continue
|
||||
|
||||
@ -627,14 +660,14 @@ class Markdown(MarkdownParser):
|
||||
continue
|
||||
img_obj = None
|
||||
try:
|
||||
if url.startswith(('http://', 'https://')):
|
||||
if url.startswith(("http://", "https://")):
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
|
||||
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
else:
|
||||
local_path = Path(url)
|
||||
if local_path.exists():
|
||||
img_obj = Image.open(url).convert('RGB')
|
||||
img_obj = Image.open(url).convert("RGB")
|
||||
else:
|
||||
logging.warning(f"Local image file not found: {url}")
|
||||
except Exception as e:
|
||||
@ -652,7 +685,7 @@ class Markdown(MarkdownParser):
|
||||
with open(filename, "r") as f:
|
||||
txt = f.read()
|
||||
|
||||
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
||||
remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables)
|
||||
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
||||
# extractor = MarkdownElementExtractor(remainder)
|
||||
extractor = MarkdownElementExtractor(txt)
|
||||
@ -678,7 +711,7 @@ class Markdown(MarkdownParser):
|
||||
|
||||
tbls = []
|
||||
for table in tables:
|
||||
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
||||
tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), ""))
|
||||
if return_section_images:
|
||||
return sections, tbls, section_images
|
||||
return sections, tbls
|
||||
@ -694,7 +727,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
|
||||
if rels_item_xml is not None:
|
||||
rels_elm = parse_xml(rels_item_xml)
|
||||
for rel_elm in rels_elm.Relationship_lst:
|
||||
if rel_elm.target_ref in ('../NULL', 'NULL'):
|
||||
if rel_elm.target_ref in ("../NULL", "NULL"):
|
||||
continue
|
||||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
|
||||
return srels
|
||||
@ -702,21 +735,18 @@ def load_from_xml_v2(baseURI, rels_item_xml):
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
Successive text will be sliced into pieces using 'delimiter'.
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
Successive text will be sliced into pieces using 'delimiter'.
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
"""
|
||||
urls = set()
|
||||
url_res = []
|
||||
|
||||
is_english = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||
|
||||
child_deli = (parser_config.get("children_delimiter") or "").encode('utf-8').decode('unicode_escape').encode(
|
||||
'latin1').decode('utf-8')
|
||||
child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
|
||||
cust_child_deli = re.findall(r"`([^`]+)`", child_deli)
|
||||
child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli))
|
||||
if cust_child_deli:
|
||||
@ -728,10 +758,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
res = []
|
||||
pdf_parser = None
|
||||
@ -750,8 +777,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
# Recursively chunk each embedded file and collect results
|
||||
for embed_filename, embed_bytes in embeds:
|
||||
try:
|
||||
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False,
|
||||
**kwargs) or []
|
||||
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
|
||||
embed_res.extend(sub_res)
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
|
||||
@ -772,8 +798,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
||||
except Exception as e:
|
||||
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
||||
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False,
|
||||
**kwargs)
|
||||
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
||||
url_res.extend(sub_url_res)
|
||||
|
||||
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
|
||||
@ -784,11 +809,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
# chunks list[dict]
|
||||
# images list - index of image chunk in chunks
|
||||
chunks, images = naive_merge_docx(
|
||||
sections, int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"), table_context_size, image_context_size)
|
||||
|
||||
chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size)
|
||||
|
||||
vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs)
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -801,9 +823,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if parser_config.get("analyze_hyperlink", False) and is_root:
|
||||
urls = extract_links_from_pdf(binary)
|
||||
@ -824,7 +844,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback=callback,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections and not tables:
|
||||
@ -833,7 +854,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if table_context_size or image_context_size:
|
||||
tables = append_context2table_image4pdf(sections, tables, image_context_size)
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
@ -847,10 +868,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if layout_recognizer == "TCADP Parser":
|
||||
table_result_type = parser_config.get("table_result_type", "1")
|
||||
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type)
|
||||
if not tcadp_parser.check_installation():
|
||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
return res
|
||||
@ -858,13 +876,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
# Determine file type based on extension
|
||||
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||
file_type=file_type
|
||||
)
|
||||
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type)
|
||||
parser_config["chunk_token_num"] = 0
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -879,9 +891,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
sections = TxtParser()(filename, binary,
|
||||
parser_config.get("chunk_token_num", 128),
|
||||
parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
|
||||
@ -919,11 +929,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
else:
|
||||
section_images = [None] * len(sections)
|
||||
section_images[idx] = combined_image
|
||||
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
|
||||
((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||
boosted_figures = markdown_vision_parser(callback=callback)
|
||||
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]),
|
||||
sections[idx][1])
|
||||
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
||||
|
||||
else:
|
||||
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
||||
@ -962,8 +970,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [(_, "") for _ in sections if _]
|
||||
callback(0.8, "Finish parsing.")
|
||||
else:
|
||||
@ -972,8 +980,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
logging.warning(error_msg)
|
||||
return []
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
|
||||
st = timer()
|
||||
if is_markdown:
|
||||
@ -1021,8 +1028,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
has_images = merged_images and any(img is not None for img in merged_images)
|
||||
|
||||
if has_images:
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images,
|
||||
child_delimiters_pattern=child_deli))
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
@ -1031,17 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
section_images = None
|
||||
|
||||
if section_images:
|
||||
chunks, images = naive_merge_with_images(sections, section_images,
|
||||
int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
res.extend(
|
||||
tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
||||
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
chunks = naive_merge(
|
||||
sections, int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
||||
|
||||
@ -1071,9 +1070,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -28,18 +28,12 @@ from common.parser_config_utils import normalize_layout_recognizer
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
from timeit import default_timer as timer
|
||||
|
||||
start = timer()
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
|
||||
start = timer()
|
||||
@ -57,21 +51,16 @@ class Pdf(PdfParser):
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._concat_downward()
|
||||
|
||||
sections = [(b["text"], self.get_position(b, zoomin))
|
||||
for i, b in enumerate(self.boxes)]
|
||||
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
|
||||
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
|
||||
sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
|
||||
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
One file forms a chunk which maintains original text order.
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
One file forms a chunk which maintains original text order.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
@ -99,9 +88,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -120,13 +107,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections and not tbls:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -134,8 +122,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
for (img, rows), poss in tbls:
|
||||
if not rows:
|
||||
continue
|
||||
sections.append((rows if isinstance(rows, str) else rows[0],
|
||||
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
sections.append((rows if isinstance(rows, str) else rows[0], [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
sections = [s for s, _ in sections if s]
|
||||
|
||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
@ -167,19 +154,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [s for s in sections if s]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
tokenize(doc, "\n".join(sections), eng)
|
||||
return [doc]
|
||||
@ -188,9 +171,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -36,22 +36,18 @@ class Ppt(PptParser):
|
||||
callback(0.5, "Text extraction finished.")
|
||||
import aspose.slides as slides
|
||||
import aspose.pydrawing as drawing
|
||||
|
||||
imgs = []
|
||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
||||
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
||||
for i, slide in enumerate(presentation.slides[from_page:to_page]):
|
||||
try:
|
||||
with BytesIO() as buffered:
|
||||
slide.get_thumbnail(
|
||||
0.1, 0.1).save(
|
||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
buffered.seek(0)
|
||||
imgs.append(Image.open(buffered).copy())
|
||||
except RuntimeError as e:
|
||||
raise RuntimeError(
|
||||
f'ppt parse error at page {i + 1}, original error: {str(e)}') from e
|
||||
assert len(imgs) == len(
|
||||
txts), "Slides text and image do not match: {} vs. {}".format(
|
||||
len(imgs), len(txts))
|
||||
raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e
|
||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
callback(0.9, "Image extraction finished")
|
||||
self.is_english = is_english(txts)
|
||||
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
@ -61,12 +57,10 @@ class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None, **kwargs):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, **kwargs):
|
||||
# 1. OCR
|
||||
callback(msg="OCR started")
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page,
|
||||
to_page, callback)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
|
||||
# 2. Layout Analysis
|
||||
callback(msg="Layout Analysis")
|
||||
@ -91,12 +85,7 @@ class Pdf(PdfParser):
|
||||
global_page_num = b["page_number"] + from_page
|
||||
if not (from_page < global_page_num <= to_page + from_page):
|
||||
continue
|
||||
page_items[global_page_num].append({
|
||||
"top": b["top"],
|
||||
"x0": b["x0"],
|
||||
"text": b["text"],
|
||||
"type": "text"
|
||||
})
|
||||
page_items[global_page_num].append({"top": b["top"], "x0": b["x0"], "text": b["text"], "type": "text"})
|
||||
|
||||
# (B) Add table and figure
|
||||
for (img, content), positions in tbls:
|
||||
@ -127,12 +116,7 @@ class Pdf(PdfParser):
|
||||
top = positions[0][3]
|
||||
left = positions[0][1]
|
||||
|
||||
page_items[current_page_num].append({
|
||||
"top": top,
|
||||
"x0": left,
|
||||
"text": final_text,
|
||||
"type": "table_or_figure"
|
||||
})
|
||||
page_items[current_page_num].append({"top": top, "x0": left, "text": final_text, "type": "table_or_figure"})
|
||||
|
||||
# 7. Generate result
|
||||
res = []
|
||||
@ -153,18 +137,16 @@ class Pdf(PdfParser):
|
||||
|
||||
|
||||
class PlainPdf(PlainParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, callback=None, **kwargs):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
|
||||
page_txt = []
|
||||
for page in self.pdf.pages[from_page: to_page]:
|
||||
for page in self.pdf.pages[from_page:to_page]:
|
||||
page_txt.append(page.extract_text())
|
||||
callback(0.9, "Parsing finished")
|
||||
return [(txt, None) for txt in page_txt], []
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, parser_config=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
|
||||
"""
|
||||
The supported file formats are pdf, pptx.
|
||||
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
||||
@ -173,18 +155,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if parser_config is None:
|
||||
parser_config = {}
|
||||
eng = lang.lower() == "english"
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(
|
||||
re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
res = []
|
||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||
ppt_parser = Ppt()
|
||||
for pn, (txt, img) in enumerate(ppt_parser(
|
||||
filename if not binary else binary, from_page, 1000000,
|
||||
callback)):
|
||||
for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
d["image"] = img
|
||||
@ -196,9 +172,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
res.append(d)
|
||||
return res
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -217,13 +191,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -236,22 +211,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
d["image"] = img
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0,
|
||||
img.size[1] if img else 0)]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pptx, pdf supported)")
|
||||
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(a, b):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
Reference in New Issue
Block a user