diff --git a/rag/app/book.py b/rag/app/book.py index b6615ead9..5ea28d40d 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -20,7 +20,7 @@ from io import BytesIO from deepdoc.parser.utils import get_text from rag.app import naive -from rag.app.naive import plaintext_parser, PARSERS +from rag.app.naive import by_plaintext, PARSERS from rag.nlp import bullets_category, is_english,remove_contents_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ tokenize_chunks @@ -102,10 +102,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" name = layout_recognizer.strip().lower() - parser = PARSERS.get(name, plaintext_parser) + parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") - sections, tables, _ = parser( + sections, tables, pdf_parser = parser( filename = filename, binary = binary, from_page = from_page, diff --git a/rag/app/laws.py b/rag/app/laws.py index 5da63d354..dd97e4e3a 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -25,7 +25,7 @@ from rag.nlp import bullets_category, remove_contents_table, \ make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge from rag.nlp import rag_tokenizer, Node from deepdoc.parser import PdfParser, DocxParser, HtmlParser -from rag.app.naive import plaintext_parser, PARSERS +from rag.app.naive import by_plaintext, PARSERS @@ -161,10 +161,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" name = layout_recognizer.strip().lower() - parser = PARSERS.get(name, plaintext_parser) + parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") - raw_sections, tables, _ = parser( + raw_sections, tables, pdf_parser = parser( filename = filename, binary = binary, from_page = from_page, diff --git a/rag/app/manual.py b/rag/app/manual.py index b1e66e7b8..81402d1bd 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -26,7 +26,7 @@ from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper from docx import Document from PIL import Image -from rag.app.naive import plaintext_parser, PARSERS +from rag.app.naive import by_plaintext, PARSERS class Pdf(PdfParser): def __init__(self): @@ -202,7 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" name = layout_recognizer.strip().lower() - pdf_parser = PARSERS.get(name, plaintext_parser) + pdf_parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") sections, tbls, pdf_parser = pdf_parser( diff --git a/rag/app/naive.py b/rag/app/naive.py index a96d947d8..631050a49 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -38,7 +38,7 @@ from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table -def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, pdf_cls = None ,**kwargs): +def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): callback = callback binary = binary pdf_parser = pdf_cls() if pdf_cls else Pdf() @@ -48,13 +48,14 @@ def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback= to_page=to_page, callback=callback ) + tables = vision_figure_parser_pdf_wrapper(tbls=tables, callback=callback, **kwargs) return sections, tables, pdf_parser -def MinerU_parser(filename, binary=None, callback=None, **kwargs): +def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api) @@ -74,7 +75,7 @@ def MinerU_parser(filename, binary=None, callback=None, **kwargs): return sections, tables, pdf_parser -def Docling_parser(filename, binary=None, callback=None, **kwargs): +def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): pdf_parser = DoclingParser() if not pdf_parser.check_installation(): @@ -91,7 +92,7 @@ def Docling_parser(filename, binary=None, callback=None, **kwargs): return sections, tables, pdf_parser -def TCADP_parser(filename, binary=None, callback=None, **kwargs): +def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): tcadp_parser = TCADPParser() if not tcadp_parser.check_installation(): @@ -108,7 +109,7 @@ def TCADP_parser(filename, binary=None, callback=None, **kwargs): return sections, tables, tcadp_parser -def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): +def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): if kwargs.get("layout_recognizer", "") == "Plain Text": pdf_parser = PlainParser() else: @@ -125,11 +126,11 @@ def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callbac PARSERS = { - "deepdoc": DeepDOC_parser, - "mineru": MinerU_parser, - "docling": Docling_parser, - "tcadp": TCADP_parser, - "plaintext": plaintext_parser, # default + "deepdoc": by_deepdoc, + "mineru": by_mineru, + "docling": by_docling, + "tcadp": by_tcadp, + "plaintext": by_plaintext, # default } @@ -630,10 +631,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" name = layout_recognizer.strip().lower() - parser = PARSERS.get(name, plaintext_parser) + parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") - sections, tables, _ = parser( + sections, tables, pdf_parser = parser( filename = filename, binary = binary, from_page = from_page, diff --git a/rag/app/one.py b/rag/app/one.py index e4016118d..5574aaa51 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -23,7 +23,7 @@ from rag.app import naive from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, HtmlParser from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper -from rag.app.naive import plaintext_parser, PARSERS +from rag.app.naive import by_plaintext, PARSERS class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, @@ -88,10 +88,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" name = layout_recognizer.strip().lower() - parser = PARSERS.get(name, plaintext_parser) + parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") - sections, tbls, _ = parser( + sections, tbls, pdf_parser = parser( filename = filename, binary = binary, from_page = from_page, diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 497a7b1cb..cd1d308ec 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -24,7 +24,7 @@ from rag.nlp import tokenize, is_english from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, PptParser, PlainParser from PyPDF2 import PdfReader as pdf2_read -from rag.app.naive import plaintext_parser, PARSERS +from rag.app.naive import by_plaintext, PARSERS class Ppt(PptParser): def __call__(self, fnm, from_page, to_page, callback=None): @@ -131,7 +131,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" name = layout_recognizer.strip().lower() - parser = PARSERS.get(name, plaintext_parser) + parser = PARSERS.get(name, by_plaintext) callback(0.1, "Start to parse.") sections, _, _ = parser(