Fix: fix pdf_parser ignored in rag/app/naive.py (#11065)

### What problem does this PR solve?

Fix: fix pdf_parser ignored in rag/app/naive.py #11000

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Billy Bao
2025-11-06 15:20:35 +08:00
committed by GitHub
parent ca30ef83bf
commit 4b8ce08050
6 changed files with 26 additions and 25 deletions

View File

@ -20,7 +20,7 @@ from io import BytesIO
from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.app.naive import plaintext_parser, PARSERS
from rag.app.naive import by_plaintext, PARSERS
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks
@ -102,10 +102,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, tables, _ = parser(
sections, tables, pdf_parser = parser(
filename = filename,
binary = binary,
from_page = from_page,

View File

@ -25,7 +25,7 @@ from rag.nlp import bullets_category, remove_contents_table, \
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
from rag.nlp import rag_tokenizer, Node
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
from rag.app.naive import plaintext_parser, PARSERS
from rag.app.naive import by_plaintext, PARSERS
@ -161,10 +161,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
raw_sections, tables, _ = parser(
raw_sections, tables, pdf_parser = parser(
filename = filename,
binary = binary,
from_page = from_page,

View File

@ -26,7 +26,7 @@ from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
from docx import Document
from PIL import Image
from rag.app.naive import plaintext_parser, PARSERS
from rag.app.naive import by_plaintext, PARSERS
class Pdf(PdfParser):
def __init__(self):
@ -202,7 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
pdf_parser = PARSERS.get(name, plaintext_parser)
pdf_parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, tbls, pdf_parser = pdf_parser(

View File

@ -38,7 +38,7 @@ from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, pdf_cls = None ,**kwargs):
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
@ -48,13 +48,14 @@ def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=
to_page=to_page,
callback=callback
)
tables = vision_figure_parser_pdf_wrapper(tbls=tables,
callback=callback,
**kwargs)
return sections, tables, pdf_parser
def MinerU_parser(filename, binary=None, callback=None, **kwargs):
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
@ -74,7 +75,7 @@ def MinerU_parser(filename, binary=None, callback=None, **kwargs):
return sections, tables, pdf_parser
def Docling_parser(filename, binary=None, callback=None, **kwargs):
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
pdf_parser = DoclingParser()
if not pdf_parser.check_installation():
@ -91,7 +92,7 @@ def Docling_parser(filename, binary=None, callback=None, **kwargs):
return sections, tables, pdf_parser
def TCADP_parser(filename, binary=None, callback=None, **kwargs):
def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
@ -108,7 +109,7 @@ def TCADP_parser(filename, binary=None, callback=None, **kwargs):
return sections, tables, tcadp_parser
def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
if kwargs.get("layout_recognizer", "") == "Plain Text":
pdf_parser = PlainParser()
else:
@ -125,11 +126,11 @@ def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callbac
PARSERS = {
"deepdoc": DeepDOC_parser,
"mineru": MinerU_parser,
"docling": Docling_parser,
"tcadp": TCADP_parser,
"plaintext": plaintext_parser, # default
"deepdoc": by_deepdoc,
"mineru": by_mineru,
"docling": by_docling,
"tcadp": by_tcadp,
"plaintext": by_plaintext, # default
}
@ -630,10 +631,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, tables, _ = parser(
sections, tables, pdf_parser = parser(
filename = filename,
binary = binary,
from_page = from_page,

View File

@ -23,7 +23,7 @@ from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
from rag.app.naive import plaintext_parser, PARSERS
from rag.app.naive import by_plaintext, PARSERS
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
@ -88,10 +88,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, tbls, _ = parser(
sections, tbls, pdf_parser = parser(
filename = filename,
binary = binary,
from_page = from_page,

View File

@ -24,7 +24,7 @@ from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read
from rag.app.naive import plaintext_parser, PARSERS
from rag.app.naive import by_plaintext, PARSERS
class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None):
@ -131,7 +131,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
sections, _, _ = parser(