mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: fix pdf_parser ignored in rag/app/naive.py (#11065)
### What problem does this PR solve? Fix: fix pdf_parser ignored in rag/app/naive.py #11000 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -20,7 +20,7 @@ from io import BytesIO
|
||||
|
||||
from deepdoc.parser.utils import get_text
|
||||
from rag.app import naive
|
||||
from rag.app.naive import plaintext_parser, PARSERS
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
||||
tokenize_chunks
|
||||
@ -102,10 +102,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
|
||||
name = layout_recognizer.strip().lower()
|
||||
parser = PARSERS.get(name, plaintext_parser)
|
||||
parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
sections, tables, _ = parser(
|
||||
sections, tables, pdf_parser = parser(
|
||||
filename = filename,
|
||||
binary = binary,
|
||||
from_page = from_page,
|
||||
|
||||
@ -25,7 +25,7 @@ from rag.nlp import bullets_category, remove_contents_table, \
|
||||
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
|
||||
from rag.nlp import rag_tokenizer, Node
|
||||
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
|
||||
from rag.app.naive import plaintext_parser, PARSERS
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
|
||||
|
||||
|
||||
@ -161,10 +161,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
|
||||
name = layout_recognizer.strip().lower()
|
||||
parser = PARSERS.get(name, plaintext_parser)
|
||||
parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
raw_sections, tables, _ = parser(
|
||||
raw_sections, tables, pdf_parser = parser(
|
||||
filename = filename,
|
||||
binary = binary,
|
||||
from_page = from_page,
|
||||
|
||||
@ -26,7 +26,7 @@ from deepdoc.parser import PdfParser, DocxParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
from rag.app.naive import plaintext_parser, PARSERS
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
@ -202,7 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
|
||||
name = layout_recognizer.strip().lower()
|
||||
pdf_parser = PARSERS.get(name, plaintext_parser)
|
||||
pdf_parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
sections, tbls, pdf_parser = pdf_parser(
|
||||
|
||||
@ -38,7 +38,7 @@ from deepdoc.parser.docling_parser import DoclingParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
||||
|
||||
def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, pdf_cls = None ,**kwargs):
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||
callback = callback
|
||||
binary = binary
|
||||
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
||||
@ -48,13 +48,14 @@ def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=
|
||||
to_page=to_page,
|
||||
callback=callback
|
||||
)
|
||||
|
||||
tables = vision_figure_parser_pdf_wrapper(tbls=tables,
|
||||
callback=callback,
|
||||
**kwargs)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
def MinerU_parser(filename, binary=None, callback=None, **kwargs):
|
||||
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
||||
@ -74,7 +75,7 @@ def MinerU_parser(filename, binary=None, callback=None, **kwargs):
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
def Docling_parser(filename, binary=None, callback=None, **kwargs):
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||
pdf_parser = DoclingParser()
|
||||
|
||||
if not pdf_parser.check_installation():
|
||||
@ -91,7 +92,7 @@ def Docling_parser(filename, binary=None, callback=None, **kwargs):
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
def TCADP_parser(filename, binary=None, callback=None, **kwargs):
|
||||
def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||
tcadp_parser = TCADPParser()
|
||||
|
||||
if not tcadp_parser.check_installation():
|
||||
@ -108,7 +109,7 @@ def TCADP_parser(filename, binary=None, callback=None, **kwargs):
|
||||
return sections, tables, tcadp_parser
|
||||
|
||||
|
||||
def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
if kwargs.get("layout_recognizer", "") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
else:
|
||||
@ -125,11 +126,11 @@ def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callbac
|
||||
|
||||
|
||||
PARSERS = {
|
||||
"deepdoc": DeepDOC_parser,
|
||||
"mineru": MinerU_parser,
|
||||
"docling": Docling_parser,
|
||||
"tcadp": TCADP_parser,
|
||||
"plaintext": plaintext_parser, # default
|
||||
"deepdoc": by_deepdoc,
|
||||
"mineru": by_mineru,
|
||||
"docling": by_docling,
|
||||
"tcadp": by_tcadp,
|
||||
"plaintext": by_plaintext, # default
|
||||
}
|
||||
|
||||
|
||||
@ -630,10 +631,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
|
||||
name = layout_recognizer.strip().lower()
|
||||
parser = PARSERS.get(name, plaintext_parser)
|
||||
parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
sections, tables, _ = parser(
|
||||
sections, tables, pdf_parser = parser(
|
||||
filename = filename,
|
||||
binary = binary,
|
||||
from_page = from_page,
|
||||
|
||||
@ -23,7 +23,7 @@ from rag.app import naive
|
||||
from rag.nlp import rag_tokenizer, tokenize
|
||||
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||
from rag.app.naive import plaintext_parser, PARSERS
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
@ -88,10 +88,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
|
||||
name = layout_recognizer.strip().lower()
|
||||
parser = PARSERS.get(name, plaintext_parser)
|
||||
parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
sections, tbls, _ = parser(
|
||||
sections, tbls, pdf_parser = parser(
|
||||
filename = filename,
|
||||
binary = binary,
|
||||
from_page = from_page,
|
||||
|
||||
@ -24,7 +24,7 @@ from rag.nlp import tokenize, is_english
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||
from PyPDF2 import PdfReader as pdf2_read
|
||||
from rag.app.naive import plaintext_parser, PARSERS
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
|
||||
class Ppt(PptParser):
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
@ -131,7 +131,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
|
||||
name = layout_recognizer.strip().lower()
|
||||
parser = PARSERS.get(name, plaintext_parser)
|
||||
parser = PARSERS.get(name, by_plaintext)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
sections, _, _ = parser(
|
||||
|
||||
Reference in New Issue
Block a user