mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Support more chunking methods (#11000)
### What problem does this PR solve? Feat: Support more chunking methods #10772 This PR enables multiple chunking methods — including books, laws, naive, one, and presentation — to be used with all existing PDF parsers (DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes). ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -15,18 +15,18 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from tika import parser
|
|
||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from deepdoc.parser.utils import get_text
|
from deepdoc.parser.utils import get_text
|
||||||
from rag.app import naive
|
from rag.app import naive
|
||||||
|
from rag.app.naive import plaintext_parser, PARSERS
|
||||||
from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
from rag.nlp import bullets_category, is_english,remove_contents_table, \
|
||||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
||||||
tokenize_chunks
|
tokenize_chunks
|
||||||
from rag.nlp import rag_tokenizer
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, PlainParser, HtmlParser
|
from deepdoc.parser import PdfParser, HtmlParser
|
||||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
@ -96,13 +96,33 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
pdf_parser = Pdf()
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
|
||||||
pdf_parser = PlainParser()
|
|
||||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
|
||||||
from_page=from_page, to_page=to_page, callback=callback)
|
|
||||||
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
|
||||||
|
|
||||||
|
if isinstance(layout_recognizer, bool):
|
||||||
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
|
|
||||||
|
name = layout_recognizer.strip().lower()
|
||||||
|
parser = PARSERS.get(name, plaintext_parser)
|
||||||
|
callback(0.1, "Start to parse.")
|
||||||
|
|
||||||
|
sections, tables, _ = parser(
|
||||||
|
filename = filename,
|
||||||
|
binary = binary,
|
||||||
|
from_page = from_page,
|
||||||
|
to_page = to_page,
|
||||||
|
lang = lang,
|
||||||
|
callback = callback,
|
||||||
|
pdf_cls = Pdf,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
if not sections and not tables:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
|
callback(0.8, "Finish parsing.")
|
||||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
txt = get_text(filename, binary)
|
txt = get_text(filename, binary)
|
||||||
|
|||||||
@ -15,7 +15,6 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from tika import parser
|
|
||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from docx import Document
|
from docx import Document
|
||||||
@ -25,8 +24,8 @@ from deepdoc.parser.utils import get_text
|
|||||||
from rag.nlp import bullets_category, remove_contents_table, \
|
from rag.nlp import bullets_category, remove_contents_table, \
|
||||||
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
|
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
|
||||||
from rag.nlp import rag_tokenizer, Node
|
from rag.nlp import rag_tokenizer, Node
|
||||||
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser
|
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
|
||||||
|
from rag.app.naive import plaintext_parser, PARSERS
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -156,13 +155,36 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
return tokenize_chunks(chunks, doc, eng, None)
|
return tokenize_chunks(chunks, doc, eng, None)
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
pdf_parser = Pdf()
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
|
||||||
pdf_parser = PlainParser()
|
if isinstance(layout_recognizer, bool):
|
||||||
for txt, poss in pdf_parser(filename if not binary else binary,
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
|
||||||
|
name = layout_recognizer.strip().lower()
|
||||||
|
parser = PARSERS.get(name, plaintext_parser)
|
||||||
|
callback(0.1, "Start to parse.")
|
||||||
|
|
||||||
|
raw_sections, tables, _ = parser(
|
||||||
|
filename = filename,
|
||||||
|
binary = binary,
|
||||||
|
from_page = from_page,
|
||||||
|
to_page = to_page,
|
||||||
|
lang = lang,
|
||||||
|
callback = callback,
|
||||||
|
pdf_cls = Pdf,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
if not raw_sections and not tables:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
|
for txt, poss in raw_sections:
|
||||||
sections.append(txt + poss)
|
sections.append(txt + poss)
|
||||||
|
|
||||||
|
callback(0.8, "Finish parsing.")
|
||||||
elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
txt = get_text(filename, binary)
|
txt = get_text(filename, binary)
|
||||||
|
|||||||
@ -22,11 +22,11 @@ from common.constants import ParserType
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
|
||||||
from common.token_utils import num_tokens_from_string
|
from common.token_utils import num_tokens_from_string
|
||||||
from deepdoc.parser import PdfParser, PlainParser, DocxParser
|
from deepdoc.parser import PdfParser, DocxParser
|
||||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from rag.app.naive import plaintext_parser, PARSERS
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -196,15 +196,34 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
# is it English
|
# is it English
|
||||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
pdf_parser = Pdf()
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
|
||||||
pdf_parser = PlainParser()
|
if isinstance(layout_recognizer, bool):
|
||||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
from_page=from_page, to_page=to_page, callback=callback)
|
|
||||||
if sections and len(sections[0]) < 3:
|
name = layout_recognizer.strip().lower()
|
||||||
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
|
pdf_parser = PARSERS.get(name, plaintext_parser)
|
||||||
# set pivot using the most frequent type of title,
|
callback(0.1, "Start to parse.")
|
||||||
# then merge between 2 pivot
|
|
||||||
|
sections, tbls, pdf_parser = pdf_parser(
|
||||||
|
filename = filename,
|
||||||
|
binary = binary,
|
||||||
|
from_page = from_page,
|
||||||
|
to_page = to_page,
|
||||||
|
lang = lang,
|
||||||
|
callback = callback,
|
||||||
|
pdf_cls = Pdf,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
if not sections and not tbls:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
||||||
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
|
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
|
||||||
most_level = max(0, max_lvl - 1)
|
most_level = max(0, max_lvl - 1)
|
||||||
|
|||||||
192
rag/app/naive.py
192
rag/app/naive.py
@ -26,7 +26,6 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
|
|||||||
from docx.opc.oxml import parse_xml
|
from docx.opc.oxml import parse_xml
|
||||||
from markdown import markdown
|
from markdown import markdown
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from tika import parser
|
|
||||||
|
|
||||||
from common.constants import LLMType
|
from common.constants import LLMType
|
||||||
from api.db.services.llm_service import LLMBundle
|
from api.db.services.llm_service import LLMBundle
|
||||||
@ -39,6 +38,100 @@ from deepdoc.parser.docling_parser import DoclingParser
|
|||||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
||||||
|
|
||||||
|
def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, pdf_cls = None ,**kwargs):
|
||||||
|
callback = callback
|
||||||
|
binary = binary
|
||||||
|
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
||||||
|
sections, tables = pdf_parser(
|
||||||
|
filename if not binary else binary,
|
||||||
|
from_page=from_page,
|
||||||
|
to_page=to_page,
|
||||||
|
callback=callback
|
||||||
|
)
|
||||||
|
tables = vision_figure_parser_pdf_wrapper(tbls=tables,
|
||||||
|
callback=callback,
|
||||||
|
**kwargs)
|
||||||
|
return sections, tables, pdf_parser
|
||||||
|
|
||||||
|
|
||||||
|
def MinerU_parser(filename, binary=None, callback=None, **kwargs):
|
||||||
|
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||||
|
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||||
|
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
||||||
|
|
||||||
|
if not pdf_parser.check_installation():
|
||||||
|
callback(-1, "MinerU not found.")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
sections, tables = pdf_parser.parse_pdf(
|
||||||
|
filepath=filename,
|
||||||
|
binary=binary,
|
||||||
|
callback=callback,
|
||||||
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||||
|
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
||||||
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||||
|
)
|
||||||
|
return sections, tables, pdf_parser
|
||||||
|
|
||||||
|
|
||||||
|
def Docling_parser(filename, binary=None, callback=None, **kwargs):
|
||||||
|
pdf_parser = DoclingParser()
|
||||||
|
|
||||||
|
if not pdf_parser.check_installation():
|
||||||
|
callback(-1, "Docling not found.")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
sections, tables = pdf_parser.parse_pdf(
|
||||||
|
filepath=filename,
|
||||||
|
binary=binary,
|
||||||
|
callback=callback,
|
||||||
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||||
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||||
|
)
|
||||||
|
return sections, tables, pdf_parser
|
||||||
|
|
||||||
|
|
||||||
|
def TCADP_parser(filename, binary=None, callback=None, **kwargs):
|
||||||
|
tcadp_parser = TCADPParser()
|
||||||
|
|
||||||
|
if not tcadp_parser.check_installation():
|
||||||
|
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
sections, tables = tcadp_parser.parse_pdf(
|
||||||
|
filepath=filename,
|
||||||
|
binary=binary,
|
||||||
|
callback=callback,
|
||||||
|
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||||
|
file_type="PDF"
|
||||||
|
)
|
||||||
|
return sections, tables, tcadp_parser
|
||||||
|
|
||||||
|
|
||||||
|
def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||||
|
if kwargs.get("layout_recognizer", "") == "Plain Text":
|
||||||
|
pdf_parser = PlainParser()
|
||||||
|
else:
|
||||||
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
|
||||||
|
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||||
|
|
||||||
|
sections, tables = pdf_parser(
|
||||||
|
filename if not binary else binary,
|
||||||
|
from_page=from_page,
|
||||||
|
to_page=to_page,
|
||||||
|
callback=callback
|
||||||
|
)
|
||||||
|
return sections, tables, pdf_parser
|
||||||
|
|
||||||
|
|
||||||
|
PARSERS = {
|
||||||
|
"deepdoc": DeepDOC_parser,
|
||||||
|
"mineru": MinerU_parser,
|
||||||
|
"docling": Docling_parser,
|
||||||
|
"tcadp": TCADP_parser,
|
||||||
|
"plaintext": plaintext_parser, # default
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class Docx(DocxParser):
|
class Docx(DocxParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -365,7 +458,7 @@ class Markdown(MarkdownParser):
|
|||||||
html_content = markdown(text)
|
html_content = markdown(text)
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_picture_urls(self, soup):
|
def get_picture_urls(self, soup):
|
||||||
if soup:
|
if soup:
|
||||||
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
|
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
|
||||||
@ -375,7 +468,7 @@ class Markdown(MarkdownParser):
|
|||||||
if soup:
|
if soup:
|
||||||
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_pictures(self, text):
|
def get_pictures(self, text):
|
||||||
"""Download and open all images from markdown text."""
|
"""Download and open all images from markdown text."""
|
||||||
import requests
|
import requests
|
||||||
@ -535,82 +628,29 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
if isinstance(layout_recognizer, bool):
|
if isinstance(layout_recognizer, bool):
|
||||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
|
|
||||||
|
name = layout_recognizer.strip().lower()
|
||||||
|
parser = PARSERS.get(name, plaintext_parser)
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
|
|
||||||
if layout_recognizer == "DeepDOC":
|
sections, tables, _ = parser(
|
||||||
pdf_parser = Pdf()
|
filename = filename,
|
||||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
binary = binary,
|
||||||
tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs)
|
from_page = from_page,
|
||||||
|
to_page = to_page,
|
||||||
|
lang = lang,
|
||||||
|
callback = callback,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
res = tokenize_table(tables, doc, is_english)
|
if not sections and not tables:
|
||||||
callback(0.8, "Finish parsing.")
|
return []
|
||||||
|
|
||||||
elif layout_recognizer == "MinerU":
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
|
||||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
|
||||||
mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
|
|
||||||
mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
|
|
||||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api, mineru_server_url=mineru_server_url)
|
|
||||||
ok, reason = pdf_parser.check_installation(backend=mineru_backend)
|
|
||||||
if not ok:
|
|
||||||
callback(-1, f"MinerU not found or server not accessible: {reason}")
|
|
||||||
return res
|
|
||||||
|
|
||||||
sections, tables = pdf_parser.parse_pdf(
|
|
||||||
filepath=filename,
|
|
||||||
binary=binary,
|
|
||||||
callback=callback,
|
|
||||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
|
||||||
backend=mineru_backend,
|
|
||||||
server_url=mineru_server_url,
|
|
||||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
|
||||||
)
|
|
||||||
parser_config["chunk_token_num"] = 0
|
parser_config["chunk_token_num"] = 0
|
||||||
callback(0.8, "Finish parsing.")
|
|
||||||
|
res = tokenize_table(tables, doc, is_english)
|
||||||
elif layout_recognizer == "Docling":
|
callback(0.8, "Finish parsing.")
|
||||||
pdf_parser = DoclingParser()
|
|
||||||
if not pdf_parser.check_installation():
|
|
||||||
callback(-1, "Docling not found.")
|
|
||||||
return res
|
|
||||||
|
|
||||||
sections, tables = pdf_parser.parse_pdf(
|
|
||||||
filepath=filename,
|
|
||||||
binary=binary,
|
|
||||||
callback=callback,
|
|
||||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
|
||||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
|
||||||
)
|
|
||||||
parser_config["chunk_token_num"] = 0
|
|
||||||
res = tokenize_table(tables, doc, is_english)
|
|
||||||
callback(0.8, "Finish parsing.")
|
|
||||||
|
|
||||||
elif layout_recognizer == "TCADP Parser":
|
|
||||||
tcadp_parser = TCADPParser()
|
|
||||||
if not tcadp_parser.check_installation():
|
|
||||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
|
||||||
return res
|
|
||||||
|
|
||||||
sections, tables = tcadp_parser.parse_pdf(
|
|
||||||
filepath=filename,
|
|
||||||
binary=binary,
|
|
||||||
callback=callback,
|
|
||||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
|
||||||
file_type="PDF"
|
|
||||||
)
|
|
||||||
parser_config["chunk_token_num"] = 0
|
|
||||||
callback(0.8, "Finish parsing.")
|
|
||||||
else:
|
|
||||||
if layout_recognizer == "Plain Text":
|
|
||||||
pdf_parser = PlainParser()
|
|
||||||
else:
|
|
||||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
|
|
||||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
|
||||||
|
|
||||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
|
|
||||||
callback=callback)
|
|
||||||
res = tokenize_table(tables, doc, is_english)
|
|
||||||
callback(0.8, "Finish parsing.")
|
|
||||||
|
|
||||||
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
@ -735,9 +775,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
||||||
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
||||||
url_res.extend(sub_url_res)
|
url_res.extend(sub_url_res)
|
||||||
|
|
||||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||||
|
|
||||||
if embed_res:
|
if embed_res:
|
||||||
res.extend(embed_res)
|
res.extend(embed_res)
|
||||||
if url_res:
|
if url_res:
|
||||||
|
|||||||
@ -15,16 +15,15 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from tika import parser
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from deepdoc.parser.utils import get_text
|
from deepdoc.parser.utils import get_text
|
||||||
from rag.app import naive
|
from rag.app import naive
|
||||||
from rag.nlp import rag_tokenizer, tokenize
|
from rag.nlp import rag_tokenizer, tokenize
|
||||||
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
|
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
|
||||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||||
|
from rag.app.naive import plaintext_parser, PARSERS
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
@ -83,12 +82,34 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
pdf_parser = Pdf()
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
|
||||||
pdf_parser = PlainParser()
|
if isinstance(layout_recognizer, bool):
|
||||||
sections, tbls = pdf_parser(
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
filename if not binary else binary, to_page=to_page, callback=callback)
|
|
||||||
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
name = layout_recognizer.strip().lower()
|
||||||
|
parser = PARSERS.get(name, plaintext_parser)
|
||||||
|
callback(0.1, "Start to parse.")
|
||||||
|
|
||||||
|
sections, tbls, _ = parser(
|
||||||
|
filename = filename,
|
||||||
|
binary = binary,
|
||||||
|
from_page = from_page,
|
||||||
|
to_page = to_page,
|
||||||
|
lang = lang,
|
||||||
|
callback = callback,
|
||||||
|
pdf_cls = Pdf,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
if not sections and not tbls:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
for (img, rows), poss in tbls:
|
for (img, rows), poss in tbls:
|
||||||
if not rows:
|
if not rows:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@ -20,14 +20,11 @@ from io import BytesIO
|
|||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from common.constants import LLMType
|
|
||||||
from api.db.services.llm_service import LLMBundle
|
|
||||||
from deepdoc.parser.pdf_parser import VisionParser
|
|
||||||
from rag.nlp import tokenize, is_english
|
from rag.nlp import tokenize, is_english
|
||||||
from rag.nlp import rag_tokenizer
|
from rag.nlp import rag_tokenizer
|
||||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||||
from PyPDF2 import PdfReader as pdf2_read
|
from PyPDF2 import PdfReader as pdf2_read
|
||||||
|
from rag.app.naive import plaintext_parser, PARSERS
|
||||||
|
|
||||||
class Ppt(PptParser):
|
class Ppt(PptParser):
|
||||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||||
@ -54,7 +51,6 @@ class Ppt(PptParser):
|
|||||||
self.is_english = is_english(txts)
|
self.is_english = is_english(txts)
|
||||||
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -84,7 +80,7 @@ class Pdf(PdfParser):
|
|||||||
res.append((lines, self.page_images[i]))
|
res.append((lines, self.page_images[i]))
|
||||||
callback(0.9, "Page {}~{}: Parsing finished".format(
|
callback(0.9, "Page {}~{}: Parsing finished".format(
|
||||||
from_page, min(to_page, self.total_page)))
|
from_page, min(to_page, self.total_page)))
|
||||||
return res
|
return res, []
|
||||||
|
|
||||||
|
|
||||||
class PlainPdf(PlainParser):
|
class PlainPdf(PlainParser):
|
||||||
@ -95,7 +91,7 @@ class PlainPdf(PlainParser):
|
|||||||
for page in self.pdf.pages[from_page: to_page]:
|
for page in self.pdf.pages[from_page: to_page]:
|
||||||
page_txt.append(page.extract_text())
|
page_txt.append(page.extract_text())
|
||||||
callback(0.9, "Parsing finished")
|
callback(0.9, "Parsing finished")
|
||||||
return [(txt, None) for txt in page_txt]
|
return [(txt, None) for txt in page_txt], []
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||||
@ -130,20 +126,33 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
return res
|
return res
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||||
if layout_recognizer == "DeepDOC":
|
|
||||||
pdf_parser = Pdf()
|
|
||||||
sections = pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
|
|
||||||
elif layout_recognizer == "Plain Text":
|
|
||||||
pdf_parser = PlainParser()
|
|
||||||
sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
|
|
||||||
callback=callback)
|
|
||||||
else:
|
|
||||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
|
|
||||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
|
||||||
sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
|
|
||||||
callback=callback)
|
|
||||||
|
|
||||||
|
if isinstance(layout_recognizer, bool):
|
||||||
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
|
|
||||||
|
name = layout_recognizer.strip().lower()
|
||||||
|
parser = PARSERS.get(name, plaintext_parser)
|
||||||
|
callback(0.1, "Start to parse.")
|
||||||
|
|
||||||
|
sections, _, _ = parser(
|
||||||
|
filename = filename,
|
||||||
|
binary = binary,
|
||||||
|
from_page = from_page,
|
||||||
|
to_page = to_page,
|
||||||
|
lang = lang,
|
||||||
|
callback = callback,
|
||||||
|
pdf_cls = Pdf,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
if not sections:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if name in ["tcadp", "docling", "mineru"]:
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
for pn, (txt, img) in enumerate(sections):
|
for pn, (txt, img) in enumerate(sections):
|
||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
pn += from_page
|
pn += from_page
|
||||||
|
|||||||
Reference in New Issue
Block a user