mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Support more chunking methods (#11000)
### What problem does this PR solve? Feat: Support more chunking methods #10772 This PR enables multiple chunking methods — including books, laws, naive, one, and presentation — to be used with all existing PDF parsers (DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes). ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -22,11 +22,11 @@ from common.constants import ParserType
|
||||
from io import BytesIO
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
|
||||
from common.token_utils import num_tokens_from_string
|
||||
from deepdoc.parser import PdfParser, PlainParser, DocxParser
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
|
||||
from rag.app.naive import plaintext_parser, PARSERS
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
@ -196,15 +196,34 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
if sections and len(sections[0]) < 3:
|
||||
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
|
||||
# set pivot using the most frequent type of title,
|
||||
# then merge between 2 pivot
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
|
||||
name = layout_recognizer.strip().lower()
|
||||
pdf_parser = PARSERS.get(name, plaintext_parser)
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
sections, tbls, pdf_parser = pdf_parser(
|
||||
filename = filename,
|
||||
binary = binary,
|
||||
from_page = from_page,
|
||||
to_page = to_page,
|
||||
lang = lang,
|
||||
callback = callback,
|
||||
pdf_cls = Pdf,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
if not sections and not tbls:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
||||
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
|
||||
most_level = max(0, max_lvl - 1)
|
||||
|
||||
Reference in New Issue
Block a user