mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add more chunking method (#11413)
### What problem does this PR solve? Feat: add more chunking method #11311 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -187,7 +187,7 @@ class DoclingParser(RAGFlowPdfParser):
|
|||||||
bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
|
bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
|
||||||
yield (DoclingContentType.EQUATION.value, text, bbox)
|
yield (DoclingContentType.EQUATION.value, text, bbox)
|
||||||
|
|
||||||
def _transfer_to_sections(self, doc) -> list[tuple[str, str]]:
|
def _transfer_to_sections(self, doc, parse_method: str) -> list[tuple[str, str]]:
|
||||||
sections: list[tuple[str, str]] = []
|
sections: list[tuple[str, str]] = []
|
||||||
for typ, payload, bbox in self._iter_doc_items(doc):
|
for typ, payload, bbox in self._iter_doc_items(doc):
|
||||||
if typ == DoclingContentType.TEXT.value:
|
if typ == DoclingContentType.TEXT.value:
|
||||||
@ -200,7 +200,12 @@ class DoclingParser(RAGFlowPdfParser):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
|
tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
|
||||||
sections.append((section, tag))
|
if parse_method == "manual":
|
||||||
|
sections.append((section, typ, tag))
|
||||||
|
elif parse_method == "paper":
|
||||||
|
sections.append((section + tag, typ))
|
||||||
|
else:
|
||||||
|
sections.append((section, tag))
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
|
def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
|
||||||
@ -283,6 +288,7 @@ class DoclingParser(RAGFlowPdfParser):
|
|||||||
lang: Optional[str] = None,
|
lang: Optional[str] = None,
|
||||||
method: str = "auto",
|
method: str = "auto",
|
||||||
delete_output: bool = True,
|
delete_output: bool = True,
|
||||||
|
parse_method: str = "raw"
|
||||||
):
|
):
|
||||||
|
|
||||||
if not self.check_installation():
|
if not self.check_installation():
|
||||||
@ -318,7 +324,7 @@ class DoclingParser(RAGFlowPdfParser):
|
|||||||
if callback:
|
if callback:
|
||||||
callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
|
callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
|
||||||
|
|
||||||
sections = self._transfer_to_sections(doc)
|
sections = self._transfer_to_sections(doc, parse_method=parse_method)
|
||||||
tables = self._transfer_to_tables(doc)
|
tables = self._transfer_to_tables(doc)
|
||||||
|
|
||||||
if callback:
|
if callback:
|
||||||
|
|||||||
@ -476,7 +476,7 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
item[key] = str((subdir / item[key]).resolve())
|
item[key] = str((subdir / item[key]).resolve())
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
|
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
|
||||||
sections = []
|
sections = []
|
||||||
for output in outputs:
|
for output in outputs:
|
||||||
match output["type"]:
|
match output["type"]:
|
||||||
@ -497,7 +497,11 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
case MinerUContentType.DISCARDED:
|
case MinerUContentType.DISCARDED:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if section:
|
if section and parse_method == "manual":
|
||||||
|
sections.append((section, output["type"], self._line_tag(output)))
|
||||||
|
elif section and parse_method == "paper":
|
||||||
|
sections.append((section + self._line_tag(output), output["type"]))
|
||||||
|
else:
|
||||||
sections.append((section, self._line_tag(output)))
|
sections.append((section, self._line_tag(output)))
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
@ -516,6 +520,7 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
method: str = "auto",
|
method: str = "auto",
|
||||||
server_url: Optional[str] = None,
|
server_url: Optional[str] = None,
|
||||||
delete_output: bool = True,
|
delete_output: bool = True,
|
||||||
|
parse_method: str = "raw"
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
@ -565,7 +570,8 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||||
return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
|
|
||||||
|
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
|
||||||
finally:
|
finally:
|
||||||
if temp_pdf and temp_pdf.exists():
|
if temp_pdf and temp_pdf.exists():
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -230,9 +230,16 @@ REGISTER_ENABLED=1
|
|||||||
# SANDBOX_MAX_MEMORY=256m # b, k, m, g
|
# SANDBOX_MAX_MEMORY=256m # b, k, m, g
|
||||||
# SANDBOX_TIMEOUT=10s # s, m, 1m30s
|
# SANDBOX_TIMEOUT=10s # s, m, 1m30s
|
||||||
|
|
||||||
# Enable DocLing and Mineru
|
# Enable DocLing
|
||||||
USE_DOCLING=false
|
USE_DOCLING=false
|
||||||
|
|
||||||
|
# Enable Mineru
|
||||||
USE_MINERU=false
|
USE_MINERU=false
|
||||||
|
MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
|
||||||
|
MINERU_DELETE_OUTPUT=0 # keep output directory
|
||||||
|
MINERU_BACKEND=pipeline # or another backend you prefer
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# pptx support
|
# pptx support
|
||||||
DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1
|
DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1
|
||||||
@ -213,6 +213,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
lang = lang,
|
lang = lang,
|
||||||
callback = callback,
|
callback = callback,
|
||||||
pdf_cls = Pdf,
|
pdf_cls = Pdf,
|
||||||
|
parse_method = "manual",
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -225,7 +226,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif len(section) != 3:
|
elif len(section) != 3:
|
||||||
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
|
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
|
||||||
|
|
||||||
txt, sec_id, poss = section
|
txt, layoutno, poss = section
|
||||||
if isinstance(poss, str):
|
if isinstance(poss, str):
|
||||||
poss = pdf_parser.extract_positions(poss)
|
poss = pdf_parser.extract_positions(poss)
|
||||||
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
||||||
@ -235,7 +236,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
pn = pn[0] # [pn] -> pn
|
pn = pn[0] # [pn] -> pn
|
||||||
poss[0] = (pn, *first[1:])
|
poss[0] = (pn, *first[1:])
|
||||||
|
|
||||||
return (txt, sec_id, poss)
|
return (txt, layoutno, poss)
|
||||||
|
|
||||||
|
|
||||||
sections = [_normalize_section(sec) for sec in sections]
|
sections = [_normalize_section(sec) for sec in sections]
|
||||||
|
|||||||
@ -59,6 +59,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
|||||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
||||||
|
parse_method = kwargs.get("parse_method", "raw")
|
||||||
|
|
||||||
if not pdf_parser.check_installation():
|
if not pdf_parser.check_installation():
|
||||||
callback(-1, "MinerU not found.")
|
callback(-1, "MinerU not found.")
|
||||||
@ -72,12 +73,14 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
|||||||
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
||||||
server_url=os.environ.get("MINERU_SERVER_URL", ""),
|
server_url=os.environ.get("MINERU_SERVER_URL", ""),
|
||||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||||
|
parse_method=parse_method
|
||||||
)
|
)
|
||||||
return sections, tables, pdf_parser
|
return sections, tables, pdf_parser
|
||||||
|
|
||||||
|
|
||||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||||
pdf_parser = DoclingParser()
|
pdf_parser = DoclingParser()
|
||||||
|
parse_method = kwargs.get("parse_method", "raw")
|
||||||
|
|
||||||
if not pdf_parser.check_installation():
|
if not pdf_parser.check_installation():
|
||||||
callback(-1, "Docling not found.")
|
callback(-1, "Docling not found.")
|
||||||
@ -89,6 +92,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
|||||||
callback=callback,
|
callback=callback,
|
||||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||||
|
parse_method=parse_method
|
||||||
)
|
)
|
||||||
return sections, tables, pdf_parser
|
return sections, tables, pdf_parser
|
||||||
|
|
||||||
|
|||||||
@ -21,8 +21,10 @@ import re
|
|||||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
|
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
|
||||||
from common.constants import ParserType
|
from common.constants import ParserType
|
||||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
|
||||||
from deepdoc.parser import PdfParser, PlainParser
|
from deepdoc.parser import PdfParser
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -147,19 +149,40 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"parser_config", {
|
"parser_config", {
|
||||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||||
pdf_parser = PlainParser()
|
|
||||||
|
if isinstance(layout_recognizer, bool):
|
||||||
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||||
|
|
||||||
|
name = layout_recognizer.strip().lower()
|
||||||
|
pdf_parser = PARSERS.get(name, by_plaintext)
|
||||||
|
callback(0.1, "Start to parse.")
|
||||||
|
|
||||||
|
if name == "deepdoc":
|
||||||
|
pdf_parser = Pdf()
|
||||||
|
paper = pdf_parser(filename if not binary else binary,
|
||||||
|
from_page=from_page, to_page=to_page, callback=callback)
|
||||||
|
else:
|
||||||
|
sections, tables, pdf_parser = pdf_parser(
|
||||||
|
filename=filename,
|
||||||
|
binary=binary,
|
||||||
|
from_page=from_page,
|
||||||
|
to_page=to_page,
|
||||||
|
lang=lang,
|
||||||
|
callback=callback,
|
||||||
|
pdf_cls=Pdf,
|
||||||
|
parse_method="paper",
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
paper = {
|
paper = {
|
||||||
"title": filename,
|
"title": filename,
|
||||||
"authors": " ",
|
"authors": " ",
|
||||||
"abstract": "",
|
"abstract": "",
|
||||||
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0],
|
"sections": sections,
|
||||||
"tables": []
|
"tables": tables
|
||||||
}
|
}
|
||||||
else:
|
|
||||||
pdf_parser = Pdf()
|
|
||||||
paper = pdf_parser(filename if not binary else binary,
|
|
||||||
from_page=from_page, to_page=to_page, callback=callback)
|
|
||||||
tbls=paper["tables"]
|
tbls=paper["tables"]
|
||||||
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
|
||||||
paper["tables"] = tbls
|
paper["tables"] = tbls
|
||||||
|
|||||||
Reference in New Issue
Block a user