mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add more chunking method (#11413)
### What problem does this PR solve? Feat: add more chunking method #11311 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -187,7 +187,7 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
|
||||
yield (DoclingContentType.EQUATION.value, text, bbox)
|
||||
|
||||
def _transfer_to_sections(self, doc) -> list[tuple[str, str]]:
|
||||
def _transfer_to_sections(self, doc, parse_method: str) -> list[tuple[str, str]]:
|
||||
sections: list[tuple[str, str]] = []
|
||||
for typ, payload, bbox in self._iter_doc_items(doc):
|
||||
if typ == DoclingContentType.TEXT.value:
|
||||
@ -200,7 +200,12 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
continue
|
||||
|
||||
tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
|
||||
sections.append((section, tag))
|
||||
if parse_method == "manual":
|
||||
sections.append((section, typ, tag))
|
||||
elif parse_method == "paper":
|
||||
sections.append((section + tag, typ))
|
||||
else:
|
||||
sections.append((section, tag))
|
||||
return sections
|
||||
|
||||
def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
|
||||
@ -282,7 +287,8 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
output_dir: Optional[str] = None,
|
||||
lang: Optional[str] = None,
|
||||
method: str = "auto",
|
||||
delete_output: bool = True,
|
||||
delete_output: bool = True,
|
||||
parse_method: str = "raw"
|
||||
):
|
||||
|
||||
if not self.check_installation():
|
||||
@ -318,7 +324,7 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
if callback:
|
||||
callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
|
||||
|
||||
sections = self._transfer_to_sections(doc)
|
||||
sections = self._transfer_to_sections(doc, parse_method=parse_method)
|
||||
tables = self._transfer_to_tables(doc)
|
||||
|
||||
if callback:
|
||||
|
||||
Reference in New Issue
Block a user