Feat: add more chunking method (#11413)

### What problem does this PR solve?

Feat: add more chunking method #11311

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-11-20 19:07:17 +08:00
committed by GitHub
parent c8ab9079b3
commit d3d2ccc76c
6 changed files with 66 additions and 19 deletions

View File

@ -476,7 +476,7 @@ class MinerUParser(RAGFlowPdfParser):
item[key] = str((subdir / item[key]).resolve())
return data
def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
sections = []
for output in outputs:
match output["type"]:
@ -497,7 +497,11 @@ class MinerUParser(RAGFlowPdfParser):
case MinerUContentType.DISCARDED:
pass
if section:
if section and parse_method == "manual":
sections.append((section, output["type"], self._line_tag(output)))
elif section and parse_method == "paper":
sections.append((section + self._line_tag(output), output["type"]))
else:
sections.append((section, self._line_tag(output)))
return sections
@ -516,6 +520,7 @@ class MinerUParser(RAGFlowPdfParser):
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw"
) -> tuple:
import shutil
@ -565,7 +570,8 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
finally:
if temp_pdf and temp_pdf.exists():
try: