Fix: manual parser with mineru (#11336)

### What problem does this PR solve?

Fix: manual parser with mineru #11320
Fix: missing parameter in mineru #11334
Fix: add outlines parameter for pdf parsers

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Billy Bao
2025-11-18 15:22:52 +08:00
committed by GitHub
parent 0db00f70b2
commit fea157ba08
5 changed files with 30 additions and 1 deletions

View File

@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
**kwargs
)
def _normalize_section(section):
# pad section to length 3: (txt, sec_id, poss)
if len(section) == 1:
section = (section[0], "", [])
elif len(section) == 2:
section = (section[0], "", section[1])
elif len(section) != 3:
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
txt, sec_id, poss = section
if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
if isinstance(pn, list):
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
return (txt, sec_id, poss)
sections = [_normalize_section(sec) for sec in sections]
if not sections and not tbls:
return []

View File

@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
server_url=os.environ.get("MINERU_SERVER_URL", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
return sections, tables, pdf_parser