Fix: manual parser with mineru (#11336)

### What problem does this PR solve? Fix: manual parser with mineru #11320 Fix: missing parameter in mineru #11334 Fix: add outlines parameter for pdf parsers ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-04 09:35:06 +08:00 · 2025-11-18 15:22:52 +08:00
parent 0db00f70b2
commit fea157ba08
5 changed files with 30 additions and 1 deletions
--- a/deepdoc/parser/docling_parser.py
+++ b/deepdoc/parser/docling_parser.py
@ -61,7 +61,9 @@ class DoclingParser(RAGFlowPdfParser):
        self.page_images: list[Image.Image] = []
        self.page_from = 0
        self.page_to = 10_000
-
+        self.outlines = []
    def check_installation(self) -> bool:
        if DocumentConverter is None:
            self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@ -59,6 +59,7 @@ class MinerUParser(RAGFlowPdfParser):
        self.mineru_api = mineru_api.rstrip("/")
        self.mineru_server_url = mineru_server_url.rstrip("/")
        self.using_api = False
        self.outlines = []
        self.logger = logging.getLogger(self.__class__.__name__)
    def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
--- a/deepdoc/parser/tcadp_parser.py
+++ b/deepdoc/parser/tcadp_parser.py
@ -47,6 +47,7 @@ class TencentCloudAPIClient:
        self.secret_id = secret_id
        self.secret_key = secret_key
        self.region = region
        self.outlines = []
        # Create credentials
        self.cred = credential.Credential(secret_id, secret_key)
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            **kwargs
        )
        def _normalize_section(section):
            # pad section to length 3: (txt, sec_id, poss)
            if len(section) == 1:
                section = (section[0], "", [])
            elif len(section) == 2:
                section = (section[0], "", section[1])
            elif len(section) != 3:
                raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
            txt, sec_id, poss = section
            if isinstance(poss, str):
                poss = pdf_parser.extract_positions(poss)
                first = poss[0]          # tuple: ([pn], x1, x2, y1, y2)
                pn = first[0]           
                if isinstance(pn, list):
                    pn = pn[0]           # [pn] -> pn
                    poss[0] = (pn, *first[1:])
            return (txt, sec_id, poss)
        sections = [_normalize_section(sec) for sec in sections]
        if not sections and not tbls:
            return []
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
        callback=callback,
        output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
        backend=os.environ.get("MINERU_BACKEND", "pipeline"),
        server_url=os.environ.get("MINERU_SERVER_URL", ""),
        delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
    )
    return sections, tables, pdf_parser