diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index 9d67478c8..59fec9250 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -61,7 +61,9 @@ class DoclingParser(RAGFlowPdfParser): self.page_images: list[Image.Image] = [] self.page_from = 0 self.page_to = 10_000 - + self.outlines = [] + + def check_installation(self) -> bool: if DocumentConverter is None: self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling") diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 6d3b292d0..99b56e83a 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -59,6 +59,7 @@ class MinerUParser(RAGFlowPdfParser): self.mineru_api = mineru_api.rstrip("/") self.mineru_server_url = mineru_server_url.rstrip("/") self.using_api = False + self.outlines = [] self.logger = logging.getLogger(self.__class__.__name__) def _extract_zip_no_root(self, zip_path, extract_to, root_dir): diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py index 1b7a3e362..920b6f1a1 100644 --- a/deepdoc/parser/tcadp_parser.py +++ b/deepdoc/parser/tcadp_parser.py @@ -47,6 +47,7 @@ class TencentCloudAPIClient: self.secret_id = secret_id self.secret_key = secret_key self.region = region + self.outlines = [] # Create credentials self.cred = credential.Credential(secret_id, secret_key) diff --git a/rag/app/manual.py b/rag/app/manual.py index 81402d1bd..5808e2498 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, **kwargs ) + def _normalize_section(section): + # pad section to length 3: (txt, sec_id, poss) + if len(section) == 1: + section = (section[0], "", []) + elif len(section) == 2: + section = (section[0], "", section[1]) + elif len(section) != 3: + raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") + + txt, sec_id, poss = section + if isinstance(poss, str): + poss = pdf_parser.extract_positions(poss) + first = poss[0] # tuple: ([pn], x1, x2, y1, y2) + pn = first[0] + + if isinstance(pn, list): + pn = pn[0] # [pn] -> pn + poss[0] = (pn, *first[1:]) + + return (txt, sec_id, poss) + + + sections = [_normalize_section(sec) for sec in sections] + if not sections and not tbls: return [] diff --git a/rag/app/naive.py b/rag/app/naive.py index d88c2bea4..293e4a8b9 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), backend=os.environ.get("MINERU_BACKEND", "pipeline"), + server_url=os.environ.get("MINERU_SERVER_URL", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), ) return sections, tables, pdf_parser