mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: manual parser with mineru (#11336)
### What problem does this PR solve? Fix: manual parser with mineru #11320 Fix: missing parameter in mineru #11334 Fix: add outlines parameter for pdf parsers ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -61,7 +61,9 @@ class DoclingParser(RAGFlowPdfParser):
|
|||||||
self.page_images: list[Image.Image] = []
|
self.page_images: list[Image.Image] = []
|
||||||
self.page_from = 0
|
self.page_from = 0
|
||||||
self.page_to = 10_000
|
self.page_to = 10_000
|
||||||
|
self.outlines = []
|
||||||
|
|
||||||
|
|
||||||
def check_installation(self) -> bool:
|
def check_installation(self) -> bool:
|
||||||
if DocumentConverter is None:
|
if DocumentConverter is None:
|
||||||
self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
|
self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
|
||||||
|
|||||||
@ -59,6 +59,7 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
self.mineru_api = mineru_api.rstrip("/")
|
self.mineru_api = mineru_api.rstrip("/")
|
||||||
self.mineru_server_url = mineru_server_url.rstrip("/")
|
self.mineru_server_url = mineru_server_url.rstrip("/")
|
||||||
self.using_api = False
|
self.using_api = False
|
||||||
|
self.outlines = []
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
||||||
|
|||||||
@ -47,6 +47,7 @@ class TencentCloudAPIClient:
|
|||||||
self.secret_id = secret_id
|
self.secret_id = secret_id
|
||||||
self.secret_key = secret_key
|
self.secret_key = secret_key
|
||||||
self.region = region
|
self.region = region
|
||||||
|
self.outlines = []
|
||||||
|
|
||||||
# Create credentials
|
# Create credentials
|
||||||
self.cred = credential.Credential(secret_id, secret_key)
|
self.cred = credential.Credential(secret_id, secret_key)
|
||||||
|
|||||||
@ -216,6 +216,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _normalize_section(section):
|
||||||
|
# pad section to length 3: (txt, sec_id, poss)
|
||||||
|
if len(section) == 1:
|
||||||
|
section = (section[0], "", [])
|
||||||
|
elif len(section) == 2:
|
||||||
|
section = (section[0], "", section[1])
|
||||||
|
elif len(section) != 3:
|
||||||
|
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
|
||||||
|
|
||||||
|
txt, sec_id, poss = section
|
||||||
|
if isinstance(poss, str):
|
||||||
|
poss = pdf_parser.extract_positions(poss)
|
||||||
|
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
|
||||||
|
pn = first[0]
|
||||||
|
|
||||||
|
if isinstance(pn, list):
|
||||||
|
pn = pn[0] # [pn] -> pn
|
||||||
|
poss[0] = (pn, *first[1:])
|
||||||
|
|
||||||
|
return (txt, sec_id, poss)
|
||||||
|
|
||||||
|
|
||||||
|
sections = [_normalize_section(sec) for sec in sections]
|
||||||
|
|
||||||
if not sections and not tbls:
|
if not sections and not tbls:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|||||||
@ -70,6 +70,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
|||||||
callback=callback,
|
callback=callback,
|
||||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||||
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
||||||
|
server_url=os.environ.get("MINERU_SERVER_URL", ""),
|
||||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||||
)
|
)
|
||||||
return sections, tables, pdf_parser
|
return sections, tables, pdf_parser
|
||||||
|
|||||||
Reference in New Issue
Block a user