From 7719fd6350b66ff6cee81a287b2981a81ac45c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=91=E5=8D=BF?= <121151546+shaoqing404@users.noreply.github.com> Date: Fri, 5 Dec 2025 19:25:45 +0800 Subject: [PATCH] Fix MinerU API sanitized-output lookup and manual chunk tuple handling (#11702) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? This PR addresses **two independent issues** encountered when using the MinerU engine in Ragflow: 1. **MinerU API output path mismatch for non-ASCII filenames** MinerU sanitizes the root directory name inside the returned ZIP when the original filename contains non-ASCII characters (e.g., Chinese). Ragflow's client-side unzip logic assumed the original filename stem and therefore failed to locate `_content_list.json`. This PR adds: * root-directory detection * fallback lookup using sanitized names * a broadened `_read_output` search with a glob fallback ensuring output files are consistently located regardless of filename encoding. 2. **Chunker crash due to tuple-structure mismatch in manual mode** Some parsers (e.g., MinerU / Docling) return **2-tuple sections**, but Ragflow’s chunker expects **3-tuple sections**, leading to: `ValueError: not enough values to unpack (expected 3, got 2)` This PR normalizes all sections to a uniform structure `(text, layout, positions)`: * parse position tags when present * default to empty positions when missing preserving backward compatibility and preventing crashes. ### Type of change * [x] Bug Fix (non-breaking change which fixes an issue) [#11136](https://github.com/infiniflow/ragflow/issues/11136) [#11700](https://github.com/infiniflow/ragflow/issues/11700) [#11620](https://github.com/infiniflow/ragflow/issues/11620) [#11701](https://github.com/infiniflow/ragflow/pull/11701) we need your help [yongtenglei](https://github.com/yongtenglei) --------- Co-authored-by: Kevin Hu --- rag/app/manual.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/rag/app/manual.py b/rag/app/manual.py index 1eb86a043..363c6e9e7 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -219,23 +219,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, ) def _normalize_section(section): - # pad section to length 3: (txt, sec_id, poss) - if len(section) == 1: + # Pad/normalize to (txt, layout, positions) + if not isinstance(section, (list, tuple)): + section = (section, "", []) + elif len(section) == 1: section = (section[0], "", []) elif len(section) == 2: section = (section[0], "", section[1]) - elif len(section) != 3: - raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") + else: + section = (section[0], section[1], section[2]) txt, layoutno, poss = section if isinstance(poss, str): poss = pdf_parser.extract_positions(poss) - first = poss[0] # tuple: ([pn], x1, x2, y1, y2) - pn = first[0] - - if isinstance(pn, list): - pn = pn[0] # [pn] -> pn + if poss: + first = poss[0] # tuple: ([pn], x1, x2, y1, y2) + pn = first[0] + if isinstance(pn, list) and pn: + pn = pn[0] # [pn] -> pn poss[0] = (pn, *first[1:]) + if not poss: + poss = [] return (txt, layoutno, poss)