diff --git a/rag/app/manual.py b/rag/app/manual.py index 1eb86a043..363c6e9e7 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -219,23 +219,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, ) def _normalize_section(section): - # pad section to length 3: (txt, sec_id, poss) - if len(section) == 1: + # Pad/normalize to (txt, layout, positions) + if not isinstance(section, (list, tuple)): + section = (section, "", []) + elif len(section) == 1: section = (section[0], "", []) elif len(section) == 2: section = (section[0], "", section[1]) - elif len(section) != 3: - raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") + else: + section = (section[0], section[1], section[2]) txt, layoutno, poss = section if isinstance(poss, str): poss = pdf_parser.extract_positions(poss) - first = poss[0] # tuple: ([pn], x1, x2, y1, y2) - pn = first[0] - - if isinstance(pn, list): - pn = pn[0] # [pn] -> pn + if poss: + first = poss[0] # tuple: ([pn], x1, x2, y1, y2) + pn = first[0] + if isinstance(pn, list) and pn: + pn = pn[0] # [pn] -> pn poss[0] = (pn, *first[1:]) + if not poss: + poss = [] return (txt, layoutno, poss)