Code refactor. (#4291)

### What problem does this PR solve? ### Type of change - [x] Refactoring
2026-02-02 08:35:08 +08:00 · 2024-12-30 18:38:51 +08:00
parent f619d5a9b6
commit 8fb18f37f6
10 changed files with 33 additions and 18 deletions
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
        # set pivot using the most frequent type of title,
        # then merge between 2 pivot
-        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
+        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
            max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
            most_level = max(0, max_lvl - 1)
            levels = []
@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
        return res

-    if re.search(r"\.docx$", filename, re.IGNORECASE):
+    elif re.search(r"\.docx$", filename, re.IGNORECASE):
        docx_parser = Docx()
        ti_list, tbls = docx_parser(filename, binary,
                                    from_page=0, to_page=10000, callback=callback)