Code refactor. (#4291)

### What problem does this PR solve? ### Type of change - [x] Refactoring
2026-01-31 15:45:08 +08:00 · 2024-12-30 18:38:51 +08:00
parent f619d5a9b6
commit 8fb18f37f6
10 changed files with 33 additions and 18 deletions
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

    if re.search(r"\.docx$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
-        for txt in Docx()(filename, binary):
-            sections.append(txt)
-        callback(0.8, "Finish parsing.")
-        chunks = sections
-        return tokenize_chunks(chunks, doc, eng, pdf_parser)
+        chunks = Docx()(filename, binary)
+        callback(0.7, "Finish parsing.")
+        return tokenize_chunks(chunks, doc, eng, None)

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf() if kwargs.get(
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
        # set pivot using the most frequent type of title,
        # then merge between 2 pivot
-        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
+        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
            max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
            most_level = max(0, max_lvl - 1)
            levels = []
@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
        return res

-    if re.search(r"\.docx$", filename, re.IGNORECASE):
+    elif re.search(r"\.docx$", filename, re.IGNORECASE):
        docx_parser = Docx()
        ti_list, tbls = docx_parser(filename, binary,
                                    from_page=0, to_page=10000, callback=callback)
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
        "datetime": "_dt",
        "bool": "_kwd"}
    for df in dfs:
-        for n in ["id", "index", "idx"]:
+        for n in ["id", "_id", "index", "idx"]:
            if n in df.columns:
                del df[n]
        clmns = df.columns.values