Optimize docx handle method in laws parser (#1302)

### What problem does this PR solve? Optimize docx handle method in laws parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 16:45:08 +08:00 · 2024-06-28 17:42:59 +08:00
parent 751447bd4f
commit fc7cc1d36c
2 changed files with 83 additions and 6 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -514,11 +514,16 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):

    return cks

-def docx_question_level(p):
+def docx_question_level(p, bull = -1):
    if p.style.name.startswith('Heading'):
        return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
    else:
-        return 0, re.sub(r"\u3000", " ", p.text).strip()
+        if bull < 0:
+            return 0, re.sub(r"\u3000", " ", p.text).strip()
+        for j, title in enumerate(BULLET_PATTERN[bull]):
+            if re.match(title, re.sub(r"\u3000", " ", p.text).strip()):
+                return j+1, re.sub(r"\u3000", " ", p.text).strip()
+    return 0, re.sub(r"\u3000", " ", p.text).strip()
    
 def concat_img(img1, img2):
    if img1 and not img2: