mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Optimize docx handle method in laws parser (#1302)
### What problem does this PR solve? Optimize docx handle method in laws parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -514,11 +514,16 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
|
||||
return cks
|
||||
|
||||
def docx_question_level(p):
|
||||
def docx_question_level(p, bull = -1):
|
||||
if p.style.name.startswith('Heading'):
|
||||
return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
|
||||
else:
|
||||
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
||||
if bull < 0:
|
||||
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
||||
for j, title in enumerate(BULLET_PATTERN[bull]):
|
||||
if re.match(title, re.sub(r"\u3000", " ", p.text).strip()):
|
||||
return j+1, re.sub(r"\u3000", " ", p.text).strip()
|
||||
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
||||
|
||||
def concat_img(img1, img2):
|
||||
if img1 and not img2:
|
||||
|
||||
Reference in New Issue
Block a user