Optimize docx handle method in laws parser (#1302)

### What problem does this PR solve?

Optimize docx handle method in laws parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Zhedong Cen
2024-06-28 17:42:59 +08:00
committed by GitHub
parent 751447bd4f
commit fc7cc1d36c
2 changed files with 83 additions and 6 deletions

View File

@ -514,11 +514,16 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
return cks
def docx_question_level(p):
def docx_question_level(p, bull = -1):
if p.style.name.startswith('Heading'):
return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
else:
return 0, re.sub(r"\u3000", " ", p.text).strip()
if bull < 0:
return 0, re.sub(r"\u3000", " ", p.text).strip()
for j, title in enumerate(BULLET_PATTERN[bull]):
if re.match(title, re.sub(r"\u3000", " ", p.text).strip()):
return j+1, re.sub(r"\u3000", " ", p.text).strip()
return 0, re.sub(r"\u3000", " ", p.text).strip()
def concat_img(img1, img2):
if img1 and not img2: