Code refactor. (#4291)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
Kevin Hu
2024-12-30 18:38:51 +08:00
committed by GitHub
parent f619d5a9b6
commit 8fb18f37f6
10 changed files with 33 additions and 18 deletions

View File

@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
for txt in Docx()(filename, binary):
sections.append(txt)
callback(0.8, "Finish parsing.")
chunks = sections
return tokenize_chunks(chunks, doc, eng, pdf_parser)
chunks = Docx()(filename, binary)
callback(0.7, "Finish parsing.")
return tokenize_chunks(chunks, doc, eng, None)
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get(