Code refactor. (#4291)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
Kevin Hu
2024-12-30 18:38:51 +08:00
committed by GitHub
parent f619d5a9b6
commit 8fb18f37f6
10 changed files with 33 additions and 18 deletions

View File

@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
most_level = max(0, max_lvl - 1)
levels = []
@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
if re.search(r"\.docx$", filename, re.IGNORECASE):
elif re.search(r"\.docx$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)