Code refactor. (#4291)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
Kevin Hu
2024-12-30 18:38:51 +08:00
committed by GitHub
parent f619d5a9b6
commit 8fb18f37f6
10 changed files with 33 additions and 18 deletions

View File

@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
for txt in Docx()(filename, binary):
sections.append(txt)
callback(0.8, "Finish parsing.")
chunks = sections
return tokenize_chunks(chunks, doc, eng, pdf_parser)
chunks = Docx()(filename, binary)
callback(0.7, "Finish parsing.")
return tokenize_chunks(chunks, doc, eng, None)
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get(

View File

@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
most_level = max(0, max_lvl - 1)
levels = []
@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
if re.search(r"\.docx$", filename, re.IGNORECASE):
elif re.search(r"\.docx$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)

View File

@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
"datetime": "_dt",
"bool": "_kwd"}
for df in dfs:
for n in ["id", "index", "idx"]:
for n in ["id", "_id", "index", "idx"]:
if n in df.columns:
del df[n]
clmns = df.columns.values