mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Code refactor. (#4291)
### What problem does this PR solve? ### Type of change - [x] Refactoring
This commit is contained in:
@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
for txt in Docx()(filename, binary):
|
||||
sections.append(txt)
|
||||
callback(0.8, "Finish parsing.")
|
||||
chunks = sections
|
||||
return tokenize_chunks(chunks, doc, eng, pdf_parser)
|
||||
chunks = Docx()(filename, binary)
|
||||
callback(0.7, "Finish parsing.")
|
||||
return tokenize_chunks(chunks, doc, eng, None)
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
|
||||
@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
|
||||
# set pivot using the most frequent type of title,
|
||||
# then merge between 2 pivot
|
||||
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
|
||||
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
|
||||
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
|
||||
most_level = max(0, max_lvl - 1)
|
||||
levels = []
|
||||
@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
return res
|
||||
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
elif re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
docx_parser = Docx()
|
||||
ti_list, tbls = docx_parser(filename, binary,
|
||||
from_page=0, to_page=10000, callback=callback)
|
||||
|
||||
@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
||||
"datetime": "_dt",
|
||||
"bool": "_kwd"}
|
||||
for df in dfs:
|
||||
for n in ["id", "index", "idx"]:
|
||||
for n in ["id", "_id", "index", "idx"]:
|
||||
if n in df.columns:
|
||||
del df[n]
|
||||
clmns = df.columns.values
|
||||
|
||||
Reference in New Issue
Block a user