Refactor parse progress (#3781)

### What problem does this PR solve?

Refactor parse file progress

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2024-12-01 22:28:00 +08:00
committed by GitHub
parent ea84cc2e33
commit 08c1a5e1e8
2 changed files with 73 additions and 74 deletions

View File

@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
eng = lang.lower() == "english" # is_english(cks)
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
@ -206,8 +206,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = None
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = Docx()(filename, binary)
res = tokenize_table(tbls, doc, eng) # just for table
sections, tables = Docx()(filename, binary)
res = tokenize_table(tables, doc, is_english) # just for table
callback(0.8, "Finish parsing.")
st = timer()
@ -220,16 +220,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
res.extend(tokenize_chunks_docx(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf(
) if parser_config.get("layout_recognize", True) else PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tbls, doc, eng)
pdf_parser = Pdf() if parser_config.get("layout_recognize", True) else PlainParser()
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tables, doc, is_english)
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
@ -248,8 +246,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
res = tokenize_table(tbls, doc, eng)
sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
@ -289,7 +287,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res