mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Refactor parse progress (#3781)
### What problem does this PR solve? Refactor parse file progress ### Type of change - [x] Refactoring Signed-off-by: jinhai <haijin.chn@gmail.com>
This commit is contained in:
@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
"""
|
||||
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
is_english = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||
@ -206,8 +206,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_parser = None
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
sections, tbls = Docx()(filename, binary)
|
||||
res = tokenize_table(tbls, doc, eng) # just for table
|
||||
sections, tables = Docx()(filename, binary)
|
||||
res = tokenize_table(tables, doc, is_english) # just for table
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
st = timer()
|
||||
@ -220,16 +220,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if kwargs.get("section_only", False):
|
||||
return chunks
|
||||
|
||||
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
|
||||
res.extend(tokenize_chunks_docx(chunks, doc, is_english, images))
|
||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf(
|
||||
) if parser_config.get("layout_recognize", True) else PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
pdf_parser = Pdf() if parser_config.get("layout_recognize", True) else PlainParser()
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
|
||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
@ -248,8 +246,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
|
||||
@ -289,7 +287,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if kwargs.get("section_only", False):
|
||||
return chunks
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
return res
|
||||
|
||||
|
||||
Reference in New Issue
Block a user