Synchronize with enterprise version (#4325)

### Type of change

- [x] Refactoring
This commit is contained in:
Yingfeng
2025-01-02 13:44:44 +08:00
committed by GitHub
parent 564277736a
commit 50f209204e
6 changed files with 94 additions and 69 deletions

View File

@ -9,7 +9,7 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": True})
"chunk_token_num": 512, "delimiter": "\n!?;。;!?", "layout_recognize": True})
eng = lang.lower() == "english"
parser_config["layout_recognize"] = True
@ -29,4 +29,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
chunks.extend(tokenize_chunks(sections, doc, eng))
return chunks
return chunks

View File

@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
elif re.search(r"\.docx$", filename, re.IGNORECASE):
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
@ -279,4 +279,4 @@ if __name__ == "__main__":
pass
chunk(sys.argv[1], callback=dummy)
chunk(sys.argv[1], callback=dummy)