Synchronize with enterprise version (#4325)

### Type of change

- [x] Refactoring
This commit is contained in:
Yingfeng
2025-01-02 13:44:44 +08:00
committed by GitHub
parent 564277736a
commit 50f209204e
6 changed files with 94 additions and 69 deletions

View File

@ -9,7 +9,7 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": True})
"chunk_token_num": 512, "delimiter": "\n!?;。;!?", "layout_recognize": True})
eng = lang.lower() == "english"
parser_config["layout_recognize"] = True
@ -29,4 +29,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
chunks.extend(tokenize_chunks(sections, doc, eng))
return chunks
return chunks