resolve the issue of naive parser (#87)

This commit is contained in:
KevinHuSh
2024-02-29 18:53:02 +08:00
committed by GitHub
parent 28531fc73d
commit 3d4315c42a
7 changed files with 41 additions and 22 deletions

View File

@ -30,7 +30,6 @@ class Pdf(PdfParser):
from timeit import default_timer as timer
start = timer()
start = timer()
self._layouts_rec(zoomin)
callback(0.5, "Layout analysis finished.")
print("paddle layouts:", timer() - start)
@ -102,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
cks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;!?"))
# wrap up to es documents
for ck in cks: