refine table parser (#120)

This commit is contained in:
KevinHuSh
2024-03-12 18:56:04 +08:00
committed by GitHub
parent f1f09df901
commit 0feb085c88
6 changed files with 20 additions and 15 deletions

View File

@ -91,10 +91,10 @@ def dispatch():
tsks.append(task)
elif r["parser_id"] == "table":
rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
for i in range(0, rn, 1000):
for i in range(0, rn, 3000):
task = new_task()
task["from_page"] = i
task["to_page"] = min(i + 1000, rn)
task["to_page"] = min(i + 3000, rn)
tsks.append(task)
else:
tsks.append(new_task())

View File

@ -128,8 +128,6 @@ def build(row):
return
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
docs = []
doc = {
"doc_id": row["doc_id"],
@ -179,8 +177,8 @@ def embedding(docs, mdl, parser_config={}, callback=None):
tk_count += c
cnts_ = np.array([])
for i in range(0, len(cnts), 32):
vts, c = mdl.encode(cnts[i: i+32])
for i in range(0, len(cnts), 8):
vts, c = mdl.encode(cnts[i: i+8])
if len(cnts_) == 0: cnts_ = vts
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
tk_count += c
@ -226,6 +224,7 @@ def main(comm, mod):
continue
# TODO: exception handler
## set_progress(r["did"], -1, "ERROR: ")
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
try:
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
except Exception as e: