mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
refine table parser (#120)
This commit is contained in:
@ -91,10 +91,10 @@ def dispatch():
|
||||
tsks.append(task)
|
||||
elif r["parser_id"] == "table":
|
||||
rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||
for i in range(0, rn, 1000):
|
||||
for i in range(0, rn, 3000):
|
||||
task = new_task()
|
||||
task["from_page"] = i
|
||||
task["to_page"] = min(i + 1000, rn)
|
||||
task["to_page"] = min(i + 3000, rn)
|
||||
tsks.append(task)
|
||||
else:
|
||||
tsks.append(new_task())
|
||||
|
||||
@ -128,8 +128,6 @@ def build(row):
|
||||
|
||||
return
|
||||
|
||||
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
||||
|
||||
docs = []
|
||||
doc = {
|
||||
"doc_id": row["doc_id"],
|
||||
@ -179,8 +177,8 @@ def embedding(docs, mdl, parser_config={}, callback=None):
|
||||
tk_count += c
|
||||
|
||||
cnts_ = np.array([])
|
||||
for i in range(0, len(cnts), 32):
|
||||
vts, c = mdl.encode(cnts[i: i+32])
|
||||
for i in range(0, len(cnts), 8):
|
||||
vts, c = mdl.encode(cnts[i: i+8])
|
||||
if len(cnts_) == 0: cnts_ = vts
|
||||
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
|
||||
tk_count += c
|
||||
@ -226,6 +224,7 @@ def main(comm, mod):
|
||||
continue
|
||||
# TODO: exception handler
|
||||
## set_progress(r["did"], -1, "ERROR: ")
|
||||
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
||||
try:
|
||||
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user