change callback strategy, add timezone to docker (#96)

This commit is contained in:
KevinHuSh
2024-03-05 12:08:41 +08:00
committed by GitHub
parent 59d8442d0d
commit 8a57f2afd5
15 changed files with 101 additions and 53 deletions

View File

@ -46,7 +46,7 @@ def collect(tm):
def set_dispatching(docid):
try:
DocumentService.update_by_id(
docid, {"progress": random.randint(0, 3) / 100.,
docid, {"progress": random.random()*1 / 100.,
"progress_msg": "Task dispatched...",
"process_begin_at": get_format_time()
})

View File

@ -72,7 +72,8 @@ def set_progress(task_id, from_page=0, to_page=-1,
prog = -1
if to_page > 0:
msg = f"Page({from_page}~{to_page}): " + msg
if msg:
msg = f"Page({from_page}~{to_page}): " + msg
d = {"progress_msg": msg}
if prog is not None:
d["progress"] = prog
@ -168,7 +169,7 @@ def init_kb(row):
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
def embedding(docs, mdl, parser_config={}):
def embedding(docs, mdl, parser_config={}, callback=None):
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
d["content_with_weight"] for d in docs]
tk_count = 0
@ -176,8 +177,14 @@ def embedding(docs, mdl, parser_config={}):
tts, c = mdl.encode(tts)
tk_count += c
cnts, c = mdl.encode(cnts)
tk_count += c
cnts_ = []
for i in range(0, len(cnts), 32):
vts, c = mdl.encode(cnts[i: i+32])
cnts_.extend(vts)
tk_count += c
callback(msg="")
cnts = cnts_
title_w = float(parser_config.get("filename_embd_weight", 0.1))
vects = (title_w * tts + (1 - title_w) *
cnts) if len(tts) == len(cnts) else cnts
@ -218,10 +225,11 @@ def main(comm, mod):
# TODO: exception handler
## set_progress(r["did"], -1, "ERROR: ")
try:
tk_count = embedding(cks, embd_mdl, r["parser_config"])
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
except Exception as e:
callback(-1, "Embedding error:{}".format(str(e)))
cron_logger.error(str(e))
tk_count = 0
callback(msg="Finished embedding! Start to build index!")
init_kb(r)