add stream chat (#811)

### What problem does this PR solve?

#709 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
KevinHuSh
2024-05-16 20:14:53 +08:00
committed by GitHub
parent d6772f5dd7
commit 95f809187e
12 changed files with 353 additions and 102 deletions

View File

@ -80,7 +80,7 @@ def set_progress(task_id, from_page=0, to_page=-1,
if to_page > 0:
if msg:
msg = f"Page({from_page+1}~{to_page+1}): " + msg
msg = f"Page({from_page + 1}~{to_page + 1}): " + msg
d = {"progress_msg": msg}
if prog is not None:
d["progress"] = prog
@ -124,7 +124,7 @@ def get_minio_binary(bucket, name):
def build(row):
if row["size"] > DOC_MAXIMUM_SIZE:
set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
(int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
return []
callback = partial(
@ -138,12 +138,12 @@ def build(row):
bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
binary = get_minio_binary(bucket, name)
cron_logger.info(
"From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
"From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
to_page=row["to_page"], lang=row["language"], callback=callback,
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
cron_logger.info(
"Chunkking({}) {}/{}".format(timer()-st, row["location"], row["name"]))
"Chunkking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
except TimeoutError as e:
callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
cron_logger.error(
@ -173,7 +173,7 @@ def build(row):
d.update(ck)
md5 = hashlib.md5()
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
str(d["doc_id"])).encode("utf-8"))
d["_id"] = md5.hexdigest()
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
@ -261,7 +261,7 @@ def main():
st = timer()
cks = build(r)
cron_logger.info("Build chunks({}): {:.2f}".format(r["name"], timer()-st))
cron_logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
if cks is None:
continue
if not cks:
@ -271,7 +271,7 @@ def main():
## set_progress(r["did"], -1, "ERROR: ")
callback(
msg="Finished slicing files(%d). Start to embedding the content." %
len(cks))
len(cks))
st = timer()
try:
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
@ -279,19 +279,19 @@ def main():
callback(-1, "Embedding error:{}".format(str(e)))
cron_logger.error(str(e))
tk_count = 0
cron_logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer()-st))
cron_logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer()-st))
callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st))
init_kb(r)
chunk_count = len(set([c["_id"] for c in cks]))
st = timer()
es_r = ""
for b in range(0, len(cks), 32):
es_r = ELASTICSEARCH.bulk(cks[b:b+32], search.index_name(r["tenant_id"]))
es_r = ELASTICSEARCH.bulk(cks[b:b + 32], search.index_name(r["tenant_id"]))
if b % 128 == 0:
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
cron_logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer()-st))
cron_logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
if es_r:
callback(-1, "Index failure!")
ELASTICSEARCH.deleteByQuery(
@ -307,8 +307,7 @@ def main():
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
cron_logger.info(
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
r["id"], tk_count, len(cks), timer()-st))
r["id"], tk_count, len(cks), timer() - st))
if __name__ == "__main__":