fix task cancling bug (#98)

2026-01-31 23:55:06 +08:00 · 2024-03-05 16:33:47 +08:00
parent 07d76ea18d
commit 602038ac49
11 changed files with 24 additions and 15 deletions
--- a/rag/app/book.py
+++ b/rag/app/book.py
@ -28,7 +28,7 @@ class Pdf(PdfParser):
            from_page,
            to_page,
            callback)
-        callback("OCR finished")
+        callback(msg="OCR finished")

        from timeit import default_timer as timer
        start = timer()
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@ -57,7 +57,7 @@ class Pdf(PdfParser):
            to_page,
            callback
        )
-        callback("OCR finished")
+        callback(msg="OCR finished")

        from timeit import default_timer as timer
        start = timer()
@ -135,6 +135,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca

 if __name__ == "__main__":
    import sys
-    def dummy(a, b):
+    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -22,7 +22,7 @@ class Pdf(PdfParser):
            to_page,
            callback
        )
-        callback("OCR finished.")
+        callback(msg="OCR finished.")

        from timeit import default_timer as timer
        start = timer()
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -29,7 +29,7 @@ class Pdf(PdfParser):
            to_page,
            callback
        )
-        callback("OCR finished")
+        callback(msg="OCR finished")

        from timeit import default_timer as timer
        start = timer()
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -36,7 +36,7 @@ class Pdf(PdfParser):
            to_page,
            callback
        )
-        callback("OCR finished.")
+        callback(msg="OCR finished.")

        from timeit import default_timer as timer
        start = timer()
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -305,8 +305,15 @@ class Dealer:
                "similarity": sim[i],
                "vector_similarity": vsim[i],
                "term_similarity": tsim[i],
-                "vector": self.trans2floats(sres.field[id].get("q_%d_vec" % dim, "\t".join(["0"] * dim)))
+                "vector": self.trans2floats(sres.field[id].get("q_%d_vec" % dim, "\t".join(["0"] * dim))),
+                "positions": sres.field[id].get("position_int", "").split("\t")
            }
+            if len(d["positions"]) % 5 == 0:
+                poss = []
+                for i in range(0, len(d["positions"]), 5):
+                    poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
+                                 float(d["positions"][i + 3]), float(d["positions"][i + 4])])
+                d["positions"] = poss
            ranks["chunks"].append(d)
            if dnm not in ranks["doc_aggs"]:
                ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -25,6 +25,7 @@ import traceback
 from functools import partial
 from timeit import default_timer as timer

+import numpy as np
 from elasticsearch_dsl import Q

 from api.db.services.task_service import TaskService
@ -177,10 +178,11 @@ def embedding(docs, mdl, parser_config={}, callback=None):
        tts, c = mdl.encode(tts)
        tk_count += c

-    cnts_ = []
+    cnts_ = np.array([])
    for i in range(0, len(cnts), 32):
        vts, c = mdl.encode(cnts[i: i+32])
-        cnts_.extend(vts)
+        if len(cnts_) == 0: cnts_ = vts
+        else: cnts_ = np.concatenate((cnts_, vts), axis=0)
        tk_count += c
        callback(msg="")
    cnts = cnts_