optimize srv broker and executor logic (#630)

### What problem does this PR solve? Optimize task broker and executor for reduce memory usage and deployment complexity. ### Type of change - [x] Performance Improvement - [x] Refactoring ### Change Log - Enhance redis utils for message queue(use stream) - Modify task broker logic via message queue (1.get parse event from message queue 2.use ThreadPoolExecutor async executor ) - Modify the table column name of document and task (process_duation -> process_duration maybe just a spelling mistake) - Reformat some code style(just what i see) - Add requirement_dev.txt for developer - Add redis container on docker compose --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-01-31 07:36:46 +08:00 · 2024-05-07 11:43:33 +08:00
parent c6b6c748ae
commit de839fc3f0
20 changed files with 414 additions and 300 deletions
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@ -15,21 +15,24 @@
 #
 import random

-from peewee import Expression, JOIN
+from api.db.db_utils import bulk_insert_into_db
+from deepdoc.parser import PdfParser
+from peewee import JOIN
 from api.db.db_models import DB, File2Document, File
 from api.db import StatusEnum, FileType, TaskStatus
 from api.db.db_models import Task, Document, Knowledgebase, Tenant
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
-from api.utils import current_timestamp
+from api.utils import current_timestamp, get_uuid
+from deepdoc.parser.excel_parser import RAGFlowExcelParser
+from rag.settings import MINIO, SVR_QUEUE_NAME
+from rag.utils.redis_conn import REDIS_CONN


 class TaskService(CommonService):
    model = Task

-    @classmethod
-    @DB.connection_context()
-    def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True):
+    def get_tasks(cls, task_id):
        fields = [
            cls.model.id,
            cls.model.doc_id,
@ -48,28 +51,18 @@ class TaskService(CommonService):
            Tenant.img2txt_id,
            Tenant.asr_id,
            cls.model.update_time]
-        with DB.lock("get_task", -1):
-            docs = cls.model.select(*fields) \
-                .join(Document, on=(cls.model.doc_id == Document.id)) \
-                .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
-                .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
-                .where(
-                    Document.status == StatusEnum.VALID.value,
-                    Document.run == TaskStatus.RUNNING.value,
-                    ~(Document.type == FileType.VIRTUAL.value),
-                    cls.model.progress == 0,
-                    #cls.model.update_time >= tm,
-                    #(Expression(cls.model.create_time, "%%", comm) == mod)
-                )\
-                .order_by(cls.model.update_time.asc())\
-                .paginate(0, items_per_page)
-            docs = list(docs.dicts())
-            if not docs: return []
-            if not takeit: return docs
+        docs = cls.model.select(*fields) \
+            .join(Document, on=(cls.model.doc_id == Document.id)) \
+            .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
+            .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \
+            .where(cls.model.id == task_id)
+        docs = list(docs.dicts())
+        if not docs: return []

-            cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where(
-                cls.model.id == docs[0]["id"]).execute()
-            return docs
+        cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.",
+                         progress=random.random() / 10.).where(
+            cls.model.id == docs[0]["id"]).execute()
+        return docs

    @classmethod
    @DB.connection_context()
@ -112,3 +105,55 @@ class TaskService(CommonService):
            if "progress" in info:
                cls.model.update(progress=info["progress"]).where(
                    cls.model.id == id).execute()
+
+
+def queue_tasks(doc, bucket, name):
+    def new_task():
+        nonlocal doc
+        return {
+            "id": get_uuid(),
+            "doc_id": doc["id"]
+        }
+    tsks = []
+
+    if doc["type"] == FileType.PDF.value:
+        file_bin = MINIO.get(bucket, name)
+        do_layout = doc["parser_config"].get("layout_recognize", True)
+        pages = PdfParser.total_page_number(doc["name"], file_bin)
+        page_size = doc["parser_config"].get("task_page_size", 12)
+        if doc["parser_id"] == "paper":
+            page_size = doc["parser_config"].get("task_page_size", 22)
+        if doc["parser_id"] == "one":
+            page_size = 1000000000
+        if not do_layout:
+            page_size = 1000000000
+        page_ranges = doc["parser_config"].get("pages")
+        if not page_ranges:
+            page_ranges = [(1, 100000)]
+        for s, e in page_ranges:
+            s -= 1
+            s = max(0, s)
+            e = min(e - 1, pages)
+            for p in range(s, e, page_size):
+                task = new_task()
+                task["from_page"] = p
+                task["to_page"] = min(p + page_size, e)
+                tsks.append(task)
+
+    elif doc["parser_id"] == "table":
+        file_bin = MINIO.get(bucket, name)
+        rn = RAGFlowExcelParser.row_number(
+            doc["name"], file_bin)
+        for i in range(0, rn, 3000):
+            task = new_task()
+            task["from_page"] = i
+            task["to_page"] = min(i + 3000, rn)
+            tsks.append(task)
+    else:
+        tsks.append(new_task())
+
+    for t in tsks:
+        REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t)
+
+    bulk_insert_into_db(Task, tsks, True)
+    DocumentService.begin2parse(doc["id"])