optimize srv broker and executor logic (#630)

### What problem does this PR solve? Optimize task broker and executor for reduce memory usage and deployment complexity. ### Type of change - [x] Performance Improvement - [x] Refactoring ### Change Log - Enhance redis utils for message queue(use stream) - Modify task broker logic via message queue (1.get parse event from message queue 2.use ThreadPoolExecutor async executor ) - Modify the table column name of document and task (process_duation -> process_duration maybe just a spelling mistake) - Reformat some code style(just what i see) - Add requirement_dev.txt for developer - Add redis container on docker compose --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-01-30 07:06:39 +08:00 · 2024-05-07 11:43:33 +08:00
parent c6b6c748ae
commit de839fc3f0
20 changed files with 414 additions and 300 deletions
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -14,7 +14,6 @@
 #  limitations under the License
 #

-import base64
 import os
 import pathlib
 import re
@ -24,8 +23,10 @@ from elasticsearch_dsl import Q
 from flask import request
 from flask_login import login_required, current_user

+from api.db.db_models import Task
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
+from api.db.services.task_service import TaskService, queue_tasks
 from rag.nlp import search
 from rag.utils.es_conn import ELASTICSEARCH
 from api.db.services import duplicate_name
@ -37,7 +38,9 @@ from api.db.services.document_service import DocumentService
 from api.settings import RetCode
 from api.utils.api_utils import get_json_result
 from rag.utils.minio_conn import MINIO
+from rag.utils.redis_conn import REDIS_CONN
 from api.utils.file_utils import filename_type, thumbnail
+from rag.settings import SVR_QUEUE_NAME


@manager.route('/upload', methods=['POST'])
@ -277,6 +280,14 @@ def run():
                return get_data_error_result(retmsg="Tenant not found!")
            ELASTICSEARCH.deleteByQuery(
                Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
+            
+            if str(req["run"]) == TaskStatus.RUNNING.value:
+                TaskService.filter_delete([Task.doc_id == id])
+                e, doc = DocumentService.get_by_id(id)
+                doc = doc.to_dict()
+                doc["tenant_id"] = tenant_id
+                bucket, name = File2DocumentService.get_minio_address(doc_id=doc["id"])
+                queue_tasks(doc, bucket, name)

        return get_json_result(data=True)
    except Exception as e:
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -13,17 +13,18 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from peewee import Expression
-
+import random
+from datetime import datetime
 from elasticsearch_dsl import Q

-from api.utils import current_timestamp
+from api.settings import stat_logger
+from api.utils import current_timestamp, get_format_time
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils.minio_conn import MINIO
 from rag.nlp import search

 from api.db import FileType, TaskStatus
-from api.db.db_models import DB, Knowledgebase, Tenant
+from api.db.db_models import DB, Knowledgebase, Tenant, Task
 from api.db.db_models import Document
 from api.db.services.common_service import CommonService
 from api.db.services.knowledgebase_service import KnowledgebaseService
@ -92,7 +93,7 @@ class DocumentService(CommonService):

    @classmethod
    @DB.connection_context()
-    def get_newly_uploaded(cls, tm):
+    def get_newly_uploaded(cls):
        fields = [
            cls.model.id,
            cls.model.kb_id,
@ -196,3 +197,55 @@ class DocumentService(CommonService):
                                                   on=(Knowledgebase.id == cls.model.kb_id)).where(
            Knowledgebase.tenant_id == tenant_id)
        return len(docs)
+
+    @classmethod
+    @DB.connection_context()
+    def begin2parse(cls, docid):
+        cls.update_by_id(
+            docid, {"progress": random.random() * 1 / 100.,
+                    "progress_msg": "Task dispatched...",
+                    "process_begin_at": get_format_time()
+                    })
+
+    @classmethod
+    @DB.connection_context()
+    def update_progress(cls):
+        docs = cls.get_unfinished_docs()
+        for d in docs:
+            try:
+                tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
+                if not tsks:
+                    continue
+                msg = []
+                prg = 0
+                finished = True
+                bad = 0
+                status = TaskStatus.RUNNING.value
+                for t in tsks:
+                    if 0 <= t.progress < 1:
+                        finished = False
+                    prg += t.progress if t.progress >= 0 else 0
+                    msg.append(t.progress_msg)
+                    if t.progress == -1:
+                        bad += 1
+                prg /= len(tsks)
+                if finished and bad:
+                    prg = -1
+                    status = TaskStatus.FAIL.value
+                elif finished:
+                    status = TaskStatus.DONE.value
+
+                msg = "\n".join(msg)
+                info = {
+                    "process_duation": datetime.timestamp(
+                        datetime.now()) -
+                                       d["process_begin_at"].timestamp(),
+                    "run": status}
+                if prg != 0:
+                    info["progress"] = prg
+                if msg:
+                    info["progress_msg"] = msg
+                cls.update_by_id(d["id"], info)
+            except Exception as e:
+                stat_logger.error("fetch task exception:" + str(e))
+
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@ -15,21 +15,24 @@
 #
 import random

-from peewee import Expression, JOIN
+from api.db.db_utils import bulk_insert_into_db
+from deepdoc.parser import PdfParser
+from peewee import JOIN
 from api.db.db_models import DB, File2Document, File
 from api.db import StatusEnum, FileType, TaskStatus
 from api.db.db_models import Task, Document, Knowledgebase, Tenant
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
-from api.utils import current_timestamp
+from api.utils import current_timestamp, get_uuid
+from deepdoc.parser.excel_parser import RAGFlowExcelParser
+from rag.settings import MINIO, SVR_QUEUE_NAME
+from rag.utils.redis_conn import REDIS_CONN


 class TaskService(CommonService):
    model = Task

-    @classmethod
-    @DB.connection_context()
-    def get_tasks(cls, tm, mod=0, comm=1, items_per_page=1, takeit=True):
+    def get_tasks(cls, task_id):
        fields = [
            cls.model.id,
            cls.model.doc_id,
@ -48,28 +51,18 @@ class TaskService(CommonService):
            Tenant.img2txt_id,
            Tenant.asr_id,
            cls.model.update_time]
-        with DB.lock("get_task", -1):
-            docs = cls.model.select(*fields) \
-                .join(Document, on=(cls.model.doc_id == Document.id)) \
-                .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
-                .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\
-                .where(
-                    Document.status == StatusEnum.VALID.value,
-                    Document.run == TaskStatus.RUNNING.value,
-                    ~(Document.type == FileType.VIRTUAL.value),
-                    cls.model.progress == 0,
-                    #cls.model.update_time >= tm,
-                    #(Expression(cls.model.create_time, "%%", comm) == mod)
-                )\
-                .order_by(cls.model.update_time.asc())\
-                .paginate(0, items_per_page)
-            docs = list(docs.dicts())
-            if not docs: return []
-            if not takeit: return docs
+        docs = cls.model.select(*fields) \
+            .join(Document, on=(cls.model.doc_id == Document.id)) \
+            .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
+            .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \
+            .where(cls.model.id == task_id)
+        docs = list(docs.dicts())
+        if not docs: return []

-            cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.", progress=random.random()/10.).where(
-                cls.model.id == docs[0]["id"]).execute()
-            return docs
+        cls.model.update(progress_msg=cls.model.progress_msg + "\n" + "Task has been received.",
+                         progress=random.random() / 10.).where(
+            cls.model.id == docs[0]["id"]).execute()
+        return docs

    @classmethod
    @DB.connection_context()
@ -112,3 +105,55 @@ class TaskService(CommonService):
            if "progress" in info:
                cls.model.update(progress=info["progress"]).where(
                    cls.model.id == id).execute()
+
+
+def queue_tasks(doc, bucket, name):
+    def new_task():
+        nonlocal doc
+        return {
+            "id": get_uuid(),
+            "doc_id": doc["id"]
+        }
+    tsks = []
+
+    if doc["type"] == FileType.PDF.value:
+        file_bin = MINIO.get(bucket, name)
+        do_layout = doc["parser_config"].get("layout_recognize", True)
+        pages = PdfParser.total_page_number(doc["name"], file_bin)
+        page_size = doc["parser_config"].get("task_page_size", 12)
+        if doc["parser_id"] == "paper":
+            page_size = doc["parser_config"].get("task_page_size", 22)
+        if doc["parser_id"] == "one":
+            page_size = 1000000000
+        if not do_layout:
+            page_size = 1000000000
+        page_ranges = doc["parser_config"].get("pages")
+        if not page_ranges:
+            page_ranges = [(1, 100000)]
+        for s, e in page_ranges:
+            s -= 1
+            s = max(0, s)
+            e = min(e - 1, pages)
+            for p in range(s, e, page_size):
+                task = new_task()
+                task["from_page"] = p
+                task["to_page"] = min(p + page_size, e)
+                tsks.append(task)
+
+    elif doc["parser_id"] == "table":
+        file_bin = MINIO.get(bucket, name)
+        rn = RAGFlowExcelParser.row_number(
+            doc["name"], file_bin)
+        for i in range(0, rn, 3000):
+            task = new_task()
+            task["from_page"] = i
+            task["to_page"] = min(i + 3000, rn)
+            tsks.append(task)
+    else:
+        tsks.append(new_task())
+
+    for t in tsks:
+        REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t)
+
+    bulk_insert_into_db(Task, tsks, True)
+    DocumentService.begin2parse(doc["id"])
--- a/api/ragflow_server.py
+++ b/api/ragflow_server.py
@ -18,10 +18,14 @@ import logging
 import os
 import signal
 import sys
+import time
 import traceback
+from concurrent.futures import ThreadPoolExecutor
+
 from werkzeug.serving import run_simple
 from api.apps import app
 from api.db.runtime_config import RuntimeConfig
+from api.db.services.document_service import DocumentService
 from api.settings import (
    HOST, HTTP_PORT, access_logger, database_logger, stat_logger,
 )
@ -31,6 +35,16 @@ from api.db.db_models import init_database_tables as init_web_db
 from api.db.init_data import init_web_data
 from api.versions import get_versions

+
+def update_progress():
+    while True:
+        time.sleep(1)
+        try:
+            DocumentService.update_progress()
+        except Exception as e:
+            stat_logger.error("update_progress exception:" + str(e))
+
+
 if __name__ == '__main__':
    print("""
    ____                 ______ __               
@ -71,6 +85,9 @@ if __name__ == '__main__':
    peewee_logger.addHandler(database_logger.handlers[0])
    peewee_logger.setLevel(database_logger.level)

+    thr = ThreadPoolExecutor(max_workers=1)
+    thr.submit(update_progress)
+
    # start http server
    try:
        stat_logger.info("RAG Flow http server start...")