Feat: add splitter (#10161)

### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com>
2026-02-06 18:45:08 +08:00 · 2025-09-19 10:15:19 +08:00
parent f9c7404bee
commit a1b947ffd6
81 changed files with 3083 additions and 799 deletions
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@ -646,6 +646,7 @@ class Knowledgebase(DataBaseModel):
    vector_similarity_weight = FloatField(default=0.3, index=True)

    parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
+    pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
    pagerank = IntegerField(default=0, index=False)
    status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)
@ -662,6 +663,7 @@ class Document(DataBaseModel):
    thumbnail = TextField(null=True, help_text="thumbnail base64 string")
    kb_id = CharField(max_length=256, null=False, index=True)
    parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
+    pipeline_id = CharField(max_length=32, null=True, help_text="pipleline ID", index=True)
    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
    source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
    type = CharField(max_length=32, null=False, help_text="file extension", index=True)
@ -1020,7 +1022,6 @@ def migrate_db():
        migrate(migrator.add_column("dialog", "meta_data_filter", JSONField(null=True, default={})))
    except Exception:
        pass
-
    try:
        migrate(migrator.alter_column_type("canvas_template", "title", JSONField(null=True, default=dict, help_text="Canvas title")))
    except Exception:
@ -1037,4 +1038,12 @@ def migrate_db():
        migrate(migrator.add_column("canvas_template", "canvas_category", CharField(max_length=32, null=False, default="agent_canvas", help_text="agent_canvas|dataflow_canvas", index=True)))
    except Exception:
        pass
+    try:
+        migrate(migrator.add_column("knowledgebase", "pipeline_id", CharField(max_length=32, null=True, help_text="default parser ID", index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("document", "pipeline_id", CharField(max_length=32, null=True, help_text="default parser ID", index=True)))
+    except Exception:
+        pass
    logging.disable(logging.NOTSET)
--- a/api/db/services/canvas_service.py
+++ b/api/db/services/canvas_service.py
@ -95,7 +95,7 @@ class UserCanvasService(CommonService):
    @DB.connection_context()
    def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
                          page_number, items_per_page,
-                          orderby, desc, keywords, canvas_category=CanvasCategory.Agent,
+                          orderby, desc, keywords, canvas_category=None
                          ):
        fields = [
            cls.model.id,
@ -122,7 +122,8 @@ class UserCanvasService(CommonService):
                                                                TenantPermission.TEAM.value)) | (
                    cls.model.user_id == user_id))
            )
-        agents = agents.where(cls.model.canvas_category == canvas_category)
+        if canvas_category:
+            agents = agents.where(cls.model.canvas_category == canvas_category)
        if desc:
            agents = agents.order_by(cls.model.getter_by(orderby).desc())
        else:
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -24,7 +24,7 @@ from io import BytesIO

 import trio
 import xxhash
-from peewee import fn
+from peewee import fn, Case

 from api import settings
 from api.constants import IMG_BASE64_PREFIX, FILE_NAME_LEN_LIMIT
@ -674,6 +674,53 @@ class DocumentService(CommonService):
        return False


+    @classmethod
+    @DB.connection_context()
+    def knowledgebase_basic_info(cls, kb_id: str) -> dict[str, int]:
+        # cancelled: run == "2" but progress can vary
+        cancelled = (
+            cls.model.select(fn.COUNT(1))
+            .where((cls.model.kb_id == kb_id) & (cls.model.run == TaskStatus.CANCEL))
+            .scalar()
+        )
+
+        row = (
+            cls.model.select(
+                # finished: progress == 1
+                fn.COALESCE(fn.SUM(Case(None, [(cls.model.progress == 1, 1)], 0)), 0).alias("finished"),
+
+                # failed: progress == -1
+                fn.COALESCE(fn.SUM(Case(None, [(cls.model.progress == -1, 1)], 0)), 0).alias("failed"),
+
+                # processing: 0 <= progress < 1
+                fn.COALESCE(
+                    fn.SUM(
+                        Case(
+                            None,
+                            [
+                                (((cls.model.progress == 0) | ((cls.model.progress > 0) & (cls.model.progress < 1))), 1),
+                            ],
+                            0,
+                        )
+                    ),
+                    0,
+                ).alias("processing"),
+            )
+            .where(
+                (cls.model.kb_id == kb_id)
+                & ((cls.model.run.is_null(True)) | (cls.model.run != TaskStatus.CANCEL))
+            )
+            .dicts()
+            .get()
+        )
+
+        return {
+            "processing": int(row["processing"]),
+            "finished": int(row["finished"]),
+            "failed": int(row["failed"]),
+            "cancelled": int(cancelled),
+        }
+
 def queue_raptor_o_graphrag_tasks(doc, ty, priority):
    chunking_config = DocumentService.get_chunking_config(doc["id"])
    hasher = xxhash.xxh64()
@ -702,6 +749,8 @@ def queue_raptor_o_graphrag_tasks(doc, ty, priority):

 def get_queue_length(priority):
    group_info = REDIS_CONN.queue_info(get_svr_queue_name(priority), SVR_CONSUMER_GROUP_NAME)
+    if not group_info:
+        return 0
    return int(group_info.get("lag", 0) or 0)


@ -847,3 +896,4 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
            doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)

    return [d["id"] for d, _ in files]
+
--- a/api/db/services/file_service.py
+++ b/api/db/services/file_service.py
@ -440,6 +440,7 @@ class FileService(CommonService):
                    "id": doc_id,
                    "kb_id": kb.id,
                    "parser_id": self.get_parser(filetype, filename, kb.parser_id),
+                    "pipeline_id": kb.pipeline_id,
                    "parser_config": kb.parser_config,
                    "created_by": user_id,
                    "type": filetype,
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@ -472,19 +472,19 @@ def has_canceled(task_id):
    return False


-def queue_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str, priority: int, callback=None) -> tuple[bool, str]:
+def queue_dataflow(dsl:str, tenant_id:str, task_id:str, flow_id:str=None, doc_id:str=None, file:dict=None, priority: int=0, callback=None) -> tuple[bool, str]:
    """
    Returns a tuple (success: bool, error_message: str).
    """
    _ = callback

    task = dict(
-    id=get_uuid() if not task_id else task_id,
-    doc_id=doc_id,
-    from_page=0,
-    to_page=100000000,
-    task_type="dataflow",
-    priority=priority,
+        id=get_uuid() if not task_id else task_id,
+        doc_id=doc_id,
+        from_page=0,
+        to_page=100000000,
+        task_type="dataflow",
+        priority=priority,
    )

    TaskService.model.delete().where(TaskService.model.id == task["id"]).execute()
@ -499,6 +499,7 @@ def queue_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str,
    task["task_type"] = "dataflow"
    task["dsl"] = dsl
    task["dataflow_id"] = get_uuid() if not flow_id else flow_id
+    task["file"] = file

    if not REDIS_CONN.queue_product(
        get_svr_queue_name(priority), message=task