Feat: add auto parse to connector. (#11099)

### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-03 17:15:08 +08:00 · 2025-11-07 16:49:29 +08:00
parent 526ba3388f
commit dd1c8c5779
5 changed files with 18 additions and 11 deletions
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@ -122,7 +122,7 @@ def update():
        if not e:
            return get_data_error_result(
                message="Database error (Knowledgebase rename)!")
-        errors = Connector2KbService.link_connectors(kb.id, [conn["id"] for conn in connectors], current_user.id)
+        errors = Connector2KbService.link_connectors(kb.id, [conn for conn in connectors], current_user.id)
        if errors:
            logging.error("Link KB errors: ", errors)
        kb = kb.to_dict()
--- a/api/db/services/connector_service.py
+++ b/api/db/services/connector_service.py
@ -73,7 +73,9 @@ class ConnectorService(CommonService):
            return
        SyncLogsService.filter_delete([SyncLogs.connector_id==connector_id, SyncLogs.kb_id==kb_id])
        docs = DocumentService.query(source_type=f"{conn.source}/{conn.id}")
-        return FileService.delete_docs([d.id for d in docs], tenant_id)
+        err = FileService.delete_docs([d.id for d in docs], tenant_id)
        SyncLogsService.schedule(connector_id, kb_id, reindex=True)
        return err
 class SyncLogsService(CommonService):
@ -226,16 +228,20 @@ class Connector2KbService(CommonService):
    model = Connector2Kb
    @classmethod
-    def link_connectors(cls, kb_id:str, connector_ids: list[str], tenant_id:str):
+    def link_connectors(cls, kb_id:str, connectors: list[dict], tenant_id:str):
        arr = cls.query(kb_id=kb_id)
        old_conn_ids = [a.connector_id for a in arr]
-        for conn_id in connector_ids:
+        connector_ids = []
        for conn in connectors:
            conn_id = conn["id"]
            connector_ids.append(conn_id)
            if conn_id in old_conn_ids:
                continue
            cls.save(**{
                "id": get_uuid(),
                "connector_id": conn_id,
-                "kb_id": kb_id
+                "kb_id": kb_id,
                "auto_parse": conn.get("auto_parse", "1")
            })
            SyncLogsService.schedule(conn_id, kb_id, reindex=True)
--- a/common/data_source/discord_connector.py
+++ b/common/data_source/discord_connector.py
@ -63,7 +63,7 @@ def _convert_message_to_document(
        semantic_identifier=semantic_identifier,
        doc_updated_at=doc_updated_at,
        blob=message.content.encode("utf-8"),
-        extension="txt",
+        extension=".txt",
        size_bytes=len(message.content.encode("utf-8")),
    )
@ -275,7 +275,7 @@ class DiscordConnector(LoadConnector, PollConnector):
                semantic_identifier=f"{min_updated_at} -> {max_updated_at}",
                doc_updated_at=max_updated_at,
                blob=blob,
-                extension="txt",
+                extension=".txt",
                size_bytes=size_bytes,
            )
--- a/common/data_source/notion_connector.py
+++ b/common/data_source/notion_connector.py
@ -1,6 +1,5 @@
 import logging
 from collections.abc import Generator
 from datetime import datetime, timezone
 from typing import Any, Optional
 from retry import retry
@ -33,7 +32,7 @@ from common.data_source.utils import (
    batch_generator,
    fetch_notion_data,
    properties_to_str,
-    filter_pages_by_time
+    filter_pages_by_time, datetime_from_string
 )
@ -293,9 +292,9 @@ class NotionConnector(LoadConnector, PollConnector):
                blob=blob,
                source=DocumentSource.NOTION,
                semantic_identifier=page_title,
-                extension="txt",
+                extension=".txt",
                size_bytes=len(blob),
-                doc_updated_at=datetime.fromisoformat(page.last_edited_time).astimezone(timezone.utc)
+                doc_updated_at=datetime_from_string(page.last_edited_time)
            )
        if self.recursive_index_enabled and all_child_page_ids:
--- a/rag/svr/sync_data_source.py
+++ b/rag/svr/sync_data_source.py
@ -63,6 +63,8 @@ class SyncBase:
                    if task["poll_range_start"]:
                        next_update = task["poll_range_start"]
                    for document_batch in document_batch_generator:
                        if not document_batch:
                            continue
                        min_update = min([doc.doc_updated_at for doc in document_batch])
                        max_update = max([doc.doc_updated_at for doc in document_batch])
                        next_update = max([next_update, max_update])