diff --git a/api/apps/kb_app.py b/api/apps/kb_app.py index 99b014ea0..f7b385c11 100644 --- a/api/apps/kb_app.py +++ b/api/apps/kb_app.py @@ -122,7 +122,7 @@ def update(): if not e: return get_data_error_result( message="Database error (Knowledgebase rename)!") - errors = Connector2KbService.link_connectors(kb.id, [conn["id"] for conn in connectors], current_user.id) + errors = Connector2KbService.link_connectors(kb.id, [conn for conn in connectors], current_user.id) if errors: logging.error("Link KB errors: ", errors) kb = kb.to_dict() diff --git a/api/db/services/connector_service.py b/api/db/services/connector_service.py index daca5dd7a..5d02f058f 100644 --- a/api/db/services/connector_service.py +++ b/api/db/services/connector_service.py @@ -73,7 +73,9 @@ class ConnectorService(CommonService): return SyncLogsService.filter_delete([SyncLogs.connector_id==connector_id, SyncLogs.kb_id==kb_id]) docs = DocumentService.query(source_type=f"{conn.source}/{conn.id}") - return FileService.delete_docs([d.id for d in docs], tenant_id) + err = FileService.delete_docs([d.id for d in docs], tenant_id) + SyncLogsService.schedule(connector_id, kb_id, reindex=True) + return err class SyncLogsService(CommonService): @@ -226,16 +228,20 @@ class Connector2KbService(CommonService): model = Connector2Kb @classmethod - def link_connectors(cls, kb_id:str, connector_ids: list[str], tenant_id:str): + def link_connectors(cls, kb_id:str, connectors: list[dict], tenant_id:str): arr = cls.query(kb_id=kb_id) old_conn_ids = [a.connector_id for a in arr] - for conn_id in connector_ids: + connector_ids = [] + for conn in connectors: + conn_id = conn["id"] + connector_ids.append(conn_id) if conn_id in old_conn_ids: continue cls.save(**{ "id": get_uuid(), "connector_id": conn_id, - "kb_id": kb_id + "kb_id": kb_id, + "auto_parse": conn.get("auto_parse", "1") }) SyncLogsService.schedule(conn_id, kb_id, reindex=True) diff --git a/common/data_source/discord_connector.py b/common/data_source/discord_connector.py index bd64fc680..93a0477b0 100644 --- a/common/data_source/discord_connector.py +++ b/common/data_source/discord_connector.py @@ -63,7 +63,7 @@ def _convert_message_to_document( semantic_identifier=semantic_identifier, doc_updated_at=doc_updated_at, blob=message.content.encode("utf-8"), - extension="txt", + extension=".txt", size_bytes=len(message.content.encode("utf-8")), ) @@ -275,7 +275,7 @@ class DiscordConnector(LoadConnector, PollConnector): semantic_identifier=f"{min_updated_at} -> {max_updated_at}", doc_updated_at=max_updated_at, blob=blob, - extension="txt", + extension=".txt", size_bytes=size_bytes, ) diff --git a/common/data_source/notion_connector.py b/common/data_source/notion_connector.py index f77184e31..8c6a522ad 100644 --- a/common/data_source/notion_connector.py +++ b/common/data_source/notion_connector.py @@ -1,6 +1,5 @@ import logging from collections.abc import Generator -from datetime import datetime, timezone from typing import Any, Optional from retry import retry @@ -33,7 +32,7 @@ from common.data_source.utils import ( batch_generator, fetch_notion_data, properties_to_str, - filter_pages_by_time + filter_pages_by_time, datetime_from_string ) @@ -293,9 +292,9 @@ class NotionConnector(LoadConnector, PollConnector): blob=blob, source=DocumentSource.NOTION, semantic_identifier=page_title, - extension="txt", + extension=".txt", size_bytes=len(blob), - doc_updated_at=datetime.fromisoformat(page.last_edited_time).astimezone(timezone.utc) + doc_updated_at=datetime_from_string(page.last_edited_time) ) if self.recursive_index_enabled and all_child_page_ids: diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index ba63da82a..fd97a068c 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -63,6 +63,8 @@ class SyncBase: if task["poll_range_start"]: next_update = task["poll_range_start"] for document_batch in document_batch_generator: + if not document_batch: + continue min_update = min([doc.doc_updated_at for doc in document_batch]) max_update = max([doc.doc_updated_at for doc in document_batch]) next_update = max([next_update, max_update])