Feat: Support multiple data sources synchronizations (#10954)

### What problem does this PR solve?
#10953

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-11-03 19:59:18 +08:00
committed by GitHub
parent 9a486e0f51
commit 3e5a39482e
33 changed files with 11444 additions and 3645 deletions

View File

@ -21,6 +21,7 @@ import os
import sys
import time
import typing
from datetime import datetime, timezone
from enum import Enum
from functools import wraps
@ -702,6 +703,7 @@ class TenantLLM(DataBaseModel):
api_base = CharField(max_length=255, null=True, help_text="API Base")
max_tokens = IntegerField(default=8192, index=True)
used_tokens = IntegerField(default=0, index=True)
status = CharField(max_length=1, null=False, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)
def __str__(self):
return self.llm_name
@ -1035,6 +1037,76 @@ class PipelineOperationLog(DataBaseModel):
db_table = "pipeline_operation_log"
class Connector(DataBaseModel):
id = CharField(max_length=32, primary_key=True)
tenant_id = CharField(max_length=32, null=False, index=True)
name = CharField(max_length=128, null=False, help_text="Search name", index=False)
source = CharField(max_length=128, null=False, help_text="Data source", index=True)
input_type = CharField(max_length=128, null=False, help_text="poll/event/..", index=True)
config = JSONField(null=False, default={})
refresh_freq = IntegerField(default=0, index=False)
prune_freq = IntegerField(default=0, index=False)
timeout_secs = IntegerField(default=3600, index=False)
indexing_start = DateTimeField(null=True, index=True)
status = CharField(max_length=16, null=True, help_text="schedule", default="schedule", index=True)
def __str__(self):
return self.name
class Meta:
db_table = "connector"
class Connector2Kb(DataBaseModel):
id = CharField(max_length=32, primary_key=True)
connector_id = CharField(max_length=32, null=False, index=True)
kb_id = CharField(max_length=32, null=False, index=True)
class Meta:
db_table = "connector2kb"
class DateTimeTzField(CharField):
field_type = 'VARCHAR'
def db_value(self, value: datetime|None) -> str|None:
if value is not None:
if value.tzinfo is not None:
return value.isoformat()
else:
return value.replace(tzinfo=timezone.utc).isoformat()
return value
def python_value(self, value: str|None) -> datetime|None:
if value is not None:
dt = datetime.fromisoformat(value)
if dt.tzinfo is None:
import pytz
return dt.replace(tzinfo=pytz.UTC)
return dt
return value
class SyncLogs(DataBaseModel):
id = CharField(max_length=32, primary_key=True)
connector_id = CharField(max_length=32, index=True)
status = CharField(max_length=128, null=False, help_text="Processing status", index=True)
from_beginning = CharField(max_length=1, null=True, help_text="", default="0", index=False)
new_docs_indexed = IntegerField(default=0, index=False)
total_docs_indexed = IntegerField(default=0, index=False)
docs_removed_from_index = IntegerField(default=0, index=False)
error_msg = TextField(null=False, help_text="process message", default="")
error_count = IntegerField(default=0, index=False)
full_exception_trace = TextField(null=True, help_text="process message", default="")
time_started = DateTimeField(null=True, index=True)
poll_range_start = DateTimeTzField(max_length=255, null=True, index=True)
poll_range_end = DateTimeTzField(max_length=255, null=True, index=True)
kb_id = CharField(max_length=32, null=False, index=True)
class Meta:
db_table = "sync_logs"
def migrate_db():
logging.disable(logging.ERROR)
migrator = DatabaseMigrator[settings.DATABASE_TYPE.upper()].value(DB)
@ -1203,4 +1275,8 @@ def migrate_db():
migrate(migrator.alter_column_type("tenant_llm", "api_key", TextField(null=True, help_text="API KEY")))
except Exception:
pass
try:
migrate(migrator.add_column("tenant_llm", "status", CharField(max_length=1, null=False, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)))
except Exception:
pass
logging.disable(logging.NOTSET)