Feat: Support multiple data sources synchronizations (#10954)

### What problem does this PR solve?
#10953

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-11-03 19:59:18 +08:00
committed by GitHub
parent 9a486e0f51
commit 3e5a39482e
33 changed files with 11444 additions and 3645 deletions

View File

@ -21,13 +21,15 @@ from pathlib import Path
from flask_login import current_user
from peewee import fn
from api.db import KNOWLEDGEBASE_FOLDER_NAME, FileSource, FileType, ParserType
from api.db.db_models import DB, Document, File, File2Document, Knowledgebase
from api.db import KNOWLEDGEBASE_FOLDER_NAME, FileSource, FileType, ParserType, TaskStatus
from api.db.db_models import DB, Document, File, File2Document, Knowledgebase, Task
from api.db.services import duplicate_name
from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from common.misc_utils import get_uuid
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.task_service import TaskService
from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img
from rag.llm.cv_model import GptV4
from rag.utils.storage_factory import STORAGE_IMPL
@ -420,7 +422,7 @@ class FileService(CommonService):
@classmethod
@DB.connection_context()
def upload_document(self, kb, file_objs, user_id):
def upload_document(self, kb, file_objs, user_id, src="local"):
root_folder = self.get_root_folder(user_id)
pf_id = root_folder["id"]
self.init_knowledgebase_docs(pf_id, user_id)
@ -462,6 +464,7 @@ class FileService(CommonService):
"created_by": user_id,
"type": filetype,
"name": filename,
"source_type": src,
"suffix": Path(filename).suffix.lstrip("."),
"location": location,
"size": len(blob),
@ -536,3 +539,48 @@ class FileService(CommonService):
def put_blob(user_id, location, blob):
bname = f"{user_id}-downloads"
return STORAGE_IMPL.put(bname, location, blob)
@classmethod
@DB.connection_context()
def delete_docs(cls, doc_ids, tenant_id):
root_folder = FileService.get_root_folder(tenant_id)
pf_id = root_folder["id"]
FileService.init_knowledgebase_docs(pf_id, tenant_id)
errors = ""
kb_table_num_map = {}
for doc_id in doc_ids:
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
raise Exception("Document not found!")
tenant_id = DocumentService.get_tenant_id(doc_id)
if not tenant_id:
raise Exception("Tenant not found!")
b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
TaskService.filter_delete([Task.doc_id == doc_id])
if not DocumentService.remove_document(doc, tenant_id):
raise Exception("Database error (Document removal)!")
f2d = File2DocumentService.get_by_document_id(doc_id)
deleted_file_count = 0
if f2d:
deleted_file_count = FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
File2DocumentService.delete_by_document_id(doc_id)
if deleted_file_count > 0:
STORAGE_IMPL.rm(b, n)
doc_parser = doc.parser_id
if doc_parser == ParserType.TABLE:
kb_id = doc.kb_id
if kb_id not in kb_table_num_map:
counts = DocumentService.count_by_kb_id(kb_id=kb_id, keywords="", run_status=[TaskStatus.DONE], types=[])
kb_table_num_map[kb_id] = counts
kb_table_num_map[kb_id] -= 1
if kb_table_num_map[kb_id] <= 0:
KnowledgebaseService.delete_field_map(kb_id)
except Exception as e:
errors += str(e)
return errors