From 8d3f9d61daa290dddfe8a215ec8736a9b2b10d30 Mon Sep 17 00:00:00 2001 From: lys1313013 <120407035@qq.com> Date: Mon, 29 Dec 2025 12:54:11 +0800 Subject: [PATCH] Fix: Delete chunk images on document parser config change. (#12262) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Modifying a document’s parser config previously left behind obsolete chunk images. If the dataset isn’t manually deleted, these images accumulate and waste storage. This PR fixes the issue by automatically removing associated images when the parser config changes. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/document_app.py | 1 + api/db/services/document_service.py | 33 ++++++++++++++++------------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 8cdb06640..4fcc07e65 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -746,6 +746,7 @@ async def change_parser(): tenant_id = DocumentService.get_tenant_id(req["doc_id"]) if not tenant_id: return get_data_error_result(message="Tenant not found!") + DocumentService.delete_chunk_images(doc, tenant_id) if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id): settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id) return None diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 90cdbf1e4..a05d1783d 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -342,21 +342,7 @@ class DocumentService(CommonService): cls.clear_chunk_num(doc.id) try: TaskService.filter_delete([Task.doc_id == doc.id]) - page = 0 - page_size = 1000 - all_chunk_ids = [] - while True: - chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(), - page * page_size, page_size, search.index_name(tenant_id), - [doc.kb_id]) - chunk_ids = settings.docStoreConn.get_doc_ids(chunks) - if not chunk_ids: - break - all_chunk_ids.extend(chunk_ids) - page += 1 - for cid in all_chunk_ids: - if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid): - settings.STORAGE_IMPL.rm(doc.kb_id, cid) + cls.delete_chunk_images(doc, tenant_id) if doc.thumbnail and not doc.thumbnail.startswith(IMG_BASE64_PREFIX): if settings.STORAGE_IMPL.obj_exist(doc.kb_id, doc.thumbnail): settings.STORAGE_IMPL.rm(doc.kb_id, doc.thumbnail) @@ -378,6 +364,23 @@ class DocumentService(CommonService): pass return cls.delete_by_id(doc.id) + @classmethod + @DB.connection_context() + def delete_chunk_images(cls, doc, tenant_id): + page = 0 + page_size = 1000 + while True: + chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(), + page * page_size, page_size, search.index_name(tenant_id), + [doc.kb_id]) + chunk_ids = settings.docStoreConn.get_doc_ids(chunks) + if not chunk_ids: + break + for cid in chunk_ids: + if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid): + settings.STORAGE_IMPL.rm(doc.kb_id, cid) + page += 1 + @classmethod @DB.connection_context() def get_newly_uploaded(cls):