Fix: Delete chunk images on document parser config change. (#12262)

### What problem does this PR solve? Modifying a document’s parser config previously left behind obsolete chunk images. If the dataset isn’t manually deleted, these images accumulate and waste storage. This PR fixes the issue by automatically removing associated images when the parser config changes. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2025-12-30 00:32:30 +08:00 · 2025-12-29 12:54:11 +08:00
parent 27c55f6514
commit 8d3f9d61da
2 changed files with 19 additions and 15 deletions
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -746,6 +746,7 @@ async def change_parser():
            tenant_id = DocumentService.get_tenant_id(req["doc_id"])
            if not tenant_id:
                return get_data_error_result(message="Tenant not found!")
+            DocumentService.delete_chunk_images(doc, tenant_id)
            if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id):
                settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
        return None
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -342,21 +342,7 @@ class DocumentService(CommonService):
        cls.clear_chunk_num(doc.id)
        try:
            TaskService.filter_delete([Task.doc_id == doc.id])
-            page = 0
-            page_size = 1000
-            all_chunk_ids = []
-            while True:
-                chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(),
-                                                      page * page_size, page_size, search.index_name(tenant_id),
-                                                      [doc.kb_id])
-                chunk_ids = settings.docStoreConn.get_doc_ids(chunks)
-                if not chunk_ids:
-                    break
-                all_chunk_ids.extend(chunk_ids)
-                page += 1
-            for cid in all_chunk_ids:
-                if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid):
-                    settings.STORAGE_IMPL.rm(doc.kb_id, cid)
+            cls.delete_chunk_images(doc, tenant_id)
            if doc.thumbnail and not doc.thumbnail.startswith(IMG_BASE64_PREFIX):
                if settings.STORAGE_IMPL.obj_exist(doc.kb_id, doc.thumbnail):
                    settings.STORAGE_IMPL.rm(doc.kb_id, doc.thumbnail)
@ -378,6 +364,23 @@ class DocumentService(CommonService):
            pass
        return cls.delete_by_id(doc.id)

+    @classmethod
+    @DB.connection_context()
+    def delete_chunk_images(cls, doc, tenant_id):
+        page = 0
+        page_size = 1000
+        while True:
+            chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(),
+                                                  page * page_size, page_size, search.index_name(tenant_id),
+                                                  [doc.kb_id])
+            chunk_ids = settings.docStoreConn.get_doc_ids(chunks)
+            if not chunk_ids:
+                break
+            for cid in chunk_ids:
+                if settings.STORAGE_IMPL.obj_exist(doc.kb_id, cid):
+                    settings.STORAGE_IMPL.rm(doc.kb_id, cid)
+            page += 1
+
    @classmethod
    @DB.connection_context()
    def get_newly_uploaded(cls):