fix(raptor): handle missing vector fields gracefully (#12713)

## Summary This PR fixes a `KeyError` crash when running RAPTOR tasks on documents that don't have the expected vector field. ## Related Issue Fixes https://github.com/infiniflow/ragflow/issues/12675 ## Problem When running RAPTOR tasks, the code assumes all chunks have the vector field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have this field if: 1. They were indexed with a **different embedding model** (different vector size) 2. The embedding step **failed silently** during initial parsing 3. The document was parsed before the current embedding model was configured This caused a crash: ``` KeyError: 'q_1024_vec' ``` ## Solution Added defensive validation in `run_raptor_for_kb()`: 1. **Check for vector field existence** before accessing it 2. **Skip chunks** that don't have the required vector field instead of crashing 3. **Log warnings** for skipped chunks with actionable guidance 4. **Provide informative error messages** suggesting users re-parse documents with the current embedding model 5. **Handle both scopes** (`file` and `kb` modes) ## Changes - `rag/svr/task_executor.py`: Added validation and error handling in `run_raptor_for_kb()` ## Testing 1. Create a knowledge base with an embedding model 2. Parse documents 3. Change the embedding model to one with a different vector size 4. Run RAPTOR task 5. **Before**: Crashes with `KeyError` 6. **After**: Gracefully skips incompatible chunks with informative warnings ---  Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
2026-01-23 03:26:53 +08:00 · 2026-01-20 15:24:20 +11:00
parent 1b1554c563
commit f367189703
1 changed files with 29 additions and 0 deletions
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -806,20 +806,49 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
    if raptor_config.get("scope", "file") == "file":
        for x, doc_id in enumerate(doc_ids):
            chunks = []
+            skipped_chunks = 0
            for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
                                                   fields=["content_with_weight", vctr_nm],
                                                   sort_by_position=True):
+                # Skip chunks that don't have the required vector field (may have been indexed with different embedding model)
+                if vctr_nm not in d or d[vctr_nm] is None:
+                    skipped_chunks += 1
+                    logging.warning(f"RAPTOR: Chunk missing vector field '{vctr_nm}' in doc {doc_id}, skipping")
+                    continue
                chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
+            
+            if skipped_chunks > 0:
+                callback(msg=f"[WARN] Skipped {skipped_chunks} chunks without vector field '{vctr_nm}' for doc {doc_id}. Consider re-parsing the document with the current embedding model.")
+            
+            if not chunks:
+                logging.warning(f"RAPTOR: No valid chunks with vectors found for doc {doc_id}")
+                callback(msg=f"[WARN] No valid chunks with vectors found for doc {doc_id}, skipping")
+                continue
+                
            await generate(chunks, doc_id)
            callback(prog=(x + 1.) / len(doc_ids))
    else:
        chunks = []
+        skipped_chunks = 0
        for doc_id in doc_ids:
            for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
                                                   fields=["content_with_weight", vctr_nm],
                                                   sort_by_position=True):
+                # Skip chunks that don't have the required vector field
+                if vctr_nm not in d or d[vctr_nm] is None:
+                    skipped_chunks += 1
+                    logging.warning(f"RAPTOR: Chunk missing vector field '{vctr_nm}' in doc {doc_id}, skipping")
+                    continue
                chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))

+        if skipped_chunks > 0:
+            callback(msg=f"[WARN] Skipped {skipped_chunks} chunks without vector field '{vctr_nm}'. Consider re-parsing documents with the current embedding model.")
+
+        if not chunks:
+            logging.error(f"RAPTOR: No valid chunks with vectors found in any document for kb {row['kb_id']}")
+            callback(msg=f"[ERROR] No valid chunks with vectors found. Please ensure documents are parsed with the current embedding model (vector size: {vector_size}).")
+            return res, tk_count
+
        await generate(chunks, fake_doc_id)

    return res, tk_count