mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-23 03:26:53 +08:00
fix(raptor): handle missing vector fields gracefully (#12713)
## Summary This PR fixes a `KeyError` crash when running RAPTOR tasks on documents that don't have the expected vector field. ## Related Issue Fixes https://github.com/infiniflow/ragflow/issues/12675 ## Problem When running RAPTOR tasks, the code assumes all chunks have the vector field `q_<size>_vec` (e.g., `q_1024_vec`). However, chunks may not have this field if: 1. They were indexed with a **different embedding model** (different vector size) 2. The embedding step **failed silently** during initial parsing 3. The document was parsed before the current embedding model was configured This caused a crash: ``` KeyError: 'q_1024_vec' ``` ## Solution Added defensive validation in `run_raptor_for_kb()`: 1. **Check for vector field existence** before accessing it 2. **Skip chunks** that don't have the required vector field instead of crashing 3. **Log warnings** for skipped chunks with actionable guidance 4. **Provide informative error messages** suggesting users re-parse documents with the current embedding model 5. **Handle both scopes** (`file` and `kb` modes) ## Changes - `rag/svr/task_executor.py`: Added validation and error handling in `run_raptor_for_kb()` ## Testing 1. Create a knowledge base with an embedding model 2. Parse documents 3. Change the embedding model to one with a different vector size 4. Run RAPTOR task 5. **Before**: Crashes with `KeyError` 6. **After**: Gracefully skips incompatible chunks with informative warnings --- <!-- Gittensor Contribution Tag: @GlobalStar117 --> Co-authored-by: GlobalStar117 <GlobalStar117@users.noreply.github.com>
This commit is contained in:
@ -806,20 +806,49 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
|
||||
if raptor_config.get("scope", "file") == "file":
|
||||
for x, doc_id in enumerate(doc_ids):
|
||||
chunks = []
|
||||
skipped_chunks = 0
|
||||
for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
|
||||
fields=["content_with_weight", vctr_nm],
|
||||
sort_by_position=True):
|
||||
# Skip chunks that don't have the required vector field (may have been indexed with different embedding model)
|
||||
if vctr_nm not in d or d[vctr_nm] is None:
|
||||
skipped_chunks += 1
|
||||
logging.warning(f"RAPTOR: Chunk missing vector field '{vctr_nm}' in doc {doc_id}, skipping")
|
||||
continue
|
||||
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
|
||||
|
||||
if skipped_chunks > 0:
|
||||
callback(msg=f"[WARN] Skipped {skipped_chunks} chunks without vector field '{vctr_nm}' for doc {doc_id}. Consider re-parsing the document with the current embedding model.")
|
||||
|
||||
if not chunks:
|
||||
logging.warning(f"RAPTOR: No valid chunks with vectors found for doc {doc_id}")
|
||||
callback(msg=f"[WARN] No valid chunks with vectors found for doc {doc_id}, skipping")
|
||||
continue
|
||||
|
||||
await generate(chunks, doc_id)
|
||||
callback(prog=(x + 1.) / len(doc_ids))
|
||||
else:
|
||||
chunks = []
|
||||
skipped_chunks = 0
|
||||
for doc_id in doc_ids:
|
||||
for d in settings.retriever.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
|
||||
fields=["content_with_weight", vctr_nm],
|
||||
sort_by_position=True):
|
||||
# Skip chunks that don't have the required vector field
|
||||
if vctr_nm not in d or d[vctr_nm] is None:
|
||||
skipped_chunks += 1
|
||||
logging.warning(f"RAPTOR: Chunk missing vector field '{vctr_nm}' in doc {doc_id}, skipping")
|
||||
continue
|
||||
chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
|
||||
|
||||
if skipped_chunks > 0:
|
||||
callback(msg=f"[WARN] Skipped {skipped_chunks} chunks without vector field '{vctr_nm}'. Consider re-parsing documents with the current embedding model.")
|
||||
|
||||
if not chunks:
|
||||
logging.error(f"RAPTOR: No valid chunks with vectors found in any document for kb {row['kb_id']}")
|
||||
callback(msg=f"[ERROR] No valid chunks with vectors found. Please ensure documents are parsed with the current embedding model (vector size: {vector_size}).")
|
||||
return res, tk_count
|
||||
|
||||
await generate(chunks, fake_doc_id)
|
||||
|
||||
return res, tk_count
|
||||
|
||||
Reference in New Issue
Block a user