Feat: support context window for docx (#12455)

### What problem does this PR solve?

Feat: support context window for docx

#12303

Done:
- [x] naive.py
- [x] one.py

TODO:
- [ ] book.py
- [ ] manual.py

Fix: incorrect image position
Fix: incorrect chunk type tag

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Magicbook1108
2026-01-07 15:08:17 +08:00
committed by GitHub
parent a442c9cac6
commit 011bbe9556
7 changed files with 397 additions and 120 deletions

View File

@ -25,7 +25,7 @@ from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from rag.prompts.generator import vision_llm_figure_describe_prompt, vision_llm_figure_describe_prompt_with_context
from rag.nlp import append_context2table_image4pdf
# need to delete before pr
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
if not figures_data_without_positions:
return []
@ -38,7 +38,6 @@ def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
if isinstance(figure_data[1], Image.Image)
]
def vision_figure_parser_docx_wrapper(sections, tbls, callback=None,**kwargs):
if not sections:
return tbls
@ -124,8 +123,56 @@ def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs):
return tbls
shared_executor = ThreadPoolExecutor(max_workers=10)
def vision_figure_parser_docx_wrapper_naive(chunks, idx_lst, callback=None, **kwargs):
print("\n\n hello here i am \n\n")
if not chunks:
return []
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
print(" \n\n Yes vision model \n\n")
except Exception:
vision_model = None
print(" \n\n No vision model \n\n")
if vision_model:
@timeout(30, 3)
def worker(idx, ck):
context_above = ck.get("context_above", "")
context_below = ck.get("context_below", "")
if context_above or context_below:
prompt = vision_llm_figure_describe_prompt_with_context(
# context_above + caption if any
context_above=ck.get("context_above") + ck.get("text", ""),
context_below=ck.get("context_below"),
)
logging.info(f"[VisionFigureParser] figure={idx} context_above_len={len(context_above)} context_below_len={len(context_below)} prompt=with_context")
logging.info(f"[VisionFigureParser] figure={idx} context_above_snippet={context_above[:512]}")
logging.info(f"[VisionFigureParser] figure={idx} context_below_snippet={context_below[:512]}")
else:
prompt = vision_llm_figure_describe_prompt()
logging.info(f"[VisionFigureParser] figure={idx} context_len=0 prompt=default")
description_text = picture_vision_llm_chunk(
binary=ck.get("image"),
vision_model=vision_model,
prompt=prompt,
callback=callback,
)
return idx, description_text
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [
executor.submit(worker, idx, chunks[idx])
for idx in idx_lst
]
for future in as_completed(futures):
idx, description = future.result()
chunks[idx]['text'] += description
shared_executor = ThreadPoolExecutor(max_workers=10)
class VisionFigureParser:
def __init__(self, vision_model, figures_data, *args, **kwargs):