diff --git a/deepdoc/parser/figure_parser.py b/deepdoc/parser/figure_parser.py index 097881a8e..b86f16a29 100644 --- a/deepdoc/parser/figure_parser.py +++ b/deepdoc/parser/figure_parser.py @@ -15,10 +15,19 @@ # +from PIL import Image + from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk from rag.prompts import vision_llm_figure_describe_prompt +def vision_figure_parser_figure_data_wraper(figures_data_without_positions): + return [( + (figure_data[1], [figure_data[0]]), + [(0, 0, 0, 0, 0)] + ) for figure_data in figures_data_without_positions if isinstance(figure_data[1], Image.Image)] + + class VisionFigureParser: def __init__(self, vision_model, figures_data, *args, **kwargs): self.vision_model = vision_model @@ -33,14 +42,14 @@ class VisionFigureParser: for item in figures_data: # position - if len(item) == 2 and isinstance(item[1], list) and len(item[1]) == 1 and len(item[1][0]) == 5: + if len(item) == 2 and isinstance(item[1], list) and len(item[1]) == 1 and isinstance(item[1][0], tuple) and len(item[1][0]) == 5: img_desc = item[0] - assert len(img_desc) == 2, "Should be (figure, [description])" + assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])" self.figures.append(img_desc[0]) self.descriptions.append(img_desc[1]) self.positions.append(item[1]) else: - assert len(item) == 2 and isinstance(item, tuple), f"get {len(item)=}, {item=}" + assert len(item) == 2 and isinstance(item, tuple) and isinstance(item[1], list), f"get {len(item)=}, {item=}" self.figures.append(item[0]) self.descriptions.append(item[1]) diff --git a/rag/app/naive.py b/rag/app/naive.py index a335659c9..4549bf86b 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -29,7 +29,7 @@ from tika import parser from api.db import LLMType from api.db.services.llm_service import LLMBundle from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser -from deepdoc.parser.figure_parser import VisionFigureParser +from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table from rag.utils import num_tokens_from_string @@ -226,10 +226,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_parser = None if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - sections, tables = Docx()(filename, binary) - res = tokenize_table(tables, doc, is_english) # just for table + try: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.15, "Visual model detected. Attempting to enhance figure extraction...") + except Exception: + vision_model = None + + sections, tables = Docx()(filename, binary) + + if vision_model: + figures_data = vision_figure_parser_figure_data_wraper(sections) + try: + docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) + boosted_figures = docx_vision_parser(callback=callback) + tables.extend(boosted_figures) + except Exception as e: + callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.") + + res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") + st = timer() chunks, images = naive_merge_docx(