From 6ab96287c9fe2b64c0adcb0e25d0c9085fdf89e0 Mon Sep 17 00:00:00 2001 From: buua436 <66937541+buua436@users.noreply.github.com> Date: Tue, 21 Oct 2025 09:36:27 +0800 Subject: [PATCH] Feat:Vision Model Image Enhancement in Manual/Paper/Book/One chunker (#10640) ### What problem does this PR solve? issue: [#7472](https://github.com/infiniflow/ragflow/issues/7472) change: Vision Model Image Enhancement in Manual chunker ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- deepdoc/parser/figure_parser.py | 39 +++++++++++++++++++++++++++++++++ rag/app/book.py | 14 ++++++++---- rag/app/manual.py | 4 +++- rag/app/naive.py | 38 +++++--------------------------- rag/app/one.py | 17 ++++++++------ rag/app/paper.py | 5 ++++- 6 files changed, 71 insertions(+), 46 deletions(-) diff --git a/deepdoc/parser/figure_parser.py b/deepdoc/parser/figure_parser.py index 0274f549d..3f6b40b4f 100644 --- a/deepdoc/parser/figure_parser.py +++ b/deepdoc/parser/figure_parser.py @@ -17,6 +17,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from PIL import Image +from api.db import LLMType +from api.db.services.llm_service import LLMBundle from api.utils.api_utils import timeout from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk from rag.prompts.generator import vision_llm_figure_describe_prompt @@ -32,6 +34,43 @@ def vision_figure_parser_figure_data_wrapper(figures_data_without_positions): if isinstance(figure_data[1], Image.Image) ] +def vision_figure_parser_docx_wrapper(sections,tbls,callback=None,**kwargs): + try: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.7, "Visual model detected. Attempting to enhance figure extraction...") + except Exception: + vision_model = None + if vision_model: + figures_data = vision_figure_parser_figure_data_wrapper(sections) + try: + docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) + boosted_figures = docx_vision_parser(callback=callback) + tbls.extend(boosted_figures) + except Exception as e: + callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.") + return tbls + +def vision_figure_parser_pdf_wrapper(tbls,callback=None,**kwargs): + try: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.7, "Visual model detected. Attempting to enhance figure extraction...") + except Exception: + vision_model = None + if vision_model: + def is_figure_item(item): + return ( + isinstance(item[0][0], Image.Image) and + isinstance(item[0][1], list) + ) + figures_data = [item for item in tbls if is_figure_item(item)] + try: + docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) + boosted_figures = docx_vision_parser(callback=callback) + tbls = [item for item in tbls if not is_figure_item(item)] + tbls.extend(boosted_figures) + except Exception as e: + callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.") + return tbls shared_executor = ThreadPoolExecutor(max_workers=10) diff --git a/rag/app/book.py b/rag/app/book.py index e3954341d..319e7a2b9 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -20,11 +20,14 @@ import re from io import BytesIO from deepdoc.parser.utils import get_text +from rag.app import naive from rag.nlp import bullets_category, is_english,remove_contents_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ tokenize_chunks from rag.nlp import rag_tokenizer -from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser +from deepdoc.parser import PdfParser, PlainParser, HtmlParser +from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper +from PIL import Image class Pdf(PdfParser): @@ -81,13 +84,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections, tbls = [], [] if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - doc_parser = DocxParser() + doc_parser = naive.Docx() # TODO: table of contents need to be removed sections, tbls = doc_parser( - binary if binary else filename, from_page=from_page, to_page=to_page) + filename, binary=binary, from_page=from_page, to_page=to_page) remove_contents_table(sections, eng=is_english( random_choices([t for t, _ in sections], k=200))) - tbls = [((None, lns), None) for lns in tbls] + tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs) + # tbls = [((None, lns), None) for lns in tbls] + sections=[(item[0],item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)] callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): @@ -96,6 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_parser = PlainParser() sections, tbls = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) + tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") diff --git a/rag/app/manual.py b/rag/app/manual.py index 7fa395fe1..a970cac06 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -23,6 +23,7 @@ from io import BytesIO from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from rag.utils import num_tokens_from_string from deepdoc.parser import PdfParser, PlainParser, DocxParser +from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper from docx import Document from PIL import Image @@ -252,7 +253,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, tk_cnt = num_tokens_from_string(txt) if sec_id > -1: last_sid = sec_id - + tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) res = tokenize_table(tbls, doc, eng) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) return res @@ -261,6 +262,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, docx_parser = Docx() ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback) + tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs) res = tokenize_table(tbls, doc, eng) for text, image in ti_list: d = copy.deepcopy(doc) diff --git a/rag/app/naive.py b/rag/app/naive.py index d91018ead..a6c67b6b6 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -32,7 +32,7 @@ from api.db import LLMType from api.db.services.llm_service import LLMBundle from api.utils.file_utils import extract_embed_file from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser -from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper +from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.mineru_parser import MinerUParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table @@ -475,24 +475,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - try: - vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) - callback(0.15, "Visual model detected. Attempting to enhance figure extraction...") - except Exception: - vision_model = None + # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 _SerializedRelationships.load_from_xml = load_from_xml_v2 sections, tables = Docx()(filename, binary) - if vision_model: - figures_data = vision_figure_parser_figure_data_wrapper(sections) - try: - docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) - boosted_figures = docx_vision_parser(callback=callback) - tables.extend(boosted_figures) - except Exception as e: - callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.") + tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs) res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") @@ -521,25 +510,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if layout_recognizer == "DeepDOC": pdf_parser = Pdf() - - try: - vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) - callback(0.15, "Visual model detected. Attempting to enhance figure extraction...") - except Exception: - vision_model = None - - if vision_model: - sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True) - callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...") - try: - pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs) - boosted_figures = pdf_vision_parser(callback=callback) - tables.extend(boosted_figures) - except Exception as e: - callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.") - tables.extend(figures) - else: - sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) + sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) + tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs) res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") diff --git a/rag/app/one.py b/rag/app/one.py index 77c9645c7..bb86b80fc 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -23,6 +23,7 @@ from deepdoc.parser.utils import get_text from rag.app import naive from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser +from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper class Pdf(PdfParser): @@ -57,13 +58,8 @@ class Pdf(PdfParser): sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)] - for (img, rows), poss in tbls: - if not rows: - continue - sections.append((rows if isinstance(rows, str) else rows[0], - [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) return [(txt, "") for txt, _ in sorted(sections, key=lambda x: ( - x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None + x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls def chunk(filename, binary=None, from_page=0, to_page=100000, @@ -80,6 +76,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections, tbls = naive.Docx()(filename, binary) + tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs) sections = [s for s, _ in sections if s] for (_, html), _ in tbls: sections.append(html) @@ -89,8 +86,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_parser = Pdf() if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": pdf_parser = PlainParser() - sections, _ = pdf_parser( + sections, tbls = pdf_parser( filename if not binary else binary, to_page=to_page, callback=callback) + tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) + for (img, rows), poss in tbls: + if not rows: + continue + sections.append((rows if isinstance(rows, str) else rows[0], + [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) sections = [s for s, _ in sections if s] elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): diff --git a/rag/app/paper.py b/rag/app/paper.py index c46f417a8..a8f29c82e 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -18,12 +18,12 @@ import logging import copy import re +from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper from api.db import ParserType from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from deepdoc.parser import PdfParser, PlainParser import numpy as np - class Pdf(PdfParser): def __init__(self): self.model_speciess = ParserType.PAPER.value @@ -160,6 +160,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_parser = Pdf() paper = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) + tbls=paper["tables"] + tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) + paper["tables"] = tbls else: raise NotImplementedError("file type not supported yet(pdf supported)")