Feat:Vision Model Image Enhancement in Manual/Paper/Book/One chunker (#10640)

### What problem does this PR solve?
issue:
[#7472](https://github.com/infiniflow/ragflow/issues/7472)
change:
Vision Model Image Enhancement in Manual chunker
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
buua436
2025-10-21 09:36:27 +08:00
committed by GitHub
parent aaa4776657
commit 6ab96287c9
6 changed files with 71 additions and 46 deletions

View File

@ -17,6 +17,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image from PIL import Image
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from api.utils.api_utils import timeout from api.utils.api_utils import timeout
from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from rag.prompts.generator import vision_llm_figure_describe_prompt from rag.prompts.generator import vision_llm_figure_describe_prompt
@ -32,6 +34,43 @@ def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
if isinstance(figure_data[1], Image.Image) if isinstance(figure_data[1], Image.Image)
] ]
def vision_figure_parser_docx_wrapper(sections,tbls,callback=None,**kwargs):
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
figures_data = vision_figure_parser_figure_data_wrapper(sections)
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tbls.extend(boosted_figures)
except Exception as e:
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
return tbls
def vision_figure_parser_pdf_wrapper(tbls,callback=None,**kwargs):
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
def is_figure_item(item):
return (
isinstance(item[0][0], Image.Image) and
isinstance(item[0][1], list)
)
figures_data = [item for item in tbls if is_figure_item(item)]
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tbls = [item for item in tbls if not is_figure_item(item)]
tbls.extend(boosted_figures)
except Exception as e:
callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.")
return tbls
shared_executor = ThreadPoolExecutor(max_workers=10) shared_executor = ThreadPoolExecutor(max_workers=10)

View File

@ -20,11 +20,14 @@ import re
from io import BytesIO from io import BytesIO
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.nlp import bullets_category, is_english,remove_contents_table, \ from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks tokenize_chunks
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, PlainParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
from PIL import Image
class Pdf(PdfParser): class Pdf(PdfParser):
@ -81,13 +84,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections, tbls = [], [] sections, tbls = [], []
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
doc_parser = DocxParser() doc_parser = naive.Docx()
# TODO: table of contents need to be removed # TODO: table of contents need to be removed
sections, tbls = doc_parser( sections, tbls = doc_parser(
binary if binary else filename, from_page=from_page, to_page=to_page) filename, binary=binary, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng=is_english( remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200))) random_choices([t for t, _ in sections], k=200)))
tbls = [((None, lns), None) for lns in tbls] tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
# tbls = [((None, lns), None) for lns in tbls]
sections=[(item[0],item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
@ -96,6 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = PlainParser() pdf_parser = PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary, sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback) from_page=from_page, to_page=to_page, callback=callback)
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
elif re.search(r"\.txt$", filename, re.IGNORECASE): elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")

View File

@ -23,6 +23,7 @@ from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
from deepdoc.parser import PdfParser, PlainParser, DocxParser from deepdoc.parser import PdfParser, PlainParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
from docx import Document from docx import Document
from PIL import Image from PIL import Image
@ -252,7 +253,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tk_cnt = num_tokens_from_string(txt) tk_cnt = num_tokens_from_string(txt)
if sec_id > -1: if sec_id > -1:
last_sid = sec_id last_sid = sec_id
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res return res
@ -261,6 +262,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
docx_parser = Docx() docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary, ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback) from_page=0, to_page=10000, callback=callback)
tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
res = tokenize_table(tbls, doc, eng) res = tokenize_table(tbls, doc, eng)
for text, image in ti_list: for text, image in ti_list:
d = copy.deepcopy(doc) d = copy.deepcopy(doc)

View File

@ -32,7 +32,7 @@ from api.db import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from api.utils.file_utils import extract_embed_file from api.utils.file_utils import extract_embed_file
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.mineru_parser import MinerUParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
@ -475,24 +475,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2 _SerializedRelationships.load_from_xml = load_from_xml_v2
sections, tables = Docx()(filename, binary) sections, tables = Docx()(filename, binary)
if vision_model: tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
figures_data = vision_figure_parser_figure_data_wrapper(sections)
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tables.extend(boosted_figures)
except Exception as e:
callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
@ -521,25 +510,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if layout_recognizer == "DeepDOC": if layout_recognizer == "DeepDOC":
pdf_parser = Pdf() pdf_parser = Pdf()
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True)
callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...")
try:
pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs)
boosted_figures = pdf_vision_parser(callback=callback)
tables.extend(boosted_figures)
except Exception as e:
callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
tables.extend(figures)
else:
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")

View File

@ -23,6 +23,7 @@ from deepdoc.parser.utils import get_text
from rag.app import naive from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
class Pdf(PdfParser): class Pdf(PdfParser):
@ -57,13 +58,8 @@ class Pdf(PdfParser):
sections = [(b["text"], self.get_position(b, zoomin)) sections = [(b["text"], self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)] for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: ( return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, def chunk(filename, binary=None, from_page=0, to_page=100000,
@ -80,6 +76,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections, tbls = naive.Docx()(filename, binary) sections, tbls = naive.Docx()(filename, binary)
tbls=vision_figure_parser_docx_wrapper(sections=sections,tbls=tbls,callback=callback,**kwargs)
sections = [s for s, _ in sections if s] sections = [s for s, _ in sections if s]
for (_, html), _ in tbls: for (_, html), _ in tbls:
sections.append(html) sections.append(html)
@ -89,8 +86,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = Pdf() pdf_parser = Pdf()
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser() pdf_parser = PlainParser()
sections, _ = pdf_parser( sections, tbls = pdf_parser(
filename if not binary else binary, to_page=to_page, callback=callback) filename if not binary else binary, to_page=to_page, callback=callback)
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
sections = [s for s, _ in sections if s] sections = [s for s, _ in sections if s]
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):

View File

@ -18,12 +18,12 @@ import logging
import copy import copy
import re import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from api.db import ParserType from api.db import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser import PdfParser, PlainParser
import numpy as np import numpy as np
class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
self.model_speciess = ParserType.PAPER.value self.model_speciess = ParserType.PAPER.value
@ -160,6 +160,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = Pdf() pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary, paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback) from_page=from_page, to_page=to_page, callback=callback)
tbls=paper["tables"]
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
paper["tables"] = tbls
else: else:
raise NotImplementedError("file type not supported yet(pdf supported)") raise NotImplementedError("file type not supported yet(pdf supported)")