Feat: add VLM-boosted PDF parser (#6278)

### What problem does this PR solve? Add VLM-boosted PDF parser if VLM is set. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 07:36:46 +08:00 · 2025-03-20 09:39:32 +08:00
parent 344727f9ba
commit 1d6760dd84
5 changed files with 181 additions and 33 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -30,6 +30,7 @@ from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
 from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
+from deepdoc.parser.figure_parser import VisionFigureParser
 from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
 from rag.utils import num_tokens_from_string

@ -134,7 +135,7 @@ class Pdf(PdfParser):
        super().__init__()

    def __call__(self, filename, binary=None, from_page=0,
-                 to_page=100000, zoomin=3, callback=None):
+                 to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
        start = timer()
        first_start = start
        callback(msg="OCR started")
@ -159,14 +160,19 @@ class Pdf(PdfParser):
        start = timer()
        self._text_merge()
        callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
-        tbls = self._extract_table_figure(True, zoomin, True, True)
-        # self._naive_vertical_merge()
-        self._concat_downward()
-        # self._filter_forpages()

-        logging.info("layouts cost: {}s".format(timer() - first_start))
-        return [(b["text"], self._line_tag(b, zoomin))
-                for b in self.boxes], tbls
+        if separate_tables_figures:
+            tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
+            self._concat_downward()
+            logging.info("layouts cost: {}s".format(timer() - first_start))
+            return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
+        else:
+            tbls = self._extract_table_figure(True, zoomin, True, True)
+            # self._naive_vertical_merge()
+            self._concat_downward()
+            # self._filter_forpages()
+            logging.info("layouts cost: {}s".format(timer() - first_start))
+            return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls


 class Markdown(MarkdownParser):
@ -243,15 +249,32 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

        if layout_recognizer == "DeepDOC":
            pdf_parser = Pdf()
-        elif layout_recognizer == "Plain Text":
-            pdf_parser = PlainParser()
-        else:
-            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
-            pdf_parser = VisionParser(vision_model=vision_model, **kwargs)

-        sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
-                                      callback=callback)
-        res = tokenize_table(tables, doc, is_english)
+            try:
+                vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
+            except Exception:
+                vision_model = None
+
+            if vision_model:
+                sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True)
+                pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs)
+                boosted_figures = pdf_vision_parser(callback=callback)
+                tables.extend(boosted_figures)
+            else:
+                sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
+
+            res = tokenize_table(tables, doc, is_english)
+
+        else:
+            if layout_recognizer == "Plain Text":
+                pdf_parser = PlainParser()
+            else:
+                vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
+                pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
+
+            sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
+                                          callback=callback)
+            res = tokenize_table(tables, doc, is_english)

    elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -86,4 +86,4 @@ def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
    except Exception as e:
        callback(-1, str(e))

-    return []
+    return ""
--- a/rag/prompts.py
+++ b/rag/prompts.py
@ -393,3 +393,28 @@ FAILURE HANDLING:
 - If you do not detect valid content in the image, return an empty string.
 """
    return prompt_en
+
+
+def vision_llm_figure_describe_prompt() -> str:
+    prompt = """
+You are an expert visual data analyst. Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image.
+
+Tasks:
+1. Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram.
+2. Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available.
+3. Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns).
+4. Analyze and explain any trends, comparisons, or patterns shown in the data.
+5. Capture any annotations, captions, or footnotes, and explain their relevance to the image.
+6. Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it.
+
+Output format (include only sections relevant to the image content):
+- Visual Type: [Type]
+- Title: [Title text, if available]
+- Axes / Legends / Labels: [Details, if available]
+- Data Points: [Extracted data]
+- Trends / Insights: [Analysis and interpretation]
+- Captions / Annotations: [Text and relevance, if available]
+
+Ensure high accuracy, clarity, and completeness in your analysis, and includes only the information present in the image. Avoid unnecessary statements about missing elements.
+"""
+    return prompt