Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve? Add vision LLM PDF parser ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-02-03 09:05:07 +08:00 · 2025-03-18 14:52:20 +08:00
parent 897fe85b5c
commit 5cf610af40
7 changed files with 413 additions and 102 deletions
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -21,8 +21,9 @@ from PIL import Image

 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
-from rag.nlp import tokenize
 from deepdoc.vision import OCR
+from rag.nlp import tokenize
+from rag.utils import clean_markdown_block

 ocr = OCR()

@ -57,3 +58,32 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
        callback(prog=-1, msg=str(e))

    return []
+
+
+def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
+    """
+    A simple wrapper to process image to markdown texts via VLM.
+
+    Returns:
+        Simple markdown texts generated by VLM.
+    """
+    callback = callback or (lambda prog, msg: None)
+
+    img = binary
+    txt = ""
+
+    try:
+        img_binary = io.BytesIO()
+        img.save(img_binary, format='JPEG')
+        img_binary.seek(0)
+
+        ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
+
+        txt += "\n" + ans
+
+        return txt
+
+    except Exception as e:
+        callback(-1, str(e))
+
+    return []