Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Yongteng Lei
2025-03-18 14:52:20 +08:00
committed by GitHub
parent 897fe85b5c
commit 5cf610af40
7 changed files with 413 additions and 102 deletions

View File

@ -21,8 +21,9 @@ from PIL import Image
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
from rag.nlp import tokenize
from deepdoc.vision import OCR
from rag.nlp import tokenize
from rag.utils import clean_markdown_block
ocr = OCR()
@ -57,3 +58,32 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
callback(prog=-1, msg=str(e))
return []
def vision_llm_chunk(binary, vision_model, prompt=None, callback=None):
"""
A simple wrapper to process image to markdown texts via VLM.
Returns:
Simple markdown texts generated by VLM.
"""
callback = callback or (lambda prog, msg: None)
img = binary
txt = ""
try:
img_binary = io.BytesIO()
img.save(img_binary, format='JPEG')
img_binary.seek(0)
ans = clean_markdown_block(vision_model.describe_with_prompt(img_binary.read(), prompt))
txt += "\n" + ans
return txt
except Exception as e:
callback(-1, str(e))
return []