Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve?

Add vision LLM PDF parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Yongteng Lei
2025-03-18 14:52:20 +08:00
committed by GitHub
parent 897fe85b5c
commit 5cf610af40
7 changed files with 413 additions and 102 deletions

View File

@ -16,7 +16,9 @@
import os
import re
import tiktoken
from api.utils.file_utils import get_project_base_directory
@ -54,7 +56,7 @@ def findMaxDt(fnm):
pass
return m
def findMaxTm(fnm):
m = 0
try:
@ -91,11 +93,18 @@ def truncate(string: str, max_len: int) -> str:
"""Returns truncated text if the length of text exceed max_len."""
return encoder.decode(encoder.encode(string)[:max_len])
def clean_markdown_block(text):
text = re.sub(r'^\s*```markdown\s*\n?', '', text)
text = re.sub(r'\n?\s*```\s*$', '', text)
return text.strip()
def get_float(v: str | None):
if v is None:
return float('-inf')
try:
return float(v)
except Exception:
return float('-inf')
return float('-inf')