Feat: add vision LLM PDF parser (#6173)

### What problem does this PR solve? Add vision LLM PDF parser ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-02-01 08:05:07 +08:00 · 2025-03-18 14:52:20 +08:00
parent 897fe85b5c
commit 5cf610af40
7 changed files with 413 additions and 102 deletions
--- a/rag/utils/init.py
+++ b/rag/utils/init.py
@ -16,7 +16,9 @@

 import os
 import re
+
 import tiktoken
+
 from api.utils.file_utils import get_project_base_directory


@ -54,7 +56,7 @@ def findMaxDt(fnm):
        pass
    return m

-  
+
 def findMaxTm(fnm):
    m = 0
    try:
@ -91,11 +93,18 @@ def truncate(string: str, max_len: int) -> str:
    """Returns truncated text if the length of text exceed max_len."""
    return encoder.decode(encoder.encode(string)[:max_len])

+  
+def clean_markdown_block(text):
+    text = re.sub(r'^\s*```markdown\s*\n?', '', text)
+    text = re.sub(r'\n?\s*```\s*$', '', text)
+    return text.strip()

+  
 def get_float(v: str | None):
    if v is None:
        return float('-inf')
    try:
        return float(v)
    except Exception:
-        return float('-inf')
+        return float('-inf')
+