feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Lin Manhui
2026-01-09 17:48:45 +08:00
committed by GitHub
parent 6abf55c048
commit 2e09db02f3
34 changed files with 1510 additions and 453 deletions

View File

@ -20,6 +20,7 @@ from strenum import StrEnum
SERVICE_CONF = "service_conf.yaml"
RAG_FLOW_SERVICE_NAME = "ragflow"
class CustomEnum(Enum):
@classmethod
def valid(cls, value):
@ -68,13 +69,13 @@ class ActiveEnum(Enum):
class LLMType(StrEnum):
CHAT = 'chat'
EMBEDDING = 'embedding'
SPEECH2TEXT = 'speech2text'
IMAGE2TEXT = 'image2text'
RERANK = 'rerank'
TTS = 'tts'
OCR = 'ocr'
CHAT = "chat"
EMBEDDING = "embedding"
SPEECH2TEXT = "speech2text"
IMAGE2TEXT = "image2text"
RERANK = "rerank"
TTS = "tts"
OCR = "ocr"
class TaskStatus(StrEnum):
@ -86,8 +87,7 @@ class TaskStatus(StrEnum):
SCHEDULE = "5"
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL,
TaskStatus.SCHEDULE}
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, TaskStatus.SCHEDULE}
class ParserType(StrEnum):
@ -136,6 +136,7 @@ class FileSource(StrEnum):
BITBUCKET = "bitbucket"
ZENDESK = "zendesk"
class PipelineTaskType(StrEnum):
PARSE = "Parse"
DOWNLOAD = "Download"
@ -145,15 +146,17 @@ class PipelineTaskType(StrEnum):
MEMORY = "Memory"
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR,
PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
class MCPServerType(StrEnum):
SSE = "sse"
STREAMABLE_HTTP = "streamable-http"
VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}
class Storage(Enum):
MINIO = 1
AZURE_SPN = 2
@ -165,10 +168,10 @@ class Storage(Enum):
class MemoryType(Enum):
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
class MemoryStorageType(StrEnum):
@ -239,3 +242,10 @@ MINERU_DEFAULT_CONFIG = {
"MINERU_SERVER_URL": "",
"MINERU_DELETE_OUTPUT": 1,
}
PADDLEOCR_ENV_KEYS = ["PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
PADDLEOCR_DEFAULT_CONFIG = {
"PADDLEOCR_API_URL": "",
"PADDLEOCR_ACCESS_TOKEN": None,
"PADDLEOCR_ALGORITHM": "PaddleOCR-VL",
}

View File

@ -26,5 +26,8 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str |
if lowered.endswith("@mineru"):
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
layout_recognizer = "MinerU"
elif lowered.endswith("@paddleocr"):
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
layout_recognizer = "PaddleOCR"
return layout_recognizer, parser_model_name