mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-31 23:55:06 +08:00
feat: add paddleocr parser (#12513)
### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -20,6 +20,7 @@ from strenum import StrEnum
|
||||
SERVICE_CONF = "service_conf.yaml"
|
||||
RAG_FLOW_SERVICE_NAME = "ragflow"
|
||||
|
||||
|
||||
class CustomEnum(Enum):
|
||||
@classmethod
|
||||
def valid(cls, value):
|
||||
@ -68,13 +69,13 @@ class ActiveEnum(Enum):
|
||||
|
||||
|
||||
class LLMType(StrEnum):
|
||||
CHAT = 'chat'
|
||||
EMBEDDING = 'embedding'
|
||||
SPEECH2TEXT = 'speech2text'
|
||||
IMAGE2TEXT = 'image2text'
|
||||
RERANK = 'rerank'
|
||||
TTS = 'tts'
|
||||
OCR = 'ocr'
|
||||
CHAT = "chat"
|
||||
EMBEDDING = "embedding"
|
||||
SPEECH2TEXT = "speech2text"
|
||||
IMAGE2TEXT = "image2text"
|
||||
RERANK = "rerank"
|
||||
TTS = "tts"
|
||||
OCR = "ocr"
|
||||
|
||||
|
||||
class TaskStatus(StrEnum):
|
||||
@ -86,8 +87,7 @@ class TaskStatus(StrEnum):
|
||||
SCHEDULE = "5"
|
||||
|
||||
|
||||
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL,
|
||||
TaskStatus.SCHEDULE}
|
||||
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, TaskStatus.SCHEDULE}
|
||||
|
||||
|
||||
class ParserType(StrEnum):
|
||||
@ -136,6 +136,7 @@ class FileSource(StrEnum):
|
||||
BITBUCKET = "bitbucket"
|
||||
ZENDESK = "zendesk"
|
||||
|
||||
|
||||
class PipelineTaskType(StrEnum):
|
||||
PARSE = "Parse"
|
||||
DOWNLOAD = "Download"
|
||||
@ -145,15 +146,17 @@ class PipelineTaskType(StrEnum):
|
||||
MEMORY = "Memory"
|
||||
|
||||
|
||||
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR,
|
||||
PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
|
||||
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
|
||||
|
||||
|
||||
class MCPServerType(StrEnum):
|
||||
SSE = "sse"
|
||||
STREAMABLE_HTTP = "streamable-http"
|
||||
|
||||
|
||||
VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}
|
||||
|
||||
|
||||
class Storage(Enum):
|
||||
MINIO = 1
|
||||
AZURE_SPN = 2
|
||||
@ -165,10 +168,10 @@ class Storage(Enum):
|
||||
|
||||
|
||||
class MemoryType(Enum):
|
||||
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
|
||||
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
|
||||
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
|
||||
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
|
||||
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
|
||||
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
|
||||
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
|
||||
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
|
||||
|
||||
|
||||
class MemoryStorageType(StrEnum):
|
||||
@ -239,3 +242,10 @@ MINERU_DEFAULT_CONFIG = {
|
||||
"MINERU_SERVER_URL": "",
|
||||
"MINERU_DELETE_OUTPUT": 1,
|
||||
}
|
||||
|
||||
PADDLEOCR_ENV_KEYS = ["PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
|
||||
PADDLEOCR_DEFAULT_CONFIG = {
|
||||
"PADDLEOCR_API_URL": "",
|
||||
"PADDLEOCR_ACCESS_TOKEN": None,
|
||||
"PADDLEOCR_ALGORITHM": "PaddleOCR-VL",
|
||||
}
|
||||
|
||||
@ -26,5 +26,8 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str |
|
||||
if lowered.endswith("@mineru"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
layout_recognizer = "MinerU"
|
||||
elif lowered.endswith("@paddleocr"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
layout_recognizer = "PaddleOCR"
|
||||
|
||||
return layout_recognizer, parser_model_name
|
||||
|
||||
Reference in New Issue
Block a user