feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Lin Manhui
2026-01-09 17:48:45 +08:00
committed by GitHub
parent 6abf55c048
commit 2e09db02f3
34 changed files with 1510 additions and 453 deletions

View File

@ -148,7 +148,7 @@ Procedural Memory: Learned skills, habits, and automated procedures.`,
action: 'Action',
},
config: {
memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes).
memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes).
Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default limit holds ~500 such messages.`,
avatar: 'Avatar',
description: 'Description',
@ -424,6 +424,17 @@ Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default lim
'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.',
mineruTableEnable: 'Table recognition',
mineruTableEnableTip: 'Enable table recognition and extraction.',
paddleocrOptions: 'PaddleOCR Options',
paddleocrApiUrl: 'PaddleOCR API URL',
paddleocrApiUrlTip: 'The API endpoint URL for PaddleOCR service',
paddleocrApiUrlPlaceholder: 'e.g. https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'AI Studio Access Token',
paddleocrAccessTokenTip: 'Access token for PaddleOCR API (optional)',
paddleocrAccessTokenPlaceholder: 'Your AI Studio token (optional)',
paddleocrAlgorithm: 'PaddleOCR Algorithm',
paddleocrAlgorithmTip: 'Algorithm to use for PaddleOCR parsing',
paddleocrSelectAlgorithm: 'Select Algorithm',
paddleocrModelNamePlaceholder: 'e.g. paddleocr-from-env-1',
overlappedPercent: 'Overlapped percent(%)',
generationScopeTip:
'Determines whether RAPTOR is generated for the entire dataset or for a single file.',
@ -1094,6 +1105,17 @@ Example: Virtual Hosted Style`,
modelTypeMessage: 'Please input your model type!',
addLlmBaseUrl: 'Base url',
baseUrlNameMessage: 'Please input your base url!',
paddleocr: {
apiUrl: 'PaddleOCR API URL',
apiUrlPlaceholder: 'For example: https://paddleocr-server.com/layout-parsing',
accessToken: 'AI Studio Access Token',
accessTokenPlaceholder: 'Your AI Studio token (optional)',
algorithm: 'PaddleOCR Algorithm',
selectAlgorithm: 'Select Algorithm',
modelNamePlaceholder: 'For example: paddleocr-from-env-1',
modelNameRequired: 'Model name is required',
apiUrlRequired: 'PaddleOCR API URL is required'
},
vision: 'Does it support Vision?',
ollamaLink: 'How to integrate {{name}}',
FishAudioLink: 'How to use FishAudio',