feat: add paddleocr parser (#12513)

### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-03 00:55:10 +08:00 · 2026-01-09 17:48:45 +08:00
parent 6abf55c048
commit 2e09db02f3
34 changed files with 1510 additions and 453 deletions
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@ -148,7 +148,7 @@ Procedural Memory: Learned skills, habits, and automated procedures.`,
        action: 'Action',
      },
      config: {
-        memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes). 
+        memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes).
 Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default limit holds ~500 such messages.`,
        avatar: 'Avatar',
        description: 'Description',
@ -424,6 +424,17 @@ Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default lim
        'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.',
      mineruTableEnable: 'Table recognition',
      mineruTableEnableTip: 'Enable table recognition and extraction.',
+      paddleocrOptions: 'PaddleOCR Options',
+      paddleocrApiUrl: 'PaddleOCR API URL',
+      paddleocrApiUrlTip: 'The API endpoint URL for PaddleOCR service',
+      paddleocrApiUrlPlaceholder: 'e.g. https://paddleocr-server.com/layout-parsing',
+      paddleocrAccessToken: 'AI Studio Access Token',
+      paddleocrAccessTokenTip: 'Access token for PaddleOCR API (optional)',
+      paddleocrAccessTokenPlaceholder: 'Your AI Studio token (optional)',
+      paddleocrAlgorithm: 'PaddleOCR Algorithm',
+      paddleocrAlgorithmTip: 'Algorithm to use for PaddleOCR parsing',
+      paddleocrSelectAlgorithm: 'Select Algorithm',
+      paddleocrModelNamePlaceholder: 'e.g. paddleocr-from-env-1',
      overlappedPercent: 'Overlapped percent(%)',
      generationScopeTip:
        'Determines whether RAPTOR is generated for the entire dataset or for a single file.',
@ -1094,6 +1105,17 @@ Example: Virtual Hosted Style`,
      modelTypeMessage: 'Please input your model type!',
      addLlmBaseUrl: 'Base url',
      baseUrlNameMessage: 'Please input your base url!',
+      paddleocr: {
+        apiUrl: 'PaddleOCR API URL',
+        apiUrlPlaceholder: 'For example: https://paddleocr-server.com/layout-parsing',
+        accessToken: 'AI Studio Access Token',
+        accessTokenPlaceholder: 'Your AI Studio token (optional)',
+        algorithm: 'PaddleOCR Algorithm',
+        selectAlgorithm: 'Select Algorithm',
+        modelNamePlaceholder: 'For example: paddleocr-from-env-1',
+        modelNameRequired: 'Model name is required',
+        apiUrlRequired: 'PaddleOCR API URL is required'
+      },
      vision: 'Does it support Vision?',
      ollamaLink: 'How to integrate {{name}}',
      FishAudioLink: 'How to use FishAudio',