Feat: add OCR's muti-gpus and parallel processing support (#5972)

### What problem does this PR solve? Add OCR's muti-gpus and parallel processing support ### Type of change - [x] New Feature (non-breaking change which adds functionality) @yuzhichang I've tried to resolve the comments in #5697. OCR jobs can now be done on both CPU and GPU. ( By the way, I've encountered a “Generate embedding error” issue #5954 that might be due to my outdated GPUs? idk. ) Please review it and give me suggestions. GPU: ![gpu_ocr](https://github.com/user-attachments/assets/0ee2ecfb-a665-4e50-8bc7-15941b9cd80e) ![smi](https://github.com/user-attachments/assets/a2312f8c-cf24-443d-bf89-bec50503546d) CPU: ![cpu_ocr](https://github.com/user-attachments/assets/1ba6bb0b-94df-41ea-be79-790096da4bf1)
2026-02-01 16:15:07 +08:00 · 2025-03-17 11:58:40 +08:00
parent 8495036ff9
commit 3e19044dee
5 changed files with 157 additions and 48 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -128,6 +128,9 @@ class Docx(DocxParser):


 class Pdf(PdfParser):
+    def __init__(self, parallel_devices = None):
+        super().__init__(parallel_devices)
+
    def __call__(self, filename, binary=None, from_page=0,
                 to_page=100000, zoomin=3, callback=None):
        start = timer()
@ -194,7 +197,7 @@ class Markdown(MarkdownParser):


 def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
+          lang="Chinese", parallel_devices=None, callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, excel, txt.
        This method apply the naive ways to chunk files.
@ -234,7 +237,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        return res

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf()
+        pdf_parser = Pdf(parallel_devices)
        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
            pdf_parser = PlainParser()
        sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,