mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add OCR's muti-gpus and parallel processing support (#5972)
### What problem does this PR solve? Add OCR's muti-gpus and parallel processing support ### Type of change - [x] New Feature (non-breaking change which adds functionality) @yuzhichang I've tried to resolve the comments in #5697. OCR jobs can now be done on both CPU and GPU. ( By the way, I've encountered a “Generate embedding error” issue #5954 that might be due to my outdated GPUs? idk. ) Please review it and give me suggestions. GPU:   CPU: 
This commit is contained in:
@ -128,6 +128,9 @@ class Docx(DocxParser):
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self, parallel_devices = None):
|
||||
super().__init__(parallel_devices)
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
start = timer()
|
||||
@ -194,7 +197,7 @@ class Markdown(MarkdownParser):
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
lang="Chinese", parallel_devices=None, callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
@ -234,7 +237,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
pdf_parser = Pdf(parallel_devices)
|
||||
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
|
||||
pdf_parser = PlainParser()
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
|
||||
|
||||
Reference in New Issue
Block a user