Refa:replace trio with asyncio (#11831)

### What problem does this PR solve?

change:
replace trio with asyncio

### Type of change
- [x] Refactoring
This commit is contained in:
buua436
2025-12-09 19:23:14 +08:00
committed by GitHub
parent ca2d6f3301
commit 65a5a56d95
31 changed files with 821 additions and 429 deletions

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
import asyncio
import logging
import math
import os
@ -28,7 +29,6 @@ from timeit import default_timer as timer
import numpy as np
import pdfplumber
import trio
import xgboost as xgb
from huggingface_hub import snapshot_download
from PIL import Image
@ -65,7 +65,7 @@ class RAGFlowPdfParser:
self.ocr = OCR()
self.parallel_limiter = None
if settings.PARALLEL_DEVICES > 1:
self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(settings.PARALLEL_DEVICES)]
self.parallel_limiter = [asyncio.Semaphore(1) for _ in range(settings.PARALLEL_DEVICES)]
layout_recognizer_type = os.getenv("LAYOUT_RECOGNIZER_TYPE", "onnx").lower()
if layout_recognizer_type not in ["onnx", "ascend"]:
@ -382,7 +382,7 @@ class RAGFlowPdfParser:
else:
x0s.append([x])
x0s = np.array(x0s, dtype=float)
max_try = min(4, len(bxs))
if max_try < 2:
max_try = 1
@ -416,7 +416,7 @@ class RAGFlowPdfParser:
for pg, bxs in by_page.items():
if not bxs:
continue
k = page_cols[pg]
k = page_cols[pg]
if len(bxs) < k:
k = 1
x0s = np.array([[b["x0"]] for b in bxs], dtype=float)
@ -430,7 +430,7 @@ class RAGFlowPdfParser:
for b, lb in zip(bxs, labels):
b["col_id"] = remap[lb]
grouped = defaultdict(list)
for b in bxs:
grouped[b["col_id"]].append(b)
@ -1111,7 +1111,7 @@ class RAGFlowPdfParser:
if limiter:
async with limiter:
await trio.to_thread.run_sync(lambda: self.__ocr(i + 1, img, chars, zoomin, id))
await asyncio.to_thread(self.__ocr, i + 1, img, chars, zoomin, id)
else:
self.__ocr(i + 1, img, chars, zoomin, id)
@ -1127,12 +1127,34 @@ class RAGFlowPdfParser:
return chars
if self.parallel_limiter:
async with trio.open_nursery() as nursery:
for i, img in enumerate(self.page_images):
chars = __ocr_preprocess()
tasks = []
for i, img in enumerate(self.page_images):
chars = __ocr_preprocess()
semaphore = self.parallel_limiter[i % settings.PARALLEL_DEVICES]
async def wrapper(i=i, img=img, chars=chars, semaphore=semaphore):
await __img_ocr(
i,
i % settings.PARALLEL_DEVICES,
img,
chars,
semaphore,
)
tasks.append(asyncio.create_task(wrapper()))
await asyncio.sleep(0)
try:
await asyncio.gather(*tasks, return_exceptions=False)
except Exception as e:
logging.error(f"Error in OCR: {e}")
for t in tasks:
t.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
raise
nursery.start_soon(__img_ocr, i, i % settings.PARALLEL_DEVICES, img, chars, self.parallel_limiter[i % settings.PARALLEL_DEVICES])
await trio.sleep(0.1)
else:
for i, img in enumerate(self.page_images):
chars = __ocr_preprocess()
@ -1140,7 +1162,7 @@ class RAGFlowPdfParser:
start = timer()
trio.run(__img_ocr_launcher)
asyncio.run(__img_ocr_launcher())
logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")

View File

@ -14,6 +14,8 @@
# limitations under the License.
#
import asyncio
import logging
import os
import sys
sys.path.insert(
@ -28,7 +30,6 @@ from deepdoc.vision.seeit import draw_box
from deepdoc.vision import OCR, init_in_out
import argparse
import numpy as np
import trio
# os.environ['CUDA_VISIBLE_DEVICES'] = '0,2' #2 gpus, uncontinuous
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #1 gpu
@ -39,7 +40,7 @@ def main(args):
import torch.cuda
cuda_devices = torch.cuda.device_count()
limiter = [trio.CapacityLimiter(1) for _ in range(cuda_devices)] if cuda_devices > 1 else None
limiter = [asyncio.Semaphore(1) for _ in range(cuda_devices)] if cuda_devices > 1 else None
ocr = OCR()
images, outputs = init_in_out(args)
@ -62,22 +63,29 @@ def main(args):
async def __ocr_thread(i, id, img, limiter = None):
if limiter:
async with limiter:
print("Task {} use device {}".format(i, id))
await trio.to_thread.run_sync(lambda: __ocr(i, id, img))
print(f"Task {i} use device {id}")
await asyncio.to_thread(__ocr, i, id, img)
else:
__ocr(i, id, img)
await asyncio.to_thread(__ocr, i, id, img)
async def __ocr_launcher():
if cuda_devices > 1:
async with trio.open_nursery() as nursery:
for i, img in enumerate(images):
nursery.start_soon(__ocr_thread, i, i % cuda_devices, img, limiter[i % cuda_devices])
await trio.sleep(0.1)
else:
for i, img in enumerate(images):
await __ocr_thread(i, 0, img)
tasks = []
for i, img in enumerate(images):
dev_id = i % cuda_devices if cuda_devices > 1 else 0
semaphore = limiter[dev_id] if limiter else None
tasks.append(asyncio.create_task(__ocr_thread(i, dev_id, img, semaphore)))
trio.run(__ocr_launcher)
try:
await asyncio.gather(*tasks, return_exceptions=False)
except Exception as e:
logging.error("OCR tasks failed: {}".format(e))
for t in tasks:
t.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
raise
asyncio.run(__ocr_launcher())
print("OCR tasks are all done")