mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Optimize ocr (#5297)
### What problem does this PR solve? Introduced OCR.recognize_batch ### Type of change - [x] Performance Improvement
This commit is contained in:
@ -18,7 +18,7 @@
|
||||
# beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code
|
||||
import random
|
||||
import sys
|
||||
from api.utils.log_utils import initRootLogger
|
||||
from api.utils.log_utils import initRootLogger, get_project_base_directory
|
||||
from graphrag.general.index import WithCommunity, WithResolution, Dealer
|
||||
from graphrag.light.graph_extractor import GraphExtractor as LightKGExt
|
||||
from graphrag.general.graph_extractor import GraphExtractor as GeneralKGExt
|
||||
@ -42,6 +42,7 @@ from io import BytesIO
|
||||
from multiprocessing.context import TimeoutError
|
||||
from timeit import default_timer as timer
|
||||
import tracemalloc
|
||||
import signal
|
||||
|
||||
import numpy as np
|
||||
from peewee import DoesNotExist
|
||||
@ -96,6 +97,35 @@ DONE_TASKS = 0
|
||||
FAILED_TASKS = 0
|
||||
CURRENT_TASK = None
|
||||
|
||||
tracemalloc_started = False
|
||||
|
||||
# SIGUSR1 handler: start tracemalloc and take snapshot
|
||||
def start_tracemalloc_and_snapshot(signum, frame):
|
||||
global tracemalloc_started
|
||||
if not tracemalloc_started:
|
||||
logging.info("got SIGUSR1, start tracemalloc")
|
||||
tracemalloc.start()
|
||||
tracemalloc_started = True
|
||||
else:
|
||||
logging.info("got SIGUSR1, tracemalloc is already running")
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
snapshot_file = f"snapshot_{timestamp}.trace"
|
||||
snapshot_file = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"{os.getpid()}_snapshot_{timestamp}.trace"))
|
||||
|
||||
snapshot = tracemalloc.take_snapshot()
|
||||
snapshot.dump(snapshot_file)
|
||||
logging.info(f"taken snapshot {snapshot_file}")
|
||||
|
||||
# SIGUSR2 handler: stop tracemalloc
|
||||
def stop_tracemalloc(signum, frame):
|
||||
global tracemalloc_started
|
||||
if tracemalloc_started:
|
||||
logging.info("go SIGUSR2, stop tracemalloc")
|
||||
tracemalloc.stop()
|
||||
tracemalloc_started = False
|
||||
else:
|
||||
logging.info("got SIGUSR2, tracemalloc not running")
|
||||
|
||||
class TaskCanceledException(Exception):
|
||||
def __init__(self, msg):
|
||||
@ -712,26 +742,18 @@ def main():
|
||||
logging.info(f'TaskExecutor: RAGFlow version: {get_ragflow_version()}')
|
||||
settings.init_settings()
|
||||
print_rag_settings()
|
||||
signal.signal(signal.SIGUSR1, start_tracemalloc_and_snapshot)
|
||||
signal.signal(signal.SIGUSR2, stop_tracemalloc)
|
||||
TRACE_MALLOC_ENABLED = int(os.environ.get('TRACE_MALLOC_ENABLED', "0"))
|
||||
if TRACE_MALLOC_ENABLED:
|
||||
start_tracemalloc_and_snapshot(None, None)
|
||||
|
||||
background_thread = threading.Thread(target=report_status)
|
||||
background_thread.daemon = True
|
||||
background_thread.start()
|
||||
|
||||
TRACE_MALLOC_DELTA = int(os.environ.get('TRACE_MALLOC_DELTA', "0"))
|
||||
TRACE_MALLOC_FULL = int(os.environ.get('TRACE_MALLOC_FULL', "0"))
|
||||
if TRACE_MALLOC_DELTA > 0:
|
||||
if TRACE_MALLOC_FULL < TRACE_MALLOC_DELTA:
|
||||
TRACE_MALLOC_FULL = TRACE_MALLOC_DELTA
|
||||
tracemalloc.start()
|
||||
snapshot1 = tracemalloc.take_snapshot()
|
||||
while True:
|
||||
handle_task()
|
||||
num_tasks = DONE_TASKS + FAILED_TASKS
|
||||
if TRACE_MALLOC_DELTA > 0 and num_tasks > 0 and num_tasks % TRACE_MALLOC_DELTA == 0:
|
||||
snapshot2 = tracemalloc.take_snapshot()
|
||||
analyze_heap(snapshot1, snapshot2, int(num_tasks / TRACE_MALLOC_DELTA), num_tasks % TRACE_MALLOC_FULL == 0)
|
||||
snapshot1 = snapshot2
|
||||
snapshot2 = None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user