mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Made task_executor async to speedup parsing (#5530)
### What problem does this PR solve? Made task_executor async to speedup parsing ### Type of change - [x] Performance Improvement
This commit is contained in:
@ -17,6 +17,7 @@ import json
|
||||
import re
|
||||
import traceback
|
||||
from copy import deepcopy
|
||||
import trio
|
||||
from api.db.db_models import APIToken
|
||||
|
||||
from api.db.services.conversation_service import ConversationService, structure_answer
|
||||
@ -386,7 +387,8 @@ def mindmap():
|
||||
rank_feature=label_question(question, [kb])
|
||||
)
|
||||
mindmap = MindMapExtractor(chat_mdl)
|
||||
mind_map = mindmap([c["content_with_weight"] for c in ranks["chunks"]]).output
|
||||
mind_map = trio.run(mindmap, [c["content_with_weight"] for c in ranks["chunks"]])
|
||||
mind_map = mind_map.output
|
||||
if "error" in mind_map:
|
||||
return server_error_response(Exception(mind_map["error"]))
|
||||
return get_json_result(data=mind_map)
|
||||
|
||||
@ -22,6 +22,7 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
import trio
|
||||
|
||||
from peewee import fn
|
||||
|
||||
@ -597,8 +598,8 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
||||
if parser_ids[doc_id] != ParserType.PICTURE.value:
|
||||
mindmap = MindMapExtractor(llm_bdl)
|
||||
try:
|
||||
mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output,
|
||||
ensure_ascii=False, indent=2)
|
||||
mind_map = trio.run(mindmap, [c["content_with_weight"] for c in docs if c["doc_id"] == doc_id])
|
||||
mind_map = json.dumps(mind_map.output, ensure_ascii=False, indent=2)
|
||||
if len(mind_map) < 32:
|
||||
raise Exception("Few content: " + mind_map)
|
||||
cks.append({
|
||||
|
||||
@ -17,6 +17,8 @@ import base64
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
from io import BytesIO
|
||||
|
||||
import pdfplumber
|
||||
@ -30,6 +32,10 @@ from api.constants import IMG_BASE64_PREFIX
|
||||
PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
|
||||
RAG_BASE = os.getenv("RAG_BASE")
|
||||
|
||||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||||
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
||||
|
||||
|
||||
def get_project_base_directory(*args):
|
||||
global PROJECT_BASE
|
||||
@ -175,19 +181,20 @@ def thumbnail_img(filename, blob):
|
||||
"""
|
||||
filename = filename.lower()
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
pdf = pdfplumber.open(BytesIO(blob))
|
||||
buffered = BytesIO()
|
||||
resolution = 32
|
||||
img = None
|
||||
for _ in range(10):
|
||||
# https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
|
||||
pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
|
||||
img = buffered.getvalue()
|
||||
if len(img) >= 64000 and resolution >= 2:
|
||||
resolution = resolution / 2
|
||||
buffered = BytesIO()
|
||||
else:
|
||||
break
|
||||
with sys.modules[LOCK_KEY_pdfplumber]:
|
||||
pdf = pdfplumber.open(BytesIO(blob))
|
||||
buffered = BytesIO()
|
||||
resolution = 32
|
||||
img = None
|
||||
for _ in range(10):
|
||||
# https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
|
||||
pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
|
||||
img = buffered.getvalue()
|
||||
if len(img) >= 64000 and resolution >= 2:
|
||||
resolution = resolution / 2
|
||||
buffered = BytesIO()
|
||||
else:
|
||||
break
|
||||
pdf.close()
|
||||
return img
|
||||
|
||||
|
||||
@ -18,6 +18,8 @@ import os.path
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
initialized_root_logger = False
|
||||
|
||||
def get_project_base_directory():
|
||||
PROJECT_BASE = os.path.abspath(
|
||||
os.path.join(
|
||||
@ -29,10 +31,13 @@ def get_project_base_directory():
|
||||
return PROJECT_BASE
|
||||
|
||||
def initRootLogger(logfile_basename: str, log_format: str = "%(asctime)-15s %(levelname)-8s %(process)d %(message)s"):
|
||||
logger = logging.getLogger()
|
||||
if logger.hasHandlers():
|
||||
global initialized_root_logger
|
||||
if initialized_root_logger:
|
||||
return
|
||||
initialized_root_logger = True
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.handlers.clear()
|
||||
log_path = os.path.abspath(os.path.join(get_project_base_directory(), "logs", f"{logfile_basename}.log"))
|
||||
|
||||
os.makedirs(os.path.dirname(log_path), exist_ok=True)
|
||||
|
||||
Reference in New Issue
Block a user