mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Redesign and refactor agent module (#9113)
### What problem does this PR solve? #9082 #6365 <u> **WARNING: it's not compatible with the older version of `Agent` module, which means that `Agent` from older versions can not work anymore.**</u> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
@ -22,7 +21,6 @@ from pathlib import Path
|
||||
from flask_login import current_user
|
||||
from peewee import fn
|
||||
|
||||
from api.constants import FILE_NAME_LEN_LIMIT
|
||||
from api.db import KNOWLEDGEBASE_FOLDER_NAME, FileSource, FileType, ParserType
|
||||
from api.db.db_models import DB, Document, File, File2Document, Knowledgebase
|
||||
from api.db.services import duplicate_name
|
||||
@ -31,6 +29,7 @@ from api.db.services.document_service import DocumentService
|
||||
from api.db.services.file2document_service import File2DocumentService
|
||||
from api.utils import get_uuid
|
||||
from api.utils.file_utils import filename_type, read_potential_broken_pdf, thumbnail_img
|
||||
from rag.llm.cv_model import GptV4
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
|
||||
|
||||
@ -411,12 +410,7 @@ class FileService(CommonService):
|
||||
err, files = [], []
|
||||
for file in file_objs:
|
||||
try:
|
||||
MAX_FILE_NUM_PER_USER = int(os.environ.get("MAX_FILE_NUM_PER_USER", 0))
|
||||
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
|
||||
raise RuntimeError("Exceed the maximum file number of a free user!")
|
||||
if len(file.filename.encode("utf-8")) > FILE_NAME_LEN_LIMIT:
|
||||
raise RuntimeError(f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.")
|
||||
|
||||
DocumentService.check_doc_health(kb.tenant_id, file.filename)
|
||||
filename = duplicate_name(DocumentService.query, name=file.filename, kb_id=kb.id)
|
||||
filetype = filename_type(filename)
|
||||
if filetype == FileType.OTHER.value:
|
||||
@ -463,6 +457,19 @@ class FileService(CommonService):
|
||||
|
||||
@staticmethod
|
||||
def parse_docs(file_objs, user_id):
|
||||
exe = ThreadPoolExecutor(max_workers=12)
|
||||
threads = []
|
||||
for file in file_objs:
|
||||
threads.append(exe.submit(FileService.parse, file.filename, file.read(), False))
|
||||
|
||||
res = []
|
||||
for th in threads:
|
||||
res.append(th.result())
|
||||
|
||||
return "\n\n".join(res)
|
||||
|
||||
@staticmethod
|
||||
def parse(filename, blob, img_base64=True, tenant_id=None):
|
||||
from rag.app import audio, email, naive, picture, presentation
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
@ -470,19 +477,12 @@ class FileService(CommonService):
|
||||
|
||||
FACTORY = {ParserType.PRESENTATION.value: presentation, ParserType.PICTURE.value: picture, ParserType.AUDIO.value: audio, ParserType.EMAIL.value: email}
|
||||
parser_config = {"chunk_token_num": 16096, "delimiter": "\n!?;。;!?", "layout_recognize": "Plain Text"}
|
||||
exe = ThreadPoolExecutor(max_workers=12)
|
||||
threads = []
|
||||
for file in file_objs:
|
||||
kwargs = {"lang": "English", "callback": dummy, "parser_config": parser_config, "from_page": 0, "to_page": 100000, "tenant_id": user_id}
|
||||
filetype = filename_type(file.filename)
|
||||
blob = file.read()
|
||||
threads.append(exe.submit(FACTORY.get(FileService.get_parser(filetype, file.filename, ""), naive).chunk, file.filename, blob, **kwargs))
|
||||
|
||||
res = []
|
||||
for th in threads:
|
||||
res.append("\n".join([ck["content_with_weight"] for ck in th.result()]))
|
||||
|
||||
return "\n\n".join(res)
|
||||
kwargs = {"lang": "English", "callback": dummy, "parser_config": parser_config, "from_page": 0, "to_page": 100000, "tenant_id": current_user.id if current_user else tenant_id}
|
||||
file_type = filename_type(filename)
|
||||
if img_base64 and file_type == FileType.VISUAL.value:
|
||||
return GptV4.image2base64(blob)
|
||||
cks = FACTORY.get(FileService.get_parser(filename_type(filename), filename, ""), naive).chunk(filename, blob, **kwargs)
|
||||
return "\n".join([ck["content_with_weight"] for ck in cks])
|
||||
|
||||
@staticmethod
|
||||
def get_parser(doc_type, filename, default):
|
||||
@ -495,3 +495,14 @@ class FileService(CommonService):
|
||||
if re.search(r"\.(eml)$", filename):
|
||||
return ParserType.EMAIL.value
|
||||
return default
|
||||
|
||||
@staticmethod
|
||||
def get_blob(user_id, location):
|
||||
bname = f"{user_id}-downloads"
|
||||
return STORAGE_IMPL.get(bname, location)
|
||||
|
||||
@staticmethod
|
||||
def put_blob(user_id, location, blob):
|
||||
bname = f"{user_id}-downloads"
|
||||
return STORAGE_IMPL.put(bname, location, blob)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user