add function: upload and parse (#1889)

### What problem does this PR solve?

#1880
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2024-08-09 16:20:02 +08:00
committed by GitHub
parent 6529c764c9
commit e3cf14a3c9
8 changed files with 255 additions and 89 deletions

View File

@ -104,7 +104,11 @@ def chat(dialog, messages, stream=True, **kwargs):
is_kg = all([kb.parser_id == ParserType.KG for kb in kbs])
retr = retrievaler if not is_kg else kg_retrievaler
questions = [m["content"] for m in messages if m["role"] == "user"]
questions = [m["content"] for m in messages if m["role"] == "user"][-3:]
attachments = kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None
if "doc_ids" in messages[-1]:
attachments = messages[-1]["doc_ids"]
embd_mdl = LLMBundle(dialog.tenant_id, LLMType.EMBEDDING, embd_nms[0])
if llm_id2llm_type(dialog.llm_id) == "image2text":
chat_mdl = LLMBundle(dialog.tenant_id, LLMType.IMAGE2TEXT, dialog.llm_id)
@ -144,7 +148,7 @@ def chat(dialog, messages, stream=True, **kwargs):
kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
dialog.similarity_threshold,
dialog.vector_similarity_weight,
doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
doc_ids=attachments,
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
#self-rag
@ -153,7 +157,7 @@ def chat(dialog, messages, stream=True, **kwargs):
kbinfos = retr.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
dialog.similarity_threshold,
dialog.vector_similarity_weight,
doc_ids=kwargs["doc_ids"].split(",") if "doc_ids" in kwargs else None,
doc_ids=attachments,
top=dialog.top_k, aggs=False, rerank_mdl=rerank_mdl)
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]

View File

@ -26,7 +26,7 @@ from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.minio_conn import MINIO
from rag.nlp import search
from api.db import FileType, TaskStatus
from api.db import FileType, TaskStatus, ParserType
from api.db.db_models import DB, Knowledgebase, Tenant, Task
from api.db.db_models import Document
from api.db.services.common_service import CommonService

View File

@ -13,16 +13,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
from flask_login import current_user
from peewee import fn
from api.db import FileType, KNOWLEDGEBASE_FOLDER_NAME, FileSource
from api.db import FileType, KNOWLEDGEBASE_FOLDER_NAME, FileSource, ParserType
from api.db.db_models import DB, File2Document, Knowledgebase
from api.db.db_models import File, Document
from api.db.services import duplicate_name
from api.db.services.common_service import CommonService
from api.db.services.document_service import DocumentService
from api.db.services.file2document_service import File2DocumentService
from api.utils import get_uuid
from api.utils.file_utils import filename_type, thumbnail
from rag.utils.minio_conn import MINIO
class FileService(CommonService):
@ -318,4 +323,60 @@ class FileService(CommonService):
cls.filter_update((cls.model.id << file_ids, ), { 'parent_id': folder_id })
except Exception as e:
print(e)
raise RuntimeError("Database error (File move)!")
raise RuntimeError("Database error (File move)!")
@classmethod
@DB.connection_context()
def upload_document(self, kb, file_objs):
root_folder = self.get_root_folder(current_user.id)
pf_id = root_folder["id"]
self.init_knowledgebase_docs(pf_id, current_user.id)
kb_root_folder = self.get_kb_folder(current_user.id)
kb_folder = self.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
err, files = [], []
for file in file_objs:
try:
MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(kb.tenant_id) >= MAX_FILE_NUM_PER_USER:
raise RuntimeError("Exceed the maximum file number of a free user!")
filename = duplicate_name(
DocumentService.query,
name=file.filename,
kb_id=kb.id)
filetype = filename_type(filename)
if filetype == FileType.OTHER.value:
raise RuntimeError("This type of file has not been supported yet!")
location = filename
while MINIO.obj_exist(kb.id, location):
location += "_"
blob = file.read()
MINIO.put(kb.id, location, blob)
doc = {
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": filetype,
"name": filename,
"location": location,
"size": len(blob),
"thumbnail": thumbnail(filename, blob)
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
DocumentService.insert(doc)
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
files.append((doc, blob))
except Exception as e:
err.append(file.filename + ": " + str(e))
return err, files