Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-11-12 14:59:41 +08:00
committed by GitHub
parent 00b6000b76
commit f4c52371ab
42 changed files with 2647 additions and 1878 deletions

View File

@ -529,13 +529,14 @@ def list_chunks():
return get_json_result(
data=False, message="Can't find doc_name or doc_id"
)
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
res = retrievaler.chunk_list(doc_id=doc_id, tenant_id=tenant_id)
res = retrievaler.chunk_list(doc_id, tenant_id, kb_ids)
res = [
{
"content": res_item["content_with_weight"],
"doc_name": res_item["docnm_kwd"],
"img_id": res_item["img_id"]
"image_id": res_item["img_id"]
} for res_item in res
]

View File

@ -18,12 +18,10 @@ import json
from flask import request
from flask_login import login_required, current_user
from elasticsearch_dsl import Q
from api.db.services.dialog_service import keyword_extraction
from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, rag_tokenizer
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace
from api.db import LLMType, ParserType
from api.db.services.knowledgebase_service import KnowledgebaseService
@ -31,12 +29,11 @@ from api.db.services.llm_service import LLMBundle
from api.db.services.user_service import UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.db.services.document_service import DocumentService
from api.settings import RetCode, retrievaler, kg_retrievaler
from api.settings import RetCode, retrievaler, kg_retrievaler, docStoreConn
from api.utils.api_utils import get_json_result
import hashlib
import re
@manager.route('/list', methods=['POST'])
@login_required
@validate_request("doc_id")
@ -53,12 +50,13 @@ def list_chunk():
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
query = {
"doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
}
if "available_int" in req:
query["available_int"] = int(req["available_int"])
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
sres = retrievaler.search(query, search.index_name(tenant_id), kb_ids, highlight=True)
res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
for id in sres.ids:
d = {
@ -69,16 +67,12 @@ def list_chunk():
"doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []),
"img_id": sres.field[id].get("img_id", ""),
"image_id": sres.field[id].get("img_id", ""),
"available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t")
"positions": json.loads(sres.field[id].get("position_list", "[]")),
}
if len(d["positions"]) % 5 == 0:
poss = []
for i in range(0, len(d["positions"]), 5):
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
d["positions"] = poss
assert isinstance(d["positions"], list)
assert len(d["positions"])==0 or (isinstance(d["positions"][0], list) and len(d["positions"][0]) == 5)
res["chunks"].append(d)
return get_json_result(data=res)
except Exception as e:
@ -96,22 +90,20 @@ def get():
tenants = UserTenantService.query(user_id=current_user.id)
if not tenants:
return get_data_error_result(message="Tenant not found!")
res = ELASTICSEARCH.get(
chunk_id, search.index_name(
tenants[0].tenant_id))
if not res.get("found"):
tenant_id = tenants[0].tenant_id
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
chunk = docStoreConn.get(chunk_id, search.index_name(tenant_id), kb_ids)
if chunk is None:
return server_error_response("Chunk not found")
id = res["_id"]
res = res["_source"]
res["chunk_id"] = id
k = []
for n in res.keys():
for n in chunk.keys():
if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
k.append(n)
for n in k:
del res[n]
del chunk[n]
return get_json_result(data=res)
return get_json_result(data=chunk)
except Exception as e:
if str(e).find("NotFoundError") >= 0:
return get_json_result(data=False, message='Chunk not found!',
@ -162,7 +154,7 @@ def set():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
@ -174,11 +166,11 @@ def set():
def switch():
req = request.json
try:
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
search.index_name(tenant_id)):
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if not docStoreConn.update({"id": req["chunk_ids"]}, {"available_int": int(req["available_int"])},
search.index_name(doc.tenant_id), doc.kb_id):
return get_data_error_result(message="Index updating failure")
return get_json_result(data=True)
except Exception as e:
@ -191,12 +183,11 @@ def switch():
def rm():
req = request.json
try:
if not ELASTICSEARCH.deleteByQuery(
Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
return get_data_error_result(message="Index updating failure")
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
if not docStoreConn.delete({"id": req["chunk_ids"]}, search.index_name(current_user.id), doc.kb_id):
return get_data_error_result(message="Index updating failure")
deleted_chunk_ids = req["chunk_ids"]
chunk_number = len(deleted_chunk_ids)
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
@ -239,7 +230,7 @@ def create():
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
DocumentService.increment_chunk_num(
doc.id, doc.kb_id, c, 1, 0)
@ -256,8 +247,9 @@ def retrieval_test():
page = int(req.get("page", 1))
size = int(req.get("size", 30))
question = req["question"]
kb_id = req["kb_id"]
if isinstance(kb_id, str): kb_id = [kb_id]
kb_ids = req["kb_id"]
if isinstance(kb_ids, str):
kb_ids = [kb_ids]
doc_ids = req.get("doc_ids", [])
similarity_threshold = float(req.get("similarity_threshold", 0.0))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
@ -265,17 +257,17 @@ def retrieval_test():
try:
tenants = UserTenantService.query(user_id=current_user.id)
for kid in kb_id:
for kb_id in kb_ids:
for tenant in tenants:
if KnowledgebaseService.query(
tenant_id=tenant.tenant_id, id=kid):
tenant_id=tenant.tenant_id, id=kb_id):
break
else:
return get_json_result(
data=False, message='Only owner of knowledgebase authorized for this operation.',
code=RetCode.OPERATING_ERROR)
e, kb = KnowledgebaseService.get_by_id(kb_id[0])
e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
if not e:
return get_data_error_result(message="Knowledgebase not found!")
@ -290,7 +282,7 @@ def retrieval_test():
question += keyword_extraction(chat_mdl, question)
retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, page, size,
similarity_threshold, vector_similarity_weight, top,
doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
for c in ranks["chunks"]:
@ -309,12 +301,16 @@ def retrieval_test():
@login_required
def knowledge_graph():
doc_id = request.args["doc_id"]
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
tenant_id = DocumentService.get_tenant_id(doc_id)
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
req = {
"doc_ids":[doc_id],
"knowledge_graph_kwd": ["graph", "mind_map"]
}
tenant_id = DocumentService.get_tenant_id(doc_id)
sres = retrievaler.search(req, search.index_name(tenant_id))
sres = retrievaler.search(req, search.index_name(tenant_id), kb_ids, doc.kb_id)
obj = {"graph": {}, "mind_map": {}}
for id in sres.ids[:2]:
ty = sres.field[id]["knowledge_graph_kwd"]

View File

@ -17,7 +17,6 @@ import pathlib
import re
import flask
from elasticsearch_dsl import Q
from flask import request
from flask_login import login_required, current_user
@ -27,14 +26,13 @@ from api.db.services.file_service import FileService
from api.db.services.task_service import TaskService, queue_tasks
from api.db.services.user_service import UserTenantService
from rag.nlp import search
from rag.utils.es_conn import ELASTICSEARCH
from api.db.services import duplicate_name
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid
from api.db import FileType, TaskStatus, ParserType, FileSource
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.settings import RetCode
from api.settings import RetCode, docStoreConn
from api.utils.api_utils import get_json_result
from rag.utils.storage_factory import STORAGE_IMPL
from api.utils.file_utils import filename_type, thumbnail
@ -275,18 +273,8 @@ def change_status():
return get_data_error_result(
message="Database error (Document update)!")
if str(req["status"]) == "0":
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=0;",
idxnm=search.index_name(
kb.tenant_id)
)
else:
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
scripts="ctx._source.available_int=1;",
idxnm=search.index_name(
kb.tenant_id)
)
status = int(req["status"])
docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
@ -365,8 +353,11 @@ def run():
tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
e, doc = DocumentService.get_by_id(id)
if not e:
return get_data_error_result(message="Document not found!")
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
if str(req["run"]) == TaskStatus.RUNNING.value:
TaskService.filter_delete([Task.doc_id == id])
@ -490,8 +481,8 @@ def change_parser():
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
return get_json_result(data=True)
except Exception as e:

View File

@ -28,6 +28,8 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.db_models import File
from api.settings import RetCode
from api.utils.api_utils import get_json_result
from api.settings import docStoreConn
from rag.nlp import search
@manager.route('/create', methods=['post'])
@ -166,6 +168,9 @@ def rm():
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
return get_data_error_result(
message="Database error (Knowledgebase removal)!")
tenants = UserTenantService.query(user_id=current_user.id)
for tenant in tenants:
docStoreConn.deleteIdx(search.index_name(tenant.tenant_id), req["kb_id"])
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)

View File

@ -30,7 +30,6 @@ from api.db.services.task_service import TaskService, queue_tasks
from api.utils.api_utils import server_error_response
from api.utils.api_utils import get_result, get_error_data_result
from io import BytesIO
from elasticsearch_dsl import Q
from flask import request, send_file
from api.db import FileSource, TaskStatus, FileType
from api.db.db_models import File
@ -42,7 +41,7 @@ from api.settings import RetCode, retrievaler
from api.utils.api_utils import construct_json_result, get_parser_config
from rag.nlp import search
from rag.utils import rmSpace
from rag.utils.es_conn import ELASTICSEARCH
from api.settings import docStoreConn
from rag.utils.storage_factory import STORAGE_IMPL
import os
@ -293,9 +292,7 @@ def update_doc(tenant_id, dataset_id, document_id):
)
if not e:
return get_error_data_result(message="Document not found!")
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)
)
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
return get_result()
@ -647,9 +644,7 @@ def parse(tenant_id, dataset_id):
info["chunk_num"] = 0
info["token_num"] = 0
DocumentService.update_by_id(id, info)
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
)
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), dataset_id)
TaskService.filter_delete([Task.doc_id == id])
e, doc = DocumentService.get_by_id(id)
doc = doc.to_dict()
@ -713,9 +708,7 @@ def stop_parsing(tenant_id, dataset_id):
)
info = {"run": "2", "progress": 0, "chunk_num": 0}
DocumentService.update_by_id(id, info)
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
)
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
return get_result()
@ -812,7 +805,6 @@ def list_chunks(tenant_id, dataset_id, document_id):
"question": question,
"sort": True,
}
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
key_mapping = {
"chunk_num": "chunk_count",
"kb_id": "dataset_id",
@ -833,51 +825,56 @@ def list_chunks(tenant_id, dataset_id, document_id):
renamed_doc[new_key] = value
if key == "run":
renamed_doc["run"] = run_mapping.get(str(value))
res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
origin_chunks = []
sign = 0
for id in sres.ids:
d = {
"chunk_id": id,
"content_with_weight": (
rmSpace(sres.highlight[id])
if question and id in sres.highlight
else sres.field[id].get("content_with_weight", "")
),
"doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []),
"img_id": sres.field[id].get("img_id", ""),
"available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t"),
}
if len(d["positions"]) % 5 == 0:
poss = []
for i in range(0, len(d["positions"]), 5):
poss.append(
[
float(d["positions"][i]),
float(d["positions"][i + 1]),
float(d["positions"][i + 2]),
float(d["positions"][i + 3]),
float(d["positions"][i + 4]),
]
)
d["positions"] = poss
origin_chunks.append(d)
res = {"total": 0, "chunks": [], "doc": renamed_doc}
origin_chunks = []
if docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
sres = retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None, highlight=True)
res["total"] = sres.total
sign = 0
for id in sres.ids:
d = {
"id": id,
"content_with_weight": (
rmSpace(sres.highlight[id])
if question and id in sres.highlight
else sres.field[id].get("content_with_weight", "")
),
"doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []),
"img_id": sres.field[id].get("img_id", ""),
"available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t"),
}
if len(d["positions"]) % 5 == 0:
poss = []
for i in range(0, len(d["positions"]), 5):
poss.append(
[
float(d["positions"][i]),
float(d["positions"][i + 1]),
float(d["positions"][i + 2]),
float(d["positions"][i + 3]),
float(d["positions"][i + 4]),
]
)
d["positions"] = poss
origin_chunks.append(d)
if req.get("id"):
if req.get("id") == id:
origin_chunks.clear()
origin_chunks.append(d)
sign = 1
break
if req.get("id"):
if req.get("id") == id:
origin_chunks.clear()
origin_chunks.append(d)
sign = 1
break
if req.get("id"):
if sign == 0:
return get_error_data_result(f"Can't find this chunk {req.get('id')}")
if sign == 0:
return get_error_data_result(f"Can't find this chunk {req.get('id')}")
for chunk in origin_chunks:
key_mapping = {
"chunk_id": "id",
"id": "id",
"content_with_weight": "content",
"doc_id": "document_id",
"important_kwd": "important_keywords",
@ -996,9 +993,9 @@ def add_chunk(tenant_id, dataset_id, document_id):
)
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
d["kb_id"] = [doc.kb_id]
d["kb_id"] = dataset_id
d["docnm_kwd"] = doc.name
d["doc_id"] = doc.id
d["doc_id"] = document_id
embd_id = DocumentService.get_embd_id(document_id)
embd_mdl = TenantLLMService.model_instance(
tenant_id, LLMType.EMBEDDING.value, embd_id
@ -1006,14 +1003,12 @@ def add_chunk(tenant_id, dataset_id, document_id):
v, c = embd_mdl.encode([doc.name, req["content"]])
v = 0.1 * v[0] + 0.9 * v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
d["chunk_id"] = chunk_id
d["kb_id"] = doc.kb_id
# rename keys
key_mapping = {
"chunk_id": "id",
"id": "id",
"content_with_weight": "content",
"doc_id": "document_id",
"important_kwd": "important_keywords",
@ -1079,36 +1074,16 @@ def rm_chunk(tenant_id, dataset_id, document_id):
"""
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
if not doc:
return get_error_data_result(
message=f"You don't own the document {document_id}."
)
doc = doc[0]
req = request.json
if not req.get("chunk_ids"):
return get_error_data_result("`chunk_ids` is required")
query = {"doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
if not req:
chunk_ids = None
else:
chunk_ids = req.get("chunk_ids")
if not chunk_ids:
chunk_list = sres.ids
else:
chunk_list = chunk_ids
for chunk_id in chunk_list:
if chunk_id not in sres.ids:
return get_error_data_result(f"Chunk {chunk_id} not found")
if not ELASTICSEARCH.deleteByQuery(
Q("ids", values=chunk_list), search.index_name(tenant_id)
):
return get_error_data_result(message="Index updating failure")
deleted_chunk_ids = chunk_list
chunk_number = len(deleted_chunk_ids)
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
return get_result()
condition = {"doc_id": document_id}
if "chunk_ids" in req:
condition["id"] = req["chunk_ids"]
chunk_number = docStoreConn.delete(condition, search.index_name(tenant_id), dataset_id)
if chunk_number != 0:
DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0)
if "chunk_ids" in req and chunk_number != len(req["chunk_ids"]):
return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(req["chunk_ids"])}")
return get_result(message=f"deleted {chunk_number} chunks")
@manager.route(
@ -1168,9 +1143,8 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
schema:
type: object
"""
try:
res = ELASTICSEARCH.get(chunk_id, search.index_name(tenant_id))
except Exception:
chunk = docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id])
if chunk is None:
return get_error_data_result(f"Can't find this chunk {chunk_id}")
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
@ -1180,19 +1154,12 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
message=f"You don't own the document {document_id}."
)
doc = doc[0]
query = {
"doc_ids": [document_id],
"page": 1,
"size": 1024,
"question": "",
"sort": True,
}
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
if chunk_id not in sres.ids:
return get_error_data_result(f"You don't own the chunk {chunk_id}")
req = request.json
content = res["_source"].get("content_with_weight")
d = {"id": chunk_id, "content_with_weight": req.get("content", content)}
if "content" in req:
content = req["content"]
else:
content = chunk.get("content_with_weight", "")
d = {"id": chunk_id, "content_with_weight": content}
d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
if "important_keywords" in req:
@ -1220,7 +1187,7 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
d["q_%d_vec" % len(v)] = v.tolist()
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
return get_result()

View File

@ -31,7 +31,7 @@ from api.utils.api_utils import (
generate_confirmation_token,
)
from api.versions import get_rag_version
from rag.utils.es_conn import ELASTICSEARCH
from api.settings import docStoreConn
from rag.utils.storage_factory import STORAGE_IMPL, STORAGE_IMPL_TYPE
from timeit import default_timer as timer
@ -98,10 +98,11 @@ def status():
res = {}
st = timer()
try:
res["es"] = ELASTICSEARCH.health()
res["es"]["elapsed"] = "{:.1f}".format((timer() - st) * 1000.0)
res["doc_store"] = docStoreConn.health()
res["doc_store"]["elapsed"] = "{:.1f}".format((timer() - st) * 1000.0)
except Exception as e:
res["es"] = {
res["doc_store"] = {
"type": "unknown",
"status": "red",
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
"error": str(e),

View File

@ -470,7 +470,7 @@ class User(DataBaseModel, UserMixin):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
is_superuser = BooleanField(null=True, help_text="is root", default=False, index=True)
@ -525,7 +525,7 @@ class Tenant(DataBaseModel):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
@ -542,7 +542,7 @@ class UserTenant(DataBaseModel):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
@ -559,7 +559,7 @@ class InvitationCode(DataBaseModel):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
@ -582,7 +582,7 @@ class LLMFactories(DataBaseModel):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
@ -616,7 +616,7 @@ class LLM(DataBaseModel):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
@ -703,7 +703,7 @@ class Knowledgebase(DataBaseModel):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
@ -767,7 +767,7 @@ class Document(DataBaseModel):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
@ -904,7 +904,7 @@ class Dialog(DataBaseModel):
status = CharField(
max_length=1,
null=True,
help_text="is it validate(0: wasted1: validate)",
help_text="is it validate(0: wasted, 1: validate)",
default="1",
index=True)
@ -987,7 +987,7 @@ def migrate_db():
help_text="where dose this document come from",
index=True))
)
except Exception as e:
except Exception:
pass
try:
migrate(
@ -996,7 +996,7 @@ def migrate_db():
help_text="default rerank model ID"))
)
except Exception as e:
except Exception:
pass
try:
migrate(
@ -1004,59 +1004,59 @@ def migrate_db():
help_text="default rerank model ID"))
)
except Exception as e:
except Exception:
pass
try:
migrate(
migrator.add_column('dialog', 'top_k', IntegerField(default=1024))
)
except Exception as e:
except Exception:
pass
try:
migrate(
migrator.alter_column_type('tenant_llm', 'api_key',
CharField(max_length=1024, null=True, help_text="API KEY", index=True))
)
except Exception as e:
except Exception:
pass
try:
migrate(
migrator.add_column('api_token', 'source',
CharField(max_length=16, null=True, help_text="none|agent|dialog", index=True))
)
except Exception as e:
except Exception:
pass
try:
migrate(
migrator.add_column("tenant","tts_id",
CharField(max_length=256,null=True,help_text="default tts model ID",index=True))
)
except Exception as e:
except Exception:
pass
try:
migrate(
migrator.add_column('api_4_conversation', 'source',
CharField(max_length=16, null=True, help_text="none|agent|dialog", index=True))
)
except Exception as e:
except Exception:
pass
try:
DB.execute_sql('ALTER TABLE llm DROP PRIMARY KEY;')
DB.execute_sql('ALTER TABLE llm ADD PRIMARY KEY (llm_name,fid);')
except Exception as e:
except Exception:
pass
try:
migrate(
migrator.add_column('task', 'retry_count', IntegerField(default=0))
)
except Exception as e:
except Exception:
pass
try:
migrate(
migrator.alter_column_type('api_token', 'dialog_id',
CharField(max_length=32, null=True, index=True))
)
except Exception as e:
except Exception:
pass

View File

@ -15,7 +15,6 @@
#
import hashlib
import json
import os
import random
import re
import traceback
@ -24,16 +23,13 @@ from copy import deepcopy
from datetime import datetime
from io import BytesIO
from elasticsearch_dsl import Q
from peewee import fn
from api.db.db_utils import bulk_insert_into_db
from api.settings import stat_logger
from api.settings import stat_logger, docStoreConn
from api.utils import current_timestamp, get_format_time, get_uuid
from api.utils.file_utils import get_project_base_directory
from graphrag.mind_map_extractor import MindMapExtractor
from rag.settings import SVR_QUEUE_NAME
from rag.utils.es_conn import ELASTICSEARCH
from rag.utils.storage_factory import STORAGE_IMPL
from rag.nlp import search, rag_tokenizer
@ -112,8 +108,7 @@ class DocumentService(CommonService):
@classmethod
@DB.connection_context()
def remove_document(cls, doc, tenant_id):
ELASTICSEARCH.deleteByQuery(
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
cls.clear_chunk_num(doc.id)
return cls.delete_by_id(doc.id)
@ -225,6 +220,15 @@ class DocumentService(CommonService):
return
return docs[0]["tenant_id"]
@classmethod
@DB.connection_context()
def get_knowledgebase_id(cls, doc_id):
docs = cls.model.select(cls.model.kb_id).where(cls.model.id == doc_id)
docs = docs.dicts()
if not docs:
return
return docs[0]["kb_id"]
@classmethod
@DB.connection_context()
def get_tenant_id_by_name(cls, name):
@ -438,11 +442,6 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
if not e:
raise LookupError("Can't find this knowledgebase!")
idxnm = search.index_name(kb.tenant_id)
if not ELASTICSEARCH.indexExist(idxnm):
ELASTICSEARCH.createIdx(idxnm, json.load(
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING, llm_name=kb.embd_id, lang=kb.language)
err, files = FileService.upload_document(kb, file_objs, user_id)
@ -486,7 +485,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
md5 = hashlib.md5()
md5.update((ck["content_with_weight"] +
str(d["doc_id"])).encode("utf-8"))
d["_id"] = md5.hexdigest()
d["id"] = md5.hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"):
@ -499,8 +498,8 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
else:
d["image"].save(output_buffer, format='JPEG')
STORAGE_IMPL.put(kb.id, d["_id"], output_buffer.getvalue())
d["img_id"] = "{}-{}".format(kb.id, d["_id"])
STORAGE_IMPL.put(kb.id, d["id"], output_buffer.getvalue())
d["img_id"] = "{}-{}".format(kb.id, d["id"])
del d["image"]
docs.append(d)
@ -520,6 +519,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
token_counts[doc_id] += c
return vects
idxnm = search.index_name(kb.tenant_id)
try_create_idx = True
_, tenant = TenantService.get_by_id(kb.tenant_id)
llm_bdl = LLMBundle(kb.tenant_id, LLMType.CHAT, tenant.llm_id)
for doc_id in docids:
@ -550,7 +552,11 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
v = vects[i]
d["q_%d_vec" % len(v)] = v
for b in range(0, len(cks), es_bulk_size):
ELASTICSEARCH.bulk(cks[b:b + es_bulk_size], idxnm)
if try_create_idx:
if not docStoreConn.indexExist(idxnm, kb_id):
docStoreConn.createIdx(idxnm, kb_id, len(vects[0]))
try_create_idx = False
docStoreConn.insert(cks[b:b + es_bulk_size], idxnm, kb_id)
DocumentService.increment_chunk_num(
doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)

View File

@ -66,6 +66,16 @@ class KnowledgebaseService(CommonService):
return list(kbs.dicts())
@classmethod
@DB.connection_context()
def get_kb_ids(cls, tenant_id):
fields = [
cls.model.id,
]
kbs = cls.model.select(*fields).where(cls.model.tenant_id == tenant_id)
kb_ids = [kb["id"] for kb in kbs]
return kb_ids
@classmethod
@DB.connection_context()
def get_detail(cls, kb_id):

View File

@ -18,6 +18,8 @@ from datetime import date
from enum import IntEnum, Enum
from api.utils.file_utils import get_project_base_directory
from api.utils.log_utils import LoggerFactory, getLogger
import rag.utils.es_conn
import rag.utils.infinity_conn
# Logger
LoggerFactory.set_directory(
@ -33,7 +35,7 @@ access_logger = getLogger("access")
database_logger = getLogger("database")
chat_logger = getLogger("chat")
from rag.utils.es_conn import ELASTICSEARCH
import rag.utils
from rag.nlp import search
from graphrag import search as kg_search
from api.utils import get_base_config, decrypt_database_config
@ -206,8 +208,12 @@ AUTHENTICATION_DEFAULT_TIMEOUT = 7 * 24 * 60 * 60 # s
PRIVILEGE_COMMAND_WHITELIST = []
CHECK_NODES_IDENTITY = False
retrievaler = search.Dealer(ELASTICSEARCH)
kg_retrievaler = kg_search.KGSearch(ELASTICSEARCH)
if 'username' in get_base_config("es", {}):
docStoreConn = rag.utils.es_conn.ESConnection()
else:
docStoreConn = rag.utils.infinity_conn.InfinityConnection()
retrievaler = search.Dealer(docStoreConn)
kg_retrievaler = kg_search.KGSearch(docStoreConn)
class CustomEnum(Enum):

View File

@ -126,10 +126,6 @@ def server_error_response(e):
if len(e.args) > 1:
return get_json_result(
code=RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
if repr(e).find("index_not_found_exception") >= 0:
return get_json_result(code=RetCode.EXCEPTION_ERROR,
message="No chunk found, please upload file and parse it.")
return get_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e))
@ -270,10 +266,6 @@ def construct_error_response(e):
pass
if len(e.args) > 1:
return construct_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
if repr(e).find("index_not_found_exception") >= 0:
return construct_json_result(code=RetCode.EXCEPTION_ERROR,
message="No chunk found, please upload file and parse it.")
return construct_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e))
@ -295,7 +287,7 @@ def token_required(func):
return decorated_function
def get_result(code=RetCode.SUCCESS, message='error', data=None):
def get_result(code=RetCode.SUCCESS, message="", data=None):
if code == 0:
if data is not None:
response = {"code": code, "data": data}