mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Integration with Infinity (#2894)
### What problem does this PR solve? Integration with Infinity - Replaced ELASTICSEARCH with dataStoreConn - Renamed deleteByQuery with delete - Renamed bulk to upsertBulk - getHighlight, getAggregation - Fix KGSearch.search - Moved Dealer.sql_retrieval to es_conn.py ### Type of change - [x] Refactoring
This commit is contained in:
@ -529,13 +529,14 @@ def list_chunks():
|
||||
return get_json_result(
|
||||
data=False, message="Can't find doc_name or doc_id"
|
||||
)
|
||||
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
|
||||
|
||||
res = retrievaler.chunk_list(doc_id=doc_id, tenant_id=tenant_id)
|
||||
res = retrievaler.chunk_list(doc_id, tenant_id, kb_ids)
|
||||
res = [
|
||||
{
|
||||
"content": res_item["content_with_weight"],
|
||||
"doc_name": res_item["docnm_kwd"],
|
||||
"img_id": res_item["img_id"]
|
||||
"image_id": res_item["img_id"]
|
||||
} for res_item in res
|
||||
]
|
||||
|
||||
|
||||
@ -18,12 +18,10 @@ import json
|
||||
|
||||
from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
from elasticsearch_dsl import Q
|
||||
|
||||
from api.db.services.dialog_service import keyword_extraction
|
||||
from rag.app.qa import rmPrefix, beAdoc
|
||||
from rag.nlp import search, rag_tokenizer
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.utils import rmSpace
|
||||
from api.db import LLMType, ParserType
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
@ -31,12 +29,11 @@ from api.db.services.llm_service import LLMBundle
|
||||
from api.db.services.user_service import UserTenantService
|
||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||
from api.db.services.document_service import DocumentService
|
||||
from api.settings import RetCode, retrievaler, kg_retrievaler
|
||||
from api.settings import RetCode, retrievaler, kg_retrievaler, docStoreConn
|
||||
from api.utils.api_utils import get_json_result
|
||||
import hashlib
|
||||
import re
|
||||
|
||||
|
||||
@manager.route('/list', methods=['POST'])
|
||||
@login_required
|
||||
@validate_request("doc_id")
|
||||
@ -53,12 +50,13 @@ def list_chunk():
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
|
||||
query = {
|
||||
"doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
|
||||
}
|
||||
if "available_int" in req:
|
||||
query["available_int"] = int(req["available_int"])
|
||||
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
||||
sres = retrievaler.search(query, search.index_name(tenant_id), kb_ids, highlight=True)
|
||||
res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
|
||||
for id in sres.ids:
|
||||
d = {
|
||||
@ -69,16 +67,12 @@ def list_chunk():
|
||||
"doc_id": sres.field[id]["doc_id"],
|
||||
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
||||
"important_kwd": sres.field[id].get("important_kwd", []),
|
||||
"img_id": sres.field[id].get("img_id", ""),
|
||||
"image_id": sres.field[id].get("img_id", ""),
|
||||
"available_int": sres.field[id].get("available_int", 1),
|
||||
"positions": sres.field[id].get("position_int", "").split("\t")
|
||||
"positions": json.loads(sres.field[id].get("position_list", "[]")),
|
||||
}
|
||||
if len(d["positions"]) % 5 == 0:
|
||||
poss = []
|
||||
for i in range(0, len(d["positions"]), 5):
|
||||
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
|
||||
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
|
||||
d["positions"] = poss
|
||||
assert isinstance(d["positions"], list)
|
||||
assert len(d["positions"])==0 or (isinstance(d["positions"][0], list) and len(d["positions"][0]) == 5)
|
||||
res["chunks"].append(d)
|
||||
return get_json_result(data=res)
|
||||
except Exception as e:
|
||||
@ -96,22 +90,20 @@ def get():
|
||||
tenants = UserTenantService.query(user_id=current_user.id)
|
||||
if not tenants:
|
||||
return get_data_error_result(message="Tenant not found!")
|
||||
res = ELASTICSEARCH.get(
|
||||
chunk_id, search.index_name(
|
||||
tenants[0].tenant_id))
|
||||
if not res.get("found"):
|
||||
tenant_id = tenants[0].tenant_id
|
||||
|
||||
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
|
||||
chunk = docStoreConn.get(chunk_id, search.index_name(tenant_id), kb_ids)
|
||||
if chunk is None:
|
||||
return server_error_response("Chunk not found")
|
||||
id = res["_id"]
|
||||
res = res["_source"]
|
||||
res["chunk_id"] = id
|
||||
k = []
|
||||
for n in res.keys():
|
||||
for n in chunk.keys():
|
||||
if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
|
||||
k.append(n)
|
||||
for n in k:
|
||||
del res[n]
|
||||
del chunk[n]
|
||||
|
||||
return get_json_result(data=res)
|
||||
return get_json_result(data=chunk)
|
||||
except Exception as e:
|
||||
if str(e).find("NotFoundError") >= 0:
|
||||
return get_json_result(data=False, message='Chunk not found!',
|
||||
@ -162,7 +154,7 @@ def set():
|
||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||
d["q_%d_vec" % len(v)] = v.tolist()
|
||||
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
|
||||
docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -174,11 +166,11 @@ def set():
|
||||
def switch():
|
||||
req = request.json
|
||||
try:
|
||||
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
||||
if not tenant_id:
|
||||
return get_data_error_result(message="Tenant not found!")
|
||||
if not ELASTICSEARCH.upsert([{"id": i, "available_int": int(req["available_int"])} for i in req["chunk_ids"]],
|
||||
search.index_name(tenant_id)):
|
||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
if not docStoreConn.update({"id": req["chunk_ids"]}, {"available_int": int(req["available_int"])},
|
||||
search.index_name(doc.tenant_id), doc.kb_id):
|
||||
return get_data_error_result(message="Index updating failure")
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
@ -191,12 +183,11 @@ def switch():
|
||||
def rm():
|
||||
req = request.json
|
||||
try:
|
||||
if not ELASTICSEARCH.deleteByQuery(
|
||||
Q("ids", values=req["chunk_ids"]), search.index_name(current_user.id)):
|
||||
return get_data_error_result(message="Index updating failure")
|
||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
if not docStoreConn.delete({"id": req["chunk_ids"]}, search.index_name(current_user.id), doc.kb_id):
|
||||
return get_data_error_result(message="Index updating failure")
|
||||
deleted_chunk_ids = req["chunk_ids"]
|
||||
chunk_number = len(deleted_chunk_ids)
|
||||
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
|
||||
@ -239,7 +230,7 @@ def create():
|
||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
||||
v = 0.1 * v[0] + 0.9 * v[1]
|
||||
d["q_%d_vec" % len(v)] = v.tolist()
|
||||
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
|
||||
docStoreConn.insert([d], search.index_name(tenant_id), doc.kb_id)
|
||||
|
||||
DocumentService.increment_chunk_num(
|
||||
doc.id, doc.kb_id, c, 1, 0)
|
||||
@ -256,8 +247,9 @@ def retrieval_test():
|
||||
page = int(req.get("page", 1))
|
||||
size = int(req.get("size", 30))
|
||||
question = req["question"]
|
||||
kb_id = req["kb_id"]
|
||||
if isinstance(kb_id, str): kb_id = [kb_id]
|
||||
kb_ids = req["kb_id"]
|
||||
if isinstance(kb_ids, str):
|
||||
kb_ids = [kb_ids]
|
||||
doc_ids = req.get("doc_ids", [])
|
||||
similarity_threshold = float(req.get("similarity_threshold", 0.0))
|
||||
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
|
||||
@ -265,17 +257,17 @@ def retrieval_test():
|
||||
|
||||
try:
|
||||
tenants = UserTenantService.query(user_id=current_user.id)
|
||||
for kid in kb_id:
|
||||
for kb_id in kb_ids:
|
||||
for tenant in tenants:
|
||||
if KnowledgebaseService.query(
|
||||
tenant_id=tenant.tenant_id, id=kid):
|
||||
tenant_id=tenant.tenant_id, id=kb_id):
|
||||
break
|
||||
else:
|
||||
return get_json_result(
|
||||
data=False, message='Only owner of knowledgebase authorized for this operation.',
|
||||
code=RetCode.OPERATING_ERROR)
|
||||
|
||||
e, kb = KnowledgebaseService.get_by_id(kb_id[0])
|
||||
e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
|
||||
if not e:
|
||||
return get_data_error_result(message="Knowledgebase not found!")
|
||||
|
||||
@ -290,7 +282,7 @@ def retrieval_test():
|
||||
question += keyword_extraction(chat_mdl, question)
|
||||
|
||||
retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
|
||||
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
|
||||
ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_ids, page, size,
|
||||
similarity_threshold, vector_similarity_weight, top,
|
||||
doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
|
||||
for c in ranks["chunks"]:
|
||||
@ -309,12 +301,16 @@ def retrieval_test():
|
||||
@login_required
|
||||
def knowledge_graph():
|
||||
doc_id = request.args["doc_id"]
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||
kb_ids = KnowledgebaseService.get_kb_ids(tenant_id)
|
||||
req = {
|
||||
"doc_ids":[doc_id],
|
||||
"knowledge_graph_kwd": ["graph", "mind_map"]
|
||||
}
|
||||
tenant_id = DocumentService.get_tenant_id(doc_id)
|
||||
sres = retrievaler.search(req, search.index_name(tenant_id))
|
||||
sres = retrievaler.search(req, search.index_name(tenant_id), kb_ids, doc.kb_id)
|
||||
obj = {"graph": {}, "mind_map": {}}
|
||||
for id in sres.ids[:2]:
|
||||
ty = sres.field[id]["knowledge_graph_kwd"]
|
||||
|
||||
@ -17,7 +17,6 @@ import pathlib
|
||||
import re
|
||||
|
||||
import flask
|
||||
from elasticsearch_dsl import Q
|
||||
from flask import request
|
||||
from flask_login import login_required, current_user
|
||||
|
||||
@ -27,14 +26,13 @@ from api.db.services.file_service import FileService
|
||||
from api.db.services.task_service import TaskService, queue_tasks
|
||||
from api.db.services.user_service import UserTenantService
|
||||
from rag.nlp import search
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from api.db.services import duplicate_name
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
|
||||
from api.utils import get_uuid
|
||||
from api.db import FileType, TaskStatus, ParserType, FileSource
|
||||
from api.db.services.document_service import DocumentService, doc_upload_and_parse
|
||||
from api.settings import RetCode
|
||||
from api.settings import RetCode, docStoreConn
|
||||
from api.utils.api_utils import get_json_result
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
from api.utils.file_utils import filename_type, thumbnail
|
||||
@ -275,18 +273,8 @@ def change_status():
|
||||
return get_data_error_result(
|
||||
message="Database error (Document update)!")
|
||||
|
||||
if str(req["status"]) == "0":
|
||||
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
|
||||
scripts="ctx._source.available_int=0;",
|
||||
idxnm=search.index_name(
|
||||
kb.tenant_id)
|
||||
)
|
||||
else:
|
||||
ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
|
||||
scripts="ctx._source.available_int=1;",
|
||||
idxnm=search.index_name(
|
||||
kb.tenant_id)
|
||||
)
|
||||
status = int(req["status"])
|
||||
docStoreConn.update({"doc_id": req["doc_id"]}, {"available_int": status}, search.index_name(kb.tenant_id), doc.kb_id)
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
@ -365,8 +353,11 @@ def run():
|
||||
tenant_id = DocumentService.get_tenant_id(id)
|
||||
if not tenant_id:
|
||||
return get_data_error_result(message="Tenant not found!")
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
|
||||
e, doc = DocumentService.get_by_id(id)
|
||||
if not e:
|
||||
return get_data_error_result(message="Document not found!")
|
||||
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
|
||||
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
|
||||
|
||||
if str(req["run"]) == TaskStatus.RUNNING.value:
|
||||
TaskService.filter_delete([Task.doc_id == id])
|
||||
@ -490,8 +481,8 @@ def change_parser():
|
||||
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
|
||||
if not tenant_id:
|
||||
return get_data_error_result(message="Tenant not found!")
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||
if docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
|
||||
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
|
||||
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
|
||||
@ -28,6 +28,8 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.db.db_models import File
|
||||
from api.settings import RetCode
|
||||
from api.utils.api_utils import get_json_result
|
||||
from api.settings import docStoreConn
|
||||
from rag.nlp import search
|
||||
|
||||
|
||||
@manager.route('/create', methods=['post'])
|
||||
@ -166,6 +168,9 @@ def rm():
|
||||
if not KnowledgebaseService.delete_by_id(req["kb_id"]):
|
||||
return get_data_error_result(
|
||||
message="Database error (Knowledgebase removal)!")
|
||||
tenants = UserTenantService.query(user_id=current_user.id)
|
||||
for tenant in tenants:
|
||||
docStoreConn.deleteIdx(search.index_name(tenant.tenant_id), req["kb_id"])
|
||||
return get_json_result(data=True)
|
||||
except Exception as e:
|
||||
return server_error_response(e)
|
||||
|
||||
@ -30,7 +30,6 @@ from api.db.services.task_service import TaskService, queue_tasks
|
||||
from api.utils.api_utils import server_error_response
|
||||
from api.utils.api_utils import get_result, get_error_data_result
|
||||
from io import BytesIO
|
||||
from elasticsearch_dsl import Q
|
||||
from flask import request, send_file
|
||||
from api.db import FileSource, TaskStatus, FileType
|
||||
from api.db.db_models import File
|
||||
@ -42,7 +41,7 @@ from api.settings import RetCode, retrievaler
|
||||
from api.utils.api_utils import construct_json_result, get_parser_config
|
||||
from rag.nlp import search
|
||||
from rag.utils import rmSpace
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from api.settings import docStoreConn
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
import os
|
||||
|
||||
@ -293,9 +292,7 @@ def update_doc(tenant_id, dataset_id, document_id):
|
||||
)
|
||||
if not e:
|
||||
return get_error_data_result(message="Document not found!")
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)
|
||||
)
|
||||
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
|
||||
|
||||
return get_result()
|
||||
|
||||
@ -647,9 +644,7 @@ def parse(tenant_id, dataset_id):
|
||||
info["chunk_num"] = 0
|
||||
info["token_num"] = 0
|
||||
DocumentService.update_by_id(id, info)
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
|
||||
)
|
||||
docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), dataset_id)
|
||||
TaskService.filter_delete([Task.doc_id == id])
|
||||
e, doc = DocumentService.get_by_id(id)
|
||||
doc = doc.to_dict()
|
||||
@ -713,9 +708,7 @@ def stop_parsing(tenant_id, dataset_id):
|
||||
)
|
||||
info = {"run": "2", "progress": 0, "chunk_num": 0}
|
||||
DocumentService.update_by_id(id, info)
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
|
||||
)
|
||||
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
|
||||
return get_result()
|
||||
|
||||
|
||||
@ -812,7 +805,6 @@ def list_chunks(tenant_id, dataset_id, document_id):
|
||||
"question": question,
|
||||
"sort": True,
|
||||
}
|
||||
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
||||
key_mapping = {
|
||||
"chunk_num": "chunk_count",
|
||||
"kb_id": "dataset_id",
|
||||
@ -833,51 +825,56 @@ def list_chunks(tenant_id, dataset_id, document_id):
|
||||
renamed_doc[new_key] = value
|
||||
if key == "run":
|
||||
renamed_doc["run"] = run_mapping.get(str(value))
|
||||
res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
|
||||
origin_chunks = []
|
||||
sign = 0
|
||||
for id in sres.ids:
|
||||
d = {
|
||||
"chunk_id": id,
|
||||
"content_with_weight": (
|
||||
rmSpace(sres.highlight[id])
|
||||
if question and id in sres.highlight
|
||||
else sres.field[id].get("content_with_weight", "")
|
||||
),
|
||||
"doc_id": sres.field[id]["doc_id"],
|
||||
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
||||
"important_kwd": sres.field[id].get("important_kwd", []),
|
||||
"img_id": sres.field[id].get("img_id", ""),
|
||||
"available_int": sres.field[id].get("available_int", 1),
|
||||
"positions": sres.field[id].get("position_int", "").split("\t"),
|
||||
}
|
||||
if len(d["positions"]) % 5 == 0:
|
||||
poss = []
|
||||
for i in range(0, len(d["positions"]), 5):
|
||||
poss.append(
|
||||
[
|
||||
float(d["positions"][i]),
|
||||
float(d["positions"][i + 1]),
|
||||
float(d["positions"][i + 2]),
|
||||
float(d["positions"][i + 3]),
|
||||
float(d["positions"][i + 4]),
|
||||
]
|
||||
)
|
||||
d["positions"] = poss
|
||||
|
||||
origin_chunks.append(d)
|
||||
res = {"total": 0, "chunks": [], "doc": renamed_doc}
|
||||
origin_chunks = []
|
||||
if docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
|
||||
sres = retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None, highlight=True)
|
||||
res["total"] = sres.total
|
||||
sign = 0
|
||||
for id in sres.ids:
|
||||
d = {
|
||||
"id": id,
|
||||
"content_with_weight": (
|
||||
rmSpace(sres.highlight[id])
|
||||
if question and id in sres.highlight
|
||||
else sres.field[id].get("content_with_weight", "")
|
||||
),
|
||||
"doc_id": sres.field[id]["doc_id"],
|
||||
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
||||
"important_kwd": sres.field[id].get("important_kwd", []),
|
||||
"img_id": sres.field[id].get("img_id", ""),
|
||||
"available_int": sres.field[id].get("available_int", 1),
|
||||
"positions": sres.field[id].get("position_int", "").split("\t"),
|
||||
}
|
||||
if len(d["positions"]) % 5 == 0:
|
||||
poss = []
|
||||
for i in range(0, len(d["positions"]), 5):
|
||||
poss.append(
|
||||
[
|
||||
float(d["positions"][i]),
|
||||
float(d["positions"][i + 1]),
|
||||
float(d["positions"][i + 2]),
|
||||
float(d["positions"][i + 3]),
|
||||
float(d["positions"][i + 4]),
|
||||
]
|
||||
)
|
||||
d["positions"] = poss
|
||||
|
||||
origin_chunks.append(d)
|
||||
if req.get("id"):
|
||||
if req.get("id") == id:
|
||||
origin_chunks.clear()
|
||||
origin_chunks.append(d)
|
||||
sign = 1
|
||||
break
|
||||
if req.get("id"):
|
||||
if req.get("id") == id:
|
||||
origin_chunks.clear()
|
||||
origin_chunks.append(d)
|
||||
sign = 1
|
||||
break
|
||||
if req.get("id"):
|
||||
if sign == 0:
|
||||
return get_error_data_result(f"Can't find this chunk {req.get('id')}")
|
||||
if sign == 0:
|
||||
return get_error_data_result(f"Can't find this chunk {req.get('id')}")
|
||||
|
||||
for chunk in origin_chunks:
|
||||
key_mapping = {
|
||||
"chunk_id": "id",
|
||||
"id": "id",
|
||||
"content_with_weight": "content",
|
||||
"doc_id": "document_id",
|
||||
"important_kwd": "important_keywords",
|
||||
@ -996,9 +993,9 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
||||
)
|
||||
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
|
||||
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
|
||||
d["kb_id"] = [doc.kb_id]
|
||||
d["kb_id"] = dataset_id
|
||||
d["docnm_kwd"] = doc.name
|
||||
d["doc_id"] = doc.id
|
||||
d["doc_id"] = document_id
|
||||
embd_id = DocumentService.get_embd_id(document_id)
|
||||
embd_mdl = TenantLLMService.model_instance(
|
||||
tenant_id, LLMType.EMBEDDING.value, embd_id
|
||||
@ -1006,14 +1003,12 @@ def add_chunk(tenant_id, dataset_id, document_id):
|
||||
v, c = embd_mdl.encode([doc.name, req["content"]])
|
||||
v = 0.1 * v[0] + 0.9 * v[1]
|
||||
d["q_%d_vec" % len(v)] = v.tolist()
|
||||
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
|
||||
docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
|
||||
|
||||
DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
|
||||
d["chunk_id"] = chunk_id
|
||||
d["kb_id"] = doc.kb_id
|
||||
# rename keys
|
||||
key_mapping = {
|
||||
"chunk_id": "id",
|
||||
"id": "id",
|
||||
"content_with_weight": "content",
|
||||
"doc_id": "document_id",
|
||||
"important_kwd": "important_keywords",
|
||||
@ -1079,36 +1074,16 @@ def rm_chunk(tenant_id, dataset_id, document_id):
|
||||
"""
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
doc = DocumentService.query(id=document_id, kb_id=dataset_id)
|
||||
if not doc:
|
||||
return get_error_data_result(
|
||||
message=f"You don't own the document {document_id}."
|
||||
)
|
||||
doc = doc[0]
|
||||
req = request.json
|
||||
if not req.get("chunk_ids"):
|
||||
return get_error_data_result("`chunk_ids` is required")
|
||||
query = {"doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
|
||||
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
||||
if not req:
|
||||
chunk_ids = None
|
||||
else:
|
||||
chunk_ids = req.get("chunk_ids")
|
||||
if not chunk_ids:
|
||||
chunk_list = sres.ids
|
||||
else:
|
||||
chunk_list = chunk_ids
|
||||
for chunk_id in chunk_list:
|
||||
if chunk_id not in sres.ids:
|
||||
return get_error_data_result(f"Chunk {chunk_id} not found")
|
||||
if not ELASTICSEARCH.deleteByQuery(
|
||||
Q("ids", values=chunk_list), search.index_name(tenant_id)
|
||||
):
|
||||
return get_error_data_result(message="Index updating failure")
|
||||
deleted_chunk_ids = chunk_list
|
||||
chunk_number = len(deleted_chunk_ids)
|
||||
DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
|
||||
return get_result()
|
||||
condition = {"doc_id": document_id}
|
||||
if "chunk_ids" in req:
|
||||
condition["id"] = req["chunk_ids"]
|
||||
chunk_number = docStoreConn.delete(condition, search.index_name(tenant_id), dataset_id)
|
||||
if chunk_number != 0:
|
||||
DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0)
|
||||
if "chunk_ids" in req and chunk_number != len(req["chunk_ids"]):
|
||||
return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(req["chunk_ids"])}")
|
||||
return get_result(message=f"deleted {chunk_number} chunks")
|
||||
|
||||
|
||||
@manager.route(
|
||||
@ -1168,9 +1143,8 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
schema:
|
||||
type: object
|
||||
"""
|
||||
try:
|
||||
res = ELASTICSEARCH.get(chunk_id, search.index_name(tenant_id))
|
||||
except Exception:
|
||||
chunk = docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id])
|
||||
if chunk is None:
|
||||
return get_error_data_result(f"Can't find this chunk {chunk_id}")
|
||||
if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
|
||||
return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
|
||||
@ -1180,19 +1154,12 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
message=f"You don't own the document {document_id}."
|
||||
)
|
||||
doc = doc[0]
|
||||
query = {
|
||||
"doc_ids": [document_id],
|
||||
"page": 1,
|
||||
"size": 1024,
|
||||
"question": "",
|
||||
"sort": True,
|
||||
}
|
||||
sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
|
||||
if chunk_id not in sres.ids:
|
||||
return get_error_data_result(f"You don't own the chunk {chunk_id}")
|
||||
req = request.json
|
||||
content = res["_source"].get("content_with_weight")
|
||||
d = {"id": chunk_id, "content_with_weight": req.get("content", content)}
|
||||
if "content" in req:
|
||||
content = req["content"]
|
||||
else:
|
||||
content = chunk.get("content_with_weight", "")
|
||||
d = {"id": chunk_id, "content_with_weight": content}
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
if "important_keywords" in req:
|
||||
@ -1220,7 +1187,7 @@ def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
|
||||
v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
|
||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||
d["q_%d_vec" % len(v)] = v.tolist()
|
||||
ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
|
||||
docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
|
||||
return get_result()
|
||||
|
||||
|
||||
|
||||
@ -31,7 +31,7 @@ from api.utils.api_utils import (
|
||||
generate_confirmation_token,
|
||||
)
|
||||
from api.versions import get_rag_version
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from api.settings import docStoreConn
|
||||
from rag.utils.storage_factory import STORAGE_IMPL, STORAGE_IMPL_TYPE
|
||||
from timeit import default_timer as timer
|
||||
|
||||
@ -98,10 +98,11 @@ def status():
|
||||
res = {}
|
||||
st = timer()
|
||||
try:
|
||||
res["es"] = ELASTICSEARCH.health()
|
||||
res["es"]["elapsed"] = "{:.1f}".format((timer() - st) * 1000.0)
|
||||
res["doc_store"] = docStoreConn.health()
|
||||
res["doc_store"]["elapsed"] = "{:.1f}".format((timer() - st) * 1000.0)
|
||||
except Exception as e:
|
||||
res["es"] = {
|
||||
res["doc_store"] = {
|
||||
"type": "unknown",
|
||||
"status": "red",
|
||||
"elapsed": "{:.1f}".format((timer() - st) * 1000.0),
|
||||
"error": str(e),
|
||||
|
||||
@ -470,7 +470,7 @@ class User(DataBaseModel, UserMixin):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
is_superuser = BooleanField(null=True, help_text="is root", default=False, index=True)
|
||||
@ -525,7 +525,7 @@ class Tenant(DataBaseModel):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
|
||||
@ -542,7 +542,7 @@ class UserTenant(DataBaseModel):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
|
||||
@ -559,7 +559,7 @@ class InvitationCode(DataBaseModel):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
|
||||
@ -582,7 +582,7 @@ class LLMFactories(DataBaseModel):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
|
||||
@ -616,7 +616,7 @@ class LLM(DataBaseModel):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
|
||||
@ -703,7 +703,7 @@ class Knowledgebase(DataBaseModel):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
|
||||
@ -767,7 +767,7 @@ class Document(DataBaseModel):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
|
||||
@ -904,7 +904,7 @@ class Dialog(DataBaseModel):
|
||||
status = CharField(
|
||||
max_length=1,
|
||||
null=True,
|
||||
help_text="is it validate(0: wasted,1: validate)",
|
||||
help_text="is it validate(0: wasted, 1: validate)",
|
||||
default="1",
|
||||
index=True)
|
||||
|
||||
@ -987,7 +987,7 @@ def migrate_db():
|
||||
help_text="where dose this document come from",
|
||||
index=True))
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
@ -996,7 +996,7 @@ def migrate_db():
|
||||
help_text="default rerank model ID"))
|
||||
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
@ -1004,59 +1004,59 @@ def migrate_db():
|
||||
help_text="default rerank model ID"))
|
||||
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
migrator.add_column('dialog', 'top_k', IntegerField(default=1024))
|
||||
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
migrator.alter_column_type('tenant_llm', 'api_key',
|
||||
CharField(max_length=1024, null=True, help_text="API KEY", index=True))
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
migrator.add_column('api_token', 'source',
|
||||
CharField(max_length=16, null=True, help_text="none|agent|dialog", index=True))
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
migrator.add_column("tenant","tts_id",
|
||||
CharField(max_length=256,null=True,help_text="default tts model ID",index=True))
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
migrator.add_column('api_4_conversation', 'source',
|
||||
CharField(max_length=16, null=True, help_text="none|agent|dialog", index=True))
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
DB.execute_sql('ALTER TABLE llm DROP PRIMARY KEY;')
|
||||
DB.execute_sql('ALTER TABLE llm ADD PRIMARY KEY (llm_name,fid);')
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
migrator.add_column('task', 'retry_count', IntegerField(default=0))
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(
|
||||
migrator.alter_column_type('api_token', 'dialog_id',
|
||||
CharField(max_length=32, null=True, index=True))
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
#
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import traceback
|
||||
@ -24,16 +23,13 @@ from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
|
||||
from elasticsearch_dsl import Q
|
||||
from peewee import fn
|
||||
|
||||
from api.db.db_utils import bulk_insert_into_db
|
||||
from api.settings import stat_logger
|
||||
from api.settings import stat_logger, docStoreConn
|
||||
from api.utils import current_timestamp, get_format_time, get_uuid
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from graphrag.mind_map_extractor import MindMapExtractor
|
||||
from rag.settings import SVR_QUEUE_NAME
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
from rag.nlp import search, rag_tokenizer
|
||||
|
||||
@ -112,8 +108,7 @@ class DocumentService(CommonService):
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def remove_document(cls, doc, tenant_id):
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
|
||||
docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
|
||||
cls.clear_chunk_num(doc.id)
|
||||
return cls.delete_by_id(doc.id)
|
||||
|
||||
@ -225,6 +220,15 @@ class DocumentService(CommonService):
|
||||
return
|
||||
return docs[0]["tenant_id"]
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_knowledgebase_id(cls, doc_id):
|
||||
docs = cls.model.select(cls.model.kb_id).where(cls.model.id == doc_id)
|
||||
docs = docs.dicts()
|
||||
if not docs:
|
||||
return
|
||||
return docs[0]["kb_id"]
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_tenant_id_by_name(cls, name):
|
||||
@ -438,11 +442,6 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
||||
if not e:
|
||||
raise LookupError("Can't find this knowledgebase!")
|
||||
|
||||
idxnm = search.index_name(kb.tenant_id)
|
||||
if not ELASTICSEARCH.indexExist(idxnm):
|
||||
ELASTICSEARCH.createIdx(idxnm, json.load(
|
||||
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
|
||||
|
||||
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING, llm_name=kb.embd_id, lang=kb.language)
|
||||
|
||||
err, files = FileService.upload_document(kb, file_objs, user_id)
|
||||
@ -486,7 +485,7 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
||||
md5 = hashlib.md5()
|
||||
md5.update((ck["content_with_weight"] +
|
||||
str(d["doc_id"])).encode("utf-8"))
|
||||
d["_id"] = md5.hexdigest()
|
||||
d["id"] = md5.hexdigest()
|
||||
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
|
||||
d["create_timestamp_flt"] = datetime.now().timestamp()
|
||||
if not d.get("image"):
|
||||
@ -499,8 +498,8 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
||||
else:
|
||||
d["image"].save(output_buffer, format='JPEG')
|
||||
|
||||
STORAGE_IMPL.put(kb.id, d["_id"], output_buffer.getvalue())
|
||||
d["img_id"] = "{}-{}".format(kb.id, d["_id"])
|
||||
STORAGE_IMPL.put(kb.id, d["id"], output_buffer.getvalue())
|
||||
d["img_id"] = "{}-{}".format(kb.id, d["id"])
|
||||
del d["image"]
|
||||
docs.append(d)
|
||||
|
||||
@ -520,6 +519,9 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
||||
token_counts[doc_id] += c
|
||||
return vects
|
||||
|
||||
idxnm = search.index_name(kb.tenant_id)
|
||||
try_create_idx = True
|
||||
|
||||
_, tenant = TenantService.get_by_id(kb.tenant_id)
|
||||
llm_bdl = LLMBundle(kb.tenant_id, LLMType.CHAT, tenant.llm_id)
|
||||
for doc_id in docids:
|
||||
@ -550,7 +552,11 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
|
||||
v = vects[i]
|
||||
d["q_%d_vec" % len(v)] = v
|
||||
for b in range(0, len(cks), es_bulk_size):
|
||||
ELASTICSEARCH.bulk(cks[b:b + es_bulk_size], idxnm)
|
||||
if try_create_idx:
|
||||
if not docStoreConn.indexExist(idxnm, kb_id):
|
||||
docStoreConn.createIdx(idxnm, kb_id, len(vects[0]))
|
||||
try_create_idx = False
|
||||
docStoreConn.insert(cks[b:b + es_bulk_size], idxnm, kb_id)
|
||||
|
||||
DocumentService.increment_chunk_num(
|
||||
doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
|
||||
|
||||
@ -66,6 +66,16 @@ class KnowledgebaseService(CommonService):
|
||||
|
||||
return list(kbs.dicts())
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_kb_ids(cls, tenant_id):
|
||||
fields = [
|
||||
cls.model.id,
|
||||
]
|
||||
kbs = cls.model.select(*fields).where(cls.model.tenant_id == tenant_id)
|
||||
kb_ids = [kb["id"] for kb in kbs]
|
||||
return kb_ids
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_detail(cls, kb_id):
|
||||
|
||||
@ -18,6 +18,8 @@ from datetime import date
|
||||
from enum import IntEnum, Enum
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import LoggerFactory, getLogger
|
||||
import rag.utils.es_conn
|
||||
import rag.utils.infinity_conn
|
||||
|
||||
# Logger
|
||||
LoggerFactory.set_directory(
|
||||
@ -33,7 +35,7 @@ access_logger = getLogger("access")
|
||||
database_logger = getLogger("database")
|
||||
chat_logger = getLogger("chat")
|
||||
|
||||
from rag.utils.es_conn import ELASTICSEARCH
|
||||
import rag.utils
|
||||
from rag.nlp import search
|
||||
from graphrag import search as kg_search
|
||||
from api.utils import get_base_config, decrypt_database_config
|
||||
@ -206,8 +208,12 @@ AUTHENTICATION_DEFAULT_TIMEOUT = 7 * 24 * 60 * 60 # s
|
||||
PRIVILEGE_COMMAND_WHITELIST = []
|
||||
CHECK_NODES_IDENTITY = False
|
||||
|
||||
retrievaler = search.Dealer(ELASTICSEARCH)
|
||||
kg_retrievaler = kg_search.KGSearch(ELASTICSEARCH)
|
||||
if 'username' in get_base_config("es", {}):
|
||||
docStoreConn = rag.utils.es_conn.ESConnection()
|
||||
else:
|
||||
docStoreConn = rag.utils.infinity_conn.InfinityConnection()
|
||||
retrievaler = search.Dealer(docStoreConn)
|
||||
kg_retrievaler = kg_search.KGSearch(docStoreConn)
|
||||
|
||||
|
||||
class CustomEnum(Enum):
|
||||
|
||||
@ -126,10 +126,6 @@ def server_error_response(e):
|
||||
if len(e.args) > 1:
|
||||
return get_json_result(
|
||||
code=RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
|
||||
if repr(e).find("index_not_found_exception") >= 0:
|
||||
return get_json_result(code=RetCode.EXCEPTION_ERROR,
|
||||
message="No chunk found, please upload file and parse it.")
|
||||
|
||||
return get_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e))
|
||||
|
||||
|
||||
@ -270,10 +266,6 @@ def construct_error_response(e):
|
||||
pass
|
||||
if len(e.args) > 1:
|
||||
return construct_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
|
||||
if repr(e).find("index_not_found_exception") >= 0:
|
||||
return construct_json_result(code=RetCode.EXCEPTION_ERROR,
|
||||
message="No chunk found, please upload file and parse it.")
|
||||
|
||||
return construct_json_result(code=RetCode.EXCEPTION_ERROR, message=repr(e))
|
||||
|
||||
|
||||
@ -295,7 +287,7 @@ def token_required(func):
|
||||
return decorated_function
|
||||
|
||||
|
||||
def get_result(code=RetCode.SUCCESS, message='error', data=None):
|
||||
def get_result(code=RetCode.SUCCESS, message="", data=None):
|
||||
if code == 0:
|
||||
if data is not None:
|
||||
response = {"code": code, "data": data}
|
||||
|
||||
Reference in New Issue
Block a user