build dialog server; add thumbnail to docinfo; (#17)

This commit is contained in:
KevinHuSh
2023-12-26 19:32:06 +08:00
committed by GitHub
parent 3245107dc7
commit 3fc700a1d4
12 changed files with 94 additions and 42 deletions

View File

@ -1,10 +1,10 @@
[infiniflow]
es=http://127.0.0.1:9200
es=http://es01:9200
pgdb_usr=root
pgdb_pwd=infiniflow_docgpt
pgdb_host=127.0.0.1
pgdb_port=5455
minio_host=127.0.0.1:9000
pgdb_host=postgres
pgdb_port=5432
minio_host=minio:9000
minio_usr=infiniflow
minio_pwd=infiniflow_docgpt

View File

@ -24,6 +24,7 @@ class QWen(Base):
from http import HTTPStatus
from dashscope import Generation
from dashscope.api_entities.dashscope_response import Role
# export DASHSCOPE_API_KEY=YOUR_DASHSCOPE_API_KEY
response = Generation.call(
Generation.Models.qwen_turbo,
messages=messages,

View File

@ -9,6 +9,8 @@ from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
import numpy as np
from copy import deepcopy
def index_name(uid):return f"docgpt_{uid}"
class Dealer:
def __init__(self, es, emb_mdl):
self.qryr = query.EsQueryer(es)

View File

@ -6,11 +6,10 @@ from tornado.ioloop import IOLoop
from tornado.httpserver import HTTPServer
from tornado.options import define,options
from util import es_conn, setup_logging
from svr import sec_search as search
from svr.rpc_proxy import RPCProxy
from sklearn.metrics.pairwise import cosine_similarity as CosineSimilarity
from nlp import huqie
from nlp import query as Query
from nlp import search
from llm import HuEmbedding, GptTurbo
import numpy as np
from io import BytesIO
@ -38,7 +37,7 @@ def get_QA_pairs(hists):
def get_instruction(sres, top_i, max_len=8096 fld="content_ltks"):
def get_instruction(sres, top_i, max_len=8096, fld="content_ltks"):
max_len //= len(top_i)
# add instruction to prompt
instructions = [re.sub(r"[\r\n]+", " ", sres.field[sres.ids[i]][fld]) for i in top_i]
@ -96,10 +95,11 @@ class Handler(RequestHandler):
try:
question = param.get("history",[{"user": "Hi!"}])[-1]["user"]
res = SE.search({
"question": question,
"kb_ids": param.get("kb_ids", []),
"size": param.get("topn", 15)
})
"question": question,
"kb_ids": param.get("kb_ids", []),
"size": param.get("topn", 15)},
search.index_name(param["uid"])
)
sim = SE.rerank(res, question)
rk_idx = np.argsort(sim*-1)
@ -112,12 +112,12 @@ class Handler(RequestHandler):
refer = OrderedDict()
docnms = {}
for i in rk_idx:
did = res.field[res.ids[i]]["doc_id"])
if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"])
did = res.field[res.ids[i]]["doc_id"]
if did not in docnms: docnms[did] = res.field[res.ids[i]]["docnm_kwd"]
if did not in refer: refer[did] = []
refer[did].append({
"chunk_id": res.ids[i],
"content": res.field[res.ids[i]]["content_ltks"]),
"content": res.field[res.ids[i]]["content_ltks"],
"image": ""
})
@ -128,7 +128,7 @@ class Handler(RequestHandler):
"data":{
"uid": param["uid"],
"dialog_id": param["dialog_id"],
"assistant": ans
"assistant": ans,
"refer": [{
"did": did,
"doc_name": docnms[did],
@ -153,7 +153,7 @@ if __name__ == '__main__':
parser.add_argument("--port", default=4455, type=int, help="Port used for service")
ARGS = parser.parse_args()
SE = search.ResearchReportSearch(es_conn.HuEs("infiniflow"), EMBEDDING)
SE = search.Dealer(es_conn.HuEs("infiniflow"), EMBEDDING)
app = Application([(r'/v1/chat/completions', Handler)],debug=False)
http_server = HTTPServer(app)

View File

@ -6,7 +6,7 @@ from util.db_conn import Postgres
from util.minio_conn import HuMinio
from util import rmSpace, findMaxDt
from FlagEmbedding import FlagModel
from nlp import huchunk, huqie
from nlp import huchunk, huqie, search
import base64, hashlib
from io import BytesIO
import pandas as pd
@ -103,7 +103,7 @@ def build(row):
if(!ctx._source.kb_id.contains('%s'))
ctx._source.kb_id.add('%s');
"""%(str(row["kb_id"]), str(row["kb_id"])),
idxnm = index_name(row["uid"])
idxnm = search.index_name(row["uid"])
)
set_progress(row["kb2doc_id"], 1, "Done")
return []
@ -171,10 +171,8 @@ def build(row):
return docs
def index_name(uid):return f"docgpt_{uid}"
def init_kb(row):
idxnm = index_name(row["uid"])
idxnm = search.index_name(row["uid"])
if ES.indexExist(idxnm): return
return ES.createIdx(idxnm, json.load(open("conf/mapping.json", "r")))
@ -199,7 +197,7 @@ def rm_doc_from_kb(df):
ctx._source.kb_id.indexOf('%s')
);
"""%(str(r["kb_id"]),str(r["kb_id"])),
idxnm = index_name(r["uid"])
idxnm = search.index_name(r["uid"])
)
if len(df) == 0:return
sql = """
@ -233,7 +231,7 @@ def main(comm, mod):
set_progress(r["kb2doc_id"], random.randint(70, 95)/100.,
"Finished embedding! Start to build index!")
init_kb(r)
es_r = ES.bulk(cks, index_name(r["uid"]))
es_r = ES.bulk(cks, search.index_name(r["uid"]))
if es_r:
set_progress(r["kb2doc_id"], -1, "Index failure!")
print(es_r)