Refine resume parts and fix bugs in retrival using sql (#66)

This commit is contained in:
KevinHuSh
2024-02-19 19:22:17 +08:00
committed by GitHub
parent 452020d33a
commit a8294f2168
29 changed files with 302 additions and 158 deletions

View File

@ -21,20 +21,21 @@ from api.db.services.dialog_service import DialogService, ConversationService
from api.db import LLMType from api.db import LLMType
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMService, LLMBundle from api.db.services.llm_service import LLMService, LLMBundle
from api.settings import access_logger from api.settings import access_logger, stat_logger
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid from api.utils import get_uuid
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result
from rag.app.resume import forbidden_select_fields4resume
from rag.llm import ChatModel from rag.llm import ChatModel
from rag.nlp import retrievaler from rag.nlp import retrievaler
from rag.nlp.search import index_name from rag.nlp.search import index_name
from rag.utils import num_tokens_from_string, encoder from rag.utils import num_tokens_from_string, encoder, rmSpace
@manager.route('/set', methods=['POST']) @manager.route('/set', methods=['POST'])
@login_required @login_required
@validate_request("dialog_id") @validate_request("dialog_id")
def set(): def set_conversation():
req = request.json req = request.json
conv_id = req.get("conversation_id") conv_id = req.get("conversation_id")
if conv_id: if conv_id:
@ -96,9 +97,10 @@ def rm():
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)
@manager.route('/list', methods=['GET']) @manager.route('/list', methods=['GET'])
@login_required @login_required
def list(): def list_convsersation():
dialog_id = request.args["dialog_id"] dialog_id = request.args["dialog_id"]
try: try:
convs = ConversationService.query(dialog_id=dialog_id) convs = ConversationService.query(dialog_id=dialog_id)
@ -175,6 +177,7 @@ def chat(dialog, messages, **kwargs):
field_map = KnowledgebaseService.get_field_map(dialog.kb_ids) field_map = KnowledgebaseService.get_field_map(dialog.kb_ids)
## try to use sql if field mapping is good to go ## try to use sql if field mapping is good to go
if field_map: if field_map:
stat_logger.info("Use SQL to retrieval.")
markdown_tbl, chunks = use_sql(question, field_map, dialog.tenant_id, chat_mdl) markdown_tbl, chunks = use_sql(question, field_map, dialog.tenant_id, chat_mdl)
if markdown_tbl: if markdown_tbl:
return {"answer": markdown_tbl, "retrieval": {"chunks": chunks}} return {"answer": markdown_tbl, "retrieval": {"chunks": chunks}}
@ -186,7 +189,8 @@ def chat(dialog, messages, **kwargs):
if p["key"] not in kwargs: if p["key"] not in kwargs:
prompt_config["system"] = prompt_config["system"].replace("{%s}" % p["key"], " ") prompt_config["system"] = prompt_config["system"].replace("{%s}" % p["key"], " ")
kbinfos = retrievaler.retrieval(question, embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, dialog.similarity_threshold, kbinfos = retrievaler.retrieval(question, embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
dialog.similarity_threshold,
dialog.vector_similarity_weight, top=1024, aggs=False) dialog.vector_similarity_weight, top=1024, aggs=False)
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
@ -220,32 +224,42 @@ def use_sql(question,field_map, tenant_id, chat_mdl):
{} {}
问题:{} 问题:{}
请写出SQL。 请写出SQL且只要SQL不要有其他说明及文字
""".format( """.format(
index_name(tenant_id), index_name(tenant_id),
"\n".join([f"{k}: {v}" for k, v in field_map.items()]), "\n".join([f"{k}: {v}" for k, v in field_map.items()]),
question question
) )
sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {"temperature": 0.1}) sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {"temperature": 0.06})
sql = re.sub(r".*?select ", "select ", sql, flags=re.IGNORECASE) stat_logger.info(f"{question}” get SQL: {sql}")
sql = re.sub(r"[\r\n]+", " ", sql.lower())
sql = re.sub(r".*?select ", "select ", sql.lower())
sql = re.sub(r" +", " ", sql) sql = re.sub(r" +", " ", sql)
sql = re.sub(r"[;].*", "", sql) sql = re.sub(r"([;]|```).*", "", sql)
if sql[:len("select ")].lower() != "select ": if sql[:len("select ")] != "select ":
return None, None return None, None
if sql[:len("select *")].lower() != "select *": if sql[:len("select *")] != "select *":
sql = "select doc_id,docnm_kwd," + sql[6:] sql = "select doc_id,docnm_kwd," + sql[6:]
else:
flds = []
for k in field_map.keys():
if k in forbidden_select_fields4resume:continue
if len(flds) > 11:break
flds.append(k)
sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:]
tbl = retrievaler.sql_retrieval(sql) stat_logger.info(f"{question}” get SQL(refined): {sql}")
if not tbl: return None, None tbl = retrievaler.sql_retrieval(sql, format="json")
if not tbl or len(tbl["rows"]) == 0: return None, None
docid_idx = set([ii for ii, c in enumerate(tbl["columns"]) if c["name"] == "doc_id"]) docid_idx = set([ii for ii, c in enumerate(tbl["columns"]) if c["name"] == "doc_id"])
docnm_idx = set([ii for ii, c in enumerate(tbl["columns"]) if c["name"] == "docnm_kwd"]) docnm_idx = set([ii for ii, c in enumerate(tbl["columns"]) if c["name"] == "docnm_kwd"])
clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)] clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]
# compose markdown table # compose markdown table
clmns = "|".join([re.sub(r"/.*", "", field_map.get(tbl["columns"][i]["name"], f"C{i}")) for i in clmn_idx]) + "|原文" clmns = "|".join([re.sub(r"(/.*|[^]+)", "", field_map.get(tbl["columns"][i]["name"], f"C{i}")) for i in clmn_idx]) + "|原文"
line = "|".join(["------" for _ in range(len(clmn_idx))]) + "|------" line = "|".join(["------" for _ in range(len(clmn_idx))]) + "|------"
rows = ["|".join([str(r[i]) for i in clmn_idx])+"|" for r in tbl["rows"]] rows = ["|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
if not docid_idx or not docnm_idx: if not docid_idx or not docnm_idx:
access_logger.error("SQL missing field: " + sql) access_logger.error("SQL missing field: " + sql)
return "\n".join([clmns, line, "\n".join(rows)]), [] return "\n".join([clmns, line, "\n".join(rows)]), []

View File

@ -27,7 +27,7 @@ from api.utils.api_utils import get_json_result
@manager.route('/set', methods=['POST']) @manager.route('/set', methods=['POST'])
@login_required @login_required
def set(): def set_dialog():
req = request.json req = request.json
dialog_id = req.get("dialog_id") dialog_id = req.get("dialog_id")
name = req.get("name", "New Dialog") name = req.get("name", "New Dialog")

View File

@ -262,17 +262,18 @@ def rename():
return server_error_response(e) return server_error_response(e)
@manager.route('/get', methods=['GET']) @manager.route('/get/<doc_id>', methods=['GET'])
@login_required def get(doc_id):
def get():
doc_id = request.args["doc_id"]
try: try:
e, doc = DocumentService.get_by_id(doc_id) e, doc = DocumentService.get_by_id(doc_id)
if not e: if not e:
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")
blob = MINIO.get(doc.kb_id, doc.location) response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
return get_json_result(data={"base64": base64.b64decode(blob)}) ext = re.search(r"\.([^.]+)$", doc.name)
if ext:
response.headers.set('Content-Type', 'application/%s'%ext.group(1))
return response
except Exception as e: except Exception as e:
return server_error_response(e) return server_error_response(e)

View File

@ -38,6 +38,9 @@ def create():
req["id"] = get_uuid() req["id"] = get_uuid()
req["tenant_id"] = current_user.id req["tenant_id"] = current_user.id
req["created_by"] = current_user.id req["created_by"] = current_user.id
e, t = TenantService.get_by_id(current_user.id)
if not e: return get_data_error_result(retmsg="Tenant not found.")
req["embd_id"] = t.embd_id
if not KnowledgebaseService.save(**req): return get_data_error_result() if not KnowledgebaseService.save(**req): return get_data_error_result()
return get_json_result(data={"kb_id": req["id"]}) return get_json_result(data={"kb_id": req["id"]})
except Exception as e: except Exception as e:

View File

@ -21,11 +21,12 @@ from api.db.services.llm_service import LLMFactoriesService, TenantLLMService, L
from api.db.services.user_service import TenantService, UserTenantService from api.db.services.user_service import TenantService, UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid, get_format_time from api.utils import get_uuid, get_format_time
from api.db import StatusEnum, UserTenantRole from api.db import StatusEnum, UserTenantRole, LLMType
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.db_models import Knowledgebase, TenantLLM from api.db.db_models import Knowledgebase, TenantLLM
from api.settings import stat_logger, RetCode from api.settings import stat_logger, RetCode
from api.utils.api_utils import get_json_result from api.utils.api_utils import get_json_result
from rag.llm import EmbeddingModel, CvModel, ChatModel
@manager.route('/factories', methods=['GET']) @manager.route('/factories', methods=['GET'])
@ -43,16 +44,37 @@ def factories():
@validate_request("llm_factory", "api_key") @validate_request("llm_factory", "api_key")
def set_api_key(): def set_api_key():
req = request.json req = request.json
# test if api key works
msg = ""
for llm in LLMService.query(fid=req["llm_factory"]):
if llm.model_type == LLMType.EMBEDDING.value:
mdl = EmbeddingModel[req["llm_factory"]](
req["api_key"], llm.llm_name)
try:
arr, tc = mdl.encode(["Test if the api key is available"])
if len(arr[0]) == 0 or tc ==0: raise Exception("Fail")
except Exception as e:
msg += f"\nFail to access embedding model({llm.llm_name}) using this api key."
elif llm.model_type == LLMType.CHAT.value:
mdl = ChatModel[req["llm_factory"]](
req["api_key"], llm.llm_name)
try:
m, tc = mdl.chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], {"temperature": 0.9})
if not tc: raise Exception(m)
except Exception as e:
msg += f"\nFail to access model({llm.llm_name}) using this api key." + str(e)
if msg: return get_data_error_result(retmsg=msg)
llm = { llm = {
"tenant_id": current_user.id, "tenant_id": current_user.id,
"llm_factory": req["llm_factory"], "llm_factory": req["llm_factory"],
"api_key": req["api_key"] "api_key": req["api_key"]
} }
# TODO: Test api_key
for n in ["model_type", "llm_name"]: for n in ["model_type", "llm_name"]:
if n in req: llm[n] = req[n] if n in req: llm[n] = req[n]
TenantLLM.insert(**llm).on_conflict("replace").execute() TenantLLMService.filter_update([TenantLLM.tenant_id==llm["tenant_id"], TenantLLM.llm_factory==llm["llm_factory"]], llm)
return get_json_result(data=True) return get_json_result(data=True)
@ -69,6 +91,7 @@ def my_llms():
@manager.route('/list', methods=['GET']) @manager.route('/list', methods=['GET'])
@login_required @login_required
def list(): def list():
model_type = request.args.get("model_type")
try: try:
objs = TenantLLMService.query(tenant_id=current_user.id) objs = TenantLLMService.query(tenant_id=current_user.id)
mdlnms = set([o.to_dict()["llm_name"] for o in objs if o.api_key]) mdlnms = set([o.to_dict()["llm_name"] for o in objs if o.api_key])
@ -79,6 +102,7 @@ def list():
res = {} res = {}
for m in llms: for m in llms:
if model_type and m["model_type"] != model_type: continue
if m["fid"] not in res: res[m["fid"]] = [] if m["fid"] not in res: res[m["fid"]] = []
res[m["fid"]].append(m) res[m["fid"]].append(m)

View File

@ -24,7 +24,8 @@ from api.db.services.llm_service import TenantLLMService, LLMService
from api.utils.api_utils import server_error_response, validate_request from api.utils.api_utils import server_error_response, validate_request
from api.utils import get_uuid, get_format_time, decrypt, download_img from api.utils import get_uuid, get_format_time, decrypt, download_img
from api.db import UserTenantRole, LLMType from api.db import UserTenantRole, LLMType
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
LLM_FACTORY
from api.db.services.user_service import UserService, TenantService, UserTenantService from api.db.services.user_service import UserService, TenantService, UserTenantService
from api.settings import stat_logger from api.settings import stat_logger
from api.utils.api_utils import get_json_result, cors_reponse from api.utils.api_utils import get_json_result, cors_reponse
@ -204,8 +205,8 @@ def user_register(user_id, user):
"role": UserTenantRole.OWNER "role": UserTenantRole.OWNER
} }
tenant_llm = [] tenant_llm = []
for llm in LLMService.query(fid="Infiniflow"): for llm in LLMService.query(fid=LLM_FACTORY):
tenant_llm.append({"tenant_id": user_id, "llm_factory": "Infiniflow", "llm_name": llm.llm_name, "model_type":llm.model_type, "api_key": "infiniflow API Key"}) tenant_llm.append({"tenant_id": user_id, "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type":llm.model_type, "api_key": API_KEY})
if not UserService.save(**user):return if not UserService.save(**user):return
TenantService.save(**tenant) TenantService.save(**tenant)

View File

@ -465,7 +465,8 @@ class Knowledgebase(DataBaseModel):
tenant_id = CharField(max_length=32, null=False) tenant_id = CharField(max_length=32, null=False)
name = CharField(max_length=128, null=False, help_text="KB name", index=True) name = CharField(max_length=128, null=False, help_text="KB name", index=True)
description = TextField(null=True, help_text="KB description") description = TextField(null=True, help_text="KB description")
permission = CharField(max_length=16, null=False, help_text="me|team") embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID")
permission = CharField(max_length=16, null=False, help_text="me|team", default="me")
created_by = CharField(max_length=32, null=False) created_by = CharField(max_length=32, null=False)
doc_num = IntegerField(default=0) doc_num = IntegerField(default=0)
token_num = IntegerField(default=0) token_num = IntegerField(default=0)

View File

@ -46,11 +46,6 @@ def init_llm_factory():
"logo": "", "logo": "",
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION", "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
"status": "1", "status": "1",
},{
"name": "Infiniflow",
"logo": "",
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
"status": "1",
},{ },{
"name": "智普AI", "name": "智普AI",
"logo": "", "logo": "",
@ -135,59 +130,33 @@ def init_llm_factory():
"model_type": LLMType.SPEECH2TEXT.value "model_type": LLMType.SPEECH2TEXT.value
},{ },{
"fid": factory_infos[1]["name"], "fid": factory_infos[1]["name"],
"llm_name": "qwen_vl_chat_v1", "llm_name": "qwen-vl-max",
"tags": "LLM,CHAT,IMAGE2TEXT",
"max_tokens": 765,
"model_type": LLMType.IMAGE2TEXT.value
},
# ----------------------- Infiniflow -----------------------
{
"fid": factory_infos[2]["name"],
"llm_name": "gpt-3.5-turbo",
"tags": "LLM,CHAT,4K",
"max_tokens": 4096,
"model_type": LLMType.CHAT.value
},{
"fid": factory_infos[2]["name"],
"llm_name": "text-embedding-ada-002",
"tags": "TEXT EMBEDDING,8K",
"max_tokens": 8191,
"model_type": LLMType.EMBEDDING.value
},{
"fid": factory_infos[2]["name"],
"llm_name": "whisper-1",
"tags": "SPEECH2TEXT",
"max_tokens": 25*1024*1024,
"model_type": LLMType.SPEECH2TEXT.value
},{
"fid": factory_infos[2]["name"],
"llm_name": "gpt-4-vision-preview",
"tags": "LLM,CHAT,IMAGE2TEXT", "tags": "LLM,CHAT,IMAGE2TEXT",
"max_tokens": 765, "max_tokens": 765,
"model_type": LLMType.IMAGE2TEXT.value "model_type": LLMType.IMAGE2TEXT.value
}, },
# ---------------------- ZhipuAI ---------------------- # ---------------------- ZhipuAI ----------------------
{ {
"fid": factory_infos[3]["name"], "fid": factory_infos[2]["name"],
"llm_name": "glm-3-turbo", "llm_name": "glm-3-turbo",
"tags": "LLM,CHAT,", "tags": "LLM,CHAT,",
"max_tokens": 128 * 1000, "max_tokens": 128 * 1000,
"model_type": LLMType.CHAT.value "model_type": LLMType.CHAT.value
}, { }, {
"fid": factory_infos[3]["name"], "fid": factory_infos[2]["name"],
"llm_name": "glm-4", "llm_name": "glm-4",
"tags": "LLM,CHAT,", "tags": "LLM,CHAT,",
"max_tokens": 128 * 1000, "max_tokens": 128 * 1000,
"model_type": LLMType.CHAT.value "model_type": LLMType.CHAT.value
}, { }, {
"fid": factory_infos[3]["name"], "fid": factory_infos[2]["name"],
"llm_name": "glm-4v", "llm_name": "glm-4v",
"tags": "LLM,CHAT,IMAGE2TEXT", "tags": "LLM,CHAT,IMAGE2TEXT",
"max_tokens": 2000, "max_tokens": 2000,
"model_type": LLMType.IMAGE2TEXT.value "model_type": LLMType.IMAGE2TEXT.value
}, },
{ {
"fid": factory_infos[3]["name"], "fid": factory_infos[2]["name"],
"llm_name": "embedding-2", "llm_name": "embedding-2",
"tags": "TEXT EMBEDDING", "tags": "TEXT EMBEDDING",
"max_tokens": 512, "max_tokens": 512,

View File

@ -77,9 +77,12 @@ class KnowledgebaseService(CommonService):
if isinstance(v, dict): if isinstance(v, dict):
assert isinstance(old[k], dict) assert isinstance(old[k], dict)
dfs_update(old[k], v) dfs_update(old[k], v)
if isinstance(v, list):
assert isinstance(old[k], list)
old[k] = list(set(old[k]+v))
else: old[k] = v else: old[k] = v
dfs_update(m.parser_config, config) dfs_update(m.parser_config, config)
cls.update_by_id(id, m.parser_config) cls.update_by_id(id, {"parser_config": m.parser_config})
@classmethod @classmethod
@ -88,6 +91,6 @@ class KnowledgebaseService(CommonService):
conf = {} conf = {}
for k in cls.get_by_ids(ids): for k in cls.get_by_ids(ids):
if k.parser_config and "field_map" in k.parser_config: if k.parser_config and "field_map" in k.parser_config:
conf.update(k.parser_config) conf.update(k.parser_config["field_map"])
return conf return conf

View File

@ -43,12 +43,14 @@ REQUEST_MAX_WAIT_SEC = 300
USE_REGISTRY = get_base_config("use_registry") USE_REGISTRY = get_base_config("use_registry")
LLM = get_base_config("llm", {}) LLM = get_base_config("user_default_llm", {})
CHAT_MDL = LLM.get("chat_model", "gpt-3.5-turbo") LLM_FACTORY=LLM.get("factory", "通义千问")
EMBEDDING_MDL = LLM.get("embedding_model", "text-embedding-ada-002") CHAT_MDL = LLM.get("chat_model", "qwen-plus")
ASR_MDL = LLM.get("asr_model", "whisper-1") EMBEDDING_MDL = LLM.get("embedding_model", "text-embedding-v2")
ASR_MDL = LLM.get("asr_model", "paraformer-realtime-8k-v1")
IMAGE2TEXT_MDL = LLM.get("image2text_model", "qwen-vl-max")
API_KEY = LLM.get("api_key", "infiniflow API Key")
PARSERS = LLM.get("parsers", "general:General,qa:Q&A,resume:Resume,naive:Naive,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture") PARSERS = LLM.get("parsers", "general:General,qa:Q&A,resume:Resume,naive:Naive,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture")
IMAGE2TEXT_MDL = LLM.get("image2text_model", "gpt-4-vision-preview")
# distribution # distribution
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)

View File

@ -164,10 +164,10 @@ def thumbnail(filename, blob):
buffered = BytesIO() buffered = BytesIO()
Image.frombytes("RGB", [pix.width, pix.height], Image.frombytes("RGB", [pix.width, pix.height],
pix.samples).save(buffered, format="png") pix.samples).save(buffered, format="png")
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()) return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()) return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()).decode("utf-8")
if re.match(r".*\.(ppt|pptx)$", filename): if re.match(r".*\.(ppt|pptx)$", filename):
import aspose.slides as slides import aspose.slides as slides
@ -176,7 +176,7 @@ def thumbnail(filename, blob):
with slides.Presentation(BytesIO(blob)) as presentation: with slides.Presentation(BytesIO(blob)) as presentation:
buffered = BytesIO() buffered = BytesIO()
presentation.slides[0].get_thumbnail(0.03, 0.03).save(buffered, drawing.imaging.ImageFormat.png) presentation.slides[0].get_thumbnail(0.03, 0.03).save(buffered, drawing.imaging.ImageFormat.png)
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()) return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
except Exception as e: except Exception as e:
pass pass

View File

@ -118,11 +118,45 @@
}, },
{ {
"dense_vector": { "dense_vector": {
"match": "*_vec", "match": "*_512_vec",
"mapping": { "mapping": {
"type": "dense_vector", "type": "dense_vector",
"index": true, "index": true,
"similarity": "cosine" "similarity": "cosine",
"dims": 512
}
}
},
{
"dense_vector": {
"match": "*_768_vec",
"mapping": {
"type": "dense_vector",
"index": true,
"similarity": "cosine",
"dims": 768
}
}
},
{
"dense_vector": {
"match": "*_1024_vec",
"mapping": {
"type": "dense_vector",
"index": true,
"similarity": "cosine",
"dims": 1024
}
}
},
{
"dense_vector": {
"match": "*_1536_vec",
"mapping": {
"type": "dense_vector",
"index": true,
"similarity": "cosine",
"dims": 1536
} }
} }
}, },

View File

@ -11,7 +11,7 @@ permission:
dataset: false dataset: false
ragflow: ragflow:
# you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported # you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported
host: 127.0.0.1 host: 0.0.0.0
http_port: 9380 http_port: 9380
database: database:
name: 'rag_flow' name: 'rag_flow'
@ -21,6 +21,19 @@ database:
port: 5455 port: 5455
max_connections: 100 max_connections: 100
stale_timeout: 30 stale_timeout: 30
minio:
user: 'rag_flow'
passwd: 'infini_rag_flow'
host: '123.60.95.134:9000'
es:
hosts: 'http://123.60.95.134:9200'
user_default_llm:
factory: '通义千问'
chat_model: 'qwen-plus'
embedding_model: 'text-embedding-v2'
asr_model: 'paraformer-realtime-8k-v1'
image2text_model: 'qwen-vl-max'
api_key: 'sk-xxxxxxxxxxxxx'
oauth: oauth:
github: github:
client_id: 302129228f0d96055bee client_id: 302129228f0d96055bee

View File

@ -39,6 +39,11 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
"""
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -2,7 +2,6 @@ import copy
import re import re
from io import BytesIO from io import BytesIO
from docx import Document from docx import Document
import numpy as np
from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title make_colon_as_title
from rag.nlp import huqie from rag.nlp import huqie
@ -59,6 +58,9 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -58,8 +58,10 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Only pdf is supported.
"""
pdf_parser = None pdf_parser = None
paper = {}
if re.search(r"\.pdf$", filename, re.IGNORECASE): if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() pdf_parser = Pdf()

View File

@ -6,6 +6,7 @@ from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser from rag.parser.pdf_parser import HuParser
from rag.settings import cron_logger from rag.settings import cron_logger
class Pdf(HuParser): class Pdf(HuParser):
def __call__(self, filename, binary=None, from_page=0, def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None): to_page=100000, zoomin=3, callback=None):
@ -26,6 +27,12 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -45,7 +52,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
elif re.search(r"\.txt$", filename, re.IGNORECASE): elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = "" txt = ""
if binary:txt = binary.decode("utf-8") if binary:
txt = binary.decode("utf-8")
else: else:
with open(filename, "r") as f: with open(filename, "r") as f:
while True: while True:
@ -55,10 +63,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
sections = txt.split("\n") sections = txt.split("\n")
sections = [(l, "") for l in sections if l] sections = [(l, "") for l in sections if l]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)") else:
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimer": "\n。;!?"}) parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimer"]) cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
eng = is_english(cks) eng = is_english(cks)
res = [] res = []
# wrap up to es documents # wrap up to es documents
@ -75,6 +84,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
def dummy(a, b): def dummy(a, b):
pass pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -129,6 +129,10 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
"""
pdf_parser = None pdf_parser = None
if re.search(r"\.pdf$", filename, re.IGNORECASE): if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() pdf_parser = Pdf()

View File

@ -94,6 +94,11 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -70,7 +70,17 @@ def beAdoc(d, q, a, eng):
def chunk(filename, binary=None, callback=None, **kwargs): def chunk(filename, binary=None, callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column question and answer without header.
And question column is ahead of answer column.
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
All the deformed lines will be ignored.
Every pair of Q&A will be treated as a chunk.
"""
res = [] res = []
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")

View File

@ -4,24 +4,34 @@ import os
import re import re
import requests import requests
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.settings import stat_logger
from rag.nlp import huqie from rag.nlp import huqie
from rag.settings import cron_logger from rag.settings import cron_logger
from rag.utils import rmSpace from rag.utils import rmSpace
forbidden_select_fields4resume = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
]
def chunk(filename, binary=None, callback=None, **kwargs): def chunk(filename, binary=None, callback=None, **kwargs):
"""
The supported file formats are pdf, docx and txt.
To maximize the effectiveness, parse the resume correctly,
please visit https://github.com/infiniflow/ragflow, and sign in the our demo web-site
to get token. It's FREE!
Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in '.env' file or
using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container.
"""
if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE): if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
raise NotImplementedError("file type not supported yet(pdf supported)") raise NotImplementedError("file type not supported yet(pdf supported)")
url = os.environ.get("INFINIFLOW_SERVER") url = os.environ.get("INFINIFLOW_SERVER")
if not url:
raise EnvironmentError(
"Please set environment variable: 'INFINIFLOW_SERVER'")
token = os.environ.get("INFINIFLOW_TOKEN") token = os.environ.get("INFINIFLOW_TOKEN")
if not token: if not url or not token:
raise EnvironmentError( stat_logger.warning(
"Please set environment variable: 'INFINIFLOW_TOKEN'") "INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
return []
if not binary: if not binary:
with open(filename, "rb") as f: with open(filename, "rb") as f:
@ -44,22 +54,28 @@ def chunk(filename, binary=None, callback=None, **kwargs):
callback(0.2, "Resume parsing is going on...") callback(0.2, "Resume parsing is going on...")
resume = remote_call() resume = remote_call()
if len(resume.keys()) < 7:
callback(-1, "Resume is not successfully parsed.")
return []
callback(0.6, "Done parsing. Chunking...") callback(0.6, "Done parsing. Chunking...")
print(json.dumps(resume, ensure_ascii=False, indent=2)) print(json.dumps(resume, ensure_ascii=False, indent=2))
field_map = { field_map = {
"name_kwd": "姓名/名字", "name_kwd": "姓名/名字",
"name_pinyin_kwd": "姓名拼音/名字拼音",
"gender_kwd": "性别(男,女)", "gender_kwd": "性别(男,女)",
"age_int": "年龄/岁/年纪", "age_int": "年龄/岁/年纪",
"phone_kwd": "电话/手机/微信", "phone_kwd": "电话/手机/微信",
"email_tks": "email/e-mail/邮箱", "email_tks": "email/e-mail/邮箱",
"position_name_tks": "职位/职能/岗位/职责", "position_name_tks": "职位/职能/岗位/职责",
"expect_position_name_tks": "期望职位/期望职能/期望岗位", "expect_city_names_tks": "期望城市",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"hightest_degree_kwd": "最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_degree_kwd": "第一学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_major_tks": "第一学历专业",
"first_school_name_tks": "第一学历毕业学校", "first_school_name_tks": "第一学历毕业学校",
"first_degree_kwd": "第一学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"highest_degree_kwd": "最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_major_tks": "第一学历专业",
"edu_first_fea_kwd": "第一学历标签211留学双一流985海外知名重点大学中专专升本专科本科大专", "edu_first_fea_kwd": "第一学历标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"degree_kwd": "过往学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA", "degree_kwd": "过往学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
@ -68,14 +84,14 @@ def chunk(filename, binary=None, callback=None, **kwargs):
"sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)", "sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)",
"edu_fea_kwd": "教育标签211留学双一流985海外知名重点大学中专专升本专科本科大专", "edu_fea_kwd": "教育标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"birth_dt": "生日/出生年份",
"corp_nm_tks": "就职过的公司/之前的公司/上过班的公司", "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"edu_end_int": "毕业年份", "edu_end_int": "毕业年份",
"expect_city_names_tks": "期望城市", "industry_name_tks": "所在行业",
"industry_name_tks": "所在行业"
"birth_dt": "生日/出生年份",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
} }
titles = [] titles = []
for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]: for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]:
v = resume.get(n, "") v = resume.get(n, "")
@ -105,6 +121,10 @@ def chunk(filename, binary=None, callback=None, **kwargs):
doc["content_ltks"] = huqie.qie(doc["content_with_weight"]) doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"]) doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
for n, _ in field_map.items(): for n, _ in field_map.items():
if n not in resume:continue
if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
resume[n] = resume[n][0]
if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
doc[n] = resume[n] doc[n] = resume[n]
print(doc) print(doc)

View File

@ -100,7 +100,20 @@ def column_data_type(arr):
def chunk(filename, binary=None, callback=None, **kwargs): def chunk(filename, binary=None, callback=None, **kwargs):
dfs = [] """
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA
Every row in table will be treated as a chunk.
"""
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
excel_parser = Excel() excel_parser = Excel()
@ -155,7 +168,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
del df[n] del df[n]
clmns = df.columns.values clmns = df.columns.values
txts = list(copy.deepcopy(clmns)) txts = list(copy.deepcopy(clmns))
py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns] py_clmns = [PY.get_pinyins(re.sub(r"(/.*|[^]+?|\([^()]+?\))", "", n), '_')[0] for n in clmns]
clmn_tys = [] clmn_tys = []
for j in range(len(clmns)): for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]]) cln, ty = column_data_type(df[clmns[j]])

View File

@ -21,7 +21,7 @@ from .cv_model import *
EmbeddingModel = { EmbeddingModel = {
"Infiniflow": HuEmbedding, "Infiniflow": HuEmbedding,
"OpenAI": OpenAIEmbed, "OpenAI": OpenAIEmbed,
"通义千问": QWenEmbed, "通义千问": HuEmbedding, #QWenEmbed,
} }

View File

@ -32,7 +32,7 @@ class GptTurbo(Base):
self.model_name = model_name self.model_name = model_name
def chat(self, system, history, gen_conf): def chat(self, system, history, gen_conf):
history.insert(0, {"role": "system", "content": system}) if system: history.insert(0, {"role": "system", "content": system})
res = self.client.chat.completions.create( res = self.client.chat.completions.create(
model=self.model_name, model=self.model_name,
messages=history, messages=history,
@ -49,11 +49,12 @@ class QWenChat(Base):
def chat(self, system, history, gen_conf): def chat(self, system, history, gen_conf):
from http import HTTPStatus from http import HTTPStatus
history.insert(0, {"role": "system", "content": system}) if system: history.insert(0, {"role": "system", "content": system})
response = Generation.call( response = Generation.call(
self.model_name, self.model_name,
messages=history, messages=history,
result_format='message' result_format='message',
**gen_conf
) )
if response.status_code == HTTPStatus.OK: if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content'], response.usage.output_tokens return response.output.choices[0]['message']['content'], response.usage.output_tokens
@ -68,10 +69,11 @@ class ZhipuChat(Base):
def chat(self, system, history, gen_conf): def chat(self, system, history, gen_conf):
from http import HTTPStatus from http import HTTPStatus
history.insert(0, {"role": "system", "content": system}) if system: history.insert(0, {"role": "system", "content": system})
response = self.client.chat.completions.create( response = self.client.chat.completions.create(
self.model_name, self.model_name,
messages=history messages=history,
**gen_conf
) )
if response.status_code == HTTPStatus.OK: if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content'], response.usage.completion_tokens return response.output.choices[0]['message']['content'], response.usage.completion_tokens

View File

@ -100,11 +100,11 @@ class QWenEmbed(Base):
input=texts[i:i+batch_size], input=texts[i:i+batch_size],
text_type="document" text_type="document"
) )
embds = [[]] * len(resp["output"]["embeddings"]) embds = [[] for _ in range(len(resp["output"]["embeddings"]))]
for e in resp["output"]["embeddings"]: for e in resp["output"]["embeddings"]:
embds[e["text_index"]] = e["embedding"] embds[e["text_index"]] = e["embedding"]
res.extend(embds) res.extend(embds)
token_count += resp["usage"]["input_tokens"] token_count += resp["usage"]["total_tokens"]
return np.array(res), token_count return np.array(res), token_count
def encode_queries(self, text): def encode_queries(self, text):
@ -113,7 +113,7 @@ class QWenEmbed(Base):
input=text[:2048], input=text[:2048],
text_type="query" text_type="query"
) )
return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["input_tokens"] return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["total_tokens"]
from zhipuai import ZhipuAI from zhipuai import ZhipuAI

View File

@ -92,7 +92,7 @@ class Dealer:
assert emb_mdl, "No embedding model selected" assert emb_mdl, "No embedding model selected"
s["knn"] = self._vector( s["knn"] = self._vector(
qst, emb_mdl, req.get( qst, emb_mdl, req.get(
"similarity", 0.4), ps) "similarity", 0.1), ps)
s["knn"]["filter"] = bqry.to_dict() s["knn"]["filter"] = bqry.to_dict()
if "highlight" in s: if "highlight" in s:
del s["highlight"] del s["highlight"]
@ -106,7 +106,7 @@ class Dealer:
bqry.filter.append(Q("terms", kb_id=req["kb_ids"])) bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
s["query"] = bqry.to_dict() s["query"] = bqry.to_dict()
s["knn"]["filter"] = bqry.to_dict() s["knn"]["filter"] = bqry.to_dict()
s["knn"]["similarity"] = 0.7 s["knn"]["similarity"] = 0.17
res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src) res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
kwds = set([]) kwds = set([])
@ -171,7 +171,7 @@ class Dealer:
continue continue
if not isinstance(v, type("")): if not isinstance(v, type("")):
m[n] = str(m[n]) m[n] = str(m[n])
m[n] = rmSpace(m[n]) if n.find("tks")>0: m[n] = rmSpace(m[n])
if m: if m:
res[d["id"]] = m res[d["id"]] = m
@ -303,21 +303,22 @@ class Dealer:
return ranks return ranks
def sql_retrieval(self, sql, fetch_size=128): def sql_retrieval(self, sql, fetch_size=128, format="json"):
sql = re.sub(r"[ ]+", " ", sql) sql = re.sub(r"[ ]+", " ", sql)
sql = sql.replace("%", "")
es_logger.info(f"Get es sql: {sql}")
replaces = [] replaces = []
for r in re.finditer(r" ([a-z_]+_l?tks like |[a-z_]+_l?tks ?= ?)'([^']+)'", sql): for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
fld, v = r.group(1), r.group(2) fld, v = r.group(1), r.group(3)
fld = re.sub(r" ?(like|=)$", "", fld).lower() match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v)))
if v[0] == "%%": v = v[1:-1] replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match))
match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qie(v))
replaces.append((r.group(1)+r.group(2), match))
for p, r in replaces: sql.replace(p, r) for p, r in replaces: sql = sql.replace(p, r, 1)
es_logger.info(f"To es: {sql}")
try: try:
tbl = self.es.sql(sql, fetch_size) tbl = self.es.sql(sql, fetch_size, format)
return tbl return tbl
except Exception as e: except Exception as e:
es_logger(f"SQL failure: {sql} =>" + str(e)) es_logger.error(f"SQL failure: {sql} =>" + str(e))

View File

@ -53,9 +53,10 @@ class HuParser:
def __remote_call(self, species, images, thr=0.7): def __remote_call(self, species, images, thr=0.7):
url = os.environ.get("INFINIFLOW_SERVER") url = os.environ.get("INFINIFLOW_SERVER")
if not url:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_SERVER'")
token = os.environ.get("INFINIFLOW_TOKEN") token = os.environ.get("INFINIFLOW_TOKEN")
if not token:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_TOKEN'") if not url or not token:
logging.warning("INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
return []
def convert_image_to_bytes(PILimage): def convert_image_to_bytes(PILimage):
image = BytesIO() image = BytesIO()

View File

@ -47,7 +47,7 @@ from api.utils.file_utils import get_project_base_directory
BATCH_SIZE = 64 BATCH_SIZE = 64
FACTORY = { FACTORY = {
ParserType.GENERAL.value: laws, ParserType.GENERAL.value: manual,
ParserType.PAPER.value: paper, ParserType.PAPER.value: paper,
ParserType.BOOK.value: book, ParserType.BOOK.value: book,
ParserType.PRESENTATION.value: presentation, ParserType.PRESENTATION.value: presentation,
@ -119,8 +119,8 @@ def build(row, cvmdl):
chunker = FACTORY[row["parser_id"].lower()] chunker = FACTORY[row["parser_id"].lower()]
try: try:
cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"])) cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
cks = chunker.chunk(row["name"], MINIO.get(row["kb_id"], row["location"]), row["from_page"], row["to_page"], cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"],
callback, kb_id=row["kb_id"], parser_config=row["parser_config"]) callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
except Exception as e: except Exception as e:
if re.search("(No such file|not found)", str(e)): if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s>" % row["doc_name"]) callback(-1, "Can not find file <%s>" % row["doc_name"])
@ -129,7 +129,7 @@ def build(row, cvmdl):
cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e))) cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
return [] return
callback(msg="Finished slicing files. Start to embedding the content.") callback(msg="Finished slicing files. Start to embedding the content.")
@ -211,6 +211,7 @@ def main(comm, mod):
st_tm = timer() st_tm = timer()
cks = build(r, cv_mdl) cks = build(r, cv_mdl)
if cks is None:continue
if not cks: if not cks:
tmf.write(str(r["update_time"]) + "\n") tmf.write(str(r["update_time"]) + "\n")
callback(1., "No chunk! Done!") callback(1., "No chunk! Done!")

View File

@ -241,7 +241,7 @@ class HuEs:
es_logger.error("ES search timeout for 3 times!") es_logger.error("ES search timeout for 3 times!")
raise Exception("ES search timeout.") raise Exception("ES search timeout.")
def sql(self, sql, fetch_size=128, format="json", timeout=2): def sql(self, sql, fetch_size=128, format="json", timeout="2s"):
for i in range(3): for i in range(3):
try: try:
res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout=timeout) res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout=timeout)