Refine resume parts and fix bugs in retrival using sql (#66)

This commit is contained in:
KevinHuSh
2024-02-19 19:22:17 +08:00
committed by GitHub
parent 452020d33a
commit a8294f2168
29 changed files with 302 additions and 158 deletions

View File

@ -21,20 +21,21 @@ from api.db.services.dialog_service import DialogService, ConversationService
from api.db import LLMType
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMService, LLMBundle
from api.settings import access_logger
from api.settings import access_logger, stat_logger
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid
from api.utils.api_utils import get_json_result
from rag.app.resume import forbidden_select_fields4resume
from rag.llm import ChatModel
from rag.nlp import retrievaler
from rag.nlp.search import index_name
from rag.utils import num_tokens_from_string, encoder
from rag.utils import num_tokens_from_string, encoder, rmSpace
@manager.route('/set', methods=['POST'])
@login_required
@validate_request("dialog_id")
def set():
def set_conversation():
req = request.json
conv_id = req.get("conversation_id")
if conv_id:
@ -96,9 +97,10 @@ def rm():
except Exception as e:
return server_error_response(e)
@manager.route('/list', methods=['GET'])
@login_required
def list():
def list_convsersation():
dialog_id = request.args["dialog_id"]
try:
convs = ConversationService.query(dialog_id=dialog_id)
@ -175,6 +177,7 @@ def chat(dialog, messages, **kwargs):
field_map = KnowledgebaseService.get_field_map(dialog.kb_ids)
## try to use sql if field mapping is good to go
if field_map:
stat_logger.info("Use SQL to retrieval.")
markdown_tbl, chunks = use_sql(question, field_map, dialog.tenant_id, chat_mdl)
if markdown_tbl:
return {"answer": markdown_tbl, "retrieval": {"chunks": chunks}}
@ -186,7 +189,8 @@ def chat(dialog, messages, **kwargs):
if p["key"] not in kwargs:
prompt_config["system"] = prompt_config["system"].replace("{%s}" % p["key"], " ")
kbinfos = retrievaler.retrieval(question, embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n, dialog.similarity_threshold,
kbinfos = retrievaler.retrieval(question, embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
dialog.similarity_threshold,
dialog.vector_similarity_weight, top=1024, aggs=False)
knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
@ -220,32 +224,42 @@ def use_sql(question,field_map, tenant_id, chat_mdl):
{}
问题:{}
请写出SQL。
请写出SQL且只要SQL不要有其他说明及文字
""".format(
index_name(tenant_id),
"\n".join([f"{k}: {v}" for k, v in field_map.items()]),
question
)
sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {"temperature": 0.1})
sql = re.sub(r".*?select ", "select ", sql, flags=re.IGNORECASE)
sql = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_promt}], {"temperature": 0.06})
stat_logger.info(f"{question}” get SQL: {sql}")
sql = re.sub(r"[\r\n]+", " ", sql.lower())
sql = re.sub(r".*?select ", "select ", sql.lower())
sql = re.sub(r" +", " ", sql)
sql = re.sub(r"[;].*", "", sql)
if sql[:len("select ")].lower() != "select ":
sql = re.sub(r"([;]|```).*", "", sql)
if sql[:len("select ")] != "select ":
return None, None
if sql[:len("select *")].lower() != "select *":
if sql[:len("select *")] != "select *":
sql = "select doc_id,docnm_kwd," + sql[6:]
else:
flds = []
for k in field_map.keys():
if k in forbidden_select_fields4resume:continue
if len(flds) > 11:break
flds.append(k)
sql = "select doc_id,docnm_kwd," + ",".join(flds) + sql[8:]
tbl = retrievaler.sql_retrieval(sql)
if not tbl: return None, None
stat_logger.info(f"{question}” get SQL(refined): {sql}")
tbl = retrievaler.sql_retrieval(sql, format="json")
if not tbl or len(tbl["rows"]) == 0: return None, None
docid_idx = set([ii for ii, c in enumerate(tbl["columns"]) if c["name"] == "doc_id"])
docnm_idx = set([ii for ii, c in enumerate(tbl["columns"]) if c["name"] == "docnm_kwd"])
clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)]
# compose markdown table
clmns = "|".join([re.sub(r"/.*", "", field_map.get(tbl["columns"][i]["name"], f"C{i}")) for i in clmn_idx]) + "|原文"
clmns = "|".join([re.sub(r"(/.*|[^]+)", "", field_map.get(tbl["columns"][i]["name"], f"C{i}")) for i in clmn_idx]) + "|原文"
line = "|".join(["------" for _ in range(len(clmn_idx))]) + "|------"
rows = ["|".join([str(r[i]) for i in clmn_idx])+"|" for r in tbl["rows"]]
rows = ["|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]]
if not docid_idx or not docnm_idx:
access_logger.error("SQL missing field: " + sql)
return "\n".join([clmns, line, "\n".join(rows)]), []

View File

@ -27,7 +27,7 @@ from api.utils.api_utils import get_json_result
@manager.route('/set', methods=['POST'])
@login_required
def set():
def set_dialog():
req = request.json
dialog_id = req.get("dialog_id")
name = req.get("name", "New Dialog")

View File

@ -262,17 +262,18 @@ def rename():
return server_error_response(e)
@manager.route('/get', methods=['GET'])
@login_required
def get():
doc_id = request.args["doc_id"]
@manager.route('/get/<doc_id>', methods=['GET'])
def get(doc_id):
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(retmsg="Document not found!")
blob = MINIO.get(doc.kb_id, doc.location)
return get_json_result(data={"base64": base64.b64decode(blob)})
response = flask.make_response(MINIO.get(doc.kb_id, doc.location))
ext = re.search(r"\.([^.]+)$", doc.name)
if ext:
response.headers.set('Content-Type', 'application/%s'%ext.group(1))
return response
except Exception as e:
return server_error_response(e)

View File

@ -38,6 +38,9 @@ def create():
req["id"] = get_uuid()
req["tenant_id"] = current_user.id
req["created_by"] = current_user.id
e, t = TenantService.get_by_id(current_user.id)
if not e: return get_data_error_result(retmsg="Tenant not found.")
req["embd_id"] = t.embd_id
if not KnowledgebaseService.save(**req): return get_data_error_result()
return get_json_result(data={"kb_id": req["id"]})
except Exception as e:

View File

@ -21,11 +21,12 @@ from api.db.services.llm_service import LLMFactoriesService, TenantLLMService, L
from api.db.services.user_service import TenantService, UserTenantService
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from api.utils import get_uuid, get_format_time
from api.db import StatusEnum, UserTenantRole
from api.db import StatusEnum, UserTenantRole, LLMType
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.db_models import Knowledgebase, TenantLLM
from api.settings import stat_logger, RetCode
from api.utils.api_utils import get_json_result
from rag.llm import EmbeddingModel, CvModel, ChatModel
@manager.route('/factories', methods=['GET'])
@ -43,16 +44,37 @@ def factories():
@validate_request("llm_factory", "api_key")
def set_api_key():
req = request.json
# test if api key works
msg = ""
for llm in LLMService.query(fid=req["llm_factory"]):
if llm.model_type == LLMType.EMBEDDING.value:
mdl = EmbeddingModel[req["llm_factory"]](
req["api_key"], llm.llm_name)
try:
arr, tc = mdl.encode(["Test if the api key is available"])
if len(arr[0]) == 0 or tc ==0: raise Exception("Fail")
except Exception as e:
msg += f"\nFail to access embedding model({llm.llm_name}) using this api key."
elif llm.model_type == LLMType.CHAT.value:
mdl = ChatModel[req["llm_factory"]](
req["api_key"], llm.llm_name)
try:
m, tc = mdl.chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], {"temperature": 0.9})
if not tc: raise Exception(m)
except Exception as e:
msg += f"\nFail to access model({llm.llm_name}) using this api key." + str(e)
if msg: return get_data_error_result(retmsg=msg)
llm = {
"tenant_id": current_user.id,
"llm_factory": req["llm_factory"],
"api_key": req["api_key"]
}
# TODO: Test api_key
for n in ["model_type", "llm_name"]:
if n in req: llm[n] = req[n]
TenantLLM.insert(**llm).on_conflict("replace").execute()
TenantLLMService.filter_update([TenantLLM.tenant_id==llm["tenant_id"], TenantLLM.llm_factory==llm["llm_factory"]], llm)
return get_json_result(data=True)
@ -69,6 +91,7 @@ def my_llms():
@manager.route('/list', methods=['GET'])
@login_required
def list():
model_type = request.args.get("model_type")
try:
objs = TenantLLMService.query(tenant_id=current_user.id)
mdlnms = set([o.to_dict()["llm_name"] for o in objs if o.api_key])
@ -79,6 +102,7 @@ def list():
res = {}
for m in llms:
if model_type and m["model_type"] != model_type: continue
if m["fid"] not in res: res[m["fid"]] = []
res[m["fid"]].append(m)

View File

@ -24,7 +24,8 @@ from api.db.services.llm_service import TenantLLMService, LLMService
from api.utils.api_utils import server_error_response, validate_request
from api.utils import get_uuid, get_format_time, decrypt, download_img
from api.db import UserTenantRole, LLMType
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS
from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
LLM_FACTORY
from api.db.services.user_service import UserService, TenantService, UserTenantService
from api.settings import stat_logger
from api.utils.api_utils import get_json_result, cors_reponse
@ -204,8 +205,8 @@ def user_register(user_id, user):
"role": UserTenantRole.OWNER
}
tenant_llm = []
for llm in LLMService.query(fid="Infiniflow"):
tenant_llm.append({"tenant_id": user_id, "llm_factory": "Infiniflow", "llm_name": llm.llm_name, "model_type":llm.model_type, "api_key": "infiniflow API Key"})
for llm in LLMService.query(fid=LLM_FACTORY):
tenant_llm.append({"tenant_id": user_id, "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type":llm.model_type, "api_key": API_KEY})
if not UserService.save(**user):return
TenantService.save(**tenant)

View File

@ -465,7 +465,8 @@ class Knowledgebase(DataBaseModel):
tenant_id = CharField(max_length=32, null=False)
name = CharField(max_length=128, null=False, help_text="KB name", index=True)
description = TextField(null=True, help_text="KB description")
permission = CharField(max_length=16, null=False, help_text="me|team")
embd_id = CharField(max_length=128, null=False, help_text="default embedding model ID")
permission = CharField(max_length=16, null=False, help_text="me|team", default="me")
created_by = CharField(max_length=32, null=False)
doc_num = IntegerField(default=0)
token_num = IntegerField(default=0)

View File

@ -46,11 +46,6 @@ def init_llm_factory():
"logo": "",
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
"status": "1",
},{
"name": "Infiniflow",
"logo": "",
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
"status": "1",
},{
"name": "智普AI",
"logo": "",
@ -135,59 +130,33 @@ def init_llm_factory():
"model_type": LLMType.SPEECH2TEXT.value
},{
"fid": factory_infos[1]["name"],
"llm_name": "qwen_vl_chat_v1",
"tags": "LLM,CHAT,IMAGE2TEXT",
"max_tokens": 765,
"model_type": LLMType.IMAGE2TEXT.value
},
# ----------------------- Infiniflow -----------------------
{
"fid": factory_infos[2]["name"],
"llm_name": "gpt-3.5-turbo",
"tags": "LLM,CHAT,4K",
"max_tokens": 4096,
"model_type": LLMType.CHAT.value
},{
"fid": factory_infos[2]["name"],
"llm_name": "text-embedding-ada-002",
"tags": "TEXT EMBEDDING,8K",
"max_tokens": 8191,
"model_type": LLMType.EMBEDDING.value
},{
"fid": factory_infos[2]["name"],
"llm_name": "whisper-1",
"tags": "SPEECH2TEXT",
"max_tokens": 25*1024*1024,
"model_type": LLMType.SPEECH2TEXT.value
},{
"fid": factory_infos[2]["name"],
"llm_name": "gpt-4-vision-preview",
"llm_name": "qwen-vl-max",
"tags": "LLM,CHAT,IMAGE2TEXT",
"max_tokens": 765,
"model_type": LLMType.IMAGE2TEXT.value
},
# ---------------------- ZhipuAI ----------------------
{
"fid": factory_infos[3]["name"],
"fid": factory_infos[2]["name"],
"llm_name": "glm-3-turbo",
"tags": "LLM,CHAT,",
"max_tokens": 128 * 1000,
"model_type": LLMType.CHAT.value
}, {
"fid": factory_infos[3]["name"],
"fid": factory_infos[2]["name"],
"llm_name": "glm-4",
"tags": "LLM,CHAT,",
"max_tokens": 128 * 1000,
"model_type": LLMType.CHAT.value
}, {
"fid": factory_infos[3]["name"],
"fid": factory_infos[2]["name"],
"llm_name": "glm-4v",
"tags": "LLM,CHAT,IMAGE2TEXT",
"max_tokens": 2000,
"model_type": LLMType.IMAGE2TEXT.value
},
{
"fid": factory_infos[3]["name"],
"fid": factory_infos[2]["name"],
"llm_name": "embedding-2",
"tags": "TEXT EMBEDDING",
"max_tokens": 512,

View File

@ -77,9 +77,12 @@ class KnowledgebaseService(CommonService):
if isinstance(v, dict):
assert isinstance(old[k], dict)
dfs_update(old[k], v)
if isinstance(v, list):
assert isinstance(old[k], list)
old[k] = list(set(old[k]+v))
else: old[k] = v
dfs_update(m.parser_config, config)
cls.update_by_id(id, m.parser_config)
cls.update_by_id(id, {"parser_config": m.parser_config})
@classmethod
@ -88,6 +91,6 @@ class KnowledgebaseService(CommonService):
conf = {}
for k in cls.get_by_ids(ids):
if k.parser_config and "field_map" in k.parser_config:
conf.update(k.parser_config)
conf.update(k.parser_config["field_map"])
return conf

View File

@ -43,12 +43,14 @@ REQUEST_MAX_WAIT_SEC = 300
USE_REGISTRY = get_base_config("use_registry")
LLM = get_base_config("llm", {})
CHAT_MDL = LLM.get("chat_model", "gpt-3.5-turbo")
EMBEDDING_MDL = LLM.get("embedding_model", "text-embedding-ada-002")
ASR_MDL = LLM.get("asr_model", "whisper-1")
LLM = get_base_config("user_default_llm", {})
LLM_FACTORY=LLM.get("factory", "通义千问")
CHAT_MDL = LLM.get("chat_model", "qwen-plus")
EMBEDDING_MDL = LLM.get("embedding_model", "text-embedding-v2")
ASR_MDL = LLM.get("asr_model", "paraformer-realtime-8k-v1")
IMAGE2TEXT_MDL = LLM.get("image2text_model", "qwen-vl-max")
API_KEY = LLM.get("api_key", "infiniflow API Key")
PARSERS = LLM.get("parsers", "general:General,qa:Q&A,resume:Resume,naive:Naive,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture")
IMAGE2TEXT_MDL = LLM.get("image2text_model", "gpt-4-vision-preview")
# distribution
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)

View File

@ -164,10 +164,10 @@ def thumbnail(filename, blob):
buffered = BytesIO()
Image.frombytes("RGB", [pix.width, pix.height],
pix.samples).save(buffered, format="png")
return "data:image/png;base64," + base64.b64encode(buffered.getvalue())
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes())
return ("data:image/%s;base64,"%filename.split(".")[-1]) + base64.b64encode(Image.open(BytesIO(blob)).thumbnail((30, 30)).tobytes()).decode("utf-8")
if re.match(r".*\.(ppt|pptx)$", filename):
import aspose.slides as slides
@ -176,7 +176,7 @@ def thumbnail(filename, blob):
with slides.Presentation(BytesIO(blob)) as presentation:
buffered = BytesIO()
presentation.slides[0].get_thumbnail(0.03, 0.03).save(buffered, drawing.imaging.ImageFormat.png)
return "data:image/png;base64," + base64.b64encode(buffered.getvalue())
return "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode("utf-8")
except Exception as e:
pass

View File

@ -118,11 +118,45 @@
},
{
"dense_vector": {
"match": "*_vec",
"match": "*_512_vec",
"mapping": {
"type": "dense_vector",
"index": true,
"similarity": "cosine"
"similarity": "cosine",
"dims": 512
}
}
},
{
"dense_vector": {
"match": "*_768_vec",
"mapping": {
"type": "dense_vector",
"index": true,
"similarity": "cosine",
"dims": 768
}
}
},
{
"dense_vector": {
"match": "*_1024_vec",
"mapping": {
"type": "dense_vector",
"index": true,
"similarity": "cosine",
"dims": 1024
}
}
},
{
"dense_vector": {
"match": "*_1536_vec",
"mapping": {
"type": "dense_vector",
"index": true,
"similarity": "cosine",
"dims": 1536
}
}
},

View File

@ -11,7 +11,7 @@ permission:
dataset: false
ragflow:
# you must set real ip address, 127.0.0.1 and 0.0.0.0 is not supported
host: 127.0.0.1
host: 0.0.0.0
http_port: 9380
database:
name: 'rag_flow'
@ -21,6 +21,19 @@ database:
port: 5455
max_connections: 100
stale_timeout: 30
minio:
user: 'rag_flow'
passwd: 'infini_rag_flow'
host: '123.60.95.134:9000'
es:
hosts: 'http://123.60.95.134:9200'
user_default_llm:
factory: '通义千问'
chat_model: 'qwen-plus'
embedding_model: 'text-embedding-v2'
asr_model: 'paraformer-realtime-8k-v1'
image2text_model: 'qwen-vl-max'
api_key: 'sk-xxxxxxxxxxxxx'
oauth:
github:
client_id: 302129228f0d96055bee

View File

@ -39,6 +39,11 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
"""
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -2,7 +2,6 @@ import copy
import re
from io import BytesIO
from docx import Document
import numpy as np
from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title
from rag.nlp import huqie
@ -59,6 +58,9 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -58,8 +58,10 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Only pdf is supported.
"""
pdf_parser = None
paper = {}
if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()

View File

@ -6,6 +6,7 @@ from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser
from rag.settings import cron_logger
class Pdf(HuParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
@ -26,6 +27,12 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -45,7 +52,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:txt = binary.decode("utf-8")
if binary:
txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
@ -55,10 +63,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
sections = txt.split("\n")
sections = [(l, "") for l in sections if l]
callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
else:
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimer": "\n。;!?"})
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimer"])
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
eng = is_english(cks)
res = []
# wrap up to es documents
@ -75,6 +84,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -129,6 +129,10 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
"""
pdf_parser = None
if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()

View File

@ -94,6 +94,11 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -70,7 +70,17 @@ def beAdoc(d, q, a, eng):
def chunk(filename, binary=None, callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column question and answer without header.
And question column is ahead of answer column.
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
All the deformed lines will be ignored.
Every pair of Q&A will be treated as a chunk.
"""
res = []
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")

View File

@ -4,24 +4,34 @@ import os
import re
import requests
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.settings import stat_logger
from rag.nlp import huqie
from rag.settings import cron_logger
from rag.utils import rmSpace
forbidden_select_fields4resume = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
]
def chunk(filename, binary=None, callback=None, **kwargs):
"""
The supported file formats are pdf, docx and txt.
To maximize the effectiveness, parse the resume correctly,
please visit https://github.com/infiniflow/ragflow, and sign in the our demo web-site
to get token. It's FREE!
Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in '.env' file or
using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container.
"""
if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
raise NotImplementedError("file type not supported yet(pdf supported)")
url = os.environ.get("INFINIFLOW_SERVER")
if not url:
raise EnvironmentError(
"Please set environment variable: 'INFINIFLOW_SERVER'")
token = os.environ.get("INFINIFLOW_TOKEN")
if not token:
raise EnvironmentError(
"Please set environment variable: 'INFINIFLOW_TOKEN'")
if not url or not token:
stat_logger.warning(
"INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
return []
if not binary:
with open(filename, "rb") as f:
@ -44,22 +54,28 @@ def chunk(filename, binary=None, callback=None, **kwargs):
callback(0.2, "Resume parsing is going on...")
resume = remote_call()
if len(resume.keys()) < 7:
callback(-1, "Resume is not successfully parsed.")
return []
callback(0.6, "Done parsing. Chunking...")
print(json.dumps(resume, ensure_ascii=False, indent=2))
field_map = {
"name_kwd": "姓名/名字",
"name_pinyin_kwd": "姓名拼音/名字拼音",
"gender_kwd": "性别(男,女)",
"age_int": "年龄/岁/年纪",
"phone_kwd": "电话/手机/微信",
"email_tks": "email/e-mail/邮箱",
"position_name_tks": "职位/职能/岗位/职责",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
"expect_city_names_tks": "期望城市",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"hightest_degree_kwd": "最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_degree_kwd": "第一学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_major_tks": "第一学历专业",
"first_school_name_tks": "第一学历毕业学校",
"first_degree_kwd": "第一学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"highest_degree_kwd": "最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_major_tks": "第一学历专业",
"edu_first_fea_kwd": "第一学历标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"degree_kwd": "过往学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
@ -68,14 +84,14 @@ def chunk(filename, binary=None, callback=None, **kwargs):
"sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)",
"edu_fea_kwd": "教育标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"birth_dt": "生日/出生年份",
"corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"edu_end_int": "毕业年份",
"expect_city_names_tks": "期望城市",
"industry_name_tks": "所在行业"
"industry_name_tks": "所在行业",
"birth_dt": "生日/出生年份",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
}
titles = []
for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]:
v = resume.get(n, "")
@ -105,6 +121,10 @@ def chunk(filename, binary=None, callback=None, **kwargs):
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
for n, _ in field_map.items():
if n not in resume:continue
if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
resume[n] = resume[n][0]
if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
doc[n] = resume[n]
print(doc)

View File

@ -100,7 +100,20 @@ def column_data_type(arr):
def chunk(filename, binary=None, callback=None, **kwargs):
dfs = []
"""
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA
Every row in table will be treated as a chunk.
"""
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
@ -155,7 +168,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
del df[n]
clmns = df.columns.values
txts = list(copy.deepcopy(clmns))
py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns]
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|[^]+?|\([^()]+?\))", "", n), '_')[0] for n in clmns]
clmn_tys = []
for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]])

View File

@ -21,7 +21,7 @@ from .cv_model import *
EmbeddingModel = {
"Infiniflow": HuEmbedding,
"OpenAI": OpenAIEmbed,
"通义千问": QWenEmbed,
"通义千问": HuEmbedding, #QWenEmbed,
}

View File

@ -32,7 +32,7 @@ class GptTurbo(Base):
self.model_name = model_name
def chat(self, system, history, gen_conf):
history.insert(0, {"role": "system", "content": system})
if system: history.insert(0, {"role": "system", "content": system})
res = self.client.chat.completions.create(
model=self.model_name,
messages=history,
@ -49,11 +49,12 @@ class QWenChat(Base):
def chat(self, system, history, gen_conf):
from http import HTTPStatus
history.insert(0, {"role": "system", "content": system})
if system: history.insert(0, {"role": "system", "content": system})
response = Generation.call(
self.model_name,
messages=history,
result_format='message'
result_format='message',
**gen_conf
)
if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content'], response.usage.output_tokens
@ -68,10 +69,11 @@ class ZhipuChat(Base):
def chat(self, system, history, gen_conf):
from http import HTTPStatus
history.insert(0, {"role": "system", "content": system})
if system: history.insert(0, {"role": "system", "content": system})
response = self.client.chat.completions.create(
self.model_name,
messages=history
messages=history,
**gen_conf
)
if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content'], response.usage.completion_tokens

View File

@ -100,11 +100,11 @@ class QWenEmbed(Base):
input=texts[i:i+batch_size],
text_type="document"
)
embds = [[]] * len(resp["output"]["embeddings"])
embds = [[] for _ in range(len(resp["output"]["embeddings"]))]
for e in resp["output"]["embeddings"]:
embds[e["text_index"]] = e["embedding"]
res.extend(embds)
token_count += resp["usage"]["input_tokens"]
token_count += resp["usage"]["total_tokens"]
return np.array(res), token_count
def encode_queries(self, text):
@ -113,7 +113,7 @@ class QWenEmbed(Base):
input=text[:2048],
text_type="query"
)
return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["input_tokens"]
return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["total_tokens"]
from zhipuai import ZhipuAI

View File

@ -92,7 +92,7 @@ class Dealer:
assert emb_mdl, "No embedding model selected"
s["knn"] = self._vector(
qst, emb_mdl, req.get(
"similarity", 0.4), ps)
"similarity", 0.1), ps)
s["knn"]["filter"] = bqry.to_dict()
if "highlight" in s:
del s["highlight"]
@ -106,7 +106,7 @@ class Dealer:
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
s["query"] = bqry.to_dict()
s["knn"]["filter"] = bqry.to_dict()
s["knn"]["similarity"] = 0.7
s["knn"]["similarity"] = 0.17
res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
kwds = set([])
@ -171,7 +171,7 @@ class Dealer:
continue
if not isinstance(v, type("")):
m[n] = str(m[n])
m[n] = rmSpace(m[n])
if n.find("tks")>0: m[n] = rmSpace(m[n])
if m:
res[d["id"]] = m
@ -303,21 +303,22 @@ class Dealer:
return ranks
def sql_retrieval(self, sql, fetch_size=128):
def sql_retrieval(self, sql, fetch_size=128, format="json"):
sql = re.sub(r"[ ]+", " ", sql)
sql = sql.replace("%", "")
es_logger.info(f"Get es sql: {sql}")
replaces = []
for r in re.finditer(r" ([a-z_]+_l?tks like |[a-z_]+_l?tks ?= ?)'([^']+)'", sql):
fld, v = r.group(1), r.group(2)
fld = re.sub(r" ?(like|=)$", "", fld).lower()
if v[0] == "%%": v = v[1:-1]
match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qie(v))
replaces.append((r.group(1)+r.group(2), match))
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
fld, v = r.group(1), r.group(3)
match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v)))
replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match))
for p, r in replaces: sql.replace(p, r)
for p, r in replaces: sql = sql.replace(p, r, 1)
es_logger.info(f"To es: {sql}")
try:
tbl = self.es.sql(sql, fetch_size)
tbl = self.es.sql(sql, fetch_size, format)
return tbl
except Exception as e:
es_logger(f"SQL failure: {sql} =>" + str(e))
es_logger.error(f"SQL failure: {sql} =>" + str(e))

View File

@ -53,9 +53,10 @@ class HuParser:
def __remote_call(self, species, images, thr=0.7):
url = os.environ.get("INFINIFLOW_SERVER")
if not url:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_SERVER'")
token = os.environ.get("INFINIFLOW_TOKEN")
if not token:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_TOKEN'")
if not url or not token:
logging.warning("INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
return []
def convert_image_to_bytes(PILimage):
image = BytesIO()

View File

@ -47,7 +47,7 @@ from api.utils.file_utils import get_project_base_directory
BATCH_SIZE = 64
FACTORY = {
ParserType.GENERAL.value: laws,
ParserType.GENERAL.value: manual,
ParserType.PAPER.value: paper,
ParserType.BOOK.value: book,
ParserType.PRESENTATION.value: presentation,
@ -119,8 +119,8 @@ def build(row, cvmdl):
chunker = FACTORY[row["parser_id"].lower()]
try:
cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
cks = chunker.chunk(row["name"], MINIO.get(row["kb_id"], row["location"]), row["from_page"], row["to_page"],
callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"],
callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
except Exception as e:
if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s>" % row["doc_name"])
@ -129,7 +129,7 @@ def build(row, cvmdl):
cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
return []
return
callback(msg="Finished slicing files. Start to embedding the content.")
@ -211,6 +211,7 @@ def main(comm, mod):
st_tm = timer()
cks = build(r, cv_mdl)
if cks is None:continue
if not cks:
tmf.write(str(r["update_time"]) + "\n")
callback(1., "No chunk! Done!")

View File

@ -241,7 +241,7 @@ class HuEs:
es_logger.error("ES search timeout for 3 times!")
raise Exception("ES search timeout.")
def sql(self, sql, fetch_size=128, format="json", timeout=2):
def sql(self, sql, fetch_size=128, format="json", timeout="2s"):
for i in range(3):
try:
res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout=timeout)