Refine resume parts and fix bugs in retrival using sql (#66)

This commit is contained in:
KevinHuSh
2024-02-19 19:22:17 +08:00
committed by GitHub
parent 452020d33a
commit a8294f2168
29 changed files with 302 additions and 158 deletions

View File

@ -39,6 +39,11 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
"""
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -2,7 +2,6 @@ import copy
import re
from io import BytesIO
from docx import Document
import numpy as np
from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title
from rag.nlp import huqie
@ -59,6 +58,9 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -58,8 +58,10 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Only pdf is supported.
"""
pdf_parser = None
paper = {}
if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()

View File

@ -6,6 +6,7 @@ from rag.nlp import huqie
from rag.parser.pdf_parser import HuParser
from rag.settings import cron_logger
class Pdf(HuParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
@ -20,12 +21,18 @@ class Pdf(HuParser):
start = timer()
self._layouts_paddle(zoomin)
callback(0.77, "Layout analysis finished")
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
self._naive_vertical_merge()
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -41,24 +48,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()
sections = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:txt = binary.decode("utf-8")
if binary:
txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:break
if not l: break
txt += l
sections = txt.split("\n")
sections = [(l,"") for l in sections if l]
sections = [(l, "") for l in sections if l]
callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
else:
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimer": "\n。;!?"})
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimer"])
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
eng = is_english(cks)
res = []
# wrap up to es documents
@ -75,6 +84,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -129,6 +129,10 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
"""
pdf_parser = None
if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf()

View File

@ -94,6 +94,11 @@ class Pdf(HuParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))

View File

@ -70,7 +70,17 @@ def beAdoc(d, q, a, eng):
def chunk(filename, binary=None, callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
If the file is in excel format, there should be 2 column question and answer without header.
And question column is ahead of answer column.
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
All the deformed lines will be ignored.
Every pair of Q&A will be treated as a chunk.
"""
res = []
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")

View File

@ -4,24 +4,34 @@ import os
import re
import requests
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.settings import stat_logger
from rag.nlp import huqie
from rag.settings import cron_logger
from rag.utils import rmSpace
forbidden_select_fields4resume = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
]
def chunk(filename, binary=None, callback=None, **kwargs):
"""
The supported file formats are pdf, docx and txt.
To maximize the effectiveness, parse the resume correctly,
please visit https://github.com/infiniflow/ragflow, and sign in the our demo web-site
to get token. It's FREE!
Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in '.env' file or
using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container.
"""
if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
raise NotImplementedError("file type not supported yet(pdf supported)")
url = os.environ.get("INFINIFLOW_SERVER")
if not url:
raise EnvironmentError(
"Please set environment variable: 'INFINIFLOW_SERVER'")
token = os.environ.get("INFINIFLOW_TOKEN")
if not token:
raise EnvironmentError(
"Please set environment variable: 'INFINIFLOW_TOKEN'")
if not url or not token:
stat_logger.warning(
"INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
return []
if not binary:
with open(filename, "rb") as f:
@ -44,22 +54,28 @@ def chunk(filename, binary=None, callback=None, **kwargs):
callback(0.2, "Resume parsing is going on...")
resume = remote_call()
if len(resume.keys()) < 7:
callback(-1, "Resume is not successfully parsed.")
return []
callback(0.6, "Done parsing. Chunking...")
print(json.dumps(resume, ensure_ascii=False, indent=2))
field_map = {
"name_kwd": "姓名/名字",
"name_pinyin_kwd": "姓名拼音/名字拼音",
"gender_kwd": "性别(男,女)",
"age_int": "年龄/岁/年纪",
"phone_kwd": "电话/手机/微信",
"email_tks": "email/e-mail/邮箱",
"position_name_tks": "职位/职能/岗位/职责",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
"expect_city_names_tks": "期望城市",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"hightest_degree_kwd": "最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_degree_kwd": "第一学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_major_tks": "第一学历专业",
"first_school_name_tks": "第一学历毕业学校",
"first_degree_kwd": "第一学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"highest_degree_kwd": "最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
"first_major_tks": "第一学历专业",
"edu_first_fea_kwd": "第一学历标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"degree_kwd": "过往学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA",
@ -68,14 +84,14 @@ def chunk(filename, binary=None, callback=None, **kwargs):
"sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)",
"edu_fea_kwd": "教育标签211留学双一流985海外知名重点大学中专专升本专科本科大专",
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
"birth_dt": "生日/出生年份",
"corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
"edu_end_int": "毕业年份",
"expect_city_names_tks": "期望城市",
"industry_name_tks": "所在行业"
"industry_name_tks": "所在行业",
"birth_dt": "生日/出生年份",
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
}
titles = []
for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]:
v = resume.get(n, "")
@ -105,6 +121,10 @@ def chunk(filename, binary=None, callback=None, **kwargs):
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
for n, _ in field_map.items():
if n not in resume:continue
if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
resume[n] = resume[n][0]
if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
doc[n] = resume[n]
print(doc)

View File

@ -100,7 +100,20 @@ def column_data_type(arr):
def chunk(filename, binary=None, callback=None, **kwargs):
dfs = []
"""
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
The first line must be column headers.
Column headers must be meaningful terms inorder to make our NLP model understanding.
It's good to enumerate some synonyms using slash '/' to separate, and even better to
enumerate values using brackets like 'gender/sex(male, female)'.
Here are some examples for headers:
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
2. 姓名/名字\t电话/手机/微信\t最高学历高中职高硕士本科博士初中中技中专专科专升本MPAMBAEMBA
Every row in table will be treated as a chunk.
"""
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
@ -155,7 +168,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
del df[n]
clmns = df.columns.values
txts = list(copy.deepcopy(clmns))
py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns]
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|[^]+?|\([^()]+?\))", "", n), '_')[0] for n in clmns]
clmn_tys = []
for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]])

View File

@ -21,7 +21,7 @@ from .cv_model import *
EmbeddingModel = {
"Infiniflow": HuEmbedding,
"OpenAI": OpenAIEmbed,
"通义千问": QWenEmbed,
"通义千问": HuEmbedding, #QWenEmbed,
}

View File

@ -32,7 +32,7 @@ class GptTurbo(Base):
self.model_name = model_name
def chat(self, system, history, gen_conf):
history.insert(0, {"role": "system", "content": system})
if system: history.insert(0, {"role": "system", "content": system})
res = self.client.chat.completions.create(
model=self.model_name,
messages=history,
@ -49,11 +49,12 @@ class QWenChat(Base):
def chat(self, system, history, gen_conf):
from http import HTTPStatus
history.insert(0, {"role": "system", "content": system})
if system: history.insert(0, {"role": "system", "content": system})
response = Generation.call(
self.model_name,
messages=history,
result_format='message'
result_format='message',
**gen_conf
)
if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content'], response.usage.output_tokens
@ -68,10 +69,11 @@ class ZhipuChat(Base):
def chat(self, system, history, gen_conf):
from http import HTTPStatus
history.insert(0, {"role": "system", "content": system})
if system: history.insert(0, {"role": "system", "content": system})
response = self.client.chat.completions.create(
self.model_name,
messages=history
messages=history,
**gen_conf
)
if response.status_code == HTTPStatus.OK:
return response.output.choices[0]['message']['content'], response.usage.completion_tokens

View File

@ -100,11 +100,11 @@ class QWenEmbed(Base):
input=texts[i:i+batch_size],
text_type="document"
)
embds = [[]] * len(resp["output"]["embeddings"])
embds = [[] for _ in range(len(resp["output"]["embeddings"]))]
for e in resp["output"]["embeddings"]:
embds[e["text_index"]] = e["embedding"]
res.extend(embds)
token_count += resp["usage"]["input_tokens"]
token_count += resp["usage"]["total_tokens"]
return np.array(res), token_count
def encode_queries(self, text):
@ -113,7 +113,7 @@ class QWenEmbed(Base):
input=text[:2048],
text_type="query"
)
return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["input_tokens"]
return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["total_tokens"]
from zhipuai import ZhipuAI

View File

@ -92,7 +92,7 @@ class Dealer:
assert emb_mdl, "No embedding model selected"
s["knn"] = self._vector(
qst, emb_mdl, req.get(
"similarity", 0.4), ps)
"similarity", 0.1), ps)
s["knn"]["filter"] = bqry.to_dict()
if "highlight" in s:
del s["highlight"]
@ -106,7 +106,7 @@ class Dealer:
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
s["query"] = bqry.to_dict()
s["knn"]["filter"] = bqry.to_dict()
s["knn"]["similarity"] = 0.7
s["knn"]["similarity"] = 0.17
res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
kwds = set([])
@ -171,7 +171,7 @@ class Dealer:
continue
if not isinstance(v, type("")):
m[n] = str(m[n])
m[n] = rmSpace(m[n])
if n.find("tks")>0: m[n] = rmSpace(m[n])
if m:
res[d["id"]] = m
@ -303,21 +303,22 @@ class Dealer:
return ranks
def sql_retrieval(self, sql, fetch_size=128):
def sql_retrieval(self, sql, fetch_size=128, format="json"):
sql = re.sub(r"[ ]+", " ", sql)
sql = sql.replace("%", "")
es_logger.info(f"Get es sql: {sql}")
replaces = []
for r in re.finditer(r" ([a-z_]+_l?tks like |[a-z_]+_l?tks ?= ?)'([^']+)'", sql):
fld, v = r.group(1), r.group(2)
fld = re.sub(r" ?(like|=)$", "", fld).lower()
if v[0] == "%%": v = v[1:-1]
match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qie(v))
replaces.append((r.group(1)+r.group(2), match))
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
fld, v = r.group(1), r.group(3)
match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v)))
replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match))
for p, r in replaces: sql.replace(p, r)
for p, r in replaces: sql = sql.replace(p, r, 1)
es_logger.info(f"To es: {sql}")
try:
tbl = self.es.sql(sql, fetch_size)
tbl = self.es.sql(sql, fetch_size, format)
return tbl
except Exception as e:
es_logger(f"SQL failure: {sql} =>" + str(e))
es_logger.error(f"SQL failure: {sql} =>" + str(e))

View File

@ -53,9 +53,10 @@ class HuParser:
def __remote_call(self, species, images, thr=0.7):
url = os.environ.get("INFINIFLOW_SERVER")
if not url:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_SERVER'")
token = os.environ.get("INFINIFLOW_TOKEN")
if not token:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_TOKEN'")
if not url or not token:
logging.warning("INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
return []
def convert_image_to_bytes(PILimage):
image = BytesIO()

View File

@ -47,7 +47,7 @@ from api.utils.file_utils import get_project_base_directory
BATCH_SIZE = 64
FACTORY = {
ParserType.GENERAL.value: laws,
ParserType.GENERAL.value: manual,
ParserType.PAPER.value: paper,
ParserType.BOOK.value: book,
ParserType.PRESENTATION.value: presentation,
@ -119,8 +119,8 @@ def build(row, cvmdl):
chunker = FACTORY[row["parser_id"].lower()]
try:
cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
cks = chunker.chunk(row["name"], MINIO.get(row["kb_id"], row["location"]), row["from_page"], row["to_page"],
callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"],
callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
except Exception as e:
if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s>" % row["doc_name"])
@ -129,7 +129,7 @@ def build(row, cvmdl):
cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
return []
return
callback(msg="Finished slicing files. Start to embedding the content.")
@ -211,6 +211,7 @@ def main(comm, mod):
st_tm = timer()
cks = build(r, cv_mdl)
if cks is None:continue
if not cks:
tmf.write(str(r["update_time"]) + "\n")
callback(1., "No chunk! Done!")

View File

@ -241,7 +241,7 @@ class HuEs:
es_logger.error("ES search timeout for 3 times!")
raise Exception("ES search timeout.")
def sql(self, sql, fetch_size=128, format="json", timeout=2):
def sql(self, sql, fetch_size=128, format="json", timeout="2s"):
for i in range(3):
try:
res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout=timeout)