mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Refine resume parts and fix bugs in retrival using sql (#66)
This commit is contained in:
@ -39,6 +39,11 @@ class Pdf(HuParser):
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
|
||||
"""
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
|
||||
@ -2,7 +2,6 @@ import copy
|
||||
import re
|
||||
from io import BytesIO
|
||||
from docx import Document
|
||||
import numpy as np
|
||||
from rag.parser import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
|
||||
make_colon_as_title
|
||||
from rag.nlp import huqie
|
||||
@ -59,6 +58,9 @@ class Pdf(HuParser):
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
"""
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
|
||||
@ -58,8 +58,10 @@ class Pdf(HuParser):
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
"""
|
||||
pdf_parser = None
|
||||
paper = {}
|
||||
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
|
||||
@ -6,6 +6,7 @@ from rag.nlp import huqie
|
||||
from rag.parser.pdf_parser import HuParser
|
||||
from rag.settings import cron_logger
|
||||
|
||||
|
||||
class Pdf(HuParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
@ -20,12 +21,18 @@ class Pdf(HuParser):
|
||||
start = timer()
|
||||
self._layouts_paddle(zoomin)
|
||||
callback(0.77, "Layout analysis finished")
|
||||
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
|
||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||
self._naive_vertical_merge()
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes]
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
Successive text will be sliced into pieces using 'delimiter'.
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
"""
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
@ -41,24 +48,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
sections = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:txt = binary.decode("utf-8")
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:break
|
||||
if not l: break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [(l,"") for l in sections if l]
|
||||
sections = [(l, "") for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimer": "\n。;!?"})
|
||||
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimer"])
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
|
||||
cks = naive_merge(sections, parser_config["chunk_token_num"], parser_config["delimiter"])
|
||||
eng = is_english(cks)
|
||||
res = []
|
||||
# wrap up to es documents
|
||||
@ -75,6 +84,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **k
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(a, b):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -129,6 +129,10 @@ class Pdf(HuParser):
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
||||
"""
|
||||
pdf_parser = None
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf()
|
||||
|
||||
@ -94,6 +94,11 @@ class Pdf(HuParser):
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
"""
|
||||
The supported file formats are pdf, pptx.
|
||||
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
||||
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
|
||||
"""
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
|
||||
@ -70,7 +70,17 @@ def beAdoc(d, q, a, eng):
|
||||
|
||||
|
||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
If the file is in excel format, there should be 2 column question and answer without header.
|
||||
And question column is ahead of answer column.
|
||||
And it's O.K if it has multiple sheets as long as the columns are rightly composed.
|
||||
|
||||
If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
|
||||
|
||||
All the deformed lines will be ignored.
|
||||
Every pair of Q&A will be treated as a chunk.
|
||||
"""
|
||||
res = []
|
||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
|
||||
@ -4,24 +4,34 @@ import os
|
||||
import re
|
||||
import requests
|
||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||
from api.settings import stat_logger
|
||||
from rag.nlp import huqie
|
||||
|
||||
from rag.settings import cron_logger
|
||||
from rag.utils import rmSpace
|
||||
|
||||
forbidden_select_fields4resume = [
|
||||
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
|
||||
]
|
||||
|
||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
"""
|
||||
The supported file formats are pdf, docx and txt.
|
||||
To maximize the effectiveness, parse the resume correctly,
|
||||
please visit https://github.com/infiniflow/ragflow, and sign in the our demo web-site
|
||||
to get token. It's FREE!
|
||||
Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in '.env' file or
|
||||
using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container.
|
||||
"""
|
||||
if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE):
|
||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
|
||||
url = os.environ.get("INFINIFLOW_SERVER")
|
||||
if not url:
|
||||
raise EnvironmentError(
|
||||
"Please set environment variable: 'INFINIFLOW_SERVER'")
|
||||
token = os.environ.get("INFINIFLOW_TOKEN")
|
||||
if not token:
|
||||
raise EnvironmentError(
|
||||
"Please set environment variable: 'INFINIFLOW_TOKEN'")
|
||||
if not url or not token:
|
||||
stat_logger.warning(
|
||||
"INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
|
||||
return []
|
||||
|
||||
if not binary:
|
||||
with open(filename, "rb") as f:
|
||||
@ -44,22 +54,28 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
|
||||
callback(0.2, "Resume parsing is going on...")
|
||||
resume = remote_call()
|
||||
if len(resume.keys()) < 7:
|
||||
callback(-1, "Resume is not successfully parsed.")
|
||||
return []
|
||||
callback(0.6, "Done parsing. Chunking...")
|
||||
print(json.dumps(resume, ensure_ascii=False, indent=2))
|
||||
|
||||
field_map = {
|
||||
"name_kwd": "姓名/名字",
|
||||
"name_pinyin_kwd": "姓名拼音/名字拼音",
|
||||
"gender_kwd": "性别(男,女)",
|
||||
"age_int": "年龄/岁/年纪",
|
||||
"phone_kwd": "电话/手机/微信",
|
||||
"email_tks": "email/e-mail/邮箱",
|
||||
"position_name_tks": "职位/职能/岗位/职责",
|
||||
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
|
||||
"expect_city_names_tks": "期望城市",
|
||||
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
|
||||
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
|
||||
|
||||
"hightest_degree_kwd": "最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)",
|
||||
"first_degree_kwd": "第一学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)",
|
||||
"first_major_tks": "第一学历专业",
|
||||
"first_school_name_tks": "第一学历毕业学校",
|
||||
"first_degree_kwd": "第一学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)",
|
||||
"highest_degree_kwd": "最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)",
|
||||
"first_major_tks": "第一学历专业",
|
||||
"edu_first_fea_kwd": "第一学历标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)",
|
||||
|
||||
"degree_kwd": "过往学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)",
|
||||
@ -68,14 +84,14 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
"sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)",
|
||||
"edu_fea_kwd": "教育标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)",
|
||||
|
||||
"work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年",
|
||||
"birth_dt": "生日/出生年份",
|
||||
"corp_nm_tks": "就职过的公司/之前的公司/上过班的公司",
|
||||
"corporation_name_tks": "最近就职(上班)的公司/上一家公司",
|
||||
"edu_end_int": "毕业年份",
|
||||
"expect_city_names_tks": "期望城市",
|
||||
"industry_name_tks": "所在行业"
|
||||
"industry_name_tks": "所在行业",
|
||||
|
||||
"birth_dt": "生日/出生年份",
|
||||
"expect_position_name_tks": "期望职位/期望职能/期望岗位",
|
||||
}
|
||||
|
||||
titles = []
|
||||
for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]:
|
||||
v = resume.get(n, "")
|
||||
@ -105,6 +121,10 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
|
||||
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
|
||||
for n, _ in field_map.items():
|
||||
if n not in resume:continue
|
||||
if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
||||
resume[n] = resume[n][0]
|
||||
if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
|
||||
doc[n] = resume[n]
|
||||
|
||||
print(doc)
|
||||
|
||||
@ -100,7 +100,20 @@ def column_data_type(arr):
|
||||
|
||||
|
||||
def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
dfs = []
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
For csv or txt file, the delimiter between columns is TAB.
|
||||
The first line must be column headers.
|
||||
Column headers must be meaningful terms inorder to make our NLP model understanding.
|
||||
It's good to enumerate some synonyms using slash '/' to separate, and even better to
|
||||
enumerate values using brackets like 'gender/sex(male, female)'.
|
||||
Here are some examples for headers:
|
||||
1. supplier/vendor\tcolor(yellow, red, brown)\tgender/sex(male, female)\tsize(M,L,XL,XXL)
|
||||
2. 姓名/名字\t电话/手机/微信\t最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)
|
||||
|
||||
Every row in table will be treated as a chunk.
|
||||
"""
|
||||
|
||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
excel_parser = Excel()
|
||||
@ -155,7 +168,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
del df[n]
|
||||
clmns = df.columns.values
|
||||
txts = list(copy.deepcopy(clmns))
|
||||
py_clmns = [PY.get_pinyins(n)[0].replace("-", "_") for n in clmns]
|
||||
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|([^()]+?)|\([^()]+?\))", "", n), '_')[0] for n in clmns]
|
||||
clmn_tys = []
|
||||
for j in range(len(clmns)):
|
||||
cln, ty = column_data_type(df[clmns[j]])
|
||||
|
||||
@ -21,7 +21,7 @@ from .cv_model import *
|
||||
EmbeddingModel = {
|
||||
"Infiniflow": HuEmbedding,
|
||||
"OpenAI": OpenAIEmbed,
|
||||
"通义千问": QWenEmbed,
|
||||
"通义千问": HuEmbedding, #QWenEmbed,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -32,7 +32,7 @@ class GptTurbo(Base):
|
||||
self.model_name = model_name
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
if system: history.insert(0, {"role": "system", "content": system})
|
||||
res = self.client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
messages=history,
|
||||
@ -49,11 +49,12 @@ class QWenChat(Base):
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
from http import HTTPStatus
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
if system: history.insert(0, {"role": "system", "content": system})
|
||||
response = Generation.call(
|
||||
self.model_name,
|
||||
messages=history,
|
||||
result_format='message'
|
||||
result_format='message',
|
||||
**gen_conf
|
||||
)
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
return response.output.choices[0]['message']['content'], response.usage.output_tokens
|
||||
@ -68,10 +69,11 @@ class ZhipuChat(Base):
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
from http import HTTPStatus
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
if system: history.insert(0, {"role": "system", "content": system})
|
||||
response = self.client.chat.completions.create(
|
||||
self.model_name,
|
||||
messages=history
|
||||
messages=history,
|
||||
**gen_conf
|
||||
)
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
return response.output.choices[0]['message']['content'], response.usage.completion_tokens
|
||||
|
||||
@ -100,11 +100,11 @@ class QWenEmbed(Base):
|
||||
input=texts[i:i+batch_size],
|
||||
text_type="document"
|
||||
)
|
||||
embds = [[]] * len(resp["output"]["embeddings"])
|
||||
embds = [[] for _ in range(len(resp["output"]["embeddings"]))]
|
||||
for e in resp["output"]["embeddings"]:
|
||||
embds[e["text_index"]] = e["embedding"]
|
||||
res.extend(embds)
|
||||
token_count += resp["usage"]["input_tokens"]
|
||||
token_count += resp["usage"]["total_tokens"]
|
||||
return np.array(res), token_count
|
||||
|
||||
def encode_queries(self, text):
|
||||
@ -113,7 +113,7 @@ class QWenEmbed(Base):
|
||||
input=text[:2048],
|
||||
text_type="query"
|
||||
)
|
||||
return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["input_tokens"]
|
||||
return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["total_tokens"]
|
||||
|
||||
|
||||
from zhipuai import ZhipuAI
|
||||
|
||||
@ -92,7 +92,7 @@ class Dealer:
|
||||
assert emb_mdl, "No embedding model selected"
|
||||
s["knn"] = self._vector(
|
||||
qst, emb_mdl, req.get(
|
||||
"similarity", 0.4), ps)
|
||||
"similarity", 0.1), ps)
|
||||
s["knn"]["filter"] = bqry.to_dict()
|
||||
if "highlight" in s:
|
||||
del s["highlight"]
|
||||
@ -106,7 +106,7 @@ class Dealer:
|
||||
bqry.filter.append(Q("terms", kb_id=req["kb_ids"]))
|
||||
s["query"] = bqry.to_dict()
|
||||
s["knn"]["filter"] = bqry.to_dict()
|
||||
s["knn"]["similarity"] = 0.7
|
||||
s["knn"]["similarity"] = 0.17
|
||||
res = self.es.search(s, idxnm=idxnm, timeout="600s", src=src)
|
||||
|
||||
kwds = set([])
|
||||
@ -171,7 +171,7 @@ class Dealer:
|
||||
continue
|
||||
if not isinstance(v, type("")):
|
||||
m[n] = str(m[n])
|
||||
m[n] = rmSpace(m[n])
|
||||
if n.find("tks")>0: m[n] = rmSpace(m[n])
|
||||
|
||||
if m:
|
||||
res[d["id"]] = m
|
||||
@ -303,21 +303,22 @@ class Dealer:
|
||||
|
||||
return ranks
|
||||
|
||||
def sql_retrieval(self, sql, fetch_size=128):
|
||||
def sql_retrieval(self, sql, fetch_size=128, format="json"):
|
||||
sql = re.sub(r"[ ]+", " ", sql)
|
||||
sql = sql.replace("%", "")
|
||||
es_logger.info(f"Get es sql: {sql}")
|
||||
replaces = []
|
||||
for r in re.finditer(r" ([a-z_]+_l?tks like |[a-z_]+_l?tks ?= ?)'([^']+)'", sql):
|
||||
fld, v = r.group(1), r.group(2)
|
||||
fld = re.sub(r" ?(like|=)$", "", fld).lower()
|
||||
if v[0] == "%%": v = v[1:-1]
|
||||
match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qie(v))
|
||||
replaces.append((r.group(1)+r.group(2), match))
|
||||
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
||||
fld, v = r.group(1), r.group(3)
|
||||
match = " MATCH({}, '{}', 'operator=OR;fuzziness=AUTO:1,3;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v)))
|
||||
replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match))
|
||||
|
||||
for p, r in replaces: sql.replace(p, r)
|
||||
for p, r in replaces: sql = sql.replace(p, r, 1)
|
||||
es_logger.info(f"To es: {sql}")
|
||||
|
||||
try:
|
||||
tbl = self.es.sql(sql, fetch_size)
|
||||
tbl = self.es.sql(sql, fetch_size, format)
|
||||
return tbl
|
||||
except Exception as e:
|
||||
es_logger(f"SQL failure: {sql} =>" + str(e))
|
||||
es_logger.error(f"SQL failure: {sql} =>" + str(e))
|
||||
|
||||
|
||||
@ -53,9 +53,10 @@ class HuParser:
|
||||
|
||||
def __remote_call(self, species, images, thr=0.7):
|
||||
url = os.environ.get("INFINIFLOW_SERVER")
|
||||
if not url:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_SERVER'")
|
||||
token = os.environ.get("INFINIFLOW_TOKEN")
|
||||
if not token:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_TOKEN'")
|
||||
if not url or not token:
|
||||
logging.warning("INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
|
||||
return []
|
||||
|
||||
def convert_image_to_bytes(PILimage):
|
||||
image = BytesIO()
|
||||
|
||||
@ -47,7 +47,7 @@ from api.utils.file_utils import get_project_base_directory
|
||||
BATCH_SIZE = 64
|
||||
|
||||
FACTORY = {
|
||||
ParserType.GENERAL.value: laws,
|
||||
ParserType.GENERAL.value: manual,
|
||||
ParserType.PAPER.value: paper,
|
||||
ParserType.BOOK.value: book,
|
||||
ParserType.PRESENTATION.value: presentation,
|
||||
@ -119,8 +119,8 @@ def build(row, cvmdl):
|
||||
chunker = FACTORY[row["parser_id"].lower()]
|
||||
try:
|
||||
cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
|
||||
cks = chunker.chunk(row["name"], MINIO.get(row["kb_id"], row["location"]), row["from_page"], row["to_page"],
|
||||
callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
|
||||
cks = chunker.chunk(row["name"], binary = MINIO.get(row["kb_id"], row["location"]), from_page=row["from_page"], to_page=row["to_page"],
|
||||
callback = callback, kb_id=row["kb_id"], parser_config=row["parser_config"])
|
||||
except Exception as e:
|
||||
if re.search("(No such file|not found)", str(e)):
|
||||
callback(-1, "Can not find file <%s>" % row["doc_name"])
|
||||
@ -129,7 +129,7 @@ def build(row, cvmdl):
|
||||
|
||||
cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
|
||||
|
||||
return []
|
||||
return
|
||||
|
||||
callback(msg="Finished slicing files. Start to embedding the content.")
|
||||
|
||||
@ -211,6 +211,7 @@ def main(comm, mod):
|
||||
|
||||
st_tm = timer()
|
||||
cks = build(r, cv_mdl)
|
||||
if cks is None:continue
|
||||
if not cks:
|
||||
tmf.write(str(r["update_time"]) + "\n")
|
||||
callback(1., "No chunk! Done!")
|
||||
|
||||
@ -241,7 +241,7 @@ class HuEs:
|
||||
es_logger.error("ES search timeout for 3 times!")
|
||||
raise Exception("ES search timeout.")
|
||||
|
||||
def sql(self, sql, fetch_size=128, format="json", timeout=2):
|
||||
def sql(self, sql, fetch_size=128, format="json", timeout="2s"):
|
||||
for i in range(3):
|
||||
try:
|
||||
res = self.es.sql.query(body={"query": sql, "fetch_size": fetch_size}, format=format, request_timeout=timeout)
|
||||
|
||||
Reference in New Issue
Block a user