mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-03 19:15:30 +08:00
apply pep8 formalize (#155)
This commit is contained in:
@ -48,10 +48,12 @@ class Pdf(PdfParser):
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
|
||||
for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
@ -63,48 +65,63 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
pdf_parser = None
|
||||
sections,tbls = [], []
|
||||
sections, tbls = [], []
|
||||
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
doc_parser = DocxParser()
|
||||
# TODO: table of contents need to be removed
|
||||
sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
|
||||
sections, tbls = doc_parser(
|
||||
binary if binary else filename, from_page=from_page, to_page=to_page)
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
|
||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:txt = binary.decode("utf-8")
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:break
|
||||
if not l:
|
||||
break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [(l,"") for l in sections if l]
|
||||
remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
|
||||
sections = [(l, "") for l in sections if l]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
make_colon_as_title(sections)
|
||||
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
|
||||
bull = bullets_category(
|
||||
[t for t in random_choices([t for t, _ in sections], k=100)])
|
||||
if bull >= 0:
|
||||
chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 3)]
|
||||
chunks = ["\n".join(ck)
|
||||
for ck in hierarchical_merge(bull, sections, 3)]
|
||||
else:
|
||||
sections = [s.split("@") for s,_ in sections]
|
||||
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
|
||||
chunks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
|
||||
sections = [s.split("@") for s, _ in sections]
|
||||
sections = [(pr[0], "@" + pr[1]) for pr in sections if len(pr) == 2]
|
||||
chunks = naive_merge(
|
||||
sections, kwargs.get(
|
||||
"chunk_token_num", 256), kwargs.get(
|
||||
"delimer", "\n。;!?"))
|
||||
|
||||
# is it English
|
||||
eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
|
||||
# is_english(random_choices([t for t, _ in sections], k=218))
|
||||
eng = lang.lower() == "english"
|
||||
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
@ -114,6 +131,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
|
||||
|
||||
@ -35,8 +35,10 @@ class Docx(DocxParser):
|
||||
pn = 0
|
||||
lines = []
|
||||
for p in self.doc.paragraphs:
|
||||
if pn > to_page:break
|
||||
if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text))
|
||||
if pn > to_page:
|
||||
break
|
||||
if from_page <= pn < to_page and p.text.strip():
|
||||
lines.append(self.__clean(p.text))
|
||||
for run in p.runs:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
pn += 1
|
||||
@ -63,15 +65,18 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.67, "Layout analysis finished")
|
||||
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
|
||||
cron_logger.info("paddle layouts:".format(
|
||||
(timer() - start) / (self.total_page + 0.1)))
|
||||
self._naive_vertical_merge()
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
|
||||
return [(b["text"], self._line_tag(b, zoomin))
|
||||
for b in self.boxes], None
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
"""
|
||||
@ -89,41 +94,50 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
for txt, poss in pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
||||
sections.append(txt + poss)
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
for txt, poss in pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
||||
sections.append(txt + poss)
|
||||
|
||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:txt = binary.decode("utf-8")
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:break
|
||||
if not l:
|
||||
break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [l for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
# is it English
|
||||
eng = lang.lower() == "english"#is_english(sections)
|
||||
eng = lang.lower() == "english" # is_english(sections)
|
||||
# Remove 'Contents' part
|
||||
remove_contents_table(sections, eng)
|
||||
|
||||
make_colon_as_title(sections)
|
||||
bull = bullets_category(sections)
|
||||
chunks = hierarchical_merge(bull, sections, 3)
|
||||
if not chunks: callback(0.99, "No chunk parsed out.")
|
||||
if not chunks:
|
||||
callback(0.99, "No chunk parsed out.")
|
||||
|
||||
return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser)
|
||||
return tokenize_chunks(["\n".join(ck)
|
||||
for ck in chunks], doc, eng, pdf_parser)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -25,10 +25,10 @@ class Pdf(PdfParser):
|
||||
callback
|
||||
)
|
||||
callback(msg="OCR finished.")
|
||||
#for bb in self.boxes:
|
||||
# for bb in self.boxes:
|
||||
# for b in bb:
|
||||
# print(b)
|
||||
print("OCR:", timer()-start)
|
||||
print("OCR:", timer() - start)
|
||||
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.65, "Layout analysis finished.")
|
||||
@ -45,30 +45,35 @@ class Pdf(PdfParser):
|
||||
for b in self.boxes:
|
||||
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
||||
|
||||
return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls
|
||||
return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
|
||||
for i, b in enumerate(self.boxes)], tbls
|
||||
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
"""
|
||||
pdf_parser = None
|
||||
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
if sections and len(sections[0])<3: sections = [(t, l, [[0]*5]) for t, l in sections]
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
if sections and len(sections[0]) < 3:
|
||||
sections = [(t, l, [[0] * 5]) for t, l in sections]
|
||||
|
||||
else: raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
doc = {
|
||||
"docnm_kwd": filename
|
||||
}
|
||||
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
|
||||
# set pivot using the most frequent type of title,
|
||||
# then merge between 2 pivot
|
||||
@ -79,7 +84,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
for txt, _, _ in sections:
|
||||
for t, lvl in pdf_parser.outlines:
|
||||
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
|
||||
tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
|
||||
tks_ = set([txt[i] + txt[i + 1]
|
||||
for i in range(min(len(t), len(txt) - 1))])
|
||||
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
|
||||
levels.append(lvl)
|
||||
break
|
||||
@ -87,24 +93,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
levels.append(max_lvl + 1)
|
||||
|
||||
else:
|
||||
bull = bullets_category([txt for txt,_,_ in sections])
|
||||
most_level, levels = title_frequency(bull, [(txt, l) for txt, l, poss in sections])
|
||||
bull = bullets_category([txt for txt, _, _ in sections])
|
||||
most_level, levels = title_frequency(
|
||||
bull, [(txt, l) for txt, l, poss in sections])
|
||||
|
||||
assert len(sections) == len(levels)
|
||||
sec_ids = []
|
||||
sid = 0
|
||||
for i, lvl in enumerate(levels):
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
# print(lvl, self.boxes[i]["text"], most_level, sid)
|
||||
|
||||
sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
|
||||
sections = [(txt, sec_ids[i], poss)
|
||||
for i, (txt, _, poss) in enumerate(sections)]
|
||||
for (img, rows), poss in tbls:
|
||||
sections.append((rows if isinstance(rows, str) else rows[0], -1,
|
||||
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
|
||||
def tag(pn, left, right, top, bottom):
|
||||
if pn+left+right+top+bottom == 0:
|
||||
if pn + left + right + top + bottom == 0:
|
||||
return ""
|
||||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
||||
.format(pn, left, right, top, bottom)
|
||||
@ -112,7 +121,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
chunks = []
|
||||
last_sid = -2
|
||||
tk_cnt = 0
|
||||
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
||||
for txt, sec_id, poss in sorted(sections, key=lambda x: (
|
||||
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
||||
poss = "\t".join([tag(*pos) for pos in poss])
|
||||
if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1):
|
||||
if chunks:
|
||||
@ -121,16 +131,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
continue
|
||||
chunks.append(txt + poss)
|
||||
tk_cnt = num_tokens_from_string(txt)
|
||||
if sec_id > -1: last_sid = sec_id
|
||||
if sec_id > -1:
|
||||
last_sid = sec_id
|
||||
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
return res
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -44,11 +44,14 @@ class Pdf(PdfParser):
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._naive_vertical_merge()
|
||||
|
||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
||||
cron_logger.info("paddle layouts:".format(
|
||||
(timer() - start) / (self.total_page + 0.1)))
|
||||
return [(b["text"], self._line_tag(b, zoomin))
|
||||
for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
@ -56,8 +59,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
"""
|
||||
|
||||
eng = lang.lower() == "english"#is_english(cks)
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
@ -73,9 +78,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if parser_config["layout_recognize"] else PlainParser()
|
||||
pdf_parser = Pdf(
|
||||
) if parser_config["layout_recognize"] else PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
|
||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
@ -92,16 +98,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l: break
|
||||
if not l:
|
||||
break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [(l, "") for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
chunks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
chunks = naive_merge(
|
||||
sections, parser_config.get(
|
||||
"chunk_token_num", 128), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
return res
|
||||
@ -110,9 +121,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -41,20 +41,23 @@ class Pdf(PdfParser):
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._concat_downward()
|
||||
|
||||
sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
|
||||
sections = [(b["text"], self.get_position(b, zoomin))
|
||||
for i, b in enumerate(self.boxes)]
|
||||
for (img, rows), poss in tbls:
|
||||
sections.append((rows if isinstance(rows, str) else rows[0],
|
||||
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
|
||||
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
|
||||
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
One file forms a chunk which maintains original text order.
|
||||
"""
|
||||
|
||||
eng = lang.lower() == "english"#is_english(cks)
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
|
||||
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
@ -62,8 +65,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
sections, _ = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
sections, _ = pdf_parser(
|
||||
filename if not binary else binary, to_page=to_page, callback=callback)
|
||||
sections = [s for s, _ in sections if s]
|
||||
|
||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
@ -80,14 +86,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l: break
|
||||
if not l:
|
||||
break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [s for s in sections if s]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
@ -101,9 +109,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -67,11 +67,11 @@ class Pdf(PdfParser):
|
||||
|
||||
if from_page > 0:
|
||||
return {
|
||||
"title":"",
|
||||
"title": "",
|
||||
"authors": "",
|
||||
"abstract": "",
|
||||
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
|
||||
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
||||
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
||||
"tables": tbls
|
||||
}
|
||||
# get title and authors
|
||||
@ -87,7 +87,8 @@ class Pdf(PdfParser):
|
||||
title = ""
|
||||
break
|
||||
for j in range(3):
|
||||
if _begin(self.boxes[i + j]["text"]): break
|
||||
if _begin(self.boxes[i + j]["text"]):
|
||||
break
|
||||
authors.append(self.boxes[i + j]["text"])
|
||||
break
|
||||
break
|
||||
@ -107,10 +108,15 @@ class Pdf(PdfParser):
|
||||
abstr = txt + self._line_tag(self.boxes[i], zoomin)
|
||||
i += 1
|
||||
break
|
||||
if not abstr: i = 0
|
||||
if not abstr:
|
||||
i = 0
|
||||
|
||||
callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
|
||||
for b in self.boxes: print(b["text"], b.get("layoutno"))
|
||||
callback(
|
||||
0.8, "Page {}~{}: Text merging finished".format(
|
||||
from_page, min(
|
||||
to_page, self.total_page)))
|
||||
for b in self.boxes:
|
||||
print(b["text"], b.get("layoutno"))
|
||||
print(tbls)
|
||||
|
||||
return {
|
||||
@ -118,19 +124,20 @@ class Pdf(PdfParser):
|
||||
"authors": " ".join(authors),
|
||||
"abstract": abstr,
|
||||
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
|
||||
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
||||
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
||||
"tables": tbls
|
||||
}
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
||||
"""
|
||||
pdf_parser = None
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
if not kwargs.get("parser_config",{}).get("layout_recognize", True):
|
||||
if not kwargs.get("parser_config", {}).get("layout_recognize", True):
|
||||
pdf_parser = PlainParser()
|
||||
paper = {
|
||||
"title": filename,
|
||||
@ -143,14 +150,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
pdf_parser = Pdf()
|
||||
paper = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
else: raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
|
||||
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
|
||||
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
print("It's English.....", eng)
|
||||
|
||||
res = tokenize_table(paper["tables"], doc, eng)
|
||||
@ -160,7 +168,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
txt = pdf_parser.remove_tag(paper["abstract"])
|
||||
d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
|
||||
d["important_tks"] = " ".join(d["important_kwd"])
|
||||
d["image"], poss = pdf_parser.crop(paper["abstract"], need_position=True)
|
||||
d["image"], poss = pdf_parser.crop(
|
||||
paper["abstract"], need_position=True)
|
||||
add_positions(d, poss)
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
@ -174,7 +183,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
sec_ids = []
|
||||
sid = 0
|
||||
for i, lvl in enumerate(levels):
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
print(lvl, sorted_sections[i][0], most_level, sid)
|
||||
|
||||
@ -190,6 +200,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
return res
|
||||
|
||||
|
||||
"""
|
||||
readed = [0] * len(paper["lines"])
|
||||
# find colon firstly
|
||||
@ -212,7 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
for k in range(j, i): readed[k] = True
|
||||
txt = txt[::-1]
|
||||
if eng:
|
||||
r = re.search(r"(.*?) ([\.;?!]|$)", txt)
|
||||
r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
|
||||
txt = r.group(1)[::-1] if r else txt[::-1]
|
||||
else:
|
||||
r = re.search(r"(.*?) ([。?;!]|$)", txt)
|
||||
@ -270,6 +281,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -33,9 +33,12 @@ class Ppt(PptParser):
|
||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
||||
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
||||
buffered = BytesIO()
|
||||
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
slide.get_thumbnail(
|
||||
0.5, 0.5).save(
|
||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
imgs.append(Image.open(buffered))
|
||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
assert len(imgs) == len(
|
||||
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
callback(0.9, "Image extraction finished")
|
||||
self.is_english = is_english(txts)
|
||||
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
@ -47,25 +50,34 @@ class Pdf(PdfParser):
|
||||
|
||||
def __garbage(self, txt):
|
||||
txt = txt.lower().strip()
|
||||
if re.match(r"[0-9\.,%/-]+$", txt): return True
|
||||
if len(txt) < 3:return True
|
||||
if re.match(r"[0-9\.,%/-]+$", txt):
|
||||
return True
|
||||
if len(txt) < 3:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
callback(msg="OCR is running...")
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
|
||||
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
|
||||
self.__images__(filename if not binary else binary,
|
||||
zoomin, from_page, to_page, callback)
|
||||
callback(0.8, "Page {}~{}: OCR finished".format(
|
||||
from_page, min(to_page, self.total_page)))
|
||||
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
|
||||
len(self.boxes), len(self.page_images))
|
||||
res = []
|
||||
for i in range(len(self.boxes)):
|
||||
lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
|
||||
lines = "\n".join([b["text"] for b in self.boxes[i]
|
||||
if not self.__garbage(b["text"])])
|
||||
res.append((lines, self.page_images[i]))
|
||||
callback(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)))
|
||||
callback(0.9, "Page {}~{}: Parsing finished".format(
|
||||
from_page, min(to_page, self.total_page)))
|
||||
return res
|
||||
|
||||
|
||||
class PlainPdf(PlainParser):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, callback=None, **kwargs):
|
||||
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
|
||||
page_txt = []
|
||||
for page in self.pdf.pages[from_page: to_page]:
|
||||
@ -74,7 +86,8 @@ class PlainPdf(PlainParser):
|
||||
return [(txt, None) for txt in page_txt]
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
The supported file formats are pdf, pptx.
|
||||
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
||||
@ -89,35 +102,42 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
res = []
|
||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||
ppt_parser = Ppt()
|
||||
for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||
for pn, (txt, img) in enumerate(ppt_parser(
|
||||
filename if not binary else binary, from_page, 1000000, callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
d["image"] = img
|
||||
d["page_num_int"] = [pn+1]
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainPdf()
|
||||
for pn, (txt,img) in enumerate(pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)):
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainPdf()
|
||||
for pn, (txt, img) in enumerate(pdf_parser(filename, binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
if img: d["image"] = img
|
||||
d["page_num_int"] = [pn+1]
|
||||
if img:
|
||||
d["image"] = img
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
|
||||
d["position_int"] = [
|
||||
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pptx, pdf supported)")
|
||||
|
||||
|
||||
if __name__== "__main__":
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(a, b):
|
||||
pass
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
|
||||
@ -27,6 +27,8 @@ from rag.utils import rmSpace
|
||||
forbidden_select_fields4resume = [
|
||||
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
|
||||
]
|
||||
|
||||
|
||||
def remote_call(filename, binary):
|
||||
q = {
|
||||
"header": {
|
||||
@ -48,18 +50,22 @@ def remote_call(filename, binary):
|
||||
}
|
||||
for _ in range(3):
|
||||
try:
|
||||
resume = requests.post("http://127.0.0.1:61670/tog", data=json.dumps(q))
|
||||
resume = requests.post(
|
||||
"http://127.0.0.1:61670/tog",
|
||||
data=json.dumps(q))
|
||||
resume = resume.json()["response"]["results"]
|
||||
resume = refactor(resume)
|
||||
for k in ["education", "work", "project", "training", "skill", "certificate", "language"]:
|
||||
if not resume.get(k) and k in resume: del resume[k]
|
||||
for k in ["education", "work", "project",
|
||||
"training", "skill", "certificate", "language"]:
|
||||
if not resume.get(k) and k in resume:
|
||||
del resume[k]
|
||||
|
||||
resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
|
||||
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
|
||||
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
|
||||
resume = step_two.parse(resume)
|
||||
return resume
|
||||
except Exception as e:
|
||||
cron_logger.error("Resume parser error: "+str(e))
|
||||
cron_logger.error("Resume parser error: " + str(e))
|
||||
return {}
|
||||
|
||||
|
||||
@ -144,10 +150,13 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
|
||||
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
|
||||
for n, _ in field_map.items():
|
||||
if n not in resume:continue
|
||||
if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
||||
if n not in resume:
|
||||
continue
|
||||
if isinstance(resume[n], list) and (
|
||||
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
||||
resume[n] = resume[n][0]
|
||||
if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
|
||||
if n.find("_tks") > 0:
|
||||
resume[n] = huqie.qieqie(resume[n])
|
||||
doc[n] = resume[n]
|
||||
|
||||
print(doc)
|
||||
|
||||
@ -25,7 +25,8 @@ from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
class Excel(ExcelParser):
|
||||
def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
|
||||
def __call__(self, fnm, binary=None, from_page=0,
|
||||
to_page=10000000000, callback=None):
|
||||
if not binary:
|
||||
wb = load_workbook(fnm)
|
||||
else:
|
||||
@ -48,8 +49,10 @@ class Excel(ExcelParser):
|
||||
data = []
|
||||
for i, r in enumerate(rows[1:]):
|
||||
rn += 1
|
||||
if rn-1 < from_page:continue
|
||||
if rn -1>=to_page: break
|
||||
if rn - 1 < from_page:
|
||||
continue
|
||||
if rn - 1 >= to_page:
|
||||
break
|
||||
row = [
|
||||
cell.value for ii,
|
||||
cell in enumerate(r) if ii not in missed]
|
||||
@ -60,7 +63,7 @@ class Excel(ExcelParser):
|
||||
done += 1
|
||||
res.append(pd.DataFrame(np.array(data), columns=headers))
|
||||
|
||||
callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + (
|
||||
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
return res
|
||||
|
||||
@ -73,7 +76,8 @@ def trans_datatime(s):
|
||||
|
||||
|
||||
def trans_bool(s):
|
||||
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
|
||||
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$",
|
||||
str(s).strip(), flags=re.IGNORECASE):
|
||||
return "yes"
|
||||
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
|
||||
return "no"
|
||||
@ -107,13 +111,14 @@ def column_data_type(arr):
|
||||
arr[i] = trans[ty](str(arr[i]))
|
||||
except Exception as e:
|
||||
arr[i] = None
|
||||
#if ty == "text":
|
||||
# if ty == "text":
|
||||
# if len(arr) > 128 and uni / len(arr) < 0.1:
|
||||
# ty = "keyword"
|
||||
return arr, ty
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
For csv or txt file, the delimiter between columns is TAB.
|
||||
@ -131,7 +136,12 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
excel_parser = Excel()
|
||||
dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
dfs = excel_parser(
|
||||
filename,
|
||||
binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
callback=callback)
|
||||
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
@ -149,8 +159,10 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
headers = lines[0].split(kwargs.get("delimiter", "\t"))
|
||||
rows = []
|
||||
for i, line in enumerate(lines[1:]):
|
||||
if i < from_page:continue
|
||||
if i >= to_page: break
|
||||
if i < from_page:
|
||||
continue
|
||||
if i >= to_page:
|
||||
break
|
||||
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
|
||||
if len(row) != len(headers):
|
||||
fails.append(str(i))
|
||||
@ -181,7 +193,13 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
del df[n]
|
||||
clmns = df.columns.values
|
||||
txts = list(copy.deepcopy(clmns))
|
||||
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|([^()]+?)|\([^()]+?\))", "", n), '_')[0] for n in clmns]
|
||||
py_clmns = [
|
||||
PY.get_pinyins(
|
||||
re.sub(
|
||||
r"(/.*|([^()]+?)|\([^()]+?\))",
|
||||
"",
|
||||
n),
|
||||
'_')[0] for n in clmns]
|
||||
clmn_tys = []
|
||||
for j in range(len(clmns)):
|
||||
cln, ty = column_data_type(df[clmns[j]])
|
||||
@ -192,7 +210,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
|
||||
for i in range(len(clmns))]
|
||||
|
||||
eng = lang.lower() == "english"#is_english(txts)
|
||||
eng = lang.lower() == "english" # is_english(txts)
|
||||
for ii, row in df.iterrows():
|
||||
d = {
|
||||
"docnm_kwd": filename,
|
||||
|
||||
@ -13,6 +13,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from zhipuai import ZhipuAI
|
||||
from dashscope import Generation
|
||||
from abc import ABC
|
||||
from openai import OpenAI
|
||||
import openai
|
||||
@ -34,7 +36,8 @@ class GptTurbo(Base):
|
||||
self.model_name = model_name
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
if system: history.insert(0, {"role": "system", "content": system})
|
||||
if system:
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
@ -46,16 +49,18 @@ class GptTurbo(Base):
|
||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||
return ans, response.usage.completion_tokens
|
||||
except openai.APIError as e:
|
||||
return "**ERROR**: "+str(e), 0
|
||||
return "**ERROR**: " + str(e), 0
|
||||
|
||||
|
||||
class MoonshotChat(GptTurbo):
|
||||
def __init__(self, key, model_name="moonshot-v1-8k"):
|
||||
self.client = OpenAI(api_key=key, base_url="https://api.moonshot.cn/v1",)
|
||||
self.client = OpenAI(
|
||||
api_key=key, base_url="https://api.moonshot.cn/v1",)
|
||||
self.model_name = model_name
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
if system: history.insert(0, {"role": "system", "content": system})
|
||||
if system:
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model_name,
|
||||
@ -67,10 +72,9 @@ class MoonshotChat(GptTurbo):
|
||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||
return ans, response.usage.completion_tokens
|
||||
except openai.APIError as e:
|
||||
return "**ERROR**: "+str(e), 0
|
||||
return "**ERROR**: " + str(e), 0
|
||||
|
||||
|
||||
from dashscope import Generation
|
||||
class QWenChat(Base):
|
||||
def __init__(self, key, model_name=Generation.Models.qwen_turbo):
|
||||
import dashscope
|
||||
@ -79,7 +83,8 @@ class QWenChat(Base):
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
from http import HTTPStatus
|
||||
if system: history.insert(0, {"role": "system", "content": system})
|
||||
if system:
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
response = Generation.call(
|
||||
self.model_name,
|
||||
messages=history,
|
||||
@ -92,20 +97,21 @@ class QWenChat(Base):
|
||||
ans += response.output.choices[0]['message']['content']
|
||||
tk_count += response.usage.output_tokens
|
||||
if response.output.choices[0].get("finish_reason", "") == "length":
|
||||
ans += "...\nFor the content length reason, it stopped, continue?" if is_english([ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||
ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
|
||||
[ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
|
||||
return ans, tk_count
|
||||
|
||||
return "**ERROR**: " + response.message, tk_count
|
||||
|
||||
|
||||
from zhipuai import ZhipuAI
|
||||
class ZhipuChat(Base):
|
||||
def __init__(self, key, model_name="glm-3-turbo"):
|
||||
self.client = ZhipuAI(api_key=key)
|
||||
self.model_name = model_name
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
if system: history.insert(0, {"role": "system", "content": system})
|
||||
if system:
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
self.model_name,
|
||||
@ -120,6 +126,7 @@ class ZhipuChat(Base):
|
||||
except Exception as e:
|
||||
return "**ERROR**: " + str(e), 0
|
||||
|
||||
|
||||
class LocalLLM(Base):
|
||||
class RPCProxy:
|
||||
def __init__(self, host, port):
|
||||
@ -129,14 +136,17 @@ class LocalLLM(Base):
|
||||
|
||||
def __conn(self):
|
||||
from multiprocessing.connection import Client
|
||||
self._connection = Client((self.host, self.port), authkey=b'infiniflow-token4kevinhu')
|
||||
self._connection = Client(
|
||||
(self.host, self.port), authkey=b'infiniflow-token4kevinhu')
|
||||
|
||||
def __getattr__(self, name):
|
||||
import pickle
|
||||
|
||||
def do_rpc(*args, **kwargs):
|
||||
for _ in range(3):
|
||||
try:
|
||||
self._connection.send(pickle.dumps((name, args, kwargs)))
|
||||
self._connection.send(
|
||||
pickle.dumps((name, args, kwargs)))
|
||||
return pickle.loads(self._connection.recv())
|
||||
except Exception as e:
|
||||
self.__conn()
|
||||
@ -148,7 +158,8 @@ class LocalLLM(Base):
|
||||
self.client = LocalLLM.RPCProxy("127.0.0.1", 7860)
|
||||
|
||||
def chat(self, system, history, gen_conf):
|
||||
if system: history.insert(0, {"role": "system", "content": system})
|
||||
if system:
|
||||
history.insert(0, {"role": "system", "content": system})
|
||||
try:
|
||||
ans = self.client.chat(
|
||||
history,
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from zhipuai import ZhipuAI
|
||||
import io
|
||||
from abc import ABC
|
||||
|
||||
@ -57,8 +58,8 @@ class Base(ABC):
|
||||
},
|
||||
},
|
||||
{
|
||||
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
|
||||
"Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
|
||||
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else
|
||||
"Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
|
||||
},
|
||||
],
|
||||
}
|
||||
@ -92,8 +93,9 @@ class QWenCV(Base):
|
||||
def prompt(self, binary):
|
||||
# stupid as hell
|
||||
tmp_dir = get_project_base_directory("tmp")
|
||||
if not os.path.exists(tmp_dir): os.mkdir(tmp_dir)
|
||||
path = os.path.join(tmp_dir, "%s.jpg"%get_uuid())
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.mkdir(tmp_dir)
|
||||
path = os.path.join(tmp_dir, "%s.jpg" % get_uuid())
|
||||
Image.open(io.BytesIO(binary)).save(path)
|
||||
return [
|
||||
{
|
||||
@ -103,8 +105,8 @@ class QWenCV(Base):
|
||||
"image": f"file://{path}"
|
||||
},
|
||||
{
|
||||
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else \
|
||||
"Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
|
||||
"text": "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。" if self.lang.lower() == "chinese" else
|
||||
"Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
|
||||
},
|
||||
],
|
||||
}
|
||||
@ -120,9 +122,6 @@ class QWenCV(Base):
|
||||
return response.message, 0
|
||||
|
||||
|
||||
from zhipuai import ZhipuAI
|
||||
|
||||
|
||||
class Zhipu4V(Base):
|
||||
def __init__(self, key, model_name="glm-4v", lang="Chinese"):
|
||||
self.client = ZhipuAI(api_key=key)
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from zhipuai import ZhipuAI
|
||||
import os
|
||||
from abc import ABC
|
||||
|
||||
@ -40,11 +41,11 @@ flag_model = FlagModel(model_dir,
|
||||
query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:",
|
||||
use_fp16=torch.cuda.is_available())
|
||||
|
||||
|
||||
class Base(ABC):
|
||||
def __init__(self, key, model_name):
|
||||
pass
|
||||
|
||||
|
||||
def encode(self, texts: list, batch_size=32):
|
||||
raise NotImplementedError("Please implement encode method!")
|
||||
|
||||
@ -67,11 +68,11 @@ class HuEmbedding(Base):
|
||||
"""
|
||||
self.model = flag_model
|
||||
|
||||
|
||||
def encode(self, texts: list, batch_size=32):
|
||||
texts = [t[:2000] for t in texts]
|
||||
token_count = 0
|
||||
for t in texts: token_count += num_tokens_from_string(t)
|
||||
for t in texts:
|
||||
token_count += num_tokens_from_string(t)
|
||||
res = []
|
||||
for i in range(0, len(texts), batch_size):
|
||||
res.extend(self.model.encode(texts[i:i + batch_size]).tolist())
|
||||
@ -90,7 +91,8 @@ class OpenAIEmbed(Base):
|
||||
def encode(self, texts: list, batch_size=32):
|
||||
res = self.client.embeddings.create(input=texts,
|
||||
model=self.model_name)
|
||||
return np.array([d.embedding for d in res.data]), res.usage.total_tokens
|
||||
return np.array([d.embedding for d in res.data]
|
||||
), res.usage.total_tokens
|
||||
|
||||
def encode_queries(self, text):
|
||||
res = self.client.embeddings.create(input=[text],
|
||||
@ -111,7 +113,7 @@ class QWenEmbed(Base):
|
||||
for i in range(0, len(texts), batch_size):
|
||||
resp = dashscope.TextEmbedding.call(
|
||||
model=self.model_name,
|
||||
input=texts[i:i+batch_size],
|
||||
input=texts[i:i + batch_size],
|
||||
text_type="document"
|
||||
)
|
||||
embds = [[] for _ in range(len(resp["output"]["embeddings"]))]
|
||||
@ -123,14 +125,14 @@ class QWenEmbed(Base):
|
||||
|
||||
def encode_queries(self, text):
|
||||
resp = dashscope.TextEmbedding.call(
|
||||
model=self.model_name,
|
||||
input=text[:2048],
|
||||
text_type="query"
|
||||
)
|
||||
return np.array(resp["output"]["embeddings"][0]["embedding"]), resp["usage"]["total_tokens"]
|
||||
model=self.model_name,
|
||||
input=text[:2048],
|
||||
text_type="query"
|
||||
)
|
||||
return np.array(resp["output"]["embeddings"][0]
|
||||
["embedding"]), resp["usage"]["total_tokens"]
|
||||
|
||||
|
||||
from zhipuai import ZhipuAI
|
||||
class ZhipuEmbed(Base):
|
||||
def __init__(self, key, model_name="embedding-2"):
|
||||
self.client = ZhipuAI(api_key=key)
|
||||
@ -139,9 +141,10 @@ class ZhipuEmbed(Base):
|
||||
def encode(self, texts: list, batch_size=32):
|
||||
res = self.client.embeddings.create(input=texts,
|
||||
model=self.model_name)
|
||||
return np.array([d.embedding for d in res.data]), res.usage.total_tokens
|
||||
return np.array([d.embedding for d in res.data]
|
||||
), res.usage.total_tokens
|
||||
|
||||
def encode_queries(self, text):
|
||||
res = self.client.embeddings.create(input=text,
|
||||
model=self.model_name)
|
||||
return np.array(res["data"][0]["embedding"]), res.usage.total_tokens
|
||||
return np.array(res["data"][0]["embedding"]), res.usage.total_tokens
|
||||
|
||||
@ -9,7 +9,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
class RPCHandler:
|
||||
def __init__(self):
|
||||
self._functions = { }
|
||||
self._functions = {}
|
||||
|
||||
def register_function(self, func):
|
||||
self._functions[func.__name__] = func
|
||||
@ -21,12 +21,12 @@ class RPCHandler:
|
||||
func_name, args, kwargs = pickle.loads(connection.recv())
|
||||
# Run the RPC and send a response
|
||||
try:
|
||||
r = self._functions[func_name](*args,**kwargs)
|
||||
r = self._functions[func_name](*args, **kwargs)
|
||||
connection.send(pickle.dumps(r))
|
||||
except Exception as e:
|
||||
connection.send(pickle.dumps(e))
|
||||
except EOFError:
|
||||
pass
|
||||
pass
|
||||
|
||||
|
||||
def rpc_server(hdlr, address, authkey):
|
||||
@ -44,11 +44,17 @@ def rpc_server(hdlr, address, authkey):
|
||||
models = []
|
||||
tokenizer = None
|
||||
|
||||
|
||||
def chat(messages, gen_conf):
|
||||
global tokenizer
|
||||
model = Model()
|
||||
try:
|
||||
conf = {"max_new_tokens": int(gen_conf.get("max_tokens", 256)), "temperature": float(gen_conf.get("temperature", 0.1))}
|
||||
conf = {
|
||||
"max_new_tokens": int(
|
||||
gen_conf.get(
|
||||
"max_tokens", 256)), "temperature": float(
|
||||
gen_conf.get(
|
||||
"temperature", 0.1))}
|
||||
print(messages, conf)
|
||||
text = tokenizer.apply_chat_template(
|
||||
messages,
|
||||
@ -65,7 +71,8 @@ def chat(messages, gen_conf):
|
||||
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||
]
|
||||
|
||||
return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
return tokenizer.batch_decode(
|
||||
generated_ids, skip_special_tokens=True)[0]
|
||||
except Exception as e:
|
||||
return str(e)
|
||||
|
||||
@ -75,10 +82,15 @@ def Model():
|
||||
random.seed(time.time())
|
||||
return random.choice(models)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model_name", type=str, help="Model name")
|
||||
parser.add_argument("--port", default=7860, type=int, help="RPC serving port")
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
default=7860,
|
||||
type=int,
|
||||
help="RPC serving port")
|
||||
args = parser.parse_args()
|
||||
|
||||
handler = RPCHandler()
|
||||
@ -93,4 +105,5 @@ if __name__ == "__main__":
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
|
||||
|
||||
# Run the server
|
||||
rpc_server(handler, ('0.0.0.0', args.port), authkey=b'infiniflow-token4kevinhu')
|
||||
rpc_server(handler, ('0.0.0.0', args.port),
|
||||
authkey=b'infiniflow-token4kevinhu')
|
||||
|
||||
@ -372,7 +372,8 @@ class PptChunker(HuChunker):
|
||||
tb = shape.table
|
||||
rows = []
|
||||
for i in range(1, len(tb.rows)):
|
||||
rows.append("; ".join([tb.cell(0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
rows.append("; ".join([tb.cell(
|
||||
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
|
||||
return "\n".join(rows)
|
||||
|
||||
if shape.has_text_frame:
|
||||
@ -382,7 +383,8 @@ class PptChunker(HuChunker):
|
||||
texts = []
|
||||
for p in shape.shapes:
|
||||
t = self.__extract(p)
|
||||
if t: texts.append(t)
|
||||
if t:
|
||||
texts.append(t)
|
||||
return "\n".join(texts)
|
||||
|
||||
def __call__(self, fnm):
|
||||
@ -395,7 +397,8 @@ class PptChunker(HuChunker):
|
||||
texts = []
|
||||
for shape in slide.shapes:
|
||||
txt = self.__extract(shape)
|
||||
if txt: texts.append(txt)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
txts.append("\n".join(texts))
|
||||
|
||||
import aspose.slides as slides
|
||||
@ -404,9 +407,12 @@ class PptChunker(HuChunker):
|
||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
||||
for slide in presentation.slides:
|
||||
buffered = BytesIO()
|
||||
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
slide.get_thumbnail(
|
||||
0.5, 0.5).save(
|
||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
imgs.append(buffered.getvalue())
|
||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
assert len(imgs) == len(
|
||||
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
|
||||
flds = self.Fields()
|
||||
flds.text_chunks = [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
@ -445,7 +451,8 @@ class TextChunker(HuChunker):
|
||||
if isinstance(fnm, str):
|
||||
with open(fnm, "r") as f:
|
||||
txt = f.read()
|
||||
else: txt = fnm.decode("utf-8")
|
||||
else:
|
||||
txt = fnm.decode("utf-8")
|
||||
flds.text_chunks = [(c, None) for c in self.naive_text_chunk(txt)]
|
||||
flds.table_chunks = []
|
||||
return flds
|
||||
|
||||
@ -149,7 +149,8 @@ class EsQueryer:
|
||||
atks = toDict(atks)
|
||||
btkss = [toDict(tks) for tks in btkss]
|
||||
tksim = [self.similarity(atks, btks) for btks in btkss]
|
||||
return np.array(sims[0]) * vtweight + np.array(tksim) * tkweight, tksim, sims[0]
|
||||
return np.array(sims[0]) * vtweight + \
|
||||
np.array(tksim) * tkweight, tksim, sims[0]
|
||||
|
||||
def similarity(self, qtwt, dtwt):
|
||||
if isinstance(dtwt, type("")):
|
||||
@ -159,11 +160,11 @@ class EsQueryer:
|
||||
s = 1e-9
|
||||
for k, v in qtwt.items():
|
||||
if k in dtwt:
|
||||
s += v# * dtwt[k]
|
||||
s += v # * dtwt[k]
|
||||
q = 1e-9
|
||||
for k, v in qtwt.items():
|
||||
q += v #* v
|
||||
q += v # * v
|
||||
#d = 1e-9
|
||||
#for k, v in dtwt.items():
|
||||
# for k, v in dtwt.items():
|
||||
# d += v * v
|
||||
return s / q #math.sqrt(q) / math.sqrt(d)
|
||||
return s / q # math.sqrt(q) / math.sqrt(d)
|
||||
|
||||
@ -80,14 +80,18 @@ class Dealer:
|
||||
if not req.get("sort"):
|
||||
s = s.sort(
|
||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||
{"create_timestamp_flt": {
|
||||
"order": "desc", "unmapped_type": "float"}}
|
||||
)
|
||||
else:
|
||||
s = s.sort(
|
||||
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
|
||||
{"page_num_int": {"order": "asc", "unmapped_type": "float",
|
||||
"mode": "avg", "numeric_type": "double"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float",
|
||||
"mode": "avg", "numeric_type": "double"}},
|
||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||
{"create_timestamp_flt": {
|
||||
"order": "desc", "unmapped_type": "float"}}
|
||||
)
|
||||
|
||||
if qst:
|
||||
@ -180,11 +184,13 @@ class Dealer:
|
||||
m = {n: d.get(n) for n in flds if d.get(n) is not None}
|
||||
for n, v in m.items():
|
||||
if isinstance(v, type([])):
|
||||
m[n] = "\t".join([str(vv) if not isinstance(vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
|
||||
m[n] = "\t".join([str(vv) if not isinstance(
|
||||
vv, list) else "\t".join([str(vvv) for vvv in vv]) for vv in v])
|
||||
continue
|
||||
if not isinstance(v, type("")):
|
||||
m[n] = str(m[n])
|
||||
if n.find("tks")>0: m[n] = rmSpace(m[n])
|
||||
if n.find("tks") > 0:
|
||||
m[n] = rmSpace(m[n])
|
||||
|
||||
if m:
|
||||
res[d["id"]] = m
|
||||
@ -205,12 +211,16 @@ class Dealer:
|
||||
if pieces[i] == "```":
|
||||
st = i
|
||||
i += 1
|
||||
while i<len(pieces) and pieces[i] != "```":
|
||||
while i < len(pieces) and pieces[i] != "```":
|
||||
i += 1
|
||||
if i < len(pieces): i += 1
|
||||
pieces_.append("".join(pieces[st: i])+"\n")
|
||||
if i < len(pieces):
|
||||
i += 1
|
||||
pieces_.append("".join(pieces[st: i]) + "\n")
|
||||
else:
|
||||
pieces_.extend(re.split(r"([^\|][;。?!!\n]|[a-z][.?;!][ \n])", pieces[i]))
|
||||
pieces_.extend(
|
||||
re.split(
|
||||
r"([^\|][;。?!!\n]|[a-z][.?;!][ \n])",
|
||||
pieces[i]))
|
||||
i += 1
|
||||
pieces = pieces_
|
||||
else:
|
||||
@ -234,7 +244,8 @@ class Dealer:
|
||||
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
||||
len(ans_v[0]), len(chunk_v[0]))
|
||||
|
||||
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks]
|
||||
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
|
||||
for ck in chunks]
|
||||
cites = {}
|
||||
for i, a in enumerate(pieces_):
|
||||
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
||||
@ -258,9 +269,11 @@ class Dealer:
|
||||
continue
|
||||
if i not in cites:
|
||||
continue
|
||||
for c in cites[i]: assert int(c) < len(chunk_v)
|
||||
for c in cites[i]:
|
||||
if c in seted:continue
|
||||
assert int(c) < len(chunk_v)
|
||||
for c in cites[i]:
|
||||
if c in seted:
|
||||
continue
|
||||
res += f" ##{c}$$"
|
||||
seted.add(c)
|
||||
|
||||
@ -343,7 +356,11 @@ class Dealer:
|
||||
if dnm not in ranks["doc_aggs"]:
|
||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
||||
ranks["doc_aggs"][dnm]["count"] += 1
|
||||
ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
|
||||
ranks["doc_aggs"] = [{"doc_name": k,
|
||||
"doc_id": v["doc_id"],
|
||||
"count": v["count"]} for k,
|
||||
v in sorted(ranks["doc_aggs"].items(),
|
||||
key=lambda x:x[1]["count"] * -1)]
|
||||
|
||||
return ranks
|
||||
|
||||
@ -354,10 +371,17 @@ class Dealer:
|
||||
replaces = []
|
||||
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
|
||||
fld, v = r.group(1), r.group(3)
|
||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(fld, huqie.qieqie(huqie.qie(v)))
|
||||
replaces.append(("{}{}'{}'".format(r.group(1), r.group(2), r.group(3)), match))
|
||||
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
|
||||
fld, huqie.qieqie(huqie.qie(v)))
|
||||
replaces.append(
|
||||
("{}{}'{}'".format(
|
||||
r.group(1),
|
||||
r.group(2),
|
||||
r.group(3)),
|
||||
match))
|
||||
|
||||
for p, r in replaces: sql = sql.replace(p, r, 1)
|
||||
for p, r in replaces:
|
||||
sql = sql.replace(p, r, 1)
|
||||
chat_logger.info(f"To es: {sql}")
|
||||
|
||||
try:
|
||||
@ -366,4 +390,3 @@ class Dealer:
|
||||
except Exception as e:
|
||||
chat_logger.error(f"SQL failure: {sql} =>" + str(e))
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
@ -150,8 +150,10 @@ class Dealer:
|
||||
return 6
|
||||
|
||||
def ner(t):
|
||||
if re.match(r"[0-9,.]{2,}$", t): return 2
|
||||
if re.match(r"[a-z]{1,2}$", t): return 0.01
|
||||
if re.match(r"[0-9,.]{2,}$", t):
|
||||
return 2
|
||||
if re.match(r"[a-z]{1,2}$", t):
|
||||
return 0.01
|
||||
if not self.ne or t not in self.ne:
|
||||
return 1
|
||||
m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3,
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import os
|
||||
from api.utils import get_base_config,decrypt_database_config
|
||||
from api.utils import get_base_config, decrypt_database_config
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from api.utils.log_utils import LoggerFactory, getLogger
|
||||
|
||||
@ -28,7 +28,11 @@ MINIO = decrypt_database_config(name="minio")
|
||||
DOC_MAXIMUM_SIZE = 128 * 1024 * 1024
|
||||
|
||||
# Logger
|
||||
LoggerFactory.set_directory(os.path.join(get_project_base_directory(), "logs", "rag"))
|
||||
LoggerFactory.set_directory(
|
||||
os.path.join(
|
||||
get_project_base_directory(),
|
||||
"logs",
|
||||
"rag"))
|
||||
# {CRITICAL: 50, FATAL:50, ERROR:40, WARNING:30, WARN:30, INFO:20, DEBUG:10, NOTSET:0}
|
||||
LoggerFactory.LEVEL = 10
|
||||
|
||||
@ -37,4 +41,3 @@ minio_logger = getLogger("minio")
|
||||
cron_logger = getLogger("cron_logger")
|
||||
chunk_logger = getLogger("chunk_logger")
|
||||
database_logger = getLogger("database")
|
||||
|
||||
|
||||
@ -47,7 +47,7 @@ def collect(tm):
|
||||
def set_dispatching(docid):
|
||||
try:
|
||||
DocumentService.update_by_id(
|
||||
docid, {"progress": random.random()*1 / 100.,
|
||||
docid, {"progress": random.random() * 1 / 100.,
|
||||
"progress_msg": "Task dispatched...",
|
||||
"process_begin_at": get_format_time()
|
||||
})
|
||||
@ -56,7 +56,10 @@ def set_dispatching(docid):
|
||||
|
||||
|
||||
def dispatch():
|
||||
tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"broker.tm")
|
||||
tm_fnm = os.path.join(
|
||||
get_project_base_directory(),
|
||||
"rag/res",
|
||||
f"broker.tm")
|
||||
tm = findMaxTm(tm_fnm)
|
||||
rows = collect(tm)
|
||||
if len(rows) == 0:
|
||||
@ -82,17 +85,22 @@ def dispatch():
|
||||
tsks = []
|
||||
if r["type"] == FileType.PDF.value:
|
||||
do_layout = r["parser_config"].get("layout_recognize", True)
|
||||
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||
pages = PdfParser.total_page_number(
|
||||
r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||
page_size = r["parser_config"].get("task_page_size", 12)
|
||||
if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22)
|
||||
if r["parser_id"] == "one": page_size = 1000000000
|
||||
if not do_layout: page_size = 1000000000
|
||||
if r["parser_id"] == "paper":
|
||||
page_size = r["parser_config"].get("task_page_size", 22)
|
||||
if r["parser_id"] == "one":
|
||||
page_size = 1000000000
|
||||
if not do_layout:
|
||||
page_size = 1000000000
|
||||
page_ranges = r["parser_config"].get("pages")
|
||||
if not page_ranges: page_ranges = [(1, 100000)]
|
||||
for s,e in page_ranges:
|
||||
if not page_ranges:
|
||||
page_ranges = [(1, 100000)]
|
||||
for s, e in page_ranges:
|
||||
s -= 1
|
||||
s = max(0, s)
|
||||
e = min(e-1, pages)
|
||||
e = min(e - 1, pages)
|
||||
for p in range(s, e, page_size):
|
||||
task = new_task()
|
||||
task["from_page"] = p
|
||||
@ -100,12 +108,14 @@ def dispatch():
|
||||
tsks.append(task)
|
||||
|
||||
elif r["parser_id"] == "table":
|
||||
rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||
for i in range(0, rn, 3000):
|
||||
task = new_task()
|
||||
task["from_page"] = i
|
||||
task["to_page"] = min(i + 3000, rn)
|
||||
tsks.append(task)
|
||||
rn = HuExcelParser.row_number(
|
||||
r["name"], MINIO.get(
|
||||
r["kb_id"], r["location"]))
|
||||
for i in range(0, rn, 3000):
|
||||
task = new_task()
|
||||
task["from_page"] = i
|
||||
task["to_page"] = min(i + 3000, rn)
|
||||
tsks.append(task)
|
||||
else:
|
||||
tsks.append(new_task())
|
||||
|
||||
@ -120,27 +130,37 @@ def update_progress():
|
||||
for d in docs:
|
||||
try:
|
||||
tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
|
||||
if not tsks:continue
|
||||
if not tsks:
|
||||
continue
|
||||
msg = []
|
||||
prg = 0
|
||||
finished = True
|
||||
bad = 0
|
||||
status = TaskStatus.RUNNING.value
|
||||
for t in tsks:
|
||||
if 0 <= t.progress < 1: finished = False
|
||||
if 0 <= t.progress < 1:
|
||||
finished = False
|
||||
prg += t.progress if t.progress >= 0 else 0
|
||||
msg.append(t.progress_msg)
|
||||
if t.progress == -1: bad += 1
|
||||
if t.progress == -1:
|
||||
bad += 1
|
||||
prg /= len(tsks)
|
||||
if finished and bad:
|
||||
prg = -1
|
||||
status = TaskStatus.FAIL.value
|
||||
elif finished: status = TaskStatus.DONE.value
|
||||
elif finished:
|
||||
status = TaskStatus.DONE.value
|
||||
|
||||
msg = "\n".join(msg)
|
||||
info = {"process_duation": datetime.timestamp(datetime.now())-d["process_begin_at"].timestamp(), "run": status}
|
||||
if prg !=0 : info["progress"] = prg
|
||||
if msg: info["progress_msg"] = msg
|
||||
info = {
|
||||
"process_duation": datetime.timestamp(
|
||||
datetime.now()) -
|
||||
d["process_begin_at"].timestamp(),
|
||||
"run": status}
|
||||
if prg != 0:
|
||||
info["progress"] = prg
|
||||
if msg:
|
||||
info["progress_msg"] = msg
|
||||
DocumentService.update_by_id(d["id"], info)
|
||||
except Exception as e:
|
||||
cron_logger.error("fetch task exception:" + str(e))
|
||||
|
||||
@ -67,7 +67,7 @@ FACTORY = {
|
||||
def set_progress(task_id, from_page=0, to_page=-1,
|
||||
prog=None, msg="Processing..."):
|
||||
if prog is not None and prog < 0:
|
||||
msg = "[ERROR]"+msg
|
||||
msg = "[ERROR]" + msg
|
||||
cancel = TaskService.do_cancel(task_id)
|
||||
if cancel:
|
||||
msg += " [Canceled]"
|
||||
@ -188,11 +188,13 @@ def embedding(docs, mdl, parser_config={}, callback=None):
|
||||
|
||||
cnts_ = np.array([])
|
||||
for i in range(0, len(cnts), batch_size):
|
||||
vts, c = mdl.encode(cnts[i: i+batch_size])
|
||||
if len(cnts_) == 0: cnts_ = vts
|
||||
else: cnts_ = np.concatenate((cnts_, vts), axis=0)
|
||||
vts, c = mdl.encode(cnts[i: i + batch_size])
|
||||
if len(cnts_) == 0:
|
||||
cnts_ = vts
|
||||
else:
|
||||
cnts_ = np.concatenate((cnts_, vts), axis=0)
|
||||
tk_count += c
|
||||
callback(prog=0.7+0.2*(i+1)/len(cnts), msg="")
|
||||
callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="")
|
||||
cnts = cnts_
|
||||
|
||||
title_w = float(parser_config.get("filename_embd_weight", 0.1))
|
||||
@ -234,7 +236,9 @@ def main(comm, mod):
|
||||
continue
|
||||
# TODO: exception handler
|
||||
## set_progress(r["did"], -1, "ERROR: ")
|
||||
callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks))
|
||||
callback(
|
||||
msg="Finished slicing files(%d). Start to embedding the content." %
|
||||
len(cks))
|
||||
try:
|
||||
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
||||
except Exception as e:
|
||||
@ -249,7 +253,7 @@ def main(comm, mod):
|
||||
if es_r:
|
||||
callback(-1, "Index failure!")
|
||||
ELASTICSEARCH.deleteByQuery(
|
||||
Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
|
||||
Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
|
||||
cron_logger.error(str(es_r))
|
||||
else:
|
||||
if TaskService.do_cancel(r["id"]):
|
||||
|
||||
Reference in New Issue
Block a user