mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
change callback strategy, add timezone to docker (#96)
This commit is contained in:
@ -26,26 +26,27 @@ class Pdf(PdfParser):
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page)
|
||||
callback(0.1, "OCR finished")
|
||||
to_page,
|
||||
callback)
|
||||
callback("OCR finished")
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.47, "Layout analysis finished")
|
||||
callback(0.67, "Layout analysis finished")
|
||||
print("paddle layouts:", timer() - start)
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.68, "Table analysis finished")
|
||||
self._text_merge()
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._naive_vertical_merge()
|
||||
self._filter_forpages()
|
||||
self._merge_with_same_bullet()
|
||||
callback(0.75, "Text merging finished.")
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls, tbl_poss
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
@ -92,7 +93,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
|
||||
if bull >= 0: cks = hierarchical_merge(bull, sections, 3)
|
||||
else:
|
||||
sections = [s.split("@") for s in sections]
|
||||
sections = [s.split("@") for s,_ in sections]
|
||||
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
|
||||
cks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
|
||||
|
||||
@ -116,6 +117,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
def dummy(a, b):
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
|
||||
|
||||
@ -54,13 +54,15 @@ class Pdf(PdfParser):
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page)
|
||||
callback(0.1, "OCR finished")
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
callback("OCR finished")
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.77, "Layout analysis finished")
|
||||
callback(0.67, "Layout analysis finished")
|
||||
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
|
||||
self._naive_vertical_merge()
|
||||
|
||||
|
||||
@ -19,20 +19,22 @@ class Pdf(PdfParser):
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page)
|
||||
callback(0.2, "OCR finished.")
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
callback("OCR finished.")
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.5, "Layout analysis finished.")
|
||||
callback(0.65, "Layout analysis finished.")
|
||||
print("paddle layouts:", timer() - start)
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.7, "Table analysis finished.")
|
||||
callback(0.67, "Table analysis finished.")
|
||||
self._text_merge()
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.77, "Text merging finished")
|
||||
callback(0.68, "Text merging finished")
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
# clean mess
|
||||
|
||||
@ -26,24 +26,24 @@ class Pdf(PdfParser):
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page)
|
||||
callback(0.1, "OCR finished")
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
callback("OCR finished")
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.5, "Layout analysis finished.")
|
||||
callback(0.63, "Layout analysis finished.")
|
||||
print("paddle layouts:", timer() - start)
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.7, "Table analysis finished.")
|
||||
callback(0.65, "Table analysis finished.")
|
||||
self._text_merge()
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.77, "Text merging finished")
|
||||
callback(0.67, "Text merging finished")
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._naive_vertical_merge()
|
||||
|
||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||
#self._naive_vertical_merge()
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
||||
|
||||
|
||||
|
||||
@ -33,13 +33,15 @@ class Pdf(PdfParser):
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page)
|
||||
callback(0.2, "OCR finished.")
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
callback("OCR finished.")
|
||||
|
||||
from timeit import default_timer as timer
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.47, "Layout analysis finished")
|
||||
callback(0.63, "Layout analysis finished")
|
||||
print("paddle layouts:", timer() - start)
|
||||
self._table_transformer_job(zoomin)
|
||||
callback(0.68, "Table analysis finished")
|
||||
|
||||
@ -49,7 +49,7 @@ class Pdf(PdfParser):
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
callback(msg="OCR is running...")
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
|
||||
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
|
||||
res = []
|
||||
|
||||
@ -56,6 +56,7 @@ class HuEmbedding(Base):
|
||||
|
||||
|
||||
def encode(self, texts: list, batch_size=32):
|
||||
texts = [t[:2000] for t in texts]
|
||||
token_count = 0
|
||||
for t in texts: token_count += num_tokens_from_string(t)
|
||||
res = []
|
||||
|
||||
@ -114,6 +114,7 @@ def add_positions(d, poss):
|
||||
d["page_num_int"].append(pn+1)
|
||||
d["top_int"].append(top)
|
||||
d["position_int"].append((pn+1, left, right, top, bottom))
|
||||
d["top_int"] = d["top_int"][:1]
|
||||
|
||||
|
||||
def remove_contents_table(sections, eng=False):
|
||||
@ -172,7 +173,7 @@ def hierarchical_merge(bull, sections, depth):
|
||||
|
||||
def not_title(txt):
|
||||
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt): return False
|
||||
if len(txt) >= 128: return True
|
||||
if len(txt.split(" "))>12 or (txt.find(" ")<0 and len(txt)) >= 32: return True
|
||||
return re.search(r"[,;,。;!!]", txt)
|
||||
|
||||
for i, (txt, layout) in enumerate(sections):
|
||||
@ -181,12 +182,12 @@ def hierarchical_merge(bull, sections, depth):
|
||||
levels[j].append(i)
|
||||
break
|
||||
else:
|
||||
if re.search(r"(title|head)", layout):
|
||||
if re.search(r"(title|head)", layout) and not not_title(txt):
|
||||
levels[bullets_size].append(i)
|
||||
else:
|
||||
levels[bullets_size + 1].append(i)
|
||||
sections = [t for t, _ in sections]
|
||||
for s in sections: print("--", s)
|
||||
#for s in sections: print("--", s)
|
||||
|
||||
def binary_search(arr, target):
|
||||
if not arr: return -1
|
||||
@ -220,11 +221,29 @@ def hierarchical_merge(bull, sections, depth):
|
||||
if jj > cks[-1][-1]: cks[-1].pop(-1)
|
||||
cks[-1].append(levels[ii][jj])
|
||||
for ii in cks[-1]: readed[ii] = True
|
||||
|
||||
if not cks:return cks
|
||||
|
||||
for i in range(len(cks)):
|
||||
cks[i] = [sections[j] for j in cks[i][::-1]]
|
||||
print("--------------\n", "\n* ".join(cks[i]))
|
||||
|
||||
return cks
|
||||
res = [[]]
|
||||
num = [0]
|
||||
for ck in cks:
|
||||
if len(ck) == 1:
|
||||
n = num_tokens_from_string(re.sub(r"@@[0-9]+.*", "", ck[0]))
|
||||
if n + num[-1] < 218:
|
||||
res[-1].append(ck[0])
|
||||
num[-1] += n
|
||||
continue
|
||||
res.append(ck)
|
||||
num.append(n)
|
||||
continue
|
||||
res.append(ck)
|
||||
num.append(218)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
|
||||
@ -46,7 +46,7 @@ def collect(tm):
|
||||
def set_dispatching(docid):
|
||||
try:
|
||||
DocumentService.update_by_id(
|
||||
docid, {"progress": random.randint(0, 3) / 100.,
|
||||
docid, {"progress": random.random()*1 / 100.,
|
||||
"progress_msg": "Task dispatched...",
|
||||
"process_begin_at": get_format_time()
|
||||
})
|
||||
|
||||
@ -72,7 +72,8 @@ def set_progress(task_id, from_page=0, to_page=-1,
|
||||
prog = -1
|
||||
|
||||
if to_page > 0:
|
||||
msg = f"Page({from_page}~{to_page}): " + msg
|
||||
if msg:
|
||||
msg = f"Page({from_page}~{to_page}): " + msg
|
||||
d = {"progress_msg": msg}
|
||||
if prog is not None:
|
||||
d["progress"] = prog
|
||||
@ -168,7 +169,7 @@ def init_kb(row):
|
||||
open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
|
||||
|
||||
|
||||
def embedding(docs, mdl, parser_config={}):
|
||||
def embedding(docs, mdl, parser_config={}, callback=None):
|
||||
tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
|
||||
d["content_with_weight"] for d in docs]
|
||||
tk_count = 0
|
||||
@ -176,8 +177,14 @@ def embedding(docs, mdl, parser_config={}):
|
||||
tts, c = mdl.encode(tts)
|
||||
tk_count += c
|
||||
|
||||
cnts, c = mdl.encode(cnts)
|
||||
tk_count += c
|
||||
cnts_ = []
|
||||
for i in range(0, len(cnts), 32):
|
||||
vts, c = mdl.encode(cnts[i: i+32])
|
||||
cnts_.extend(vts)
|
||||
tk_count += c
|
||||
callback(msg="")
|
||||
cnts = cnts_
|
||||
|
||||
title_w = float(parser_config.get("filename_embd_weight", 0.1))
|
||||
vects = (title_w * tts + (1 - title_w) *
|
||||
cnts) if len(tts) == len(cnts) else cnts
|
||||
@ -218,10 +225,11 @@ def main(comm, mod):
|
||||
# TODO: exception handler
|
||||
## set_progress(r["did"], -1, "ERROR: ")
|
||||
try:
|
||||
tk_count = embedding(cks, embd_mdl, r["parser_config"])
|
||||
tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
|
||||
except Exception as e:
|
||||
callback(-1, "Embedding error:{}".format(str(e)))
|
||||
cron_logger.error(str(e))
|
||||
tk_count = 0
|
||||
|
||||
callback(msg="Finished embedding! Start to build index!")
|
||||
init_kb(r)
|
||||
|
||||
Reference in New Issue
Block a user