mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
apply pep8 formalize (#155)
This commit is contained in:
@ -48,10 +48,12 @@ class Pdf(PdfParser):
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
|
||||
for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
@ -63,48 +65,63 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
pdf_parser = None
|
||||
sections,tbls = [], []
|
||||
sections, tbls = [], []
|
||||
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
doc_parser = DocxParser()
|
||||
# TODO: table of contents need to be removed
|
||||
sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
|
||||
sections, tbls = doc_parser(
|
||||
binary if binary else filename, from_page=from_page, to_page=to_page)
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
|
||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:txt = binary.decode("utf-8")
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:break
|
||||
if not l:
|
||||
break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [(l,"") for l in sections if l]
|
||||
remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
|
||||
sections = [(l, "") for l in sections if l]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
make_colon_as_title(sections)
|
||||
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
|
||||
bull = bullets_category(
|
||||
[t for t in random_choices([t for t, _ in sections], k=100)])
|
||||
if bull >= 0:
|
||||
chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 3)]
|
||||
chunks = ["\n".join(ck)
|
||||
for ck in hierarchical_merge(bull, sections, 3)]
|
||||
else:
|
||||
sections = [s.split("@") for s,_ in sections]
|
||||
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
|
||||
chunks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
|
||||
sections = [s.split("@") for s, _ in sections]
|
||||
sections = [(pr[0], "@" + pr[1]) for pr in sections if len(pr) == 2]
|
||||
chunks = naive_merge(
|
||||
sections, kwargs.get(
|
||||
"chunk_token_num", 256), kwargs.get(
|
||||
"delimer", "\n。;!?"))
|
||||
|
||||
# is it English
|
||||
eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
|
||||
# is_english(random_choices([t for t, _ in sections], k=218))
|
||||
eng = lang.lower() == "english"
|
||||
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
@ -114,6 +131,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
|
||||
|
||||
@ -35,8 +35,10 @@ class Docx(DocxParser):
|
||||
pn = 0
|
||||
lines = []
|
||||
for p in self.doc.paragraphs:
|
||||
if pn > to_page:break
|
||||
if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text))
|
||||
if pn > to_page:
|
||||
break
|
||||
if from_page <= pn < to_page and p.text.strip():
|
||||
lines.append(self.__clean(p.text))
|
||||
for run in p.runs:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
pn += 1
|
||||
@ -63,15 +65,18 @@ class Pdf(PdfParser):
|
||||
start = timer()
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.67, "Layout analysis finished")
|
||||
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
|
||||
cron_logger.info("paddle layouts:".format(
|
||||
(timer() - start) / (self.total_page + 0.1)))
|
||||
self._naive_vertical_merge()
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
|
||||
return [(b["text"], self._line_tag(b, zoomin))
|
||||
for b in self.boxes], None
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
"""
|
||||
@ -89,41 +94,50 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
for txt, poss in pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
||||
sections.append(txt + poss)
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
for txt, poss in pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)[0]:
|
||||
sections.append(txt + poss)
|
||||
|
||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
if binary:txt = binary.decode("utf-8")
|
||||
if binary:
|
||||
txt = binary.decode("utf-8")
|
||||
else:
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l:break
|
||||
if not l:
|
||||
break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [l for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
# is it English
|
||||
eng = lang.lower() == "english"#is_english(sections)
|
||||
eng = lang.lower() == "english" # is_english(sections)
|
||||
# Remove 'Contents' part
|
||||
remove_contents_table(sections, eng)
|
||||
|
||||
make_colon_as_title(sections)
|
||||
bull = bullets_category(sections)
|
||||
chunks = hierarchical_merge(bull, sections, 3)
|
||||
if not chunks: callback(0.99, "No chunk parsed out.")
|
||||
if not chunks:
|
||||
callback(0.99, "No chunk parsed out.")
|
||||
|
||||
return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser)
|
||||
return tokenize_chunks(["\n".join(ck)
|
||||
for ck in chunks], doc, eng, pdf_parser)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -25,10 +25,10 @@ class Pdf(PdfParser):
|
||||
callback
|
||||
)
|
||||
callback(msg="OCR finished.")
|
||||
#for bb in self.boxes:
|
||||
# for bb in self.boxes:
|
||||
# for b in bb:
|
||||
# print(b)
|
||||
print("OCR:", timer()-start)
|
||||
print("OCR:", timer() - start)
|
||||
|
||||
self._layouts_rec(zoomin)
|
||||
callback(0.65, "Layout analysis finished.")
|
||||
@ -45,30 +45,35 @@ class Pdf(PdfParser):
|
||||
for b in self.boxes:
|
||||
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
||||
|
||||
return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls
|
||||
return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
|
||||
for i, b in enumerate(self.boxes)], tbls
|
||||
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
"""
|
||||
pdf_parser = None
|
||||
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
if sections and len(sections[0])<3: sections = [(t, l, [[0]*5]) for t, l in sections]
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
if sections and len(sections[0]) < 3:
|
||||
sections = [(t, l, [[0] * 5]) for t, l in sections]
|
||||
|
||||
else: raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
doc = {
|
||||
"docnm_kwd": filename
|
||||
}
|
||||
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
|
||||
# set pivot using the most frequent type of title,
|
||||
# then merge between 2 pivot
|
||||
@ -79,7 +84,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
for txt, _, _ in sections:
|
||||
for t, lvl in pdf_parser.outlines:
|
||||
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
|
||||
tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
|
||||
tks_ = set([txt[i] + txt[i + 1]
|
||||
for i in range(min(len(t), len(txt) - 1))])
|
||||
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
|
||||
levels.append(lvl)
|
||||
break
|
||||
@ -87,24 +93,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
levels.append(max_lvl + 1)
|
||||
|
||||
else:
|
||||
bull = bullets_category([txt for txt,_,_ in sections])
|
||||
most_level, levels = title_frequency(bull, [(txt, l) for txt, l, poss in sections])
|
||||
bull = bullets_category([txt for txt, _, _ in sections])
|
||||
most_level, levels = title_frequency(
|
||||
bull, [(txt, l) for txt, l, poss in sections])
|
||||
|
||||
assert len(sections) == len(levels)
|
||||
sec_ids = []
|
||||
sid = 0
|
||||
for i, lvl in enumerate(levels):
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
# print(lvl, self.boxes[i]["text"], most_level, sid)
|
||||
|
||||
sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
|
||||
sections = [(txt, sec_ids[i], poss)
|
||||
for i, (txt, _, poss) in enumerate(sections)]
|
||||
for (img, rows), poss in tbls:
|
||||
sections.append((rows if isinstance(rows, str) else rows[0], -1,
|
||||
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
|
||||
def tag(pn, left, right, top, bottom):
|
||||
if pn+left+right+top+bottom == 0:
|
||||
if pn + left + right + top + bottom == 0:
|
||||
return ""
|
||||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
||||
.format(pn, left, right, top, bottom)
|
||||
@ -112,7 +121,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
chunks = []
|
||||
last_sid = -2
|
||||
tk_cnt = 0
|
||||
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
||||
for txt, sec_id, poss in sorted(sections, key=lambda x: (
|
||||
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
||||
poss = "\t".join([tag(*pos) for pos in poss])
|
||||
if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1):
|
||||
if chunks:
|
||||
@ -121,16 +131,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
continue
|
||||
chunks.append(txt + poss)
|
||||
tk_cnt = num_tokens_from_string(txt)
|
||||
if sec_id > -1: last_sid = sec_id
|
||||
if sec_id > -1:
|
||||
last_sid = sec_id
|
||||
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
return res
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -44,11 +44,14 @@ class Pdf(PdfParser):
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._naive_vertical_merge()
|
||||
|
||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
||||
cron_logger.info("paddle layouts:".format(
|
||||
(timer() - start) / (self.total_page + 0.1)))
|
||||
return [(b["text"], self._line_tag(b, zoomin))
|
||||
for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
@ -56,8 +59,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
"""
|
||||
|
||||
eng = lang.lower() == "english"#is_english(cks)
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
@ -73,9 +78,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if parser_config["layout_recognize"] else PlainParser()
|
||||
pdf_parser = Pdf(
|
||||
) if parser_config["layout_recognize"] else PlainParser()
|
||||
sections, tbls = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
|
||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
@ -92,16 +98,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l: break
|
||||
if not l:
|
||||
break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [(l, "") for l in sections if l]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
chunks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
chunks = naive_merge(
|
||||
sections, parser_config.get(
|
||||
"chunk_token_num", 128), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
return res
|
||||
@ -110,9 +121,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -41,20 +41,23 @@ class Pdf(PdfParser):
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._concat_downward()
|
||||
|
||||
sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
|
||||
sections = [(b["text"], self.get_position(b, zoomin))
|
||||
for i, b in enumerate(self.boxes)]
|
||||
for (img, rows), poss in tbls:
|
||||
sections.append((rows if isinstance(rows, str) else rows[0],
|
||||
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
|
||||
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
|
||||
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
One file forms a chunk which maintains original text order.
|
||||
"""
|
||||
|
||||
eng = lang.lower() == "english"#is_english(cks)
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
|
||||
if re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
@ -62,8 +65,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
|
||||
sections, _ = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainParser()
|
||||
sections, _ = pdf_parser(
|
||||
filename if not binary else binary, to_page=to_page, callback=callback)
|
||||
sections = [s for s, _ in sections if s]
|
||||
|
||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
@ -80,14 +86,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
with open(filename, "r") as f:
|
||||
while True:
|
||||
l = f.readline()
|
||||
if not l: break
|
||||
if not l:
|
||||
break
|
||||
txt += l
|
||||
sections = txt.split("\n")
|
||||
sections = [s for s in sections if s]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(docx, pdf, txt supported)")
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
@ -101,9 +109,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -67,11 +67,11 @@ class Pdf(PdfParser):
|
||||
|
||||
if from_page > 0:
|
||||
return {
|
||||
"title":"",
|
||||
"title": "",
|
||||
"authors": "",
|
||||
"abstract": "",
|
||||
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
|
||||
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
||||
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
||||
"tables": tbls
|
||||
}
|
||||
# get title and authors
|
||||
@ -87,7 +87,8 @@ class Pdf(PdfParser):
|
||||
title = ""
|
||||
break
|
||||
for j in range(3):
|
||||
if _begin(self.boxes[i + j]["text"]): break
|
||||
if _begin(self.boxes[i + j]["text"]):
|
||||
break
|
||||
authors.append(self.boxes[i + j]["text"])
|
||||
break
|
||||
break
|
||||
@ -107,10 +108,15 @@ class Pdf(PdfParser):
|
||||
abstr = txt + self._line_tag(self.boxes[i], zoomin)
|
||||
i += 1
|
||||
break
|
||||
if not abstr: i = 0
|
||||
if not abstr:
|
||||
i = 0
|
||||
|
||||
callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
|
||||
for b in self.boxes: print(b["text"], b.get("layoutno"))
|
||||
callback(
|
||||
0.8, "Page {}~{}: Text merging finished".format(
|
||||
from_page, min(
|
||||
to_page, self.total_page)))
|
||||
for b in self.boxes:
|
||||
print(b["text"], b.get("layoutno"))
|
||||
print(tbls)
|
||||
|
||||
return {
|
||||
@ -118,19 +124,20 @@ class Pdf(PdfParser):
|
||||
"authors": " ".join(authors),
|
||||
"abstract": abstr,
|
||||
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
|
||||
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
||||
re.match(r"(text|title)", b.get("layoutno", "text"))],
|
||||
"tables": tbls
|
||||
}
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
|
||||
"""
|
||||
pdf_parser = None
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
if not kwargs.get("parser_config",{}).get("layout_recognize", True):
|
||||
if not kwargs.get("parser_config", {}).get("layout_recognize", True):
|
||||
pdf_parser = PlainParser()
|
||||
paper = {
|
||||
"title": filename,
|
||||
@ -143,14 +150,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
pdf_parser = Pdf()
|
||||
paper = pdf_parser(filename if not binary else binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)
|
||||
else: raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
else:
|
||||
raise NotImplementedError("file type not supported yet(pdf supported)")
|
||||
|
||||
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
|
||||
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
|
||||
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
|
||||
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english"#pdf_parser.is_english
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
print("It's English.....", eng)
|
||||
|
||||
res = tokenize_table(paper["tables"], doc, eng)
|
||||
@ -160,7 +168,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
txt = pdf_parser.remove_tag(paper["abstract"])
|
||||
d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
|
||||
d["important_tks"] = " ".join(d["important_kwd"])
|
||||
d["image"], poss = pdf_parser.crop(paper["abstract"], need_position=True)
|
||||
d["image"], poss = pdf_parser.crop(
|
||||
paper["abstract"], need_position=True)
|
||||
add_positions(d, poss)
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
@ -174,7 +183,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
sec_ids = []
|
||||
sid = 0
|
||||
for i, lvl in enumerate(levels):
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
|
||||
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
print(lvl, sorted_sections[i][0], most_level, sid)
|
||||
|
||||
@ -190,6 +200,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
return res
|
||||
|
||||
|
||||
"""
|
||||
readed = [0] * len(paper["lines"])
|
||||
# find colon firstly
|
||||
@ -212,7 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
for k in range(j, i): readed[k] = True
|
||||
txt = txt[::-1]
|
||||
if eng:
|
||||
r = re.search(r"(.*?) ([\.;?!]|$)", txt)
|
||||
r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
|
||||
txt = r.group(1)[::-1] if r else txt[::-1]
|
||||
else:
|
||||
r = re.search(r"(.*?) ([。?;!]|$)", txt)
|
||||
@ -270,6 +281,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -33,9 +33,12 @@ class Ppt(PptParser):
|
||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
||||
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
||||
buffered = BytesIO()
|
||||
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
slide.get_thumbnail(
|
||||
0.5, 0.5).save(
|
||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
imgs.append(Image.open(buffered))
|
||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
assert len(imgs) == len(
|
||||
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
callback(0.9, "Image extraction finished")
|
||||
self.is_english = is_english(txts)
|
||||
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
@ -47,25 +50,34 @@ class Pdf(PdfParser):
|
||||
|
||||
def __garbage(self, txt):
|
||||
txt = txt.lower().strip()
|
||||
if re.match(r"[0-9\.,%/-]+$", txt): return True
|
||||
if len(txt) < 3:return True
|
||||
if re.match(r"[0-9\.,%/-]+$", txt):
|
||||
return True
|
||||
if len(txt) < 3:
|
||||
return True
|
||||
return False
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
callback(msg="OCR is running...")
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
|
||||
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
|
||||
self.__images__(filename if not binary else binary,
|
||||
zoomin, from_page, to_page, callback)
|
||||
callback(0.8, "Page {}~{}: OCR finished".format(
|
||||
from_page, min(to_page, self.total_page)))
|
||||
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
|
||||
len(self.boxes), len(self.page_images))
|
||||
res = []
|
||||
for i in range(len(self.boxes)):
|
||||
lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
|
||||
lines = "\n".join([b["text"] for b in self.boxes[i]
|
||||
if not self.__garbage(b["text"])])
|
||||
res.append((lines, self.page_images[i]))
|
||||
callback(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)))
|
||||
callback(0.9, "Page {}~{}: Parsing finished".format(
|
||||
from_page, min(to_page, self.total_page)))
|
||||
return res
|
||||
|
||||
|
||||
class PlainPdf(PlainParser):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, callback=None, **kwargs):
|
||||
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
|
||||
page_txt = []
|
||||
for page in self.pdf.pages[from_page: to_page]:
|
||||
@ -74,7 +86,8 @@ class PlainPdf(PlainParser):
|
||||
return [(txt, None) for txt in page_txt]
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
The supported file formats are pdf, pptx.
|
||||
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
||||
@ -89,35 +102,42 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
res = []
|
||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||
ppt_parser = Ppt()
|
||||
for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||
for pn, (txt, img) in enumerate(ppt_parser(
|
||||
filename if not binary else binary, from_page, 1000000, callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
d["image"] = img
|
||||
d["page_num_int"] = [pn+1]
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainPdf()
|
||||
for pn, (txt,img) in enumerate(pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)):
|
||||
pdf_parser = Pdf() if kwargs.get(
|
||||
"parser_config", {}).get(
|
||||
"layout_recognize", True) else PlainPdf()
|
||||
for pn, (txt, img) in enumerate(pdf_parser(filename, binary,
|
||||
from_page=from_page, to_page=to_page, callback=callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
if img: d["image"] = img
|
||||
d["page_num_int"] = [pn+1]
|
||||
if img:
|
||||
d["image"] = img
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
|
||||
d["position_int"] = [
|
||||
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pptx, pdf supported)")
|
||||
|
||||
|
||||
if __name__== "__main__":
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
def dummy(a, b):
|
||||
pass
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
|
||||
@ -27,6 +27,8 @@ from rag.utils import rmSpace
|
||||
forbidden_select_fields4resume = [
|
||||
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
|
||||
]
|
||||
|
||||
|
||||
def remote_call(filename, binary):
|
||||
q = {
|
||||
"header": {
|
||||
@ -48,18 +50,22 @@ def remote_call(filename, binary):
|
||||
}
|
||||
for _ in range(3):
|
||||
try:
|
||||
resume = requests.post("http://127.0.0.1:61670/tog", data=json.dumps(q))
|
||||
resume = requests.post(
|
||||
"http://127.0.0.1:61670/tog",
|
||||
data=json.dumps(q))
|
||||
resume = resume.json()["response"]["results"]
|
||||
resume = refactor(resume)
|
||||
for k in ["education", "work", "project", "training", "skill", "certificate", "language"]:
|
||||
if not resume.get(k) and k in resume: del resume[k]
|
||||
for k in ["education", "work", "project",
|
||||
"training", "skill", "certificate", "language"]:
|
||||
if not resume.get(k) and k in resume:
|
||||
del resume[k]
|
||||
|
||||
resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
|
||||
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
|
||||
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
|
||||
resume = step_two.parse(resume)
|
||||
return resume
|
||||
except Exception as e:
|
||||
cron_logger.error("Resume parser error: "+str(e))
|
||||
cron_logger.error("Resume parser error: " + str(e))
|
||||
return {}
|
||||
|
||||
|
||||
@ -144,10 +150,13 @@ def chunk(filename, binary=None, callback=None, **kwargs):
|
||||
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
|
||||
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
|
||||
for n, _ in field_map.items():
|
||||
if n not in resume:continue
|
||||
if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
||||
if n not in resume:
|
||||
continue
|
||||
if isinstance(resume[n], list) and (
|
||||
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
|
||||
resume[n] = resume[n][0]
|
||||
if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
|
||||
if n.find("_tks") > 0:
|
||||
resume[n] = huqie.qieqie(resume[n])
|
||||
doc[n] = resume[n]
|
||||
|
||||
print(doc)
|
||||
|
||||
@ -25,7 +25,8 @@ from deepdoc.parser import ExcelParser
|
||||
|
||||
|
||||
class Excel(ExcelParser):
|
||||
def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
|
||||
def __call__(self, fnm, binary=None, from_page=0,
|
||||
to_page=10000000000, callback=None):
|
||||
if not binary:
|
||||
wb = load_workbook(fnm)
|
||||
else:
|
||||
@ -48,8 +49,10 @@ class Excel(ExcelParser):
|
||||
data = []
|
||||
for i, r in enumerate(rows[1:]):
|
||||
rn += 1
|
||||
if rn-1 < from_page:continue
|
||||
if rn -1>=to_page: break
|
||||
if rn - 1 < from_page:
|
||||
continue
|
||||
if rn - 1 >= to_page:
|
||||
break
|
||||
row = [
|
||||
cell.value for ii,
|
||||
cell in enumerate(r) if ii not in missed]
|
||||
@ -60,7 +63,7 @@ class Excel(ExcelParser):
|
||||
done += 1
|
||||
res.append(pd.DataFrame(np.array(data), columns=headers))
|
||||
|
||||
callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + (
|
||||
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
|
||||
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
|
||||
return res
|
||||
|
||||
@ -73,7 +76,8 @@ def trans_datatime(s):
|
||||
|
||||
|
||||
def trans_bool(s):
|
||||
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
|
||||
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$",
|
||||
str(s).strip(), flags=re.IGNORECASE):
|
||||
return "yes"
|
||||
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
|
||||
return "no"
|
||||
@ -107,13 +111,14 @@ def column_data_type(arr):
|
||||
arr[i] = trans[ty](str(arr[i]))
|
||||
except Exception as e:
|
||||
arr[i] = None
|
||||
#if ty == "text":
|
||||
# if ty == "text":
|
||||
# if len(arr) > 128 and uni / len(arr) < 0.1:
|
||||
# ty = "keyword"
|
||||
return arr, ty
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=10000000000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Excel and csv(txt) format files are supported.
|
||||
For csv or txt file, the delimiter between columns is TAB.
|
||||
@ -131,7 +136,12 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
excel_parser = Excel()
|
||||
dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
dfs = excel_parser(
|
||||
filename,
|
||||
binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
callback=callback)
|
||||
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
txt = ""
|
||||
@ -149,8 +159,10 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
headers = lines[0].split(kwargs.get("delimiter", "\t"))
|
||||
rows = []
|
||||
for i, line in enumerate(lines[1:]):
|
||||
if i < from_page:continue
|
||||
if i >= to_page: break
|
||||
if i < from_page:
|
||||
continue
|
||||
if i >= to_page:
|
||||
break
|
||||
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
|
||||
if len(row) != len(headers):
|
||||
fails.append(str(i))
|
||||
@ -181,7 +193,13 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
del df[n]
|
||||
clmns = df.columns.values
|
||||
txts = list(copy.deepcopy(clmns))
|
||||
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|([^()]+?)|\([^()]+?\))", "", n), '_')[0] for n in clmns]
|
||||
py_clmns = [
|
||||
PY.get_pinyins(
|
||||
re.sub(
|
||||
r"(/.*|([^()]+?)|\([^()]+?\))",
|
||||
"",
|
||||
n),
|
||||
'_')[0] for n in clmns]
|
||||
clmn_tys = []
|
||||
for j in range(len(clmns)):
|
||||
cln, ty = column_data_type(df[clmns[j]])
|
||||
@ -192,7 +210,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
|
||||
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
|
||||
for i in range(len(clmns))]
|
||||
|
||||
eng = lang.lower() == "english"#is_english(txts)
|
||||
eng = lang.lower() == "english" # is_english(txts)
|
||||
for ii, row in df.iterrows():
|
||||
d = {
|
||||
"docnm_kwd": filename,
|
||||
|
||||
Reference in New Issue
Block a user