apply pep8 formalize (#155)

This commit is contained in:
KevinHuSh
2024-03-27 11:33:46 +08:00
committed by GitHub
parent a02e836790
commit fd7fcb5baf
55 changed files with 1568 additions and 753 deletions

View File

@ -48,10 +48,12 @@ class Pdf(PdfParser):
callback(0.8, "Text extraction finished")
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
@ -63,48 +65,63 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
pdf_parser = None
sections,tbls = [], []
sections, tbls = [], []
if re.search(r"\.docx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
doc_parser = DocxParser()
# TODO: table of contents need to be removed
sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
sections, tbls = doc_parser(
binary if binary else filename, from_page=from_page, to_page=to_page)
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
pdf_parser = Pdf() if kwargs.get(
"parser_config", {}).get(
"layout_recognize", True) else PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
from_page=from_page, to_page=to_page, callback=callback)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:txt = binary.decode("utf-8")
if binary:
txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:break
if not l:
break
txt += l
sections = txt.split("\n")
sections = [(l,"") for l in sections if l]
remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
sections = [(l, "") for l in sections if l]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
else:
raise NotImplementedError(
"file type not supported yet(docx, pdf, txt supported)")
make_colon_as_title(sections)
bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
bull = bullets_category(
[t for t in random_choices([t for t, _ in sections], k=100)])
if bull >= 0:
chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 3)]
chunks = ["\n".join(ck)
for ck in hierarchical_merge(bull, sections, 3)]
else:
sections = [s.split("@") for s,_ in sections]
sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
chunks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。;!?"))
sections = [s.split("@") for s, _ in sections]
sections = [(pr[0], "@" + pr[1]) for pr in sections if len(pr) == 2]
chunks = naive_merge(
sections, kwargs.get(
"chunk_token_num", 256), kwargs.get(
"delimer", "\n。;!?"))
# is it English
eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
# is_english(random_choices([t for t, _ in sections], k=218))
eng = lang.lower() == "english"
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
@ -114,6 +131,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)

View File

@ -35,8 +35,10 @@ class Docx(DocxParser):
pn = 0
lines = []
for p in self.doc.paragraphs:
if pn > to_page:break
if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text))
if pn > to_page:
break
if from_page <= pn < to_page and p.text.strip():
lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
pn += 1
@ -63,15 +65,18 @@ class Pdf(PdfParser):
start = timer()
self._layouts_rec(zoomin)
callback(0.67, "Layout analysis finished")
cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
cron_logger.info("paddle layouts:".format(
(timer() - start) / (self.total_page + 0.1)))
self._naive_vertical_merge()
callback(0.8, "Text extraction finished")
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], None
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
"""
@ -89,41 +94,50 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
for txt, poss in pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)[0]:
sections.append(txt + poss)
pdf_parser = Pdf() if kwargs.get(
"parser_config", {}).get(
"layout_recognize", True) else PlainParser()
for txt, poss in pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)[0]:
sections.append(txt + poss)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
if binary:txt = binary.decode("utf-8")
if binary:
txt = binary.decode("utf-8")
else:
with open(filename, "r") as f:
while True:
l = f.readline()
if not l:break
if not l:
break
txt += l
sections = txt.split("\n")
sections = [l for l in sections if l]
callback(0.8, "Finish parsing.")
else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
else:
raise NotImplementedError(
"file type not supported yet(docx, pdf, txt supported)")
# is it English
eng = lang.lower() == "english"#is_english(sections)
eng = lang.lower() == "english" # is_english(sections)
# Remove 'Contents' part
remove_contents_table(sections, eng)
make_colon_as_title(sections)
bull = bullets_category(sections)
chunks = hierarchical_merge(bull, sections, 3)
if not chunks: callback(0.99, "No chunk parsed out.")
if not chunks:
callback(0.99, "No chunk parsed out.")
return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser)
return tokenize_chunks(["\n".join(ck)
for ck in chunks], doc, eng, pdf_parser)
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -25,10 +25,10 @@ class Pdf(PdfParser):
callback
)
callback(msg="OCR finished.")
#for bb in self.boxes:
# for bb in self.boxes:
# for b in bb:
# print(b)
print("OCR:", timer()-start)
print("OCR:", timer() - start)
self._layouts_rec(zoomin)
callback(0.65, "Layout analysis finished.")
@ -45,30 +45,35 @@ class Pdf(PdfParser):
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls
return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
"""
pdf_parser = None
if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
pdf_parser = Pdf() if kwargs.get(
"parser_config", {}).get(
"layout_recognize", True) else PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
if sections and len(sections[0])<3: sections = [(t, l, [[0]*5]) for t, l in sections]
from_page=from_page, to_page=to_page, callback=callback)
if sections and len(sections[0]) < 3:
sections = [(t, l, [[0] * 5]) for t, l in sections]
else: raise NotImplementedError("file type not supported yet(pdf supported)")
else:
raise NotImplementedError("file type not supported yet(pdf supported)")
doc = {
"docnm_kwd": filename
}
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
# is it English
eng = lang.lower() == "english"#pdf_parser.is_english
eng = lang.lower() == "english" # pdf_parser.is_english
# set pivot using the most frequent type of title,
# then merge between 2 pivot
@ -79,7 +84,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
for txt, _, _ in sections:
for t, lvl in pdf_parser.outlines:
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
tks_ = set([txt[i] + txt[i + 1]
for i in range(min(len(t), len(txt) - 1))])
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
levels.append(lvl)
break
@ -87,24 +93,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
levels.append(max_lvl + 1)
else:
bull = bullets_category([txt for txt,_,_ in sections])
most_level, levels = title_frequency(bull, [(txt, l) for txt, l, poss in sections])
bull = bullets_category([txt for txt, _, _ in sections])
most_level, levels = title_frequency(
bull, [(txt, l) for txt, l, poss in sections])
assert len(sections) == len(levels)
sec_ids = []
sid = 0
for i, lvl in enumerate(levels):
if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
# print(lvl, self.boxes[i]["text"], most_level, sid)
sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
sections = [(txt, sec_ids[i], poss)
for i, (txt, _, poss) in enumerate(sections)]
for (img, rows), poss in tbls:
sections.append((rows if isinstance(rows, str) else rows[0], -1,
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
def tag(pn, left, right, top, bottom):
if pn+left+right+top+bottom == 0:
if pn + left + right + top + bottom == 0:
return ""
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(pn, left, right, top, bottom)
@ -112,7 +121,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
chunks = []
last_sid = -2
tk_cnt = 0
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
for txt, sec_id, poss in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
poss = "\t".join([tag(*pos) for pos in poss])
if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1):
if chunks:
@ -121,16 +131,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
continue
chunks.append(txt + poss)
tk_cnt = num_tokens_from_string(txt)
if sec_id > -1: last_sid = sec_id
if sec_id > -1:
last_sid = sec_id
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -44,11 +44,14 @@ class Pdf(PdfParser):
tbls = self._extract_table_figure(True, zoomin, True, True)
self._naive_vertical_merge()
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
cron_logger.info("paddle layouts:".format(
(timer() - start) / (self.total_page + 0.1)))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
@ -56,8 +59,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
eng = lang.lower() == "english"#is_english(cks)
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
eng = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
doc = {
"docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -73,9 +78,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if parser_config["layout_recognize"] else PlainParser()
pdf_parser = Pdf(
) if parser_config["layout_recognize"] else PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tbls, doc, eng)
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
@ -92,16 +98,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
with open(filename, "r") as f:
while True:
l = f.readline()
if not l: break
if not l:
break
txt += l
sections = txt.split("\n")
sections = [(l, "") for l in sections if l]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
raise NotImplementedError(
"file type not supported yet(docx, pdf, txt supported)")
chunks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;!?"))
chunks = naive_merge(
sections, parser_config.get(
"chunk_token_num", 128), parser_config.get(
"delimiter", "\n!?。;!?"))
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
@ -110,9 +121,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -41,20 +41,23 @@ class Pdf(PdfParser):
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
sections = [(b["text"], self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls:
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
One file forms a chunk which maintains original text order.
"""
eng = lang.lower() == "english"#is_english(cks)
eng = lang.lower() == "english" # is_english(cks)
if re.search(r"\.docx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
@ -62,8 +65,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
sections, _ = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
pdf_parser = Pdf() if kwargs.get(
"parser_config", {}).get(
"layout_recognize", True) else PlainParser()
sections, _ = pdf_parser(
filename if not binary else binary, to_page=to_page, callback=callback)
sections = [s for s, _ in sections if s]
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
@ -80,14 +86,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
with open(filename, "r") as f:
while True:
l = f.readline()
if not l: break
if not l:
break
txt += l
sections = txt.split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
raise NotImplementedError(
"file type not supported yet(docx, pdf, txt supported)")
doc = {
"docnm_kwd": filename,
@ -101,9 +109,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -67,11 +67,11 @@ class Pdf(PdfParser):
if from_page > 0:
return {
"title":"",
"title": "",
"authors": "",
"abstract": "",
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
re.match(r"(text|title)", b.get("layoutno", "text"))],
re.match(r"(text|title)", b.get("layoutno", "text"))],
"tables": tbls
}
# get title and authors
@ -87,7 +87,8 @@ class Pdf(PdfParser):
title = ""
break
for j in range(3):
if _begin(self.boxes[i + j]["text"]): break
if _begin(self.boxes[i + j]["text"]):
break
authors.append(self.boxes[i + j]["text"])
break
break
@ -107,10 +108,15 @@ class Pdf(PdfParser):
abstr = txt + self._line_tag(self.boxes[i], zoomin)
i += 1
break
if not abstr: i = 0
if not abstr:
i = 0
callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
for b in self.boxes: print(b["text"], b.get("layoutno"))
callback(
0.8, "Page {}~{}: Text merging finished".format(
from_page, min(
to_page, self.total_page)))
for b in self.boxes:
print(b["text"], b.get("layoutno"))
print(tbls)
return {
@ -118,19 +124,20 @@ class Pdf(PdfParser):
"authors": " ".join(authors),
"abstract": abstr,
"sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
re.match(r"(text|title)", b.get("layoutno", "text"))],
re.match(r"(text|title)", b.get("layoutno", "text"))],
"tables": tbls
}
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
"""
pdf_parser = None
if re.search(r"\.pdf$", filename, re.IGNORECASE):
if not kwargs.get("parser_config",{}).get("layout_recognize", True):
if not kwargs.get("parser_config", {}).get("layout_recognize", True):
pdf_parser = PlainParser()
paper = {
"title": filename,
@ -143,14 +150,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else: raise NotImplementedError("file type not supported yet(pdf supported)")
else:
raise NotImplementedError("file type not supported yet(pdf supported)")
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
# is it English
eng = lang.lower() == "english"#pdf_parser.is_english
eng = lang.lower() == "english" # pdf_parser.is_english
print("It's English.....", eng)
res = tokenize_table(paper["tables"], doc, eng)
@ -160,7 +168,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
txt = pdf_parser.remove_tag(paper["abstract"])
d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
d["important_tks"] = " ".join(d["important_kwd"])
d["image"], poss = pdf_parser.crop(paper["abstract"], need_position=True)
d["image"], poss = pdf_parser.crop(
paper["abstract"], need_position=True)
add_positions(d, poss)
tokenize(d, txt, eng)
res.append(d)
@ -174,7 +183,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
sec_ids = []
sid = 0
for i, lvl in enumerate(levels):
if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
sid += 1
sec_ids.append(sid)
print(lvl, sorted_sections[i][0], most_level, sid)
@ -190,6 +200,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res
"""
readed = [0] * len(paper["lines"])
# find colon firstly
@ -212,7 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
for k in range(j, i): readed[k] = True
txt = txt[::-1]
if eng:
r = re.search(r"(.*?) ([\.;?!]|$)", txt)
r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
txt = r.group(1)[::-1] if r else txt[::-1]
else:
r = re.search(r"(.*?) ([。?;!]|$)", txt)
@ -270,6 +281,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -33,9 +33,12 @@ class Ppt(PptParser):
with slides.Presentation(BytesIO(fnm)) as presentation:
for i, slide in enumerate(presentation.slides[from_page: to_page]):
buffered = BytesIO()
slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
slide.get_thumbnail(
0.5, 0.5).save(
buffered, drawing.imaging.ImageFormat.jpeg)
imgs.append(Image.open(buffered))
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback(0.9, "Image extraction finished")
self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))]
@ -47,25 +50,34 @@ class Pdf(PdfParser):
def __garbage(self, txt):
txt = txt.lower().strip()
if re.match(r"[0-9\.,%/-]+$", txt): return True
if len(txt) < 3:return True
if re.match(r"[0-9\.,%/-]+$", txt):
return True
if len(txt) < 3:
return True
return False
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
callback(msg="OCR is running...")
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
self.__images__(filename if not binary else binary,
zoomin, from_page, to_page, callback)
callback(0.8, "Page {}~{}: OCR finished".format(
from_page, min(to_page, self.total_page)))
assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
len(self.boxes), len(self.page_images))
res = []
for i in range(len(self.boxes)):
lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
lines = "\n".join([b["text"] for b in self.boxes[i]
if not self.__garbage(b["text"])])
res.append((lines, self.page_images[i]))
callback(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)))
callback(0.9, "Page {}~{}: Parsing finished".format(
from_page, min(to_page, self.total_page)))
return res
class PlainPdf(PlainParser):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, callback=None, **kwargs):
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
page_txt = []
for page in self.pdf.pages[from_page: to_page]:
@ -74,7 +86,8 @@ class PlainPdf(PlainParser):
return [(txt, None) for txt in page_txt]
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
@ -89,35 +102,42 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt()
for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
for pn, (txt, img) in enumerate(ppt_parser(
filename if not binary else binary, from_page, 1000000, callback)):
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["page_num_int"] = [pn+1]
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, eng)
res.append(d)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainPdf()
for pn, (txt,img) in enumerate(pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)):
pdf_parser = Pdf() if kwargs.get(
"parser_config", {}).get(
"layout_recognize", True) else PlainPdf()
for pn, (txt, img) in enumerate(pdf_parser(filename, binary,
from_page=from_page, to_page=to_page, callback=callback)):
d = copy.deepcopy(doc)
pn += from_page
if img: d["image"] = img
d["page_num_int"] = [pn+1]
if img:
d["image"] = img
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
d["position_int"] = [
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
tokenize(d, txt, eng)
res.append(d)
return res
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
raise NotImplementedError(
"file type not supported yet(pptx, pdf supported)")
if __name__== "__main__":
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -27,6 +27,8 @@ from rag.utils import rmSpace
forbidden_select_fields4resume = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
]
def remote_call(filename, binary):
q = {
"header": {
@ -48,18 +50,22 @@ def remote_call(filename, binary):
}
for _ in range(3):
try:
resume = requests.post("http://127.0.0.1:61670/tog", data=json.dumps(q))
resume = requests.post(
"http://127.0.0.1:61670/tog",
data=json.dumps(q))
resume = resume.json()["response"]["results"]
resume = refactor(resume)
for k in ["education", "work", "project", "training", "skill", "certificate", "language"]:
if not resume.get(k) and k in resume: del resume[k]
for k in ["education", "work", "project",
"training", "skill", "certificate", "language"]:
if not resume.get(k) and k in resume:
del resume[k]
resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
"updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
resume = step_two.parse(resume)
return resume
except Exception as e:
cron_logger.error("Resume parser error: "+str(e))
cron_logger.error("Resume parser error: " + str(e))
return {}
@ -144,10 +150,13 @@ def chunk(filename, binary=None, callback=None, **kwargs):
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
for n, _ in field_map.items():
if n not in resume:continue
if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
if n not in resume:
continue
if isinstance(resume[n], list) and (
len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
resume[n] = resume[n][0]
if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
if n.find("_tks") > 0:
resume[n] = huqie.qieqie(resume[n])
doc[n] = resume[n]
print(doc)

View File

@ -25,7 +25,8 @@ from deepdoc.parser import ExcelParser
class Excel(ExcelParser):
def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
def __call__(self, fnm, binary=None, from_page=0,
to_page=10000000000, callback=None):
if not binary:
wb = load_workbook(fnm)
else:
@ -48,8 +49,10 @@ class Excel(ExcelParser):
data = []
for i, r in enumerate(rows[1:]):
rn += 1
if rn-1 < from_page:continue
if rn -1>=to_page: break
if rn - 1 < from_page:
continue
if rn - 1 >= to_page:
break
row = [
cell.value for ii,
cell in enumerate(r) if ii not in missed]
@ -60,7 +63,7 @@ class Excel(ExcelParser):
done += 1
res.append(pd.DataFrame(np.array(data), columns=headers))
callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + (
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res
@ -73,7 +76,8 @@ def trans_datatime(s):
def trans_bool(s):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$",
str(s).strip(), flags=re.IGNORECASE):
return "yes"
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return "no"
@ -107,13 +111,14 @@ def column_data_type(arr):
arr[i] = trans[ty](str(arr[i]))
except Exception as e:
arr[i] = None
#if ty == "text":
# if ty == "text":
# if len(arr) > 128 and uni / len(arr) < 0.1:
# ty = "keyword"
return arr, ty
def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=10000000000,
lang="Chinese", callback=None, **kwargs):
"""
Excel and csv(txt) format files are supported.
For csv or txt file, the delimiter between columns is TAB.
@ -131,7 +136,12 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
dfs = excel_parser(
filename,
binary,
from_page=from_page,
to_page=to_page,
callback=callback)
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = ""
@ -149,8 +159,10 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
headers = lines[0].split(kwargs.get("delimiter", "\t"))
rows = []
for i, line in enumerate(lines[1:]):
if i < from_page:continue
if i >= to_page: break
if i < from_page:
continue
if i >= to_page:
break
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
if len(row) != len(headers):
fails.append(str(i))
@ -181,7 +193,13 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
del df[n]
clmns = df.columns.values
txts = list(copy.deepcopy(clmns))
py_clmns = [PY.get_pinyins(re.sub(r"(/.*|[^]+?|\([^()]+?\))", "", n), '_')[0] for n in clmns]
py_clmns = [
PY.get_pinyins(
re.sub(
r"(/.*|[^]+?|\([^()]+?\))",
"",
n),
'_')[0] for n in clmns]
clmn_tys = []
for j in range(len(clmns)):
cln, ty = column_data_type(df[clmns[j]])
@ -192,7 +210,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
for i in range(len(clmns))]
eng = lang.lower() == "english"#is_english(txts)
eng = lang.lower() == "english" # is_english(txts)
for ii, row in df.iterrows():
d = {
"docnm_kwd": filename,