apply pep8 formalize (#155)

2026-02-01 16:15:07 +08:00 · 2024-03-27 11:33:46 +08:00
parent a02e836790
commit fd7fcb5baf
55 changed files with 1568 additions and 753 deletions
--- a/rag/app/book.py
+++ b/rag/app/book.py
@ -48,10 +48,12 @@ class Pdf(PdfParser):

        callback(0.8, "Text extraction finished")

-        return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno","")) for b in self.boxes], tbls
+        return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
+                for b in self.boxes], tbls


-def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, txt.
        Since a book is long and not all the parts are useful, if it's a PDF,
@ -63,48 +65,63 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
    }
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    pdf_parser = None
-    sections,tbls = [], []
+    sections, tbls = [], []
    if re.search(r"\.docx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        doc_parser = DocxParser()
        # TODO: table of contents need to be removed
-        sections, tbls = doc_parser(binary if binary else filename, from_page=from_page, to_page=to_page)
-        remove_contents_table(sections, eng=is_english(random_choices([t for t,_ in sections], k=200)))
+        sections, tbls = doc_parser(
+            binary if binary else filename, from_page=from_page, to_page=to_page)
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
+        pdf_parser = Pdf() if kwargs.get(
+            "parser_config", {}).get(
+            "layout_recognize", True) else PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
-                         from_page=from_page, to_page=to_page, callback=callback)
+                                    from_page=from_page, to_page=to_page, callback=callback)

    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
-        if binary:txt = binary.decode("utf-8")
+        if binary:
+            txt = binary.decode("utf-8")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
-                    if not l:break
+                    if not l:
+                        break
                    txt += l
        sections = txt.split("\n")
-        sections = [(l,"") for l in sections if l]
-        remove_contents_table(sections, eng = is_english(random_choices([t for t,_ in sections], k=200)))
+        sections = [(l, "") for l in sections if l]
+        remove_contents_table(sections, eng=is_english(
+            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")

-    else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(docx, pdf, txt supported)")

    make_colon_as_title(sections)
-    bull = bullets_category([t for t in random_choices([t for t,_ in sections], k=100)])
+    bull = bullets_category(
+        [t for t in random_choices([t for t, _ in sections], k=100)])
    if bull >= 0:
-        chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 3)]
+        chunks = ["\n".join(ck)
+                  for ck in hierarchical_merge(bull, sections, 3)]
    else:
-        sections = [s.split("@") for s,_ in sections]
-        sections = [(pr[0], "@"+pr[1]) for pr in sections if len(pr)==2]
-        chunks = naive_merge(sections, kwargs.get("chunk_token_num", 256), kwargs.get("delimer", "\n。；！？"))
+        sections = [s.split("@") for s, _ in sections]
+        sections = [(pr[0], "@" + pr[1]) for pr in sections if len(pr) == 2]
+        chunks = naive_merge(
+            sections, kwargs.get(
+                "chunk_token_num", 256), kwargs.get(
+                "delimer", "\n。；！？"))

    # is it English
-    eng = lang.lower() == "english"#is_english(random_choices([t for t, _ in sections], k=218))
+    # is_english(random_choices([t for t, _ in sections], k=218))
+    eng = lang.lower() == "english"

    res = tokenize_table(tbls, doc, eng)
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
@ -114,6 +131,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca

 if __name__ == "__main__":
    import sys
+
    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@ -35,8 +35,10 @@ class Docx(DocxParser):
        pn = 0
        lines = []
        for p in self.doc.paragraphs:
-            if pn > to_page:break
-            if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text))
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page and p.text.strip():
+                lines.append(self.__clean(p.text))
            for run in p.runs:
                if 'lastRenderedPageBreak' in run._element.xml:
                    pn += 1
@ -63,15 +65,18 @@ class Pdf(PdfParser):
        start = timer()
        self._layouts_rec(zoomin)
        callback(0.67, "Layout analysis finished")
-        cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
+        cron_logger.info("paddle layouts:".format(
+            (timer() - start) / (self.total_page + 0.1)))
        self._naive_vertical_merge()

        callback(0.8, "Text extraction finished")

-        return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
+        return [(b["text"], self._line_tag(b, zoomin))
+                for b in self.boxes], None


-def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, txt.
    """
@ -89,41 +94,50 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-            pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
-            for txt, poss in pdf_parser(filename if not binary else binary,
-                             from_page=from_page, to_page=to_page, callback=callback)[0]:
-                sections.append(txt + poss)
+        pdf_parser = Pdf() if kwargs.get(
+            "parser_config", {}).get(
+            "layout_recognize", True) else PlainParser()
+        for txt, poss in pdf_parser(filename if not binary else binary,
+                                    from_page=from_page, to_page=to_page, callback=callback)[0]:
+            sections.append(txt + poss)

    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
-        if binary:txt = binary.decode("utf-8")
+        if binary:
+            txt = binary.decode("utf-8")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
-                    if not l:break
+                    if not l:
+                        break
                    txt += l
        sections = txt.split("\n")
        sections = [l for l in sections if l]
        callback(0.8, "Finish parsing.")
-    else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(docx, pdf, txt supported)")

    # is it English
-    eng = lang.lower() == "english"#is_english(sections)
+    eng = lang.lower() == "english"  # is_english(sections)
    # Remove 'Contents' part
    remove_contents_table(sections, eng)

    make_colon_as_title(sections)
    bull = bullets_category(sections)
    chunks = hierarchical_merge(bull, sections, 3)
-    if not chunks: callback(0.99, "No chunk parsed out.")
+    if not chunks:
+        callback(0.99, "No chunk parsed out.")

-    return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser)
+    return tokenize_chunks(["\n".join(ck)
+                           for ck in chunks], doc, eng, pdf_parser)


 if __name__ == "__main__":
    import sys
+
    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -25,10 +25,10 @@ class Pdf(PdfParser):
            callback
        )
        callback(msg="OCR finished.")
-        #for bb in self.boxes:
+        # for bb in self.boxes:
        #    for b in bb:
        #        print(b)
-        print("OCR:", timer()-start)
+        print("OCR:", timer() - start)

        self._layouts_rec(zoomin)
        callback(0.65, "Layout analysis finished.")
@ -45,30 +45,35 @@ class Pdf(PdfParser):
        for b in self.boxes:
            b["text"] = re.sub(r"([\t 　]|\u3000){2,}", " ", b["text"].strip())

-        return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls
+        return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin))
+                for i, b in enumerate(self.boxes)], tbls


-
-def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
    """
        Only pdf is supported.
    """
    pdf_parser = None

    if re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
+        pdf_parser = Pdf() if kwargs.get(
+            "parser_config", {}).get(
+            "layout_recognize", True) else PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
-                               from_page=from_page, to_page=to_page, callback=callback)
-        if sections and len(sections[0])<3: sections = [(t, l, [[0]*5]) for t, l in sections]
+                                    from_page=from_page, to_page=to_page, callback=callback)
+        if sections and len(sections[0]) < 3:
+            sections = [(t, l, [[0] * 5]) for t, l in sections]

-    else: raise NotImplementedError("file type not supported yet(pdf supported)")
+    else:
+        raise NotImplementedError("file type not supported yet(pdf supported)")
    doc = {
        "docnm_kwd": filename
    }
    doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    # is it English
-    eng = lang.lower() == "english"#pdf_parser.is_english
+    eng = lang.lower() == "english"  # pdf_parser.is_english

    # set pivot using the most frequent type of title,
    # then merge between 2 pivot
@ -79,7 +84,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        for txt, _, _ in sections:
            for t, lvl in pdf_parser.outlines:
                tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
-                tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
+                tks_ = set([txt[i] + txt[i + 1]
+                           for i in range(min(len(t), len(txt) - 1))])
                if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
                    levels.append(lvl)
                    break
@ -87,24 +93,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
                levels.append(max_lvl + 1)

    else:
-        bull = bullets_category([txt for txt,_,_ in sections])
-        most_level, levels = title_frequency(bull, [(txt, l) for txt, l, poss in sections])
+        bull = bullets_category([txt for txt, _, _ in sections])
+        most_level, levels = title_frequency(
+            bull, [(txt, l) for txt, l, poss in sections])

    assert len(sections) == len(levels)
    sec_ids = []
    sid = 0
    for i, lvl in enumerate(levels):
-        if lvl <= most_level and i > 0 and lvl != levels[i - 1]: sid += 1
+        if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
+            sid += 1
        sec_ids.append(sid)
        # print(lvl, self.boxes[i]["text"], most_level, sid)

-    sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
+    sections = [(txt, sec_ids[i], poss)
+                for i, (txt, _, poss) in enumerate(sections)]
    for (img, rows), poss in tbls:
        sections.append((rows if isinstance(rows, str) else rows[0], -1,
                         [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))

    def tag(pn, left, right, top, bottom):
-        if pn+left+right+top+bottom == 0:
+        if pn + left + right + top + bottom == 0:
            return ""
        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
            .format(pn, left, right, top, bottom)
@ -112,7 +121,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
    chunks = []
    last_sid = -2
    tk_cnt = 0
-    for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
+    for txt, sec_id, poss in sorted(sections, key=lambda x: (
+            x[-1][0][0], x[-1][0][3], x[-1][0][1])):
        poss = "\t".join([tag(*pos) for pos in poss])
        if tk_cnt < 2048 and (sec_id == last_sid or sec_id == -1):
            if chunks:
@ -121,16 +131,17 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
                continue
        chunks.append(txt + poss)
        tk_cnt = num_tokens_from_string(txt)
-        if sec_id > -1: last_sid = sec_id
+        if sec_id > -1:
+            last_sid = sec_id

    res = tokenize_table(tbls, doc, eng)
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    return res


-
 if __name__ == "__main__":
    import sys
+
    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -44,11 +44,14 @@ class Pdf(PdfParser):
        tbls = self._extract_table_figure(True, zoomin, True, True)
        self._naive_vertical_merge()

-        cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
-        return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
+        cron_logger.info("paddle layouts:".format(
+            (timer() - start) / (self.total_page + 0.1)))
+        return [(b["text"], self._line_tag(b, zoomin))
+                for b in self.boxes], tbls


-def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, excel, txt.
        This method apply the naive ways to chunk files.
@ -56,8 +59,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
    """

-    eng = lang.lower() == "english"#is_english(cks)
-    parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True})
+    eng = lang.lower() == "english"  # is_english(cks)
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 128, "delimiter": "\n!?。；！？", "layout_recognize": True})
    doc = {
        "docnm_kwd": filename,
        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -73,9 +78,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if parser_config["layout_recognize"] else PlainParser()
+        pdf_parser = Pdf(
+        ) if parser_config["layout_recognize"] else PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
-                              from_page=from_page, to_page=to_page, callback=callback)
+                                    from_page=from_page, to_page=to_page, callback=callback)
        res = tokenize_table(tbls, doc, eng)

    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
@ -92,16 +98,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
-                    if not l: break
+                    if not l:
+                        break
                    txt += l
        sections = txt.split("\n")
        sections = [(l, "") for l in sections if l]
        callback(0.8, "Finish parsing.")

    else:
-        raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
+        raise NotImplementedError(
+            "file type not supported yet(docx, pdf, txt supported)")

-    chunks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。；！？"))
+    chunks = naive_merge(
+        sections, parser_config.get(
+            "chunk_token_num", 128), parser_config.get(
+            "delimiter", "\n!?。；！？"))

    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    return res
@ -110,9 +121,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
 if __name__ == "__main__":
    import sys

-
    def dummy(prog=None, msg=""):
        pass

-
    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/one.py
+++ b/rag/app/one.py
@ -41,20 +41,23 @@ class Pdf(PdfParser):
        tbls = self._extract_table_figure(True, zoomin, True, True)
        self._concat_downward()

-        sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
+        sections = [(b["text"], self.get_position(b, zoomin))
+                    for i, b in enumerate(self.boxes)]
        for (img, rows), poss in tbls:
            sections.append((rows if isinstance(rows, str) else rows[0],
                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
-        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None
+        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
+            x[-1][0][0], x[-1][0][3], x[-1][0][1]))], None


-def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, excel, txt.
        One file forms a chunk which maintains original text order.
    """

-    eng = lang.lower() == "english"#is_english(cks)
+    eng = lang.lower() == "english"  # is_english(cks)

    if re.search(r"\.docx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
@ -62,8 +65,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
-        sections, _ = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
+        pdf_parser = Pdf() if kwargs.get(
+            "parser_config", {}).get(
+            "layout_recognize", True) else PlainParser()
+        sections, _ = pdf_parser(
+            filename if not binary else binary, to_page=to_page, callback=callback)
        sections = [s for s, _ in sections if s]

    elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
@ -80,14 +86,16 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
-                    if not l: break
+                    if not l:
+                        break
                    txt += l
        sections = txt.split("\n")
        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")

    else:
-        raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
+        raise NotImplementedError(
+            "file type not supported yet(docx, pdf, txt supported)")

    doc = {
        "docnm_kwd": filename,
@ -101,9 +109,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
 if __name__ == "__main__":
    import sys

-
    def dummy(prog=None, msg=""):
        pass

-
    chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -67,11 +67,11 @@ class Pdf(PdfParser):

        if from_page > 0:
            return {
-                "title":"",
+                "title": "",
                "authors": "",
                "abstract": "",
                "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes if
-                          re.match(r"(text|title)", b.get("layoutno", "text"))],
+                             re.match(r"(text|title)", b.get("layoutno", "text"))],
                "tables": tbls
            }
        # get title and authors
@ -87,7 +87,8 @@ class Pdf(PdfParser):
                    title = ""
                    break
                for j in range(3):
-                    if _begin(self.boxes[i + j]["text"]): break
+                    if _begin(self.boxes[i + j]["text"]):
+                        break
                    authors.append(self.boxes[i + j]["text"])
                    break
                break
@ -107,10 +108,15 @@ class Pdf(PdfParser):
                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
                i += 1
                break
-        if not abstr: i = 0
+        if not abstr:
+            i = 0

-        callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
-        for b in self.boxes: print(b["text"], b.get("layoutno"))
+        callback(
+            0.8, "Page {}~{}: Text merging finished".format(
+                from_page, min(
+                    to_page, self.total_page)))
+        for b in self.boxes:
+            print(b["text"], b.get("layoutno"))
        print(tbls)

        return {
@ -118,19 +124,20 @@ class Pdf(PdfParser):
            "authors": " ".join(authors),
            "abstract": abstr,
            "sections": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
-                      re.match(r"(text|title)", b.get("layoutno", "text"))],
+                         re.match(r"(text|title)", b.get("layoutno", "text"))],
            "tables": tbls
        }


-def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
    """
        Only pdf is supported.
        The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
    """
    pdf_parser = None
    if re.search(r"\.pdf$", filename, re.IGNORECASE):
-        if not kwargs.get("parser_config",{}).get("layout_recognize", True):
+        if not kwargs.get("parser_config", {}).get("layout_recognize", True):
            pdf_parser = PlainParser()
            paper = {
                "title": filename,
@ -143,14 +150,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
            pdf_parser = Pdf()
            paper = pdf_parser(filename if not binary else binary,
                               from_page=from_page, to_page=to_page, callback=callback)
-    else: raise NotImplementedError("file type not supported yet(pdf supported)")
+    else:
+        raise NotImplementedError("file type not supported yet(pdf supported)")

    doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
           "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
    doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
    doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
    # is it English
-    eng = lang.lower() == "english"#pdf_parser.is_english
+    eng = lang.lower() == "english"  # pdf_parser.is_english
    print("It's English.....", eng)

    res = tokenize_table(paper["tables"], doc, eng)
@ -160,7 +168,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        txt = pdf_parser.remove_tag(paper["abstract"])
        d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
        d["important_tks"] = " ".join(d["important_kwd"])
-        d["image"], poss = pdf_parser.crop(paper["abstract"], need_position=True)
+        d["image"], poss = pdf_parser.crop(
+            paper["abstract"], need_position=True)
        add_positions(d, poss)
        tokenize(d, txt, eng)
        res.append(d)
@ -174,7 +183,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
    sec_ids = []
    sid = 0
    for i, lvl in enumerate(levels):
-        if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
+        if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
+            sid += 1
        sec_ids.append(sid)
        print(lvl, sorted_sections[i][0], most_level, sid)

@ -190,6 +200,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    return res

+
 """
    readed = [0] * len(paper["lines"])
    # find colon firstly
@ -212,7 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        for k in range(j, i): readed[k] = True
        txt = txt[::-1]
        if eng:
-            r = re.search(r"(.*?) ([\.;?!]|$)", txt)
+            r = re.search(r"(.*?) ([\\.;?!]|$)", txt)
            txt = r.group(1)[::-1] if r else txt[::-1]
        else:
            r = re.search(r"(.*?) ([。？；！]|$)", txt)
@ -270,6 +281,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca

 if __name__ == "__main__":
    import sys
+
    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], callback=dummy)
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@ -33,9 +33,12 @@ class Ppt(PptParser):
        with slides.Presentation(BytesIO(fnm)) as presentation:
            for i, slide in enumerate(presentation.slides[from_page: to_page]):
                buffered = BytesIO()
-                slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg)
+                slide.get_thumbnail(
+                    0.5, 0.5).save(
+                    buffered, drawing.imaging.ImageFormat.jpeg)
                imgs.append(Image.open(buffered))
-        assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
+        assert len(imgs) == len(
+            txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
        callback(0.9, "Image extraction finished")
        self.is_english = is_english(txts)
        return [(txts[i], imgs[i]) for i in range(len(txts))]
@ -47,25 +50,34 @@ class Pdf(PdfParser):

    def __garbage(self, txt):
        txt = txt.lower().strip()
-        if re.match(r"[0-9\.,%/-]+$", txt): return True
-        if len(txt) < 3:return True
+        if re.match(r"[0-9\.,%/-]+$", txt):
+            return True
+        if len(txt) < 3:
+            return True
        return False

-    def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, zoomin=3, callback=None):
        callback(msg="OCR is  running...")
-        self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
-        callback(0.8, "Page {}~{}: OCR finished".format(from_page, min(to_page, self.total_page)))
-        assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(len(self.boxes), len(self.page_images))
+        self.__images__(filename if not binary else binary,
+                        zoomin, from_page, to_page, callback)
+        callback(0.8, "Page {}~{}: OCR finished".format(
+            from_page, min(to_page, self.total_page)))
+        assert len(self.boxes) == len(self.page_images), "{} vs. {}".format(
+            len(self.boxes), len(self.page_images))
        res = []
        for i in range(len(self.boxes)):
-            lines = "\n".join([b["text"] for b in self.boxes[i] if not self.__garbage(b["text"])])
+            lines = "\n".join([b["text"] for b in self.boxes[i]
+                              if not self.__garbage(b["text"])])
            res.append((lines, self.page_images[i]))
-        callback(0.9, "Page {}~{}: Parsing finished".format(from_page, min(to_page, self.total_page)))
+        callback(0.9, "Page {}~{}: Parsing finished".format(
+            from_page, min(to_page, self.total_page)))
        return res


 class PlainPdf(PlainParser):
-    def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
+    def __call__(self, filename, binary=None, from_page=0,
+                 to_page=100000, callback=None, **kwargs):
        self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
        page_txt = []
        for page in self.pdf.pages[from_page: to_page]:
@ -74,7 +86,8 @@ class PlainPdf(PlainParser):
        return [(txt, None) for txt in page_txt]


-def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
    """
    The supported file formats are pdf, pptx.
    Every page will be treated as a chunk. And the thumbnail of every page will be stored.
@ -89,35 +102,42 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
    res = []
    if re.search(r"\.pptx?$", filename, re.IGNORECASE):
        ppt_parser = Ppt()
-        for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
+        for pn, (txt, img) in enumerate(ppt_parser(
+                filename if not binary else binary, from_page, 1000000, callback)):
            d = copy.deepcopy(doc)
            pn += from_page
            d["image"] = img
-            d["page_num_int"] = [pn+1]
+            d["page_num_int"] = [pn + 1]
            d["top_int"] = [0]
            d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
            tokenize(d, txt, eng)
            res.append(d)
        return res
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-        pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainPdf()
-        for pn, (txt,img) in enumerate(pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)):
+        pdf_parser = Pdf() if kwargs.get(
+            "parser_config", {}).get(
+            "layout_recognize", True) else PlainPdf()
+        for pn, (txt, img) in enumerate(pdf_parser(filename, binary,
+                                                   from_page=from_page, to_page=to_page, callback=callback)):
            d = copy.deepcopy(doc)
            pn += from_page
-            if img: d["image"] = img
-            d["page_num_int"] = [pn+1]
+            if img:
+                d["image"] = img
+            d["page_num_int"] = [pn + 1]
            d["top_int"] = [0]
-            d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
+            d["position_int"] = [
+                (pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
            tokenize(d, txt, eng)
            res.append(d)
        return res

-    raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
+    raise NotImplementedError(
+        "file type not supported yet(pptx, pdf supported)")


-if __name__== "__main__":
+if __name__ == "__main__":
    import sys
+
    def dummy(a, b):
        pass
    chunk(sys.argv[1], callback=dummy)
-
--- a/rag/app/resume.py
+++ b/rag/app/resume.py
@ -27,6 +27,8 @@ from rag.utils import rmSpace
 forbidden_select_fields4resume = [
    "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
 ]
+
+
 def remote_call(filename, binary):
    q = {
        "header": {
@ -48,18 +50,22 @@ def remote_call(filename, binary):
    }
    for _ in range(3):
        try:
-            resume = requests.post("http://127.0.0.1:61670/tog", data=json.dumps(q))
+            resume = requests.post(
+                "http://127.0.0.1:61670/tog",
+                data=json.dumps(q))
            resume = resume.json()["response"]["results"]
            resume = refactor(resume)
-            for k in ["education", "work", "project", "training", "skill", "certificate", "language"]:
-                if not resume.get(k) and k in resume: del resume[k]
+            for k in ["education", "work", "project",
+                      "training", "skill", "certificate", "language"]:
+                if not resume.get(k) and k in resume:
+                    del resume[k]

            resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x",
-                                                "updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
+                                                      "updated_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}]))
            resume = step_two.parse(resume)
            return resume
        except Exception as e:
-            cron_logger.error("Resume parser error: "+str(e))
+            cron_logger.error("Resume parser error: " + str(e))
    return {}


@ -144,10 +150,13 @@ def chunk(filename, binary=None, callback=None, **kwargs):
    doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
    doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
    for n, _ in field_map.items():
-        if n not in resume:continue
-        if isinstance(resume[n], list) and (len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
+        if n not in resume:
+            continue
+        if isinstance(resume[n], list) and (
+                len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
            resume[n] = resume[n][0]
-        if n.find("_tks")>0: resume[n] = huqie.qieqie(resume[n])
+        if n.find("_tks") > 0:
+            resume[n] = huqie.qieqie(resume[n])
        doc[n] = resume[n]

    print(doc)
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -25,7 +25,8 @@ from deepdoc.parser import ExcelParser


 class Excel(ExcelParser):
-    def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None):
+    def __call__(self, fnm, binary=None, from_page=0,
+                 to_page=10000000000, callback=None):
        if not binary:
            wb = load_workbook(fnm)
        else:
@ -48,8 +49,10 @@ class Excel(ExcelParser):
            data = []
            for i, r in enumerate(rows[1:]):
                rn += 1
-                if rn-1 < from_page:continue
-                if rn -1>=to_page: break
+                if rn - 1 < from_page:
+                    continue
+                if rn - 1 >= to_page:
+                    break
                row = [
                    cell.value for ii,
                    cell in enumerate(r) if ii not in missed]
@ -60,7 +63,7 @@ class Excel(ExcelParser):
                done += 1
            res.append(pd.DataFrame(np.array(data), columns=headers))

-        callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + (
+        callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        return res

@ -73,7 +76,8 @@ def trans_datatime(s):


 def trans_bool(s):
-    if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$", str(s).strip(), flags=re.IGNORECASE):
+    if re.match(r"(true|yes|是|\*|✓|✔|☑|✅|√)$",
+                str(s).strip(), flags=re.IGNORECASE):
        return "yes"
    if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
        return "no"
@ -107,13 +111,14 @@ def column_data_type(arr):
            arr[i] = trans[ty](str(arr[i]))
        except Exception as e:
            arr[i] = None
-    #if ty == "text":
+    # if ty == "text":
    #    if len(arr) > 128 and uni / len(arr) < 0.1:
    #        ty = "keyword"
    return arr, ty


-def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=10000000000,
+          lang="Chinese", callback=None, **kwargs):
    """
        Excel and csv(txt) format files are supported.
        For csv or txt file, the delimiter between columns is TAB.
@ -131,7 +136,12 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        excel_parser = Excel()
-        dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
+        dfs = excel_parser(
+            filename,
+            binary,
+            from_page=from_page,
+            to_page=to_page,
+            callback=callback)
    elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
@ -149,8 +159,10 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
        headers = lines[0].split(kwargs.get("delimiter", "\t"))
        rows = []
        for i, line in enumerate(lines[1:]):
-            if i < from_page:continue
-            if i >= to_page: break
+            if i < from_page:
+                continue
+            if i >= to_page:
+                break
            row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
            if len(row) != len(headers):
                fails.append(str(i))
@ -181,7 +193,13 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
                del df[n]
        clmns = df.columns.values
        txts = list(copy.deepcopy(clmns))
-        py_clmns = [PY.get_pinyins(re.sub(r"(/.*|（[^（）]+?）|\([^()]+?\))", "", n), '_')[0] for n in clmns]
+        py_clmns = [
+            PY.get_pinyins(
+                re.sub(
+                    r"(/.*|（[^（）]+?）|\([^()]+?\))",
+                    "",
+                    n),
+                '_')[0] for n in clmns]
        clmn_tys = []
        for j in range(len(clmns)):
            cln, ty = column_data_type(df[clmns[j]])
@ -192,7 +210,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
        clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], clmns[i].replace("_", " "))
                     for i in range(len(clmns))]

-        eng = lang.lower() == "english"#is_english(txts)
+        eng = lang.lower() == "english"  # is_english(txts)
        for ii, row in df.iterrows():
            d = {
                "docnm_kwd": filename,