apply pep8 formalize (#155)

2026-02-01 16:15:07 +08:00 · 2024-03-27 11:33:46 +08:00
parent a02e836790
commit fd7fcb5baf
55 changed files with 1568 additions and 753 deletions
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@ -35,8 +35,10 @@ class Docx(DocxParser):
        pn = 0
        lines = []
        for p in self.doc.paragraphs:
-            if pn > to_page:break
-            if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text))
+            if pn > to_page:
+                break
+            if from_page <= pn < to_page and p.text.strip():
+                lines.append(self.__clean(p.text))
            for run in p.runs:
                if 'lastRenderedPageBreak' in run._element.xml:
                    pn += 1
@ -63,15 +65,18 @@ class Pdf(PdfParser):
        start = timer()
        self._layouts_rec(zoomin)
        callback(0.67, "Layout analysis finished")
-        cron_logger.info("paddle layouts:".format((timer()-start)/(self.total_page+0.1)))
+        cron_logger.info("paddle layouts:".format(
+            (timer() - start) / (self.total_page + 0.1)))
        self._naive_vertical_merge()

        callback(0.8, "Text extraction finished")

-        return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
+        return [(b["text"], self._line_tag(b, zoomin))
+                for b in self.boxes], None


-def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000,
+          lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, txt.
    """
@ -89,41 +94,50 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
-            pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser()
-            for txt, poss in pdf_parser(filename if not binary else binary,
-                             from_page=from_page, to_page=to_page, callback=callback)[0]:
-                sections.append(txt + poss)
+        pdf_parser = Pdf() if kwargs.get(
+            "parser_config", {}).get(
+            "layout_recognize", True) else PlainParser()
+        for txt, poss in pdf_parser(filename if not binary else binary,
+                                    from_page=from_page, to_page=to_page, callback=callback)[0]:
+            sections.append(txt + poss)

    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
-        if binary:txt = binary.decode("utf-8")
+        if binary:
+            txt = binary.decode("utf-8")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
-                    if not l:break
+                    if not l:
+                        break
                    txt += l
        sections = txt.split("\n")
        sections = [l for l in sections if l]
        callback(0.8, "Finish parsing.")
-    else: raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
+    else:
+        raise NotImplementedError(
+            "file type not supported yet(docx, pdf, txt supported)")

    # is it English
-    eng = lang.lower() == "english"#is_english(sections)
+    eng = lang.lower() == "english"  # is_english(sections)
    # Remove 'Contents' part
    remove_contents_table(sections, eng)

    make_colon_as_title(sections)
    bull = bullets_category(sections)
    chunks = hierarchical_merge(bull, sections, 3)
-    if not chunks: callback(0.99, "No chunk parsed out.")
+    if not chunks:
+        callback(0.99, "No chunk parsed out.")

-    return tokenize_chunks(["\n".join(ck) for ck in chunks], doc, eng, pdf_parser)
+    return tokenize_chunks(["\n".join(ck)
+                           for ck in chunks], doc, eng, pdf_parser)


 if __name__ == "__main__":
    import sys
+
    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], callback=dummy)