Fix errors detected by Ruff (#3918)

### What problem does this PR solve? Fix errors detected by Ruff ### Type of change - [x] Refactoring
2025-12-08 20:42:30 +08:00 · 2024-12-08 14:21:12 +08:00
parent e267a026f3
commit 0d68a6cd1b
97 changed files with 2558 additions and 1976 deletions
--- a/rag/app/book.py
+++ b/rag/app/book.py
@ -94,7 +94,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        callback(0.1, "Start to parse.")
        txt = get_text(filename, binary)
        sections = txt.split("\n")
-        sections = [(l, "") for l in sections if l]
+        sections = [(line, "") for line in sections if line]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")
@ -102,7 +102,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        sections = HtmlParser()(filename, binary)
-        sections = [(l, "") for l in sections if l]
+        sections = [(line, "") for line in sections if line]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")
@ -112,7 +112,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        binary = BytesIO(binary)
        doc_parsed = parser.from_buffer(binary)
        sections = doc_parsed['content'].split('\n')
-        sections = [(l, "") for l in sections if l]
+        sections = [(line, "") for line in sections if line]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")
--- a/rag/app/email.py
+++ b/rag/app/email.py
@ -75,7 +75,7 @@ def chunk(
    _add_content(msg, msg.get_content_type())

    sections = TxtParser.parser_txt("\n".join(text_txt)) + [
-        (l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
+        (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line
    ]

    st = timer()
--- a/rag/app/knowledge_graph.py
+++ b/rag/app/knowledge_graph.py
@ -18,7 +18,8 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
    chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
                                          parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
                                          )
-    for c in chunks: c["docnm_kwd"] = filename
+    for c in chunks:
+        c["docnm_kwd"] = filename

    doc = {
        "docnm_kwd": filename,
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@ -48,7 +48,7 @@ class Docx(DocxParser):
                    continue
                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
                    pn += 1
-        return [l for l in lines if l]
+        return [line for line in lines if line]

    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
        self.doc = Document(
@ -60,7 +60,8 @@ class Docx(DocxParser):
            if pn > to_page:
                break
            question_level, p_text = docx_question_level(p, bull)
-            if not p_text.strip("\n"):continue
+            if not p_text.strip("\n"):
+                continue
            lines.append((question_level, p_text))

            for run in p.runs:
@ -78,19 +79,21 @@ class Docx(DocxParser):
                if lines[e][0] <= lines[s][0]:
                    break
                e += 1
-            if e - s == 1 and visit[s]: continue
+            if e - s == 1 and visit[s]:
+                continue
            sec = []
            next_level = lines[s][0] + 1
            while not sec and next_level < 22:
                for i in range(s+1, e):
-                    if lines[i][0] != next_level: continue
+                    if lines[i][0] != next_level:
+                        continue
                    sec.append(lines[i][1])
                    visit[i] = True
                next_level += 1
            sec.insert(0, lines[s][1])

            sections.append("\n".join(sec))
-        return [l for l in sections if l]
+        return [s for s in sections if s]

    def __str__(self) -> str:
        return f'''
@ -168,13 +171,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        callback(0.1, "Start to parse.")
        txt = get_text(filename, binary)
        sections = txt.split("\n")
-        sections = [l for l in sections if l]
+        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        sections = HtmlParser()(filename, binary)
-        sections = [l for l in sections if l]
+        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.doc$", filename, re.IGNORECASE):
@ -182,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        binary = BytesIO(binary)
        doc_parsed = parser.from_buffer(binary)
        sections = doc_parsed['content'].split('\n')
-        sections = [l for l in sections if l]
+        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")

    else:
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -190,7 +190,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        sections, tbls = pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)
        if sections and len(sections[0]) < 3:
-            sections = [(t, l, [[0] * 5]) for t, l in sections]
+            sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
        # set pivot using the most frequent type of title,
        # then merge between 2 pivot
        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
@ -211,7 +211,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        else:
            bull = bullets_category([txt for txt, _, _ in sections])
            most_level, levels = title_frequency(
-                bull, [(txt, l) for txt, l, poss in sections])
+                bull, [(txt, lvl) for txt, lvl, _ in sections])

        assert len(sections) == len(levels)
        sec_ids = []
@ -225,7 +225,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        sections = [(txt, sec_ids[i], poss)
                    for i, (txt, _, poss) in enumerate(sections)]
        for (img, rows), poss in tbls:
-            if not rows: continue
+            if not rows:
+                continue
            sections.append((rows if isinstance(rows, str) else rows[0], -1,
                            [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))

--- a/rag/app/one.py
+++ b/rag/app/one.py
@ -54,7 +54,8 @@ class Pdf(PdfParser):
        sections = [(b["text"], self.get_position(b, zoomin))
                    for i, b in enumerate(self.boxes)]
        for (img, rows), poss in tbls:
-            if not rows:continue
+            if not rows:
+                continue
            sections.append((rows if isinstance(rows, str) else rows[0],
                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
@ -109,7 +110,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        binary = BytesIO(binary)
        doc_parsed = parser.from_buffer(binary)
        sections = doc_parsed['content'].split('\n')
-        sections = [l for l in sections if l]
+        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")

    else:
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@ -171,7 +171,7 @@ class Pdf(PdfParser):
        tbl_bottom = tbls[tbl_index][1][0][4]
        tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
            .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
-        tbl_text = ''.join(tbls[tbl_index][0][1])
+        _tbl_text = ''.join(tbls[tbl_index][0][1])
        return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,


@ -325,9 +325,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
        txt = get_text(filename, binary)
        lines = txt.split("\n")
        comma, tab = 0, 0
-        for l in lines:
-            if len(l.split(",")) == 2: comma += 1
-            if len(l.split("\t")) == 2: tab += 1
+        for line in lines:
+            if len(line.split(",")) == 2:
+                comma += 1
+            if len(line.split("\t")) == 2:
+                tab += 1
        delimiter = "\t" if tab >= comma else ","

        fails = []
@ -336,18 +338,21 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
        while i < len(lines):
            arr = lines[i].split(delimiter)
            if len(arr) != 2:
-                if question: answer += "\n" + lines[i]
+                if question:
+                    answer += "\n" + lines[i]
                else:
                    fails.append(str(i+1))
            elif len(arr) == 2:
-                if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng))
+                if question and answer:
+                    res.append(beAdoc(deepcopy(doc), question, answer, eng))
                question, answer = arr
            i += 1
            if len(res) % 999 == 0:
                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))

-        if question: res.append(beAdoc(deepcopy(doc), question, answer, eng))
+        if question:
+            res.append(beAdoc(deepcopy(doc), question, answer, eng))

        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
@ -367,19 +372,18 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
        callback(0.1, "Start to parse.")
        txt = get_text(filename, binary)
        lines = txt.split("\n")
-        last_question, last_answer = "", ""
+        _last_question, last_answer = "", ""
        question_stack, level_stack = [], []
        code_block = False
-        level_index = [-1] * 7
-        for index, l in enumerate(lines):
-            if l.strip().startswith('```'):
+        for index, line in enumerate(lines):
+            if line.strip().startswith('```'):
                code_block = not code_block
            question_level, question = 0, ''
            if not code_block:
-                question_level, question = mdQuestionLevel(l)
+                question_level, question = mdQuestionLevel(line)

            if not question_level or question_level > 6: # not a question
-                last_answer = f'{last_answer}\n{l}'
+                last_answer = f'{last_answer}\n{line}'
            else:   # is a question
                if last_answer.strip():
                    sum_question = '\n'.join(question_stack)
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -41,14 +41,16 @@ class Excel(ExcelParser):
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
-            if not rows:continue
+            if not rows:
+                continue
            headers = [cell.value for cell in rows[0]]
            missed = set([i for i, h in enumerate(headers) if h is None])
            headers = [
                cell.value for i,
                cell in enumerate(
                    rows[0]) if i not in missed]
-            if not headers:continue
+            if not headers:
+                continue
            data = []
            for i, r in enumerate(rows[1:]):
                rn += 1
@ -88,7 +90,6 @@ def trans_bool(s):

 def column_data_type(arr):
    arr = list(arr)
-    uni = len(set([a for a in arr if a is not None]))
    counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
    trans = {t: f for f, t in
             [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
@ -157,7 +158,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
                continue
            if i >= to_page:
                break
-            row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
+            row = [field for field in line.split(kwargs.get("delimiter", "\t"))]
            if len(row) != len(headers):
                fails.append(str(i))
                continue