Fix errors detected by Ruff (#3918)

### What problem does this PR solve?

Fix errors detected by Ruff

### Type of change

- [x] Refactoring
This commit is contained in:
Zhichang Yu
2024-12-08 14:21:12 +08:00
committed by GitHub
parent e267a026f3
commit 0d68a6cd1b
97 changed files with 2558 additions and 1976 deletions

View File

@ -94,7 +94,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [(l, "") for l in sections if l]
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
@ -102,7 +102,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [(l, "") for l in sections if l]
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
@ -112,7 +112,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [(l, "") for l in sections if l]
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")

View File

@ -75,7 +75,7 @@ def chunk(
_add_content(msg, msg.get_content_type())
sections = TxtParser.parser_txt("\n".join(text_txt)) + [
(l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
(line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line
]
st = timer()

View File

@ -18,7 +18,8 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
)
for c in chunks: c["docnm_kwd"] = filename
for c in chunks:
c["docnm_kwd"] = filename
doc = {
"docnm_kwd": filename,

View File

@ -48,7 +48,7 @@ class Docx(DocxParser):
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
return [l for l in lines if l]
return [line for line in lines if line]
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
@ -60,7 +60,8 @@ class Docx(DocxParser):
if pn > to_page:
break
question_level, p_text = docx_question_level(p, bull)
if not p_text.strip("\n"):continue
if not p_text.strip("\n"):
continue
lines.append((question_level, p_text))
for run in p.runs:
@ -78,19 +79,21 @@ class Docx(DocxParser):
if lines[e][0] <= lines[s][0]:
break
e += 1
if e - s == 1 and visit[s]: continue
if e - s == 1 and visit[s]:
continue
sec = []
next_level = lines[s][0] + 1
while not sec and next_level < 22:
for i in range(s+1, e):
if lines[i][0] != next_level: continue
if lines[i][0] != next_level:
continue
sec.append(lines[i][1])
visit[i] = True
next_level += 1
sec.insert(0, lines[s][1])
sections.append("\n".join(sec))
return [l for l in sections if l]
return [s for s in sections if s]
def __str__(self) -> str:
return f'''
@ -168,13 +171,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [l for l in sections if l]
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [l for l in sections if l]
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
@ -182,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [l for l in sections if l]
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:

View File

@ -190,7 +190,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
if sections and len(sections[0]) < 3:
sections = [(t, l, [[0] * 5]) for t, l in sections]
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
@ -211,7 +211,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
else:
bull = bullets_category([txt for txt, _, _ in sections])
most_level, levels = title_frequency(
bull, [(txt, l) for txt, l, poss in sections])
bull, [(txt, lvl) for txt, lvl, _ in sections])
assert len(sections) == len(levels)
sec_ids = []
@ -225,7 +225,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections = [(txt, sec_ids[i], poss)
for i, (txt, _, poss) in enumerate(sections)]
for (img, rows), poss in tbls:
if not rows: continue
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0], -1,
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))

View File

@ -54,7 +54,8 @@ class Pdf(PdfParser):
sections = [(b["text"], self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)]
for (img, rows), poss in tbls:
if not rows:continue
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
@ -109,7 +110,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = parser.from_buffer(binary)
sections = doc_parsed['content'].split('\n')
sections = [l for l in sections if l]
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:

View File

@ -171,7 +171,7 @@ class Pdf(PdfParser):
tbl_bottom = tbls[tbl_index][1][0][4]
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
tbl_text = ''.join(tbls[tbl_index][0][1])
_tbl_text = ''.join(tbls[tbl_index][0][1])
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,
@ -325,9 +325,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
txt = get_text(filename, binary)
lines = txt.split("\n")
comma, tab = 0, 0
for l in lines:
if len(l.split(",")) == 2: comma += 1
if len(l.split("\t")) == 2: tab += 1
for line in lines:
if len(line.split(",")) == 2:
comma += 1
if len(line.split("\t")) == 2:
tab += 1
delimiter = "\t" if tab >= comma else ","
fails = []
@ -336,18 +338,21 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
while i < len(lines):
arr = lines[i].split(delimiter)
if len(arr) != 2:
if question: answer += "\n" + lines[i]
if question:
answer += "\n" + lines[i]
else:
fails.append(str(i+1))
elif len(arr) == 2:
if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng))
if question and answer:
res.append(beAdoc(deepcopy(doc), question, answer, eng))
question, answer = arr
i += 1
if len(res) % 999 == 0:
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
if question: res.append(beAdoc(deepcopy(doc), question, answer, eng))
if question:
res.append(beAdoc(deepcopy(doc), question, answer, eng))
callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
@ -367,19 +372,18 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
lines = txt.split("\n")
last_question, last_answer = "", ""
_last_question, last_answer = "", ""
question_stack, level_stack = [], []
code_block = False
level_index = [-1] * 7
for index, l in enumerate(lines):
if l.strip().startswith('```'):
for index, line in enumerate(lines):
if line.strip().startswith('```'):
code_block = not code_block
question_level, question = 0, ''
if not code_block:
question_level, question = mdQuestionLevel(l)
question_level, question = mdQuestionLevel(line)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{l}'
last_answer = f'{last_answer}\n{line}'
else: # is a question
if last_answer.strip():
sum_question = '\n'.join(question_stack)

View File

@ -41,14 +41,16 @@ class Excel(ExcelParser):
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:continue
if not rows:
continue
headers = [cell.value for cell in rows[0]]
missed = set([i for i, h in enumerate(headers) if h is None])
headers = [
cell.value for i,
cell in enumerate(
rows[0]) if i not in missed]
if not headers:continue
if not headers:
continue
data = []
for i, r in enumerate(rows[1:]):
rn += 1
@ -88,7 +90,6 @@ def trans_bool(s):
def column_data_type(arr):
arr = list(arr)
uni = len(set([a for a in arr if a is not None]))
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
trans = {t: f for f, t in
[(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
@ -157,7 +158,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
continue
if i >= to_page:
break
row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
row = [field for field in line.split(kwargs.get("delimiter", "\t"))]
if len(row) != len(headers):
fails.append(str(i))
continue