Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-12-29 12:01:18 +08:00
committed by GitHub
parent 647fb115a0
commit 01f0ced1e6
43 changed files with 817 additions and 637 deletions

View File

@ -51,14 +51,15 @@ class Excel(ExcelParser):
tables = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
images = Excel._extract_images_from_worksheet(ws,sheetname=sheetname)
images = Excel._extract_images_from_worksheet(ws, sheetname=sheetname)
if images:
image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback, **kwargs)
image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback,
**kwargs)
if image_descriptions and len(image_descriptions) == len(images):
for i, bf in enumerate(image_descriptions):
images[i]["image_description"] = "\n".join(bf[0][1])
for img in images:
if (img["span_type"] == "single_cell"and img.get("image_description")):
if (img["span_type"] == "single_cell" and img.get("image_description")):
pending_cell_images.append(img)
else:
flow_images.append(img)
@ -113,16 +114,17 @@ class Excel(ExcelParser):
tables.append(
(
(
img["image"], # Image.Image
[img["image_description"]] # description list (must be list)
img["image"], # Image.Image
[img["image_description"]] # description list (must be list)
),
[
(0, 0, 0, 0, 0) # dummy position
(0, 0, 0, 0, 0) # dummy position
]
)
)
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res,tables
callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
return res, tables
def _parse_headers(self, ws, rows):
if len(rows) == 0:
@ -315,14 +317,15 @@ def trans_bool(s):
def column_data_type(arr):
arr = list(arr)
counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
trans = {t: f for f, t in [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
trans = {t: f for f, t in
[(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
float_flag = False
for a in arr:
if a is None:
continue
if re.match(r"[+-]?[0-9]+$", str(a).replace("%%", "")) and not str(a).replace("%%", "").startswith("0"):
counts["int"] += 1
if int(str(a)) > 2**63 - 1:
if int(str(a)) > 2 ** 63 - 1:
float_flag = True
break
elif re.match(r"[+-]?[0-9.]{,19}$", str(a).replace("%%", "")) and not str(a).replace("%%", "").startswith("0"):
@ -370,7 +373,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = Excel()
dfs,tbls = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback, **kwargs)
dfs, tbls = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback, **kwargs)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
txt = get_text(filename, binary)
@ -389,7 +392,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
continue
rows.append(row)
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
callback(0.3, ("Extract records: {}~{}".format(from_page, min(len(lines), to_page)) + (
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
dfs = [pd.DataFrame(np.array(rows), columns=headers)]
elif re.search(r"\.csv$", filename, re.IGNORECASE):
@ -406,7 +410,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
fails = []
rows = []
for i, row in enumerate(all_rows[1 + from_page : 1 + to_page]):
for i, row in enumerate(all_rows[1 + from_page: 1 + to_page]):
if len(row) != len(headers):
fails.append(str(i + from_page))
continue
@ -415,7 +419,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
callback(
0.3,
(f"Extract records: {from_page}~{from_page + len(rows)}" +
(f"{len(fails)} failure, line: {','.join(fails[:3])}..." if fails else ""))
(f"{len(fails)} failure, line: {','.join(fails[:3])}..." if fails else ""))
)
dfs = [pd.DataFrame(rows, columns=headers)]
@ -445,7 +449,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
df[clmns[j]] = cln
if ty == "text":
txts.extend([str(c) for c in cln if c])
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in range(len(clmns))]
clmns_map = [(py_clmns[i].lower() + fieds_map[clmn_tys[i]], str(clmns[i]).replace("_", " ")) for i in
range(len(clmns))]
eng = lang.lower() == "english" # is_english(txts)
for ii, row in df.iterrows():
@ -477,7 +482,9 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)