Fix IDE warnings (#12315)

### What problem does this PR solve?

As title.

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-12-30 15:04:09 +08:00
committed by GitHub
parent 4037788e0c
commit f0392e7501
2 changed files with 26 additions and 17 deletions

View File

@ -210,8 +210,8 @@ class Docx(DocxParser):
except UnicodeDecodeError: except UnicodeDecodeError:
logging.info("The recognized image stream appears to be corrupted. Skipping image.") logging.info("The recognized image stream appears to be corrupted. Skipping image.")
continue continue
except Exception: except Exception as e:
logging.info("The recognized image stream appears to be corrupted. Skipping image.") logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
continue continue
try: try:
image = Image.open(BytesIO(image_blob)).convert('RGB') image = Image.open(BytesIO(image_blob)).convert('RGB')
@ -219,7 +219,8 @@ class Docx(DocxParser):
res_img = image res_img = image
else: else:
res_img = concat_img(res_img, image) res_img = concat_img(res_img, image)
except Exception: except Exception as e:
logging.warning(f"Fail to open or concat images, exception: {e}")
continue continue
return res_img return res_img
@ -553,7 +554,8 @@ class Markdown(MarkdownParser):
if (src, line_no) not in seen: if (src, line_no) not in seen:
urls.append({"url": src, "line": line_no}) urls.append({"url": src, "line": line_no})
seen.add((src, line_no)) seen.add((src, line_no))
except Exception: except Exception as e:
logging.error("Failed to extract image urls: {}".format(e))
pass pass
return urls return urls
@ -698,8 +700,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
**kwargs) or [] **kwargs) or []
embed_res.extend(sub_res) embed_res.extend(sub_res)
except Exception as e: except Exception as e:
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
logging.error(error_msg)
if callback: if callback:
callback(0.05, f"Failed to chunk embed {embed_filename}: {e}") callback(0.05, error_msg)
continue continue
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
@ -839,7 +843,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
try: try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...") callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
except Exception: except Exception as e:
logging.warning(f"Failed to detect figure extraction: {e}")
vision_model = None vision_model = None
if vision_model: if vision_model:
@ -905,8 +910,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
sections = [(_, "") for _ in sections if _] sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
else: else:
callback(0.8, f"tika.parser got empty content from {filename}.") error_msg = f"tika.parser got empty content from {filename}."
logging.warning(f"tika.parser got empty content from {filename}.") callback(0.8, error_msg)
logging.warning(error_msg)
return [] return []
else: else:
raise NotImplementedError( raise NotImplementedError(

View File

@ -42,16 +42,16 @@ class Excel(ExcelParser):
else: else:
wb = Excel._load_excel_to_workbook(BytesIO(binary)) wb = Excel._load_excel_to_workbook(BytesIO(binary))
total = 0 total = 0
for sheetname in wb.sheetnames: for sheet_name in wb.sheetnames:
total += len(list(wb[sheetname].rows)) total += len(list(wb[sheet_name].rows))
res, fails, done = [], [], 0 res, fails, done = [], [], 0
rn = 0 rn = 0
flow_images = [] flow_images = []
pending_cell_images = [] pending_cell_images = []
tables = [] tables = []
for sheetname in wb.sheetnames: for sheet_name in wb.sheetnames:
ws = wb[sheetname] ws = wb[sheet_name]
images = Excel._extract_images_from_worksheet(ws, sheetname=sheetname) images = Excel._extract_images_from_worksheet(ws, sheetname=sheet_name)
if images: if images:
image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback, image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback,
**kwargs) **kwargs)
@ -59,7 +59,7 @@ class Excel(ExcelParser):
for i, bf in enumerate(image_descriptions): for i, bf in enumerate(image_descriptions):
images[i]["image_description"] = "\n".join(bf[0][1]) images[i]["image_description"] = "\n".join(bf[0][1])
for img in images: for img in images:
if (img["span_type"] == "single_cell" and img.get("image_description")): if img["span_type"] == "single_cell" and img.get("image_description"):
pending_cell_images.append(img) pending_cell_images.append(img)
else: else:
flow_images.append(img) flow_images.append(img)
@ -67,7 +67,7 @@ class Excel(ExcelParser):
try: try:
rows = list(ws.rows) rows = list(ws.rows)
except Exception as e: except Exception as e:
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}")
continue continue
if not rows: if not rows:
continue continue
@ -303,7 +303,8 @@ class Excel(ExcelParser):
def trans_datatime(s): def trans_datatime(s):
try: try:
return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S") return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
except Exception: except Exception as e:
logging.warning(f"Failed to parse date from {s}, error: {e}")
pass pass
@ -312,6 +313,7 @@ def trans_bool(s):
return "yes" return "yes"
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE): if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
return "no" return "no"
return None
def column_data_type(arr): def column_data_type(arr):
@ -346,8 +348,9 @@ def column_data_type(arr):
continue continue
try: try:
arr[i] = trans[ty](str(arr[i])) arr[i] = trans[ty](str(arr[i]))
except Exception: except Exception as e:
arr[i] = None arr[i] = None
logging.warning(f"Column {i}: {e}")
# if ty == "text": # if ty == "text":
# if len(arr) > 128 and uni / len(arr) < 0.1: # if len(arr) > 128 and uni / len(arr) < 0.1:
# ty = "keyword" # ty = "keyword"