Fix IDE warnings (#12315)

### What problem does this PR solve? As title. ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2025-12-31 01:01:30 +08:00 · 2025-12-30 15:04:09 +08:00
parent 4037788e0c
commit f0392e7501
2 changed files with 26 additions and 17 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -210,8 +210,8 @@ class Docx(DocxParser):
            except UnicodeDecodeError:
                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
                continue
-            except Exception:
-                logging.info("The recognized image stream appears to be corrupted. Skipping image.")
+            except Exception as e:
+                logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
                continue
            try:
                image = Image.open(BytesIO(image_blob)).convert('RGB')
@ -219,7 +219,8 @@ class Docx(DocxParser):
                    res_img = image
                else:
                    res_img = concat_img(res_img, image)
-            except Exception:
+            except Exception as e:
+                logging.warning(f"Fail to open or concat images, exception: {e}")
                continue

        return res_img
@ -553,7 +554,8 @@ class Markdown(MarkdownParser):
                if (src, line_no) not in seen:
                    urls.append({"url": src, "line": line_no})
                    seen.add((src, line_no))
-        except Exception:
+        except Exception as e:
+            logging.error("Failed to extract image urls: {}".format(e))
            pass

        return urls
@ -698,8 +700,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
                                **kwargs) or []
                embed_res.extend(sub_res)
            except Exception as e:
+                error_msg = f"Failed to chunk embed {embed_filename}: {e}"
+                logging.error(error_msg)
                if callback:
-                    callback(0.05, f"Failed to chunk embed {embed_filename}: {e}")
+                    callback(0.05, error_msg)
                continue

    if re.search(r"\.docx$", filename, re.IGNORECASE):
@ -839,7 +843,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        try:
            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
            callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
-        except Exception:
+        except Exception as e:
+            logging.warning(f"Failed to detect figure extraction: {e}")
            vision_model = None

        if vision_model:
@ -905,8 +910,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
            sections = [(_, "") for _ in sections if _]
            callback(0.8, "Finish parsing.")
        else:
-            callback(0.8, f"tika.parser got empty content from {filename}.")
-            logging.warning(f"tika.parser got empty content from {filename}.")
+            error_msg = f"tika.parser got empty content from {filename}."
+            callback(0.8, error_msg)
+            logging.warning(error_msg)
            return []
    else:
        raise NotImplementedError(
--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -42,16 +42,16 @@ class Excel(ExcelParser):
        else:
            wb = Excel._load_excel_to_workbook(BytesIO(binary))
        total = 0
-        for sheetname in wb.sheetnames:
-            total += len(list(wb[sheetname].rows))
+        for sheet_name in wb.sheetnames:
+            total += len(list(wb[sheet_name].rows))
        res, fails, done = [], [], 0
        rn = 0
        flow_images = []
        pending_cell_images = []
        tables = []
-        for sheetname in wb.sheetnames:
-            ws = wb[sheetname]
-            images = Excel._extract_images_from_worksheet(ws, sheetname=sheetname)
+        for sheet_name in wb.sheetnames:
+            ws = wb[sheet_name]
+            images = Excel._extract_images_from_worksheet(ws, sheetname=sheet_name)
            if images:
                image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback,
                                                                              **kwargs)
@ -59,7 +59,7 @@ class Excel(ExcelParser):
                    for i, bf in enumerate(image_descriptions):
                        images[i]["image_description"] = "\n".join(bf[0][1])
                    for img in images:
-                        if (img["span_type"] == "single_cell" and img.get("image_description")):
+                        if img["span_type"] == "single_cell" and img.get("image_description"):
                            pending_cell_images.append(img)
                        else:
                            flow_images.append(img)
@ -67,7 +67,7 @@ class Excel(ExcelParser):
            try:
                rows = list(ws.rows)
            except Exception as e:
-                logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
+                logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}")
                continue
            if not rows:
                continue
@ -303,7 +303,8 @@ class Excel(ExcelParser):
 def trans_datatime(s):
    try:
        return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
-    except Exception:
+    except Exception as e:
+        logging.warning(f"Failed to parse date from {s}, error: {e}")
        pass


@ -312,6 +313,7 @@ def trans_bool(s):
        return "yes"
    if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
        return "no"
+    return None


 def column_data_type(arr):
@ -346,8 +348,9 @@ def column_data_type(arr):
            continue
        try:
            arr[i] = trans[ty](str(arr[i]))
-        except Exception:
+        except Exception as e:
            arr[i] = None
+            logging.warning(f"Column {i}: {e}")
    # if ty == "text":
    #    if len(arr) > 128 and uni / len(arr) < 0.1:
    #        ty = "keyword"