diff --git a/rag/app/naive.py b/rag/app/naive.py index 8811c6b70..200059707 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -210,8 +210,8 @@ class Docx(DocxParser): except UnicodeDecodeError: logging.info("The recognized image stream appears to be corrupted. Skipping image.") continue - except Exception: - logging.info("The recognized image stream appears to be corrupted. Skipping image.") + except Exception as e: + logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}") continue try: image = Image.open(BytesIO(image_blob)).convert('RGB') @@ -219,7 +219,8 @@ class Docx(DocxParser): res_img = image else: res_img = concat_img(res_img, image) - except Exception: + except Exception as e: + logging.warning(f"Fail to open or concat images, exception: {e}") continue return res_img @@ -553,7 +554,8 @@ class Markdown(MarkdownParser): if (src, line_no) not in seen: urls.append({"url": src, "line": line_no}) seen.add((src, line_no)) - except Exception: + except Exception as e: + logging.error("Failed to extract image urls: {}".format(e)) pass return urls @@ -698,8 +700,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca **kwargs) or [] embed_res.extend(sub_res) except Exception as e: + error_msg = f"Failed to chunk embed {embed_filename}: {e}" + logging.error(error_msg) if callback: - callback(0.05, f"Failed to chunk embed {embed_filename}: {e}") + callback(0.05, error_msg) continue if re.search(r"\.docx$", filename, re.IGNORECASE): @@ -839,7 +843,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca try: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) callback(0.2, "Visual model detected. Attempting to enhance figure extraction...") - except Exception: + except Exception as e: + logging.warning(f"Failed to detect figure extraction: {e}") vision_model = None if vision_model: @@ -905,8 +910,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca sections = [(_, "") for _ in sections if _] callback(0.8, "Finish parsing.") else: - callback(0.8, f"tika.parser got empty content from {filename}.") - logging.warning(f"tika.parser got empty content from {filename}.") + error_msg = f"tika.parser got empty content from {filename}." + callback(0.8, error_msg) + logging.warning(error_msg) return [] else: raise NotImplementedError( diff --git a/rag/app/table.py b/rag/app/table.py index 4ffbee367..f931d2849 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -42,16 +42,16 @@ class Excel(ExcelParser): else: wb = Excel._load_excel_to_workbook(BytesIO(binary)) total = 0 - for sheetname in wb.sheetnames: - total += len(list(wb[sheetname].rows)) + for sheet_name in wb.sheetnames: + total += len(list(wb[sheet_name].rows)) res, fails, done = [], [], 0 rn = 0 flow_images = [] pending_cell_images = [] tables = [] - for sheetname in wb.sheetnames: - ws = wb[sheetname] - images = Excel._extract_images_from_worksheet(ws, sheetname=sheetname) + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + images = Excel._extract_images_from_worksheet(ws, sheetname=sheet_name) if images: image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback, **kwargs) @@ -59,7 +59,7 @@ class Excel(ExcelParser): for i, bf in enumerate(image_descriptions): images[i]["image_description"] = "\n".join(bf[0][1]) for img in images: - if (img["span_type"] == "single_cell" and img.get("image_description")): + if img["span_type"] == "single_cell" and img.get("image_description"): pending_cell_images.append(img) else: flow_images.append(img) @@ -67,7 +67,7 @@ class Excel(ExcelParser): try: rows = list(ws.rows) except Exception as e: - logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}") + logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}") continue if not rows: continue @@ -303,7 +303,8 @@ class Excel(ExcelParser): def trans_datatime(s): try: return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S") - except Exception: + except Exception as e: + logging.warning(f"Failed to parse date from {s}, error: {e}") pass @@ -312,6 +313,7 @@ def trans_bool(s): return "yes" if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE): return "no" + return None def column_data_type(arr): @@ -346,8 +348,9 @@ def column_data_type(arr): continue try: arr[i] = trans[ty](str(arr[i])) - except Exception: + except Exception as e: arr[i] = None + logging.warning(f"Column {i}: {e}") # if ty == "text": # if len(arr) > 128 and uni / len(arr) < 0.1: # ty = "keyword"