mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-31 09:05:30 +08:00
Fix IDE warnings (#12315)
### What problem does this PR solve? As title. ### Type of change - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
@ -210,8 +210,8 @@ class Docx(DocxParser):
|
|||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||||
continue
|
continue
|
||||||
except Exception:
|
except Exception as e:
|
||||||
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
||||||
@ -219,7 +219,8 @@ class Docx(DocxParser):
|
|||||||
res_img = image
|
res_img = image
|
||||||
else:
|
else:
|
||||||
res_img = concat_img(res_img, image)
|
res_img = concat_img(res_img, image)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
logging.warning(f"Fail to open or concat images, exception: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return res_img
|
return res_img
|
||||||
@ -553,7 +554,8 @@ class Markdown(MarkdownParser):
|
|||||||
if (src, line_no) not in seen:
|
if (src, line_no) not in seen:
|
||||||
urls.append({"url": src, "line": line_no})
|
urls.append({"url": src, "line": line_no})
|
||||||
seen.add((src, line_no))
|
seen.add((src, line_no))
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
logging.error("Failed to extract image urls: {}".format(e))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return urls
|
return urls
|
||||||
@ -698,8 +700,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
**kwargs) or []
|
**kwargs) or []
|
||||||
embed_res.extend(sub_res)
|
embed_res.extend(sub_res)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
|
||||||
|
logging.error(error_msg)
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.05, f"Failed to chunk embed {embed_filename}: {e}")
|
callback(0.05, error_msg)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||||
@ -839,7 +843,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
try:
|
try:
|
||||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
||||||
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
|
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
logging.warning(f"Failed to detect figure extraction: {e}")
|
||||||
vision_model = None
|
vision_model = None
|
||||||
|
|
||||||
if vision_model:
|
if vision_model:
|
||||||
@ -905,8 +910,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
sections = [(_, "") for _ in sections if _]
|
sections = [(_, "") for _ in sections if _]
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
else:
|
else:
|
||||||
callback(0.8, f"tika.parser got empty content from {filename}.")
|
error_msg = f"tika.parser got empty content from {filename}."
|
||||||
logging.warning(f"tika.parser got empty content from {filename}.")
|
callback(0.8, error_msg)
|
||||||
|
logging.warning(error_msg)
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
|
|||||||
@ -42,16 +42,16 @@ class Excel(ExcelParser):
|
|||||||
else:
|
else:
|
||||||
wb = Excel._load_excel_to_workbook(BytesIO(binary))
|
wb = Excel._load_excel_to_workbook(BytesIO(binary))
|
||||||
total = 0
|
total = 0
|
||||||
for sheetname in wb.sheetnames:
|
for sheet_name in wb.sheetnames:
|
||||||
total += len(list(wb[sheetname].rows))
|
total += len(list(wb[sheet_name].rows))
|
||||||
res, fails, done = [], [], 0
|
res, fails, done = [], [], 0
|
||||||
rn = 0
|
rn = 0
|
||||||
flow_images = []
|
flow_images = []
|
||||||
pending_cell_images = []
|
pending_cell_images = []
|
||||||
tables = []
|
tables = []
|
||||||
for sheetname in wb.sheetnames:
|
for sheet_name in wb.sheetnames:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheet_name]
|
||||||
images = Excel._extract_images_from_worksheet(ws, sheetname=sheetname)
|
images = Excel._extract_images_from_worksheet(ws, sheetname=sheet_name)
|
||||||
if images:
|
if images:
|
||||||
image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback,
|
image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
@ -59,7 +59,7 @@ class Excel(ExcelParser):
|
|||||||
for i, bf in enumerate(image_descriptions):
|
for i, bf in enumerate(image_descriptions):
|
||||||
images[i]["image_description"] = "\n".join(bf[0][1])
|
images[i]["image_description"] = "\n".join(bf[0][1])
|
||||||
for img in images:
|
for img in images:
|
||||||
if (img["span_type"] == "single_cell" and img.get("image_description")):
|
if img["span_type"] == "single_cell" and img.get("image_description"):
|
||||||
pending_cell_images.append(img)
|
pending_cell_images.append(img)
|
||||||
else:
|
else:
|
||||||
flow_images.append(img)
|
flow_images.append(img)
|
||||||
@ -67,7 +67,7 @@ class Excel(ExcelParser):
|
|||||||
try:
|
try:
|
||||||
rows = list(ws.rows)
|
rows = list(ws.rows)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f"Skip sheet '{sheetname}' due to rows access error: {e}")
|
logging.warning(f"Skip sheet '{sheet_name}' due to rows access error: {e}")
|
||||||
continue
|
continue
|
||||||
if not rows:
|
if not rows:
|
||||||
continue
|
continue
|
||||||
@ -303,7 +303,8 @@ class Excel(ExcelParser):
|
|||||||
def trans_datatime(s):
|
def trans_datatime(s):
|
||||||
try:
|
try:
|
||||||
return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
|
return datetime_parse(s.strip()).strftime("%Y-%m-%d %H:%M:%S")
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
logging.warning(f"Failed to parse date from {s}, error: {e}")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -312,6 +313,7 @@ def trans_bool(s):
|
|||||||
return "yes"
|
return "yes"
|
||||||
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
|
if re.match(r"(false|no|否|⍻|×)$", str(s).strip(), flags=re.IGNORECASE):
|
||||||
return "no"
|
return "no"
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def column_data_type(arr):
|
def column_data_type(arr):
|
||||||
@ -346,8 +348,9 @@ def column_data_type(arr):
|
|||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
arr[i] = trans[ty](str(arr[i]))
|
arr[i] = trans[ty](str(arr[i]))
|
||||||
except Exception:
|
except Exception as e:
|
||||||
arr[i] = None
|
arr[i] = None
|
||||||
|
logging.warning(f"Column {i}: {e}")
|
||||||
# if ty == "text":
|
# if ty == "text":
|
||||||
# if len(arr) > 128 and uni / len(arr) < 0.1:
|
# if len(arr) > 128 and uni / len(arr) < 0.1:
|
||||||
# ty = "keyword"
|
# ty = "keyword"
|
||||||
|
|||||||
Reference in New Issue
Block a user