diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 906de8135..a8087ef81 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -18,6 +18,7 @@ from io import BytesIO import pandas as pd from openpyxl import Workbook, load_workbook +from PIL import Image from rag.nlp import find_codec @@ -109,6 +110,52 @@ class RAGFlowExcelParser: ws.cell(row=row_num, column=col_num, value=value) return wb + @staticmethod + def _extract_images_from_worksheet(ws, sheetname=None): + """ + Extract images from a worksheet and enrich them with vision-based descriptions. + + Returns: List[dict] + """ + images = getattr(ws, "_images", []) + if not images: + return [] + + raw_items = [] + + for img in images: + try: + img_bytes = img._data() + pil_img = Image.open(BytesIO(img_bytes)).convert("RGB") + + anchor = img.anchor + if hasattr(anchor, "_from") and hasattr(anchor, "_to"): + r1, c1 = anchor._from.row + 1, anchor._from.col + 1 + r2, c2 = anchor._to.row + 1, anchor._to.col + 1 + if r1 == r2 and c1 == c2: + span = "single_cell" + else: + span = "multi_cell" + else: + r1, c1 = anchor._from.row + 1, anchor._from.col + 1 + r2, c2 = r1, c1 + span = "single_cell" + + item = { + "sheet": sheetname or ws.title, + "image": pil_img, + "image_description": "", + "row_from": r1, + "col_from": c1, + "row_to": r2, + "col_to": c2, + "span_type": span, + } + raw_items.append(item) + except Exception: + continue + return raw_items + def html(self, fnm, chunk_rows=256): from html import escape diff --git a/deepdoc/parser/figure_parser.py b/deepdoc/parser/figure_parser.py index 0cb47b601..bbdb52abf 100644 --- a/deepdoc/parser/figure_parser.py +++ b/deepdoc/parser/figure_parser.py @@ -55,6 +55,31 @@ def vision_figure_parser_docx_wrapper(sections, tbls, callback=None,**kwargs): callback(0.8, f"Visual model error: {e}. Skipping figure parsing enhancement.") return tbls +def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs): + tbls = [] + if not images: + return [] + try: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.2, "Visual model detected. Attempting to enhance Excel image extraction...") + except Exception: + vision_model = None + if vision_model: + figures_data = [(( + img["image"], # Image.Image + [img["image_description"]] # description list (must be list) + ), + [ + (0, 0, 0, 0, 0) # dummy position + ]) for img in images] + try: + parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) + callback(0.22, "Parsing images...") + boosted_figures = parser(callback=callback) + tbls.extend(boosted_figures) + except Exception as e: + callback(0.25, f"Excel visual model error: {e}. Skipping vision enhancement.") + return tbls def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs): if not tbls: diff --git a/rag/app/table.py b/rag/app/table.py index a87a858bf..bb6a4d007 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -29,13 +29,14 @@ from collections import Counter from dateutil.parser import parse as datetime_parse from api.db.services.knowledgebase_service import KnowledgebaseService +from deepdoc.parser.figure_parser import vision_figure_parser_figure_xlsx_wrapper from deepdoc.parser.utils import get_text -from rag.nlp import rag_tokenizer, tokenize +from rag.nlp import rag_tokenizer, tokenize, tokenize_table from deepdoc.parser import ExcelParser class Excel(ExcelParser): - def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None): + def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None, **kwargs): if not binary: wb = Excel._load_excel_to_workbook(fnm) else: @@ -45,8 +46,23 @@ class Excel(ExcelParser): total += len(list(wb[sheetname].rows)) res, fails, done = [], [], 0 rn = 0 + flow_images = [] + pending_cell_images = [] + tables = [] for sheetname in wb.sheetnames: ws = wb[sheetname] + images = Excel._extract_images_from_worksheet(ws,sheetname=sheetname) + if images: + image_descriptions = vision_figure_parser_figure_xlsx_wrapper(images=images, callback=callback, **kwargs) + if image_descriptions and len(image_descriptions) == len(images): + for i, bf in enumerate(image_descriptions): + images[i]["image_description"] = "\n".join(bf[0][1]) + for img in images: + if (img["span_type"] == "single_cell"and img.get("image_description")): + pending_cell_images.append(img) + else: + flow_images.append(img) + try: rows = list(ws.rows) except Exception as e: @@ -75,9 +91,38 @@ class Excel(ExcelParser): if len(data) == 0: continue df = pd.DataFrame(data, columns=headers) + for img in pending_cell_images: + excel_row = img["row_from"] - 1 + excel_col = img["col_from"] - 1 + + df_row_idx = excel_row - header_rows + if df_row_idx < 0 or df_row_idx >= len(df): + flow_images.append(img) + continue + + if excel_col < 0 or excel_col >= len(df.columns): + flow_images.append(img) + continue + + col_name = df.columns[excel_col] + + if not df.iloc[df_row_idx][col_name]: + df.iat[df_row_idx, excel_col] = img["image_description"] res.append(df) + for img in flow_images: + tables.append( + ( + ( + img["image"], # Image.Image + [img["image_description"]] # description list (must be list) + ), + [ + (0, 0, 0, 0, 0) # dummy position + ] + ) + ) callback(0.3, ("Extract records: {}~{}".format(from_page + 1, min(to_page, from_page + rn)) + (f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) - return res + return res,tables def _parse_headers(self, ws, rows): if len(rows) == 0: @@ -320,11 +365,12 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese Every row in table will be treated as a chunk. """ - + tbls = [] + is_english = lang.lower() == "english" if re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = Excel() - dfs = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback) + dfs,tbls = excel_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback, **kwargs) elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = get_text(filename, binary) @@ -419,7 +465,9 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese continue tokenize(d, "; ".join(row_txt), eng) res.append(d) - + if tbls: + doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} + res.extend(tokenize_table(tbls, doc, is_english)) KnowledgebaseService.update_parser_config(kwargs["kb_id"], {"field_map": {k: v for k, v in clmns_map}}) callback(0.35, "")