diff --git a/deepdoc/parser/figure_parser.py b/deepdoc/parser/figure_parser.py index 8dfcd02d2..86b05690c 100644 --- a/deepdoc/parser/figure_parser.py +++ b/deepdoc/parser/figure_parser.py @@ -14,6 +14,7 @@ # limitations under the License. # from concurrent.futures import ThreadPoolExecutor, as_completed +import logging from PIL import Image @@ -21,7 +22,8 @@ from common.constants import LLMType from api.db.services.llm_service import LLMBundle from common.connection_utils import timeout from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk -from rag.prompts.generator import vision_llm_figure_describe_prompt +from rag.prompts.generator import vision_llm_figure_describe_prompt, vision_llm_figure_describe_prompt_with_context +from rag.nlp import append_context2table_image4pdf def vision_figure_parser_figure_data_wrapper(figures_data_without_positions): @@ -84,20 +86,36 @@ def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs): def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs): if not tbls: return [] + sections = kwargs.get("sections") + parser_config = kwargs.get("parser_config", {}) + context_size = max(0, int(parser_config.get("image_context_size", 0) or 0)) try: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) callback(0.7, "Visual model detected. Attempting to enhance figure extraction...") except Exception: vision_model = None if vision_model: + def is_figure_item(item): - return ( - isinstance(item[0][0], Image.Image) and - isinstance(item[0][1], list) - ) + return isinstance(item[0][0], Image.Image) and isinstance(item[0][1], list) + figures_data = [item for item in tbls if is_figure_item(item)] + figure_contexts = [] + if sections and figures_data and context_size > 0: + figure_contexts = append_context2table_image4pdf( + sections, + figures_data, + context_size, + return_context=True, + ) try: - docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs) + docx_vision_parser = VisionFigureParser( + vision_model=vision_model, + figures_data=figures_data, + figure_contexts=figure_contexts, + context_size=context_size, + **kwargs, + ) boosted_figures = docx_vision_parser(callback=callback) tbls = [item for item in tbls if not is_figure_item(item)] tbls.extend(boosted_figures) @@ -112,6 +130,8 @@ shared_executor = ThreadPoolExecutor(max_workers=10) class VisionFigureParser: def __init__(self, vision_model, figures_data, *args, **kwargs): self.vision_model = vision_model + self.figure_contexts = kwargs.get("figure_contexts") or [] + self.context_size = max(0, int(kwargs.get("context_size", 0) or 0)) self._extract_figures_info(figures_data) assert len(self.figures) == len(self.descriptions) assert not self.positions or (len(self.figures) == len(self.positions)) @@ -156,10 +176,25 @@ class VisionFigureParser: @timeout(30, 3) def process(figure_idx, figure_binary): + context_above = "" + context_below = "" + if figure_idx < len(self.figure_contexts): + context_above, context_below = self.figure_contexts[figure_idx] + if context_above or context_below: + prompt = vision_llm_figure_describe_prompt_with_context( + context_above=context_above, + context_below=context_below, + ) + logging.info(f"[VisionFigureParser] figure={figure_idx} context_size={self.context_size} context_above_len={len(context_above)} context_below_len={len(context_below)} prompt=with_context") + logging.info(f"[VisionFigureParser] figure={figure_idx} context_above_snippet={context_above[:512]}") + logging.info(f"[VisionFigureParser] figure={figure_idx} context_below_snippet={context_below[:512]}") + else: + prompt = vision_llm_figure_describe_prompt() + logging.info(f"[VisionFigureParser] figure={figure_idx} context_size={self.context_size} context_len=0 prompt=default") description_text = picture_vision_llm_chunk( binary=figure_binary, vision_model=self.vision_model, - prompt=vision_llm_figure_describe_prompt(), + prompt=prompt, callback=callback, ) return figure_idx, description_text diff --git a/rag/app/manual.py b/rag/app/manual.py index 0c85e8949..8a39bffec 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -314,7 +314,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, tk_cnt = num_tokens_from_string(txt) if sec_id > -1: last_sid = sec_id - tbls = vision_figure_parser_pdf_wrapper(tbls=tbls, callback=callback, **kwargs) + tbls = vision_figure_parser_pdf_wrapper( + tbls=tbls, + sections=sections, + callback=callback, + **kwargs, + ) res = tokenize_table(tbls, doc, eng) res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0)) diff --git a/rag/app/naive.py b/rag/app/naive.py index 7aa8c8c76..c2e028b34 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -55,9 +55,12 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese callback=callback ) - tables = vision_figure_parser_pdf_wrapper(tbls=tables, - callback=callback, - **kwargs) + tables = vision_figure_parser_pdf_wrapper( + tbls=tables, + sections=sections, + callback=callback, + **kwargs, + ) return sections, tables, pdf_parser diff --git a/rag/app/paper.py b/rag/app/paper.py index 4317c7a1d..b34e7d95e 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -166,6 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_parser = Pdf() paper = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) + sections = paper.get("sections", []) else: kwargs.pop("parse_method", None) kwargs.pop("mineru_llm_name", None) @@ -192,7 +193,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, } tbls = paper["tables"] - tbls = vision_figure_parser_pdf_wrapper(tbls=tbls, callback=callback, **kwargs) + tbls = vision_figure_parser_pdf_wrapper( + tbls=tbls, + sections=sections, + callback=callback, + **kwargs, + ) paper["tables"] = tbls else: raise NotImplementedError("file type not supported yet(pdf supported)") diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 9c613e8ce..b41bf7ead 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -667,17 +667,42 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0): return chunks -def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0): +def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0, return_context=False): from deepdoc.parser import PdfParser if table_context_size <=0: - return tabls + return [] if return_context else tabls page_bucket = defaultdict(list) - for i, (txt, poss) in enumerate(sections): - poss = PdfParser.extract_positions(poss) + for i, item in enumerate(sections): + if isinstance(item, (tuple, list)): + if len(item) > 2: + txt, _sec_id, poss = item[0], item[1], item[2] + else: + txt = item[0] if item else "" + poss = item[1] if len(item) > 1 else "" + else: + txt = item + poss = "" + # Normal: (text, "@@...##") from naive parser -> poss is a position tag string. + # Manual: (text, sec_id, poss_list) -> poss is a list of (page, left, right, top, bottom). + # Paper: (text_with_@@tag, layoutno) -> poss is layoutno; parse from txt when it contains @@ tags. + if isinstance(poss, list): + poss = poss + elif isinstance(poss, str): + if "@@" not in poss and isinstance(txt, str) and "@@" in txt: + poss = txt + poss = PdfParser.extract_positions(poss) + else: + if isinstance(txt, str) and "@@" in txt: + poss = PdfParser.extract_positions(txt) + else: + poss = [] + if isinstance(txt, str) and "@@" in txt: + txt = re.sub(r"@@[0-9-]+\t[0-9.\t]+##", "", txt).strip() for page, left, right, top, bottom in poss: - page = page[0] - page_bucket[page].append(((left, top, right, bottom), txt)) + if isinstance(page, list): + page = page[0] if page else 0 + page_bucket[page].append(((left, right, top, bottom), txt)) def upper_context(page, i): txt = "" @@ -720,9 +745,10 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si return txt res = [] + contexts = [] for (img, tb), poss in tabls: - page, left, top, right, bott = poss[0] - _page, _left, _top, _right, _bott = poss[-1] + page, left, right, top, bott = poss[0] + _page, _left, _right, _top, _bott = poss[-1] if isinstance(tb, list): tb = "\n".join(tb) @@ -736,23 +762,34 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si i = 0 blks = page_bucket.get(page, []) continue - tb = upper_context(page, i) + tb + lower_context(page+1, 0) + upper = upper_context(page, i) + lower = lower_context(page + 1, 0) + tb = upper + tb + lower + contexts.append((upper.strip(), lower.strip())) break - (_, t, r, b), txt = blks[i] + (_, _, t, b), txt = blks[i] if b > top: break - (_, _t, _r, _b), _txt = blks[i+1] + (_, _, _t, _b), _txt = blks[i+1] if _t < _bott: i += 1 continue - tb = upper_context(page, i) + tb + lower_context(page, i) + upper = upper_context(page, i) + lower = lower_context(page, i) + tb = upper + tb + lower + contexts.append((upper.strip(), lower.strip())) break if _tb == tb: - tb = upper_context(page, -1) + tb + lower_context(page+1, 0) + upper = upper_context(page, -1) + lower = lower_context(page + 1, 0) + tb = upper + tb + lower + contexts.append((upper.strip(), lower.strip())) + if len(contexts) < len(res) + 1: + contexts.append(("", "")) res.append(((img, tb), poss)) - return res + return contexts if return_context else res def add_positions(d, poss): diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py index b429960eb..7c0714663 100644 --- a/rag/prompts/generator.py +++ b/rag/prompts/generator.py @@ -158,6 +158,7 @@ KEYWORD_PROMPT_TEMPLATE = load_prompt("keyword_prompt") QUESTION_PROMPT_TEMPLATE = load_prompt("question_prompt") VISION_LLM_DESCRIBE_PROMPT = load_prompt("vision_llm_describe_prompt") VISION_LLM_FIGURE_DESCRIBE_PROMPT = load_prompt("vision_llm_figure_describe_prompt") +VISION_LLM_FIGURE_DESCRIBE_PROMPT_WITH_CONTEXT = load_prompt("vision_llm_figure_describe_prompt_with_context") STRUCTURED_OUTPUT_PROMPT = load_prompt("structured_output_prompt") ANALYZE_TASK_SYSTEM = load_prompt("analyze_task_system") @@ -321,6 +322,11 @@ def vision_llm_figure_describe_prompt() -> str: return template.render() +def vision_llm_figure_describe_prompt_with_context(context_above: str, context_below: str) -> str: + template = PROMPT_JINJA_ENV.from_string(VISION_LLM_FIGURE_DESCRIBE_PROMPT_WITH_CONTEXT) + return template.render(context_above=context_above, context_below=context_below) + + def tool_schema(tools_description: list[dict], complete_task=False): if not tools_description: return "" diff --git a/rag/prompts/vision_llm_figure_describe_prompt.md b/rag/prompts/vision_llm_figure_describe_prompt.md index 7e5285641..db17b44ef 100644 --- a/rag/prompts/vision_llm_figure_describe_prompt.md +++ b/rag/prompts/vision_llm_figure_describe_prompt.md @@ -1,24 +1,72 @@ ## ROLE + You are an expert visual data analyst. ## GOAL -Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image. + +Analyze the image and produce a textual representation strictly based on what is visible in the image. + +## DECISION RULE (CRITICAL) + +First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset. + +Enumerable data units are clearly separable, repeatable elements intended for comparison, measurement, or aggregation, such as: + +- rows or columns in a table +- individual bars in a bar chart +- identifiable data points or series in a line graph +- labeled segments in a pie chart + +The mere presence of numbers, icons, UI elements, or labels does NOT qualify unless they together form such a dataset. ## TASKS -1. Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram. -2. Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available. -3. Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns). -4. Analyze and explain any trends, comparisons, or patterns shown in the data. -5. Capture any annotations, captions, or footnotes, and explain their relevance to the image. -6. Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it. -## OUTPUT FORMAT (Include only sections relevant to the image content) -- Visual Type: [Type] -- Title: [Title text, if available] -- Axes / Legends / Labels: [Details, if available] -- Data Points: [Extracted data] -- Trends / Insights: [Analysis and interpretation] -- Captions / Annotations: [Text and relevance, if available] +1. Inspect the image and determine which output mode applies based on the decision rule. +2. Follow the output rules strictly. +3. Include only content that is explicitly visible in the image. +4. Do not infer intent, functionality, process logic, or meaning beyond what is visually or textually shown. -> Ensure high accuracy, clarity, and completeness in your analysis, and include only the information present in the image. Avoid unnecessary statements about missing elements. +## OUTPUT RULES (STRICT) +- Produce output in **exactly one** of the two modes defined below. +- Do NOT mention, label, or reference the modes in the output. +- Do NOT combine content from both modes. +- Do NOT explain or justify the choice of mode. +- Do NOT add any headings, titles, or commentary beyond what the mode requires. + +--- + +## MODE 1: STRUCTURED VISUAL DATA OUTPUT + +(Use only if the image contains enumerable data units forming a coherent dataset.) + +Output **only** the following fields, in list form. +Do NOT add free-form paragraphs or additional sections. + +- Visual Type: +- Title: +- Axes / Legends / Labels: +- Data Points: +- Captions / Annotations: + +--- + +## MODE 2: GENERAL FIGURE CONTENT + +(Use only if the image does NOT contain enumerable data units.) + +Write the content directly, starting from the first sentence. +Do NOT add any introductory labels, titles, headings, or prefixes. + +Requirements: + +- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right). +- Explicitly name interface elements or visual objects exactly as they appear (e.g., tabs, panels, buttons, icons, input fields). +- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels. +- Describe spatial grouping, containment, and alignment of elements. +- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes. +- Do NOT describe the figure as a chart, diagram, process, phase, or sequence unless such words explicitly appear in the image text. +- Avoid narrative or stylistic language unless it is a dominant and functional visual element. + +Use concise, information-dense sentences. +Do not use bullet lists or structured fields in this mode. diff --git a/rag/prompts/vision_llm_figure_describe_prompt_with_context.md b/rag/prompts/vision_llm_figure_describe_prompt_with_context.md new file mode 100644 index 000000000..6843f7e7e --- /dev/null +++ b/rag/prompts/vision_llm_figure_describe_prompt_with_context.md @@ -0,0 +1,82 @@ +## ROLE + +You are an expert visual data analyst. + +## GOAL + +Analyze the image and produce a textual representation strictly based on what is visible in the image. +Surrounding context may be used only for minimal clarification or disambiguation of terms that appear in the image, not as a source of new information. + +## CONTEXT (ABOVE) + +{{ context_above }} + +## CONTEXT (BELOW) + +{{ context_below }} + +## DECISION RULE (CRITICAL) + +First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset. + +Enumerable data units are clearly separable, repeatable elements intended for comparison, measurement, or aggregation, such as: + +- rows or columns in a table +- individual bars in a bar chart +- identifiable data points or series in a line graph +- labeled segments in a pie chart + +The mere presence of numbers, icons, UI elements, or labels does NOT qualify unless they together form such a dataset. + +## TASKS + +1. Inspect the image and determine which output mode applies based on the decision rule. +2. Use surrounding context only to disambiguate terms that appear in the image. +3. Follow the output rules strictly. +4. Include only content that is explicitly visible in the image. +5. Do not infer intent, functionality, process logic, or meaning beyond what is visually or textually shown. + +## OUTPUT RULES (STRICT) + +- Produce output in **exactly one** of the two modes defined below. +- Do NOT mention, label, or reference the modes in the output. +- Do NOT combine content from both modes. +- Do NOT explain or justify the choice of mode. +- Do NOT add any headings, titles, or commentary beyond what the mode requires. + +--- + +## MODE 1: STRUCTURED VISUAL DATA OUTPUT + +(Use only if the image contains enumerable data units forming a coherent dataset.) + +Output **only** the following fields, in list form. +Do NOT add free-form paragraphs or additional sections. + +- Visual Type: +- Title: +- Axes / Legends / Labels: +- Data Points: +- Captions / Annotations: + +--- + +## MODE 2: GENERAL FIGURE CONTENT + +(Use only if the image does NOT contain enumerable data units.) + +Write the content directly, starting from the first sentence. +Do NOT add any introductory labels, titles, headings, or prefixes. + +Requirements: + +- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right). +- Explicitly name interface elements or visual objects exactly as they appear (e.g., tabs, panels, buttons, icons, input fields). +- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels. +- Describe spatial grouping, containment, and alignment of elements. +- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes. +- Do NOT describe the figure as a chart, diagram, process, phase, or sequence unless such words explicitly appear in the image text. +- Avoid narrative or stylistic language unless it is a dominant and functional visual element. + +Use concise, information-dense sentences. +Do not use bullet lists or structured fields in this mode.