From 9b52ba8061fe772fbf7868186649f5d2bfdc1d5f Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Wed, 24 Dec 2025 16:58:14 +0800 Subject: [PATCH] Feat: add image table context to pipeline splitter (#12167) ### What problem does this PR solve? Add image table context to pipeline splitter. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/flow/parser/parser.py | 40 ------------------- rag/flow/splitter/splitter.py | 18 ++++++++- rag/utils/base64_image.py | 2 +- web/src/pages/agent/constant/pipeline.tsx | 1 + .../pages/agent/form/splitter-form/index.tsx | 8 ++++ web/src/pages/agent/utils.ts | 6 ++- 6 files changed, 31 insertions(+), 44 deletions(-) diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index b4398f626..1c7154424 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -37,7 +37,6 @@ from rag.app.naive import Docx from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.parser.schema import ParserFromUpstream from rag.llm.cv_model import Base as VLM -from rag.nlp import attach_media_context from rag.utils.base64_image import image2id @@ -86,8 +85,6 @@ class ParserParam(ProcessParamBase): "pdf", ], "output_format": "json", - "table_context_size": 0, - "image_context_size": 0, }, "spreadsheet": { "parse_method": "deepdoc", # deepdoc/tcadp_parser @@ -97,8 +94,6 @@ class ParserParam(ProcessParamBase): "xlsx", "csv", ], - "table_context_size": 0, - "image_context_size": 0, }, "word": { "suffix": [ @@ -106,14 +101,10 @@ class ParserParam(ProcessParamBase): "docx", ], "output_format": "json", - "table_context_size": 0, - "image_context_size": 0, }, "text&markdown": { "suffix": ["md", "markdown", "mdx", "txt"], "output_format": "json", - "table_context_size": 0, - "image_context_size": 0, }, "slides": { "parse_method": "deepdoc", # deepdoc/tcadp_parser @@ -122,8 +113,6 @@ class ParserParam(ProcessParamBase): "ppt", ], "output_format": "json", - "table_context_size": 0, - "image_context_size": 0, }, "image": { "parse_method": "ocr", @@ -357,11 +346,6 @@ class Parser(ProcessBase): elif layout == "table": b["doc_type_kwd"] = "table" - table_ctx = conf.get("table_context_size", 0) or 0 - image_ctx = conf.get("image_context_size", 0) or 0 - if table_ctx or image_ctx: - bboxes = attach_media_context(bboxes, table_ctx, image_ctx) - if conf.get("output_format") == "json": self.set_output("json", bboxes) if conf.get("output_format") == "markdown": @@ -436,11 +420,6 @@ class Parser(ProcessBase): if table: result.append({"text": table, "doc_type_kwd": "table"}) - table_ctx = conf.get("table_context_size", 0) or 0 - image_ctx = conf.get("image_context_size", 0) or 0 - if table_ctx or image_ctx: - result = attach_media_context(result, table_ctx, image_ctx) - self.set_output("json", result) elif output_format == "markdown": @@ -476,11 +455,6 @@ class Parser(ProcessBase): sections = [{"text": section[0], "image": section[1]} for section in sections if section] sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls]) - table_ctx = conf.get("table_context_size", 0) or 0 - image_ctx = conf.get("image_context_size", 0) or 0 - if table_ctx or image_ctx: - sections = attach_media_context(sections, table_ctx, image_ctx) - self.set_output("json", sections) elif conf.get("output_format") == "markdown": markdown_text = docx_parser.to_markdown(name, binary=blob) @@ -536,11 +510,6 @@ class Parser(ProcessBase): if table: result.append({"text": table, "doc_type_kwd": "table"}) - table_ctx = conf.get("table_context_size", 0) or 0 - image_ctx = conf.get("image_context_size", 0) or 0 - if table_ctx or image_ctx: - result = attach_media_context(result, table_ctx, image_ctx) - self.set_output("json", result) else: # Default DeepDOC parser (supports .pptx format) @@ -554,10 +523,6 @@ class Parser(ProcessBase): # json assert conf.get("output_format") == "json", "have to be json for ppt" if conf.get("output_format") == "json": - table_ctx = conf.get("table_context_size", 0) or 0 - image_ctx = conf.get("image_context_size", 0) or 0 - if table_ctx or image_ctx: - sections = attach_media_context(sections, table_ctx, image_ctx) self.set_output("json", sections) def _markdown(self, name, blob): @@ -597,11 +562,6 @@ class Parser(ProcessBase): json_results.append(json_result) - table_ctx = conf.get("table_context_size", 0) or 0 - image_ctx = conf.get("image_context_size", 0) or 0 - if table_ctx or image_ctx: - json_results = attach_media_context(json_results, table_ctx, image_ctx) - self.set_output("json", json_results) else: self.set_output("text", "\n".join([section_text for section_text, _ in sections])) diff --git a/rag/flow/splitter/splitter.py b/rag/flow/splitter/splitter.py index 45abb547a..0aec023d1 100644 --- a/rag/flow/splitter/splitter.py +++ b/rag/flow/splitter/splitter.py @@ -23,7 +23,7 @@ from rag.utils.base64_image import id2image, image2id from deepdoc.parser.pdf_parser import RAGFlowPdfParser from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.splitter.schema import SplitterFromUpstream -from rag.nlp import naive_merge, naive_merge_with_images +from rag.nlp import attach_media_context, naive_merge, naive_merge_with_images from common import settings @@ -34,11 +34,15 @@ class SplitterParam(ProcessParamBase): self.delimiters = ["\n"] self.overlapped_percent = 0 self.children_delimiters = [] + self.table_context_size = 0 + self.image_context_size = 0 def check(self): self.check_empty(self.delimiters, "Delimiters.") self.check_positive_integer(self.chunk_token_size, "Chunk token size.") self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)") + self.check_nonnegative_number(self.table_context_size, "Table context size.") + self.check_nonnegative_number(self.image_context_size, "Image context size.") def get_input_form(self) -> dict[str, dict]: return {} @@ -103,8 +107,18 @@ class Splitter(ProcessBase): return # json + json_result = from_upstream.json_result or [] + if self._param.table_context_size or self._param.image_context_size: + for ck in json_result: + if "image" not in ck and ck.get("img_id") and not (isinstance(ck.get("text"), str) and ck.get("text").strip()): + ck["image"] = True + attach_media_context(json_result, self._param.table_context_size, self._param.image_context_size) + for ck in json_result: + if ck.get("image") is True: + del ck["image"] + sections, section_images = [], [] - for o in from_upstream.json_result or []: + for o in json_result: sections.append((o.get("text", ""), o.get("position_tag", ""))) section_images.append(id2image(o.get("img_id"), partial(settings.STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id))) diff --git a/rag/utils/base64_image.py b/rag/utils/base64_image.py index 66c90dfa5..935979710 100644 --- a/rag/utils/base64_image.py +++ b/rag/utils/base64_image.py @@ -82,7 +82,7 @@ def id2image(image_id:str|None, storage_get_func: partial): return bkt, nm = image_id.split("-") try: - blob = storage_get_func(bucket=bkt, filename=nm) + blob = storage_get_func(bucket=bkt, fnm=nm) if not blob: return return Image.open(BytesIO(blob)) diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx index c799cf959..d2ba07502 100644 --- a/web/src/pages/agent/constant/pipeline.tsx +++ b/web/src/pages/agent/constant/pipeline.tsx @@ -206,6 +206,7 @@ export const initialSplitterValues = { chunk_token_size: 512, overlapped_percent: 0, delimiters: [{ value: '\n' }], + image_table_context_window: 0, }; export enum Hierarchy { diff --git a/web/src/pages/agent/form/splitter-form/index.tsx b/web/src/pages/agent/form/splitter-form/index.tsx index c6cb8962a..f4dcb7418 100644 --- a/web/src/pages/agent/form/splitter-form/index.tsx +++ b/web/src/pages/agent/form/splitter-form/index.tsx @@ -22,6 +22,7 @@ const outputList = buildOutputList(initialSplitterValues.outputs); export const FormSchema = z.object({ chunk_token_size: z.number(), + image_table_context_window: z.number(), delimiters: z.array( z.object({ value: z.string().optional(), @@ -74,6 +75,13 @@ const SplitterForm = ({ node }: INextOperatorForm) => { min={0} label={t('flow.overlappedPercent')} > +
{t('flow.delimiters')}
diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts index cea553a9d..e54f0864b 100644 --- a/web/src/pages/agent/utils.ts +++ b/web/src/pages/agent/utils.ts @@ -289,10 +289,14 @@ function transformParserParams(params: ParserFormSchemaType) { } function transformSplitterParams(params: SplitterFormSchemaType) { + const { image_table_context_window, ...rest } = params; + const imageTableContextWindow = Number(image_table_context_window || 0); return { - ...params, + ...rest, overlapped_percent: Number(params.overlapped_percent) / 100, delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'), + table_context_size: imageTableContextWindow, + image_context_size: imageTableContextWindow, // Unset children delimiters if this option is not enabled children_delimiters: params.enable_children