Feat: add image table context to pipeline splitter (#12167)

### What problem does this PR solve?

Add image table context to pipeline splitter.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-12-24 16:58:14 +08:00
committed by GitHub
parent 44671ea413
commit 9b52ba8061
6 changed files with 31 additions and 44 deletions

View File

@ -37,7 +37,6 @@ from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream from rag.flow.parser.schema import ParserFromUpstream
from rag.llm.cv_model import Base as VLM from rag.llm.cv_model import Base as VLM
from rag.nlp import attach_media_context
from rag.utils.base64_image import image2id from rag.utils.base64_image import image2id
@ -86,8 +85,6 @@ class ParserParam(ProcessParamBase):
"pdf", "pdf",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"spreadsheet": { "spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser "parse_method": "deepdoc", # deepdoc/tcadp_parser
@ -97,8 +94,6 @@ class ParserParam(ProcessParamBase):
"xlsx", "xlsx",
"csv", "csv",
], ],
"table_context_size": 0,
"image_context_size": 0,
}, },
"word": { "word": {
"suffix": [ "suffix": [
@ -106,14 +101,10 @@ class ParserParam(ProcessParamBase):
"docx", "docx",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"text&markdown": { "text&markdown": {
"suffix": ["md", "markdown", "mdx", "txt"], "suffix": ["md", "markdown", "mdx", "txt"],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"slides": { "slides": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser "parse_method": "deepdoc", # deepdoc/tcadp_parser
@ -122,8 +113,6 @@ class ParserParam(ProcessParamBase):
"ppt", "ppt",
], ],
"output_format": "json", "output_format": "json",
"table_context_size": 0,
"image_context_size": 0,
}, },
"image": { "image": {
"parse_method": "ocr", "parse_method": "ocr",
@ -357,11 +346,6 @@ class Parser(ProcessBase):
elif layout == "table": elif layout == "table":
b["doc_type_kwd"] = "table" b["doc_type_kwd"] = "table"
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
self.set_output("json", bboxes) self.set_output("json", bboxes)
if conf.get("output_format") == "markdown": if conf.get("output_format") == "markdown":
@ -436,11 +420,6 @@ class Parser(ProcessBase):
if table: if table:
result.append({"text": table, "doc_type_kwd": "table"}) result.append({"text": table, "doc_type_kwd": "table"})
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result) self.set_output("json", result)
elif output_format == "markdown": elif output_format == "markdown":
@ -476,11 +455,6 @@ class Parser(ProcessBase):
sections = [{"text": section[0], "image": section[1]} for section in sections if section] sections = [{"text": section[0], "image": section[1]} for section in sections if section]
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls]) sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections) self.set_output("json", sections)
elif conf.get("output_format") == "markdown": elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob) markdown_text = docx_parser.to_markdown(name, binary=blob)
@ -536,11 +510,6 @@ class Parser(ProcessBase):
if table: if table:
result.append({"text": table, "doc_type_kwd": "table"}) result.append({"text": table, "doc_type_kwd": "table"})
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
result = attach_media_context(result, table_ctx, image_ctx)
self.set_output("json", result) self.set_output("json", result)
else: else:
# Default DeepDOC parser (supports .pptx format) # Default DeepDOC parser (supports .pptx format)
@ -554,10 +523,6 @@ class Parser(ProcessBase):
# json # json
assert conf.get("output_format") == "json", "have to be json for ppt" assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
sections = attach_media_context(sections, table_ctx, image_ctx)
self.set_output("json", sections) self.set_output("json", sections)
def _markdown(self, name, blob): def _markdown(self, name, blob):
@ -597,11 +562,6 @@ class Parser(ProcessBase):
json_results.append(json_result) json_results.append(json_result)
table_ctx = conf.get("table_context_size", 0) or 0
image_ctx = conf.get("image_context_size", 0) or 0
if table_ctx or image_ctx:
json_results = attach_media_context(json_results, table_ctx, image_ctx)
self.set_output("json", json_results) self.set_output("json", json_results)
else: else:
self.set_output("text", "\n".join([section_text for section_text, _ in sections])) self.set_output("text", "\n".join([section_text for section_text, _ in sections]))

View File

@ -23,7 +23,7 @@ from rag.utils.base64_image import id2image, image2id
from deepdoc.parser.pdf_parser import RAGFlowPdfParser from deepdoc.parser.pdf_parser import RAGFlowPdfParser
from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.splitter.schema import SplitterFromUpstream from rag.flow.splitter.schema import SplitterFromUpstream
from rag.nlp import naive_merge, naive_merge_with_images from rag.nlp import attach_media_context, naive_merge, naive_merge_with_images
from common import settings from common import settings
@ -34,11 +34,15 @@ class SplitterParam(ProcessParamBase):
self.delimiters = ["\n"] self.delimiters = ["\n"]
self.overlapped_percent = 0 self.overlapped_percent = 0
self.children_delimiters = [] self.children_delimiters = []
self.table_context_size = 0
self.image_context_size = 0
def check(self): def check(self):
self.check_empty(self.delimiters, "Delimiters.") self.check_empty(self.delimiters, "Delimiters.")
self.check_positive_integer(self.chunk_token_size, "Chunk token size.") self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)") self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
self.check_nonnegative_number(self.table_context_size, "Table context size.")
self.check_nonnegative_number(self.image_context_size, "Image context size.")
def get_input_form(self) -> dict[str, dict]: def get_input_form(self) -> dict[str, dict]:
return {} return {}
@ -103,8 +107,18 @@ class Splitter(ProcessBase):
return return
# json # json
json_result = from_upstream.json_result or []
if self._param.table_context_size or self._param.image_context_size:
for ck in json_result:
if "image" not in ck and ck.get("img_id") and not (isinstance(ck.get("text"), str) and ck.get("text").strip()):
ck["image"] = True
attach_media_context(json_result, self._param.table_context_size, self._param.image_context_size)
for ck in json_result:
if ck.get("image") is True:
del ck["image"]
sections, section_images = [], [] sections, section_images = [], []
for o in from_upstream.json_result or []: for o in json_result:
sections.append((o.get("text", ""), o.get("position_tag", ""))) sections.append((o.get("text", ""), o.get("position_tag", "")))
section_images.append(id2image(o.get("img_id"), partial(settings.STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id))) section_images.append(id2image(o.get("img_id"), partial(settings.STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id)))

View File

@ -82,7 +82,7 @@ def id2image(image_id:str|None, storage_get_func: partial):
return return
bkt, nm = image_id.split("-") bkt, nm = image_id.split("-")
try: try:
blob = storage_get_func(bucket=bkt, filename=nm) blob = storage_get_func(bucket=bkt, fnm=nm)
if not blob: if not blob:
return return
return Image.open(BytesIO(blob)) return Image.open(BytesIO(blob))

View File

@ -206,6 +206,7 @@ export const initialSplitterValues = {
chunk_token_size: 512, chunk_token_size: 512,
overlapped_percent: 0, overlapped_percent: 0,
delimiters: [{ value: '\n' }], delimiters: [{ value: '\n' }],
image_table_context_window: 0,
}; };
export enum Hierarchy { export enum Hierarchy {

View File

@ -22,6 +22,7 @@ const outputList = buildOutputList(initialSplitterValues.outputs);
export const FormSchema = z.object({ export const FormSchema = z.object({
chunk_token_size: z.number(), chunk_token_size: z.number(),
image_table_context_window: z.number(),
delimiters: z.array( delimiters: z.array(
z.object({ z.object({
value: z.string().optional(), value: z.string().optional(),
@ -74,6 +75,13 @@ const SplitterForm = ({ node }: INextOperatorForm) => {
min={0} min={0}
label={t('flow.overlappedPercent')} label={t('flow.overlappedPercent')}
></SliderInputFormField> ></SliderInputFormField>
<SliderInputFormField
name="image_table_context_window"
max={256}
min={0}
label={t('knowledgeConfiguration.imageTableContextWindow')}
tooltip={t('knowledgeConfiguration.imageTableContextWindowTip')}
></SliderInputFormField>
<section> <section>
<span className="mb-2 inline-block">{t('flow.delimiters')}</span> <span className="mb-2 inline-block">{t('flow.delimiters')}</span>
<div className="space-y-4"> <div className="space-y-4">

View File

@ -289,10 +289,14 @@ function transformParserParams(params: ParserFormSchemaType) {
} }
function transformSplitterParams(params: SplitterFormSchemaType) { function transformSplitterParams(params: SplitterFormSchemaType) {
const { image_table_context_window, ...rest } = params;
const imageTableContextWindow = Number(image_table_context_window || 0);
return { return {
...params, ...rest,
overlapped_percent: Number(params.overlapped_percent) / 100, overlapped_percent: Number(params.overlapped_percent) / 100,
delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'), delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'),
table_context_size: imageTableContextWindow,
image_context_size: imageTableContextWindow,
// Unset children delimiters if this option is not enabled // Unset children delimiters if this option is not enabled
children_delimiters: params.enable_children children_delimiters: params.enable_children