mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-29 16:05:35 +08:00
Feat: add image table context to pipeline splitter (#12167)
### What problem does this PR solve? Add image table context to pipeline splitter. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -37,7 +37,6 @@ from rag.app.naive import Docx
|
|||||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||||
from rag.flow.parser.schema import ParserFromUpstream
|
from rag.flow.parser.schema import ParserFromUpstream
|
||||||
from rag.llm.cv_model import Base as VLM
|
from rag.llm.cv_model import Base as VLM
|
||||||
from rag.nlp import attach_media_context
|
|
||||||
from rag.utils.base64_image import image2id
|
from rag.utils.base64_image import image2id
|
||||||
|
|
||||||
|
|
||||||
@ -86,8 +85,6 @@ class ParserParam(ProcessParamBase):
|
|||||||
"pdf",
|
"pdf",
|
||||||
],
|
],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
"table_context_size": 0,
|
|
||||||
"image_context_size": 0,
|
|
||||||
},
|
},
|
||||||
"spreadsheet": {
|
"spreadsheet": {
|
||||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||||
@ -97,8 +94,6 @@ class ParserParam(ProcessParamBase):
|
|||||||
"xlsx",
|
"xlsx",
|
||||||
"csv",
|
"csv",
|
||||||
],
|
],
|
||||||
"table_context_size": 0,
|
|
||||||
"image_context_size": 0,
|
|
||||||
},
|
},
|
||||||
"word": {
|
"word": {
|
||||||
"suffix": [
|
"suffix": [
|
||||||
@ -106,14 +101,10 @@ class ParserParam(ProcessParamBase):
|
|||||||
"docx",
|
"docx",
|
||||||
],
|
],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
"table_context_size": 0,
|
|
||||||
"image_context_size": 0,
|
|
||||||
},
|
},
|
||||||
"text&markdown": {
|
"text&markdown": {
|
||||||
"suffix": ["md", "markdown", "mdx", "txt"],
|
"suffix": ["md", "markdown", "mdx", "txt"],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
"table_context_size": 0,
|
|
||||||
"image_context_size": 0,
|
|
||||||
},
|
},
|
||||||
"slides": {
|
"slides": {
|
||||||
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
"parse_method": "deepdoc", # deepdoc/tcadp_parser
|
||||||
@ -122,8 +113,6 @@ class ParserParam(ProcessParamBase):
|
|||||||
"ppt",
|
"ppt",
|
||||||
],
|
],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
"table_context_size": 0,
|
|
||||||
"image_context_size": 0,
|
|
||||||
},
|
},
|
||||||
"image": {
|
"image": {
|
||||||
"parse_method": "ocr",
|
"parse_method": "ocr",
|
||||||
@ -357,11 +346,6 @@ class Parser(ProcessBase):
|
|||||||
elif layout == "table":
|
elif layout == "table":
|
||||||
b["doc_type_kwd"] = "table"
|
b["doc_type_kwd"] = "table"
|
||||||
|
|
||||||
table_ctx = conf.get("table_context_size", 0) or 0
|
|
||||||
image_ctx = conf.get("image_context_size", 0) or 0
|
|
||||||
if table_ctx or image_ctx:
|
|
||||||
bboxes = attach_media_context(bboxes, table_ctx, image_ctx)
|
|
||||||
|
|
||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
self.set_output("json", bboxes)
|
self.set_output("json", bboxes)
|
||||||
if conf.get("output_format") == "markdown":
|
if conf.get("output_format") == "markdown":
|
||||||
@ -436,11 +420,6 @@ class Parser(ProcessBase):
|
|||||||
if table:
|
if table:
|
||||||
result.append({"text": table, "doc_type_kwd": "table"})
|
result.append({"text": table, "doc_type_kwd": "table"})
|
||||||
|
|
||||||
table_ctx = conf.get("table_context_size", 0) or 0
|
|
||||||
image_ctx = conf.get("image_context_size", 0) or 0
|
|
||||||
if table_ctx or image_ctx:
|
|
||||||
result = attach_media_context(result, table_ctx, image_ctx)
|
|
||||||
|
|
||||||
self.set_output("json", result)
|
self.set_output("json", result)
|
||||||
|
|
||||||
elif output_format == "markdown":
|
elif output_format == "markdown":
|
||||||
@ -476,11 +455,6 @@ class Parser(ProcessBase):
|
|||||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||||
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
|
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
|
||||||
|
|
||||||
table_ctx = conf.get("table_context_size", 0) or 0
|
|
||||||
image_ctx = conf.get("image_context_size", 0) or 0
|
|
||||||
if table_ctx or image_ctx:
|
|
||||||
sections = attach_media_context(sections, table_ctx, image_ctx)
|
|
||||||
|
|
||||||
self.set_output("json", sections)
|
self.set_output("json", sections)
|
||||||
elif conf.get("output_format") == "markdown":
|
elif conf.get("output_format") == "markdown":
|
||||||
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
markdown_text = docx_parser.to_markdown(name, binary=blob)
|
||||||
@ -536,11 +510,6 @@ class Parser(ProcessBase):
|
|||||||
if table:
|
if table:
|
||||||
result.append({"text": table, "doc_type_kwd": "table"})
|
result.append({"text": table, "doc_type_kwd": "table"})
|
||||||
|
|
||||||
table_ctx = conf.get("table_context_size", 0) or 0
|
|
||||||
image_ctx = conf.get("image_context_size", 0) or 0
|
|
||||||
if table_ctx or image_ctx:
|
|
||||||
result = attach_media_context(result, table_ctx, image_ctx)
|
|
||||||
|
|
||||||
self.set_output("json", result)
|
self.set_output("json", result)
|
||||||
else:
|
else:
|
||||||
# Default DeepDOC parser (supports .pptx format)
|
# Default DeepDOC parser (supports .pptx format)
|
||||||
@ -554,10 +523,6 @@ class Parser(ProcessBase):
|
|||||||
# json
|
# json
|
||||||
assert conf.get("output_format") == "json", "have to be json for ppt"
|
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
table_ctx = conf.get("table_context_size", 0) or 0
|
|
||||||
image_ctx = conf.get("image_context_size", 0) or 0
|
|
||||||
if table_ctx or image_ctx:
|
|
||||||
sections = attach_media_context(sections, table_ctx, image_ctx)
|
|
||||||
self.set_output("json", sections)
|
self.set_output("json", sections)
|
||||||
|
|
||||||
def _markdown(self, name, blob):
|
def _markdown(self, name, blob):
|
||||||
@ -597,11 +562,6 @@ class Parser(ProcessBase):
|
|||||||
|
|
||||||
json_results.append(json_result)
|
json_results.append(json_result)
|
||||||
|
|
||||||
table_ctx = conf.get("table_context_size", 0) or 0
|
|
||||||
image_ctx = conf.get("image_context_size", 0) or 0
|
|
||||||
if table_ctx or image_ctx:
|
|
||||||
json_results = attach_media_context(json_results, table_ctx, image_ctx)
|
|
||||||
|
|
||||||
self.set_output("json", json_results)
|
self.set_output("json", json_results)
|
||||||
else:
|
else:
|
||||||
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
|
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
|
||||||
|
|||||||
@ -23,7 +23,7 @@ from rag.utils.base64_image import id2image, image2id
|
|||||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||||
from rag.flow.splitter.schema import SplitterFromUpstream
|
from rag.flow.splitter.schema import SplitterFromUpstream
|
||||||
from rag.nlp import naive_merge, naive_merge_with_images
|
from rag.nlp import attach_media_context, naive_merge, naive_merge_with_images
|
||||||
from common import settings
|
from common import settings
|
||||||
|
|
||||||
|
|
||||||
@ -34,11 +34,15 @@ class SplitterParam(ProcessParamBase):
|
|||||||
self.delimiters = ["\n"]
|
self.delimiters = ["\n"]
|
||||||
self.overlapped_percent = 0
|
self.overlapped_percent = 0
|
||||||
self.children_delimiters = []
|
self.children_delimiters = []
|
||||||
|
self.table_context_size = 0
|
||||||
|
self.image_context_size = 0
|
||||||
|
|
||||||
def check(self):
|
def check(self):
|
||||||
self.check_empty(self.delimiters, "Delimiters.")
|
self.check_empty(self.delimiters, "Delimiters.")
|
||||||
self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
|
self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
|
||||||
self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
|
self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
|
||||||
|
self.check_nonnegative_number(self.table_context_size, "Table context size.")
|
||||||
|
self.check_nonnegative_number(self.image_context_size, "Image context size.")
|
||||||
|
|
||||||
def get_input_form(self) -> dict[str, dict]:
|
def get_input_form(self) -> dict[str, dict]:
|
||||||
return {}
|
return {}
|
||||||
@ -103,8 +107,18 @@ class Splitter(ProcessBase):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# json
|
# json
|
||||||
|
json_result = from_upstream.json_result or []
|
||||||
|
if self._param.table_context_size or self._param.image_context_size:
|
||||||
|
for ck in json_result:
|
||||||
|
if "image" not in ck and ck.get("img_id") and not (isinstance(ck.get("text"), str) and ck.get("text").strip()):
|
||||||
|
ck["image"] = True
|
||||||
|
attach_media_context(json_result, self._param.table_context_size, self._param.image_context_size)
|
||||||
|
for ck in json_result:
|
||||||
|
if ck.get("image") is True:
|
||||||
|
del ck["image"]
|
||||||
|
|
||||||
sections, section_images = [], []
|
sections, section_images = [], []
|
||||||
for o in from_upstream.json_result or []:
|
for o in json_result:
|
||||||
sections.append((o.get("text", ""), o.get("position_tag", "")))
|
sections.append((o.get("text", ""), o.get("position_tag", "")))
|
||||||
section_images.append(id2image(o.get("img_id"), partial(settings.STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id)))
|
section_images.append(id2image(o.get("img_id"), partial(settings.STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id)))
|
||||||
|
|
||||||
|
|||||||
@ -82,7 +82,7 @@ def id2image(image_id:str|None, storage_get_func: partial):
|
|||||||
return
|
return
|
||||||
bkt, nm = image_id.split("-")
|
bkt, nm = image_id.split("-")
|
||||||
try:
|
try:
|
||||||
blob = storage_get_func(bucket=bkt, filename=nm)
|
blob = storage_get_func(bucket=bkt, fnm=nm)
|
||||||
if not blob:
|
if not blob:
|
||||||
return
|
return
|
||||||
return Image.open(BytesIO(blob))
|
return Image.open(BytesIO(blob))
|
||||||
|
|||||||
@ -206,6 +206,7 @@ export const initialSplitterValues = {
|
|||||||
chunk_token_size: 512,
|
chunk_token_size: 512,
|
||||||
overlapped_percent: 0,
|
overlapped_percent: 0,
|
||||||
delimiters: [{ value: '\n' }],
|
delimiters: [{ value: '\n' }],
|
||||||
|
image_table_context_window: 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
export enum Hierarchy {
|
export enum Hierarchy {
|
||||||
|
|||||||
@ -22,6 +22,7 @@ const outputList = buildOutputList(initialSplitterValues.outputs);
|
|||||||
|
|
||||||
export const FormSchema = z.object({
|
export const FormSchema = z.object({
|
||||||
chunk_token_size: z.number(),
|
chunk_token_size: z.number(),
|
||||||
|
image_table_context_window: z.number(),
|
||||||
delimiters: z.array(
|
delimiters: z.array(
|
||||||
z.object({
|
z.object({
|
||||||
value: z.string().optional(),
|
value: z.string().optional(),
|
||||||
@ -74,6 +75,13 @@ const SplitterForm = ({ node }: INextOperatorForm) => {
|
|||||||
min={0}
|
min={0}
|
||||||
label={t('flow.overlappedPercent')}
|
label={t('flow.overlappedPercent')}
|
||||||
></SliderInputFormField>
|
></SliderInputFormField>
|
||||||
|
<SliderInputFormField
|
||||||
|
name="image_table_context_window"
|
||||||
|
max={256}
|
||||||
|
min={0}
|
||||||
|
label={t('knowledgeConfiguration.imageTableContextWindow')}
|
||||||
|
tooltip={t('knowledgeConfiguration.imageTableContextWindowTip')}
|
||||||
|
></SliderInputFormField>
|
||||||
<section>
|
<section>
|
||||||
<span className="mb-2 inline-block">{t('flow.delimiters')}</span>
|
<span className="mb-2 inline-block">{t('flow.delimiters')}</span>
|
||||||
<div className="space-y-4">
|
<div className="space-y-4">
|
||||||
|
|||||||
@ -289,10 +289,14 @@ function transformParserParams(params: ParserFormSchemaType) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function transformSplitterParams(params: SplitterFormSchemaType) {
|
function transformSplitterParams(params: SplitterFormSchemaType) {
|
||||||
|
const { image_table_context_window, ...rest } = params;
|
||||||
|
const imageTableContextWindow = Number(image_table_context_window || 0);
|
||||||
return {
|
return {
|
||||||
...params,
|
...rest,
|
||||||
overlapped_percent: Number(params.overlapped_percent) / 100,
|
overlapped_percent: Number(params.overlapped_percent) / 100,
|
||||||
delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'),
|
delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'),
|
||||||
|
table_context_size: imageTableContextWindow,
|
||||||
|
image_context_size: imageTableContextWindow,
|
||||||
|
|
||||||
// Unset children delimiters if this option is not enabled
|
// Unset children delimiters if this option is not enabled
|
||||||
children_delimiters: params.enable_children
|
children_delimiters: params.enable_children
|
||||||
|
|||||||
Reference in New Issue
Block a user