mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-30 00:32:30 +08:00
Refa: improve image table context (#12244)
### What problem does this PR solve? Improve image table context. Current strategy in attach_media_context: - Order by position when possible: if any chunk has page/position info, sort by (page, top, left), otherwise keep original order. - Apply only to media chunks: images use image_context_size, tables use table_context_size. - Primary matching: on the same page, choose a text chunk whose vertical span overlaps the media, then pick the one with the closest vertical midpoint. - Fallback matching: if no overlap on that page, choose the nearest text chunk on the same page (page-head uses the next text; page-tail uses the previous text). - Context extraction: inside the chosen text chunk, find a mid-sentence boundary near the text midpoint, then take context_size tokens split before/after (total budget). - No multi-chunk stitching: context comes from a single text chunk to avoid mixing unrelated segments. ### Type of change - [x] Refactoring --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
@ -220,15 +220,19 @@ export function ChunkMethodDialog({
|
||||
|
||||
async function onSubmit(data: z.infer<typeof FormSchema>) {
|
||||
console.log('🚀 ~ onSubmit ~ data:', data);
|
||||
const { image_table_context_window, ...parserConfig } = data.parser_config;
|
||||
const imageTableContextWindow = Number(image_table_context_window || 0);
|
||||
const nextData = {
|
||||
...data,
|
||||
parser_config: {
|
||||
...data.parser_config,
|
||||
...parserConfig,
|
||||
image_context_size: imageTableContextWindow,
|
||||
table_context_size: imageTableContextWindow,
|
||||
// Unset children delimiter if this option is not enabled
|
||||
children_delimiter: data.parser_config.enable_children
|
||||
? data.parser_config.children_delimiter
|
||||
children_delimiter: parserConfig.enable_children
|
||||
? parserConfig.children_delimiter
|
||||
: '',
|
||||
pages: data.parser_config?.pages?.map((x: any) => [x.from, x.to]) ?? [],
|
||||
pages: parserConfig?.pages?.map((x: any) => [x.from, x.to]) ?? [],
|
||||
},
|
||||
};
|
||||
console.log('🚀 ~ onSubmit ~ nextData:', nextData);
|
||||
@ -249,6 +253,10 @@ export function ChunkMethodDialog({
|
||||
parser_config: fillDefaultParserValue({
|
||||
pages: pages.length > 0 ? pages : [{ from: 1, to: 1024 }],
|
||||
...omit(parserConfig, 'pages'),
|
||||
image_table_context_window:
|
||||
parserConfig?.image_table_context_window ??
|
||||
parserConfig?.image_context_size ??
|
||||
parserConfig?.table_context_size,
|
||||
// graphrag: {
|
||||
// use_graphrag: get(
|
||||
// parserConfig,
|
||||
|
||||
@ -44,6 +44,9 @@ export interface IParserConfig {
|
||||
raptor?: Raptor;
|
||||
graphrag?: GraphRag;
|
||||
image_context_window?: number;
|
||||
image_table_context_window?: number;
|
||||
image_context_size?: number;
|
||||
table_context_size?: number;
|
||||
mineru_parse_method?: 'auto' | 'txt' | 'ocr';
|
||||
mineru_formula_enable?: boolean;
|
||||
mineru_table_enable?: boolean;
|
||||
|
||||
@ -8,6 +8,9 @@ export interface IChangeParserConfigRequestBody {
|
||||
auto_questions?: number;
|
||||
html4excel?: boolean;
|
||||
toc_extraction?: boolean;
|
||||
image_table_context_window?: number;
|
||||
image_context_size?: number;
|
||||
table_context_size?: number;
|
||||
}
|
||||
|
||||
export interface IChangeParserRequestBody {
|
||||
|
||||
Reference in New Issue
Block a user