Refa: improve image table context (#12244)

### What problem does this PR solve?

Improve image table context.

Current strategy in attach_media_context:

- Order by position when possible: if any chunk has page/position info,
sort by (page, top, left), otherwise keep original order.
- Apply only to media chunks: images use image_context_size, tables use
table_context_size.
- Primary matching: on the same page, choose a text chunk whose vertical
span overlaps the media, then pick the one with the closest vertical
midpoint.
- Fallback matching: if no overlap on that page, choose the nearest text
chunk on the same page (page-head uses the next text; page-tail uses the
previous text).
- Context extraction: inside the chosen text chunk, find a mid-sentence
boundary near the text midpoint, then take context_size tokens split
before/after (total budget).
- No multi-chunk stitching: context comes from a single text chunk to
avoid mixing unrelated segments.

### Type of change

- [x] Refactoring

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Yongteng Lei
2025-12-26 17:55:32 +08:00
committed by GitHub
parent 9de3ecc4a8
commit 51bc41b2e8
4 changed files with 165 additions and 43 deletions

View File

@ -220,15 +220,19 @@ export function ChunkMethodDialog({
async function onSubmit(data: z.infer<typeof FormSchema>) {
console.log('🚀 ~ onSubmit ~ data:', data);
const { image_table_context_window, ...parserConfig } = data.parser_config;
const imageTableContextWindow = Number(image_table_context_window || 0);
const nextData = {
...data,
parser_config: {
...data.parser_config,
...parserConfig,
image_context_size: imageTableContextWindow,
table_context_size: imageTableContextWindow,
// Unset children delimiter if this option is not enabled
children_delimiter: data.parser_config.enable_children
? data.parser_config.children_delimiter
children_delimiter: parserConfig.enable_children
? parserConfig.children_delimiter
: '',
pages: data.parser_config?.pages?.map((x: any) => [x.from, x.to]) ?? [],
pages: parserConfig?.pages?.map((x: any) => [x.from, x.to]) ?? [],
},
};
console.log('🚀 ~ onSubmit ~ nextData:', nextData);
@ -249,6 +253,10 @@ export function ChunkMethodDialog({
parser_config: fillDefaultParserValue({
pages: pages.length > 0 ? pages : [{ from: 1, to: 1024 }],
...omit(parserConfig, 'pages'),
image_table_context_window:
parserConfig?.image_table_context_window ??
parserConfig?.image_context_size ??
parserConfig?.table_context_size,
// graphrag: {
// use_graphrag: get(
// parserConfig,

View File

@ -44,6 +44,9 @@ export interface IParserConfig {
raptor?: Raptor;
graphrag?: GraphRag;
image_context_window?: number;
image_table_context_window?: number;
image_context_size?: number;
table_context_size?: number;
mineru_parse_method?: 'auto' | 'txt' | 'ocr';
mineru_formula_enable?: boolean;
mineru_table_enable?: boolean;

View File

@ -8,6 +8,9 @@ export interface IChangeParserConfigRequestBody {
auto_questions?: number;
html4excel?: boolean;
toc_extraction?: boolean;
image_table_context_window?: number;
image_context_size?: number;
table_context_size?: number;
}
export interface IChangeParserRequestBody {