Refine: image/table context. (#12336)

### What problem does this PR solve? #12303 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2025-12-31 17:15:32 +08:00 · 2025-12-30 20:24:27 +08:00
parent 348265afc1
commit 52f91c2388
5 changed files with 116 additions and 11 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -16,7 +16,7 @@

 import logging
 import random
-from collections import Counter
+from collections import Counter, defaultdict

 from common.token_utils import num_tokens_from_string
 import re
@ -667,6 +667,94 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
    return chunks


+def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0):
+    from deepdoc.parser import PdfParser
+    if table_context_size <=0:
+        return tabls
+
+    page_bucket = defaultdict(list)
+    for i, (txt, poss) in enumerate(sections):
+        poss = PdfParser.extract_positions(poss)
+        for page, left, right, top, bottom in poss:
+            page = page[0]
+            page_bucket[page].append(((left, top, right, bottom), txt))
+
+    def upper_context(page, i):
+        txt = ""
+        if page not in page_bucket:
+            i = -1
+        while num_tokens_from_string(txt) < table_context_size:
+            if i < 0:
+                page -= 1
+                if page < 0 or page not in page_bucket:
+                    break
+                i = len(page_bucket[page]) -1
+            blks = page_bucket[page]
+            (_, _, _, _), cnt = blks[i]
+            txts = re.split(r"([。!?？；！\n]|\. )", cnt, flags=re.DOTALL)[::-1]
+            for j in range(0, len(txts), 2):
+                txt = (txts[j+1] if j+1<len(txts) else "") + txts[j] + txt
+                if num_tokens_from_string(txt) > table_context_size:
+                    break
+            i -= 1
+        return txt
+
+    def lower_context(page, i):
+        txt = ""
+        if page not in page_bucket:
+            return txt
+        while num_tokens_from_string(txt) < table_context_size:
+            if i >= len(page_bucket[page]):
+                page += 1
+                if page not in page_bucket:
+                    break
+                i = 0
+            blks = page_bucket[page]
+            (_, _, _, _), cnt = blks[i]
+            txts = re.split(r"([。!?？；！\n]|\. )", cnt, flags=re.DOTALL)
+            for j in range(0, len(txts), 2):
+                txt += txts[j] + (txts[j+1] if j+1<len(txts) else "")
+                if num_tokens_from_string(txt) > table_context_size:
+                    break
+            i += 1
+        return txt
+
+    res = []
+    for (img, tb), poss in tabls:
+        page, left, top, right, bott = poss[0]
+        _page, _left, _top, _right, _bott = poss[-1]
+        if isinstance(tb, list):
+            tb = "\n".join(tb)
+
+        i = 0
+        blks = page_bucket.get(page, [])
+        _tb = tb
+        while i < len(blks):
+            if i + 1 >= len(blks):
+                if _page > page:
+                    page += 1
+                    i = 0
+                    blks = page_bucket.get(page, [])
+                    continue
+                tb = upper_context(page, i) + tb + lower_context(page+1, 0)
+                break
+            (_, t, r, b), txt = blks[i]
+            if b > top:
+                break
+            (_, _t, _r, _b), _txt = blks[i+1]
+            if _t < _bott:
+                i += 1
+                continue
+
+            tb = upper_context(page, i) + tb + lower_context(page, i)
+            break
+
+        if _tb == tb:
+            tb = upper_context(page, -1) + tb + lower_context(page+1, 0)
+        res.append(((img, tb), poss))
+    return res
+
+
 def add_positions(d, poss):
    if not poss:
        return