Import rag_tokenizer from Infinity (#11647)

### What problem does this PR solve? - Original rag/nlp/rag_tokenizer.py is put to Infinity and infinity-sdk via https://github.com/infiniflow/infinity/pull/3117 . Import rag_tokenizer from infinity and inherit from rag_tokenizer.RagTokenizer in new rag/nlp/rag_tokenizer.py. - Bump infinity to 0.6.8 ### Type of change - [x] Refactoring
2026-02-06 18:45:08 +08:00 · 2025-12-02 14:59:37 +08:00
parent e3987e21b9
commit 2ffe6f7439
7 changed files with 3712 additions and 4082 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -19,7 +19,6 @@ import random
 from collections import Counter

 from common.token_utils import num_tokens_from_string
-from . import rag_tokenizer
 import re
 import copy
 import roman_numbers as r
@ -29,6 +28,8 @@ from PIL import Image

 import chardet

+__all__ = ['rag_tokenizer']
+
 all_codecs = [
    'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
    'cp037', 'cp273', 'cp424', 'cp437',
@ -265,6 +266,7 @@ def is_chinese(text):


 def tokenize(d, txt, eng):
+    from . import rag_tokenizer
    d["content_with_weight"] = txt
    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
    d["content_ltks"] = rag_tokenizer.tokenize(t)
@ -362,6 +364,7 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
    Best-effort ordering: if positional info exists on any chunk, use it to
    order chunks before collecting context; otherwise keep original order.
    """
+    from . import rag_tokenizer
    if not chunks or (table_context_size <= 0 and image_context_size <= 0):
        return chunks