Import rag_tokenizer from Infinity (#11647)

### What problem does this PR solve?

- Original rag/nlp/rag_tokenizer.py is put to Infinity and infinity-sdk
via https://github.com/infiniflow/infinity/pull/3117 .
Import rag_tokenizer from infinity and inherit from
rag_tokenizer.RagTokenizer in new rag/nlp/rag_tokenizer.py.

- Bump infinity to 0.6.8

### Type of change
- [x] Refactoring
This commit is contained in:
qinling0210
2025-12-02 14:59:37 +08:00
committed by GitHub
parent e3987e21b9
commit 2ffe6f7439
7 changed files with 3712 additions and 4082 deletions

View File

@ -19,7 +19,6 @@ import random
from collections import Counter
from common.token_utils import num_tokens_from_string
from . import rag_tokenizer
import re
import copy
import roman_numbers as r
@ -29,6 +28,8 @@ from PIL import Image
import chardet
__all__ = ['rag_tokenizer']
all_codecs = [
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
'cp037', 'cp273', 'cp424', 'cp437',
@ -265,6 +266,7 @@ def is_chinese(text):
def tokenize(d, txt, eng):
from . import rag_tokenizer
d["content_with_weight"] = txt
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
d["content_ltks"] = rag_tokenizer.tokenize(t)
@ -362,6 +364,7 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
Best-effort ordering: if positional info exists on any chunk, use it to
order chunks before collecting context; otherwise keep original order.
"""
from . import rag_tokenizer
if not chunks or (table_context_size <= 0 and image_context_size <= 0):
return chunks