refine code (#595)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
KevinHuSh
2024-04-28 19:13:33 +08:00
committed by GitHub
parent aee8b48d2f
commit 8c07992b6c
24 changed files with 538 additions and 116 deletions

View File

@ -2,7 +2,7 @@ import random
from collections import Counter
from rag.utils import num_tokens_from_string
from . import huqie
from . import rag_tokenizer
import re
import copy
@ -109,8 +109,8 @@ def is_english(texts):
def tokenize(d, t, eng):
d["content_with_weight"] = t
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
d["content_ltks"] = huqie.qie(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
d["content_ltks"] = rag_tokenizer.tokenize(t)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def tokenize_chunks(chunks, doc, eng, pdf_parser):