Refactor: rename rmSpace to remove_redundant_spaces (#10796)

### What problem does this PR solve?

- rename rmSpace to remove_redundant_spaces
- move clean_markdown_block to common module
- add unit tests for remove_redundant_spaces and clean_markdown_block

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-10-28 09:46:32 +08:00
committed by GitHub
parent e59458c36b
commit 766d900a41
17 changed files with 604 additions and 75 deletions

View File

@ -15,7 +15,6 @@
#
import os
import re
import tiktoken
@ -33,48 +32,6 @@ def singleton(cls, *args, **kw):
return _singleton
def rmSpace(txt):
txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)
def findMaxDt(fnm):
m = "1970-01-01 00:00:00"
try:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
line = line.strip("\n")
if line == 'nan':
continue
if line > m:
m = line
except Exception:
pass
return m
def findMaxTm(fnm):
m = 0
try:
with open(fnm, "r") as f:
while True:
line = f.readline()
if not line:
break
line = line.strip("\n")
if line == 'nan':
continue
if int(line) > m:
m = int(line)
except Exception:
pass
return m
tiktoken_cache_dir = get_project_base_directory()
os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
# encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
@ -113,18 +70,4 @@ def truncate(string: str, max_len: int) -> str:
"""Returns truncated text if the length of text exceed max_len."""
return encoder.decode(encoder.encode(string)[:max_len])
def clean_markdown_block(text):
text = re.sub(r'^\s*```markdown\s*\n?', '', text)
text = re.sub(r'\n?\s*```\s*$', '', text)
return text.strip()
def get_float(v):
if v is None:
return float('-inf')
try:
return float(v)
except Exception:
return float('-inf')

View File

@ -26,12 +26,13 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
from elastic_transport import ConnectionTimeout
from rag import settings
from rag.settings import TAG_FLD, PAGERANK_FLD
from rag.utils import singleton, get_float
from rag.utils import singleton
from api.utils.file_utils import get_project_base_directory
from api.utils.common import convert_bytes
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
FusionExpr
from rag.nlp import is_english, rag_tokenizer
from common.float_utils import get_float
ATTEMPT_TIME = 2
@ -503,7 +504,7 @@ class ESConnection(DocStoreConnection):
if not isinstance(v, str):
m[n] = str(m[n])
# if n.find("tks") > 0:
# m[n] = rmSpace(m[n])
# m[n] = remove_redundant_spaces(m[n])
if m:
res_fields[d["id"]] = m

View File

@ -484,7 +484,7 @@ class OSConnection(DocStoreConnection):
if not isinstance(v, str):
m[n] = str(m[n])
# if n.find("tks") > 0:
# m[n] = rmSpace(m[n])
# m[n] = remove_redundant_spaces(m[n])
if m:
res_fields[d["id"]] = m