Refactor: rename rmSpace to remove_redundant_spaces (#10796)

### What problem does this PR solve?

- rename rmSpace to remove_redundant_spaces
- move clean_markdown_block to common module
- add unit tests for remove_redundant_spaces and clean_markdown_block

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-10-28 09:46:32 +08:00
committed by GitHub
parent e59458c36b
commit 766d900a41
17 changed files with 604 additions and 75 deletions

View File

@ -26,12 +26,13 @@ from elasticsearch_dsl import UpdateByQuery, Q, Search, Index
from elastic_transport import ConnectionTimeout
from rag import settings
from rag.settings import TAG_FLD, PAGERANK_FLD
from rag.utils import singleton, get_float
from rag.utils import singleton
from api.utils.file_utils import get_project_base_directory
from api.utils.common import convert_bytes
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
FusionExpr
from rag.nlp import is_english, rag_tokenizer
from common.float_utils import get_float
ATTEMPT_TIME = 2
@ -503,7 +504,7 @@ class ESConnection(DocStoreConnection):
if not isinstance(v, str):
m[n] = str(m[n])
# if n.find("tks") > 0:
# m[n] = rmSpace(m[n])
# m[n] = remove_redundant_spaces(m[n])
if m:
res_fields[d["id"]] = m