Refactor: rename rmSpace to remove_redundant_spaces (#10796)

### What problem does this PR solve?

- rename rmSpace to remove_redundant_spaces
- move clean_markdown_block to common module
- add unit tests for remove_redundant_spaces and clean_markdown_block

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2025-10-28 09:46:32 +08:00
committed by GitHub
parent e59458c36b
commit 766d900a41
17 changed files with 604 additions and 75 deletions

View File

@ -25,7 +25,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import rag_tokenizer
from deepdoc.parser.resume import refactor
from deepdoc.parser.resume import step_one, step_two
from rag.utils import rmSpace
from common.string_utils import remove_redundant_spaces
forbidden_select_fields4resume = [
"name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd"
@ -130,7 +130,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
if isinstance(v, list):
v = v[0]
if n.find("tks") > 0:
v = rmSpace(v)
v = remove_redundant_spaces(v)
titles.append(str(v))
doc = {
"docnm_kwd": filename,
@ -145,7 +145,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
if isinstance(v, list):
v = " ".join(v)
if n.find("tks") > 0:
v = rmSpace(v)
v = remove_redundant_spaces(v)
pairs.append((m, str(v)))
doc["content_with_weight"] = "\n".join(